iseal-core/util/export-controlled-vocabularies.py

#!/usr/bin/env python3
#
# export-controlled-vocabularies.py v0.0.1
#
# This is a legacy script used to export the controlled vocabularies from a CSV
# file. Originally we were embedding the controlled vocabularies directly inside
# the CSV, but we eventually decided that this was unwieldy and error prone.
#
# SPDX-License-Identifier: GPL-3.0-only

import argparse
import os
import re
import sys
from shutil import rmtree

import pandas as pd


def parseSchema(schema_df):
    # Iterate over all rows (the "index, row" syntax allows us to access column
    # headings in each row, which isn't possible if we just do row).
    for index, row in schema_df.iterrows():

        if row["dspace field name"] is not None and row["dspace field name"] != "":
            dspace_field_name = row["dspace field name"]
        else:
            dspace_field_name = False

        # Generate a "safe" version of the element name for use in URLs and
        # files by using the DSpace field name with dots replaced by dashes.
        element_name_safe = dspace_field_name.replace(".", "-").lower()

        print(f"element name: {element_name_safe}")

        # Export controlled vocabularies from CSV file if they exist
        if row["element controlled values or terms"]:
            exportVocabulary(
                row["element controlled values or terms"], element_name_safe
            )


def exportVocabulary(vocabulary: str, element_name_safe: str):
    # Create an empty list where we'll add all the values (we don't need to do
    # it this way, but using a list allows us to de-duplicate the values).
    controlledVocabularyLines = []
    for value in vocabulary.split("||"):
        if value not in controlledVocabularyLines:
            controlledVocabularyLines.append(value)

    with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
        for value in controlledVocabularyLines:
            f.write(f"{value}\n")

    if args.debug:
        print(f"Exported controlled vocabulary: {element_name_safe}")


parser = argparse.ArgumentParser(
    description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
)
parser.add_argument(
    "--clean",
    help="Clean controlled vocabularies directory before exporting.",
    action="store_true",
)
parser.add_argument(
    "-d",
    "--debug",
    help="Print debug messages.",
    action="store_true",
)
parser.add_argument(
    "-i",
    "--input-file",
    help="Path to schema fields file (ie, iseal-core.csv).",
    required=True,
    type=argparse.FileType("r"),
)
args = parser.parse_args()

if args.clean:
    if args.debug:
        print(f"Cleaning controlled vocabularies directory")

    rmtree("data/controlled-vocabularies", ignore_errors=True)

if args.debug:
    print(f"Creating controlled vocabularies directory")

# Make sure controlled vocabularies directory exists.
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)

if args.debug:
    print(f"Opening {args.input_file.name}")

df = pd.read_csv(args.input_file.name)
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)

parseSchema(df)