diff --git a/util/export-controlled-vocabularies.py b/util/export-controlled-vocabularies.py new file mode 100755 index 00000000..7e806336 --- /dev/null +++ b/util/export-controlled-vocabularies.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# +# export-controlled-vocabularies.py v0.0.1 +# +# This is a legacy script used to export the controlled vocabularies from a CSV +# file. Originally we were embedding the controlled vocabularies directly inside +# the CSV, but we eventually decided that this was unwieldy and error prone. +# +# SPDX-License-Identifier: GPL-3.0-only + +import argparse +import os +import re +import sys +from shutil import rmtree + +import pandas as pd + + +def parseSchema(schema_df): + # Iterate over all rows (the "index, row" syntax allows us to access column + # headings in each row, which isn't possible if we just do row). + for index, row in schema_df.iterrows(): + + if row["dspace field name"] is not None and row["dspace field name"] != "": + dspace_field_name = row["dspace field name"] + else: + dspace_field_name = False + + # Generate a "safe" version of the element name for use in URLs and + # files by using the DSpace field name with dots replaced by dashes. + element_name_safe = dspace_field_name.replace(".", "-").lower() + + print(f"element name: {element_name_safe}") + + # Export controlled vocabularies from CSV file if they exist + if row["element controlled values or terms"]: + exportVocabulary( + row["element controlled values or terms"], element_name_safe + ) + + +def exportVocabulary(vocabulary: str, element_name_safe: str): + # Create an empty list where we'll add all the values (we don't need to do + # it this way, but using a list allows us to de-duplicate the values). + controlledVocabularyLines = [] + for value in vocabulary.split("||"): + if value not in controlledVocabularyLines: + controlledVocabularyLines.append(value) + + with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f: + for value in controlledVocabularyLines: + f.write(f"{value}\n") + + if args.debug: + print(f"Exported controlled vocabulary: {element_name_safe}") + + +parser = argparse.ArgumentParser( + description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies." +) +parser.add_argument( + "--clean", + help="Clean controlled vocabularies directory before exporting.", + action="store_true", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages.", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="Path to schema fields file (ie, iseal-core.csv).", + required=True, + type=argparse.FileType("r"), +) +args = parser.parse_args() + +if args.clean: + if args.debug: + print(f"Cleaning controlled vocabularies directory") + + rmtree("data/controlled-vocabularies", ignore_errors=True) + +if args.debug: + print(f"Creating controlled vocabularies directory") + +# Make sure controlled vocabularies directory exists. +# metadata and controlled vocabularies for Hugo to process. +os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True) + +if args.debug: + print(f"Opening {args.input_file.name}") + +df = pd.read_csv(args.input_file.name) +df.dropna(how="all", axis=1, inplace=True) +df.fillna("", inplace=True) + +parseSchema(df)