Add util/export-controlled-vocabularies.py

This script is only used to export the controlled vocabularies from the schema CSV files. Eventually we will remove them from there and it won't be needed anymore.
2025-08-15 17:33:04 +02:00 · 2022-01-06 12:09:59 +02:00
parent 75076124a5
commit d629a1ab17
1 changed files with 102 additions and 0 deletions
--- a/util/export-controlled-vocabularies.py
+++ b/util/export-controlled-vocabularies.py
@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+#
+# export-controlled-vocabularies.py v0.0.1
+#
+# This is a legacy script used to export the controlled vocabularies from a CSV
+# file. Originally we were embedding the controlled vocabularies directly inside
+# the CSV, but we eventually decided that this was unwieldy and error prone.
+#
+# SPDX-License-Identifier: GPL-3.0-only
+
+import argparse
+import os
+import re
+import sys
+from shutil import rmtree
+
+import pandas as pd
+
+
+def parseSchema(schema_df):
+    # Iterate over all rows (the "index, row" syntax allows us to access column
+    # headings in each row, which isn't possible if we just do row).
+    for index, row in schema_df.iterrows():
+
+        if row["dspace field name"] is not None and row["dspace field name"] != "":
+            dspace_field_name = row["dspace field name"]
+        else:
+            dspace_field_name = False
+
+        # Generate a "safe" version of the element name for use in URLs and
+        # files by using the DSpace field name with dots replaced by dashes.
+        element_name_safe = dspace_field_name.replace(".", "-").lower()
+
+        print(f"element name: {element_name_safe}")
+
+        # Export controlled vocabularies from CSV file if they exist
+        if row["element controlled values or terms"]:
+            exportVocabulary(
+                row["element controlled values or terms"], element_name_safe
+            )
+
+
+def exportVocabulary(vocabulary: str, element_name_safe: str):
+    # Create an empty list where we'll add all the values (we don't need to do
+    # it this way, but using a list allows us to de-duplicate the values).
+    controlledVocabularyLines = []
+    for value in vocabulary.split("||"):
+        if value not in controlledVocabularyLines:
+            controlledVocabularyLines.append(value)
+
+    with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
+        for value in controlledVocabularyLines:
+            f.write(f"{value}\n")
+
+    if args.debug:
+        print(f"Exported controlled vocabulary: {element_name_safe}")
+
+
+parser = argparse.ArgumentParser(
+    description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
+)
+parser.add_argument(
+    "--clean",
+    help="Clean controlled vocabularies directory before exporting.",
+    action="store_true",
+)
+parser.add_argument(
+    "-d",
+    "--debug",
+    help="Print debug messages.",
+    action="store_true",
+)
+parser.add_argument(
+    "-i",
+    "--input-file",
+    help="Path to schema fields file (ie, iseal-core.csv).",
+    required=True,
+    type=argparse.FileType("r"),
+)
+args = parser.parse_args()
+
+if args.clean:
+    if args.debug:
+        print(f"Cleaning controlled vocabularies directory")
+
+    rmtree("data/controlled-vocabularies", ignore_errors=True)
+
+if args.debug:
+    print(f"Creating controlled vocabularies directory")
+
+# Make sure controlled vocabularies directory exists.
+# metadata and controlled vocabularies for Hugo to process.
+os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)
+
+if args.debug:
+    print(f"Opening {args.input_file.name}")
+
+df = pd.read_csv(args.input_file.name)
+df.dropna(how="all", axis=1, inplace=True)
+df.fillna("", inplace=True)
+
+parseSchema(df)