Alan Orth 2022-01-06 12:09:59 +02:00
Alan Orth 2022-01-06 12:09:59 +02:00
@ -0,0 +1,102 @@
#!/usr/bin/env python3
# v0.0.1
# This is a legacy script used to export the controlled vocabularies from a CSV
# file. Originally we were embedding the controlled vocabularies directly inside
# the CSV, but we eventually decided that this was unwieldy and error prone.
# SPDX-License-Identifier: GPL-3.0-only
import argparse
import os
import re
import sys
from shutil import rmtree
import pandas as pd
def parseSchema(schema_df):
# Iterate over all rows (the "index, row" syntax allows us to access column
# headings in each row, which isn't possible if we just do row).
for index, row in schema_df.iterrows():
if row["dspace field name"] is not None and row["dspace field name"] != "":
dspace_field_name = row["dspace field name"]
dspace_field_name = False
# Generate a "safe" version of the element name for use in URLs and
# files by using the DSpace field name with dots replaced by dashes.
element_name_safe = dspace_field_name.replace(".", "-").lower()
print(f"element name: {element_name_safe}")
# Export controlled vocabularies from CSV file if they exist
if row["element controlled values or terms"]:
row["element controlled values or terms"], element_name_safe
def exportVocabulary(vocabulary: str, element_name_safe: str):
# Create an empty list where we'll add all the values (we don't need to do
# it this way, but using a list allows us to de-duplicate the values).
controlledVocabularyLines = []
for value in vocabulary.split("||"):
if value not in controlledVocabularyLines:
with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
for value in controlledVocabularyLines:
if args.debug:
print(f"Exported controlled vocabulary: {element_name_safe}")
parser = argparse.ArgumentParser(
description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
help="Clean controlled vocabularies directory before exporting.",
help="Print debug messages.",
help="Path to schema fields file (ie, iseal-core.csv).",
args = parser.parse_args()
if args.clean:
if args.debug:
print(f"Cleaning controlled vocabularies directory")
rmtree("data/controlled-vocabularies", ignore_errors=True)
if args.debug:
print(f"Creating controlled vocabularies directory")
# Make sure controlled vocabularies directory exists.
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)
if args.debug:
print(f"Opening {}")
df = pd.read_csv(
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)