Add util/export-controlled-vocabularies.py

This script is only used to export the controlled vocabularies from
the schema CSV files. Eventually we will remove them from there and
it won't be needed anymore.
This commit is contained in:
Alan Orth 2022-01-06 12:09:59 +02:00
parent 75076124a5
commit d629a1ab17
1 changed files with 102 additions and 0 deletions

View File

@ -0,0 +1,102 @@
#!/usr/bin/env python3
#
# export-controlled-vocabularies.py v0.0.1
#
# This is a legacy script used to export the controlled vocabularies from a CSV
# file. Originally we were embedding the controlled vocabularies directly inside
# the CSV, but we eventually decided that this was unwieldy and error prone.
#
# SPDX-License-Identifier: GPL-3.0-only
import argparse
import os
import re
import sys
from shutil import rmtree
import pandas as pd
def parseSchema(schema_df):
# Iterate over all rows (the "index, row" syntax allows us to access column
# headings in each row, which isn't possible if we just do row).
for index, row in schema_df.iterrows():
if row["dspace field name"] is not None and row["dspace field name"] != "":
dspace_field_name = row["dspace field name"]
else:
dspace_field_name = False
# Generate a "safe" version of the element name for use in URLs and
# files by using the DSpace field name with dots replaced by dashes.
element_name_safe = dspace_field_name.replace(".", "-").lower()
print(f"element name: {element_name_safe}")
# Export controlled vocabularies from CSV file if they exist
if row["element controlled values or terms"]:
exportVocabulary(
row["element controlled values or terms"], element_name_safe
)
def exportVocabulary(vocabulary: str, element_name_safe: str):
# Create an empty list where we'll add all the values (we don't need to do
# it this way, but using a list allows us to de-duplicate the values).
controlledVocabularyLines = []
for value in vocabulary.split("||"):
if value not in controlledVocabularyLines:
controlledVocabularyLines.append(value)
with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
for value in controlledVocabularyLines:
f.write(f"{value}\n")
if args.debug:
print(f"Exported controlled vocabulary: {element_name_safe}")
parser = argparse.ArgumentParser(
description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
)
parser.add_argument(
"--clean",
help="Clean controlled vocabularies directory before exporting.",
action="store_true",
)
parser.add_argument(
"-d",
"--debug",
help="Print debug messages.",
action="store_true",
)
parser.add_argument(
"-i",
"--input-file",
help="Path to schema fields file (ie, iseal-core.csv).",
required=True,
type=argparse.FileType("r"),
)
args = parser.parse_args()
if args.clean:
if args.debug:
print(f"Cleaning controlled vocabularies directory")
rmtree("data/controlled-vocabularies", ignore_errors=True)
if args.debug:
print(f"Creating controlled vocabularies directory")
# Make sure controlled vocabularies directory exists.
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)
if args.debug:
print(f"Opening {args.input_file.name}")
df = pd.read_csv(args.input_file.name)
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)
parseSchema(df)