iseal-core/util/export-controlled-vocabularies.py
Alan Orth d629a1ab17 Add util/export-controlled-vocabularies.py
This script is only used to export the controlled vocabularies from
the schema CSV files. Eventually we will remove them from there and
it won't be needed anymore.
2022-01-06 12:40:20 +02:00

103 lines
3.1 KiB
Python
Executable File

#!/usr/bin/env python3
#
# export-controlled-vocabularies.py v0.0.1
#
# This is a legacy script used to export the controlled vocabularies from a CSV
# file. Originally we were embedding the controlled vocabularies directly inside
# the CSV, but we eventually decided that this was unwieldy and error prone.
#
# SPDX-License-Identifier: GPL-3.0-only
import argparse
import os
import re
import sys
from shutil import rmtree
import pandas as pd
def parseSchema(schema_df):
# Iterate over all rows (the "index, row" syntax allows us to access column
# headings in each row, which isn't possible if we just do row).
for index, row in schema_df.iterrows():
if row["dspace field name"] is not None and row["dspace field name"] != "":
dspace_field_name = row["dspace field name"]
else:
dspace_field_name = False
# Generate a "safe" version of the element name for use in URLs and
# files by using the DSpace field name with dots replaced by dashes.
element_name_safe = dspace_field_name.replace(".", "-").lower()
print(f"element name: {element_name_safe}")
# Export controlled vocabularies from CSV file if they exist
if row["element controlled values or terms"]:
exportVocabulary(
row["element controlled values or terms"], element_name_safe
)
def exportVocabulary(vocabulary: str, element_name_safe: str):
# Create an empty list where we'll add all the values (we don't need to do
# it this way, but using a list allows us to de-duplicate the values).
controlledVocabularyLines = []
for value in vocabulary.split("||"):
if value not in controlledVocabularyLines:
controlledVocabularyLines.append(value)
with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
for value in controlledVocabularyLines:
f.write(f"{value}\n")
if args.debug:
print(f"Exported controlled vocabulary: {element_name_safe}")
parser = argparse.ArgumentParser(
description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
)
parser.add_argument(
"--clean",
help="Clean controlled vocabularies directory before exporting.",
action="store_true",
)
parser.add_argument(
"-d",
"--debug",
help="Print debug messages.",
action="store_true",
)
parser.add_argument(
"-i",
"--input-file",
help="Path to schema fields file (ie, iseal-core.csv).",
required=True,
type=argparse.FileType("r"),
)
args = parser.parse_args()
if args.clean:
if args.debug:
print(f"Cleaning controlled vocabularies directory")
rmtree("data/controlled-vocabularies", ignore_errors=True)
if args.debug:
print(f"Creating controlled vocabularies directory")
# Make sure controlled vocabularies directory exists.
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)
if args.debug:
print(f"Opening {args.input_file.name}")
df = pd.read_csv(args.input_file.name)
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)
parseSchema(df)