mirror of
https://github.com/ISEAL-Community/iseal-core.git
synced 2024-11-25 00:00:19 +01:00
Alan Orth
d629a1ab17
This script is only used to export the controlled vocabularies from the schema CSV files. Eventually we will remove them from there and it won't be needed anymore.
103 lines
3.1 KiB
Python
Executable File
103 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# export-controlled-vocabularies.py v0.0.1
|
|
#
|
|
# This is a legacy script used to export the controlled vocabularies from a CSV
|
|
# file. Originally we were embedding the controlled vocabularies directly inside
|
|
# the CSV, but we eventually decided that this was unwieldy and error prone.
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
from shutil import rmtree
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def parseSchema(schema_df):
|
|
# Iterate over all rows (the "index, row" syntax allows us to access column
|
|
# headings in each row, which isn't possible if we just do row).
|
|
for index, row in schema_df.iterrows():
|
|
|
|
if row["dspace field name"] is not None and row["dspace field name"] != "":
|
|
dspace_field_name = row["dspace field name"]
|
|
else:
|
|
dspace_field_name = False
|
|
|
|
# Generate a "safe" version of the element name for use in URLs and
|
|
# files by using the DSpace field name with dots replaced by dashes.
|
|
element_name_safe = dspace_field_name.replace(".", "-").lower()
|
|
|
|
print(f"element name: {element_name_safe}")
|
|
|
|
# Export controlled vocabularies from CSV file if they exist
|
|
if row["element controlled values or terms"]:
|
|
exportVocabulary(
|
|
row["element controlled values or terms"], element_name_safe
|
|
)
|
|
|
|
|
|
def exportVocabulary(vocabulary: str, element_name_safe: str):
|
|
# Create an empty list where we'll add all the values (we don't need to do
|
|
# it this way, but using a list allows us to de-duplicate the values).
|
|
controlledVocabularyLines = []
|
|
for value in vocabulary.split("||"):
|
|
if value not in controlledVocabularyLines:
|
|
controlledVocabularyLines.append(value)
|
|
|
|
with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
|
|
for value in controlledVocabularyLines:
|
|
f.write(f"{value}\n")
|
|
|
|
if args.debug:
|
|
print(f"Exported controlled vocabulary: {element_name_safe}")
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
|
|
)
|
|
parser.add_argument(
|
|
"--clean",
|
|
help="Clean controlled vocabularies directory before exporting.",
|
|
action="store_true",
|
|
)
|
|
parser.add_argument(
|
|
"-d",
|
|
"--debug",
|
|
help="Print debug messages.",
|
|
action="store_true",
|
|
)
|
|
parser.add_argument(
|
|
"-i",
|
|
"--input-file",
|
|
help="Path to schema fields file (ie, iseal-core.csv).",
|
|
required=True,
|
|
type=argparse.FileType("r"),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.clean:
|
|
if args.debug:
|
|
print(f"Cleaning controlled vocabularies directory")
|
|
|
|
rmtree("data/controlled-vocabularies", ignore_errors=True)
|
|
|
|
if args.debug:
|
|
print(f"Creating controlled vocabularies directory")
|
|
|
|
# Make sure controlled vocabularies directory exists.
|
|
# metadata and controlled vocabularies for Hugo to process.
|
|
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)
|
|
|
|
if args.debug:
|
|
print(f"Opening {args.input_file.name}")
|
|
|
|
df = pd.read_csv(args.input_file.name)
|
|
df.dropna(how="all", axis=1, inplace=True)
|
|
df.fillna("", inplace=True)
|
|
|
|
parseSchema(df)
|