mirror of
https://github.com/ISEAL-Community/iseal-core.git
synced 2024-11-25 08:10:18 +01:00
103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
|
#!/usr/bin/env python3
|
||
|
#
|
||
|
# export-controlled-vocabularies.py v0.0.1
|
||
|
#
|
||
|
# This is a legacy script used to export the controlled vocabularies from a CSV
|
||
|
# file. Originally we were embedding the controlled vocabularies directly inside
|
||
|
# the CSV, but we eventually decided that this was unwieldy and error prone.
|
||
|
#
|
||
|
# SPDX-License-Identifier: GPL-3.0-only
|
||
|
|
||
|
import argparse
|
||
|
import os
|
||
|
import re
|
||
|
import sys
|
||
|
from shutil import rmtree
|
||
|
|
||
|
import pandas as pd
|
||
|
|
||
|
|
||
|
def parseSchema(schema_df):
|
||
|
# Iterate over all rows (the "index, row" syntax allows us to access column
|
||
|
# headings in each row, which isn't possible if we just do row).
|
||
|
for index, row in schema_df.iterrows():
|
||
|
|
||
|
if row["dspace field name"] is not None and row["dspace field name"] != "":
|
||
|
dspace_field_name = row["dspace field name"]
|
||
|
else:
|
||
|
dspace_field_name = False
|
||
|
|
||
|
# Generate a "safe" version of the element name for use in URLs and
|
||
|
# files by using the DSpace field name with dots replaced by dashes.
|
||
|
element_name_safe = dspace_field_name.replace(".", "-").lower()
|
||
|
|
||
|
print(f"element name: {element_name_safe}")
|
||
|
|
||
|
# Export controlled vocabularies from CSV file if they exist
|
||
|
if row["element controlled values or terms"]:
|
||
|
exportVocabulary(
|
||
|
row["element controlled values or terms"], element_name_safe
|
||
|
)
|
||
|
|
||
|
|
||
|
def exportVocabulary(vocabulary: str, element_name_safe: str):
|
||
|
# Create an empty list where we'll add all the values (we don't need to do
|
||
|
# it this way, but using a list allows us to de-duplicate the values).
|
||
|
controlledVocabularyLines = []
|
||
|
for value in vocabulary.split("||"):
|
||
|
if value not in controlledVocabularyLines:
|
||
|
controlledVocabularyLines.append(value)
|
||
|
|
||
|
with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
|
||
|
for value in controlledVocabularyLines:
|
||
|
f.write(f"{value}\n")
|
||
|
|
||
|
if args.debug:
|
||
|
print(f"Exported controlled vocabulary: {element_name_safe}")
|
||
|
|
||
|
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"--clean",
|
||
|
help="Clean controlled vocabularies directory before exporting.",
|
||
|
action="store_true",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"-d",
|
||
|
"--debug",
|
||
|
help="Print debug messages.",
|
||
|
action="store_true",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"-i",
|
||
|
"--input-file",
|
||
|
help="Path to schema fields file (ie, iseal-core.csv).",
|
||
|
required=True,
|
||
|
type=argparse.FileType("r"),
|
||
|
)
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
if args.clean:
|
||
|
if args.debug:
|
||
|
print(f"Cleaning controlled vocabularies directory")
|
||
|
|
||
|
rmtree("data/controlled-vocabularies", ignore_errors=True)
|
||
|
|
||
|
if args.debug:
|
||
|
print(f"Creating controlled vocabularies directory")
|
||
|
|
||
|
# Make sure controlled vocabularies directory exists.
|
||
|
# metadata and controlled vocabularies for Hugo to process.
|
||
|
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)
|
||
|
|
||
|
if args.debug:
|
||
|
print(f"Opening {args.input_file.name}")
|
||
|
|
||
|
df = pd.read_csv(args.input_file.name)
|
||
|
df.dropna(how="all", axis=1, inplace=True)
|
||
|
df.fillna("", inplace=True)
|
||
|
|
||
|
parseSchema(df)
|