iseal-core/util/export-controlled-vocabularies.py

#!/usr/bin/env python3
#
# export-controlled-vocabularies.py v0.0.1
#
# This is a legacy script used to export the controlled vocabularies from a CSV
# file. Originally we were embedding the controlled vocabularies directly inside
# the CSV, but we eventually decided that this was unwieldy and error prone.
#
# SPDX-License-Identifier: GPL-3.0-only

import argparse
import os
import re
import sys
from shutil import rmtree

import pandas as pd


def parseSchema(schema_df):
    # Iterate over all rows (the "index, row" syntax allows us to access column
    # headings in each row, which isn't possible if we just do row).
    for index, row in schema_df.iterrows():

        if row["dspace field name"] is not None and row["dspace field name"] != "":
            dspace_field_name = row["dspace field name"]
        else:
            dspace_field_name = False

        # Generate a "safe" version of the element name for use in URLs and
        # files by using the DSpace field name with dots replaced by dashes.
        element_name_safe = dspace_field_name.replace(".", "-").lower()

        print(f"element name: {element_name_safe}")

        # Export controlled vocabularies from CSV file if they exist
        if row["element controlled values or terms"]:
            exportVocabulary(
                row["element controlled values or terms"], element_name_safe
            )


def exportVocabulary(vocabulary: str, element_name_safe: str):
    # Create an empty list where we'll add all the values (we don't need to do
    # it this way, but using a list allows us to de-duplicate the values).
    controlledVocabularyLines = []
    for value in vocabulary.split("||"):
        if value not in controlledVocabularyLines:
            controlledVocabularyLines.append(value)

    with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:
        for value in controlledVocabularyLines:
            f.write(f"{value}\n")

    if args.debug:
        print(f"Exported controlled vocabulary: {element_name_safe}")


parser = argparse.ArgumentParser(
    description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."
)
parser.add_argument(
    "--clean",
    help="Clean controlled vocabularies directory before exporting.",
    action="store_true",
)
parser.add_argument(
    "-d",
    "--debug",
    help="Print debug messages.",
    action="store_true",
)
parser.add_argument(
    "-i",
    "--input-file",
    help="Path to schema fields file (ie, iseal-core.csv).",
    required=True,
    type=argparse.FileType("r"),
)
args = parser.parse_args()

if args.clean:
    if args.debug:
        print(f"Cleaning controlled vocabularies directory")

    rmtree("data/controlled-vocabularies", ignore_errors=True)

if args.debug:
    print(f"Creating controlled vocabularies directory")

# Make sure controlled vocabularies directory exists.
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)

if args.debug:
    print(f"Opening {args.input_file.name}")

df = pd.read_csv(args.input_file.name)
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)

parseSchema(df)
Add util/export-controlled-vocabularies.py This script is only used to export the controlled vocabularies from the schema CSV files. Eventually we will remove them from there and it won't be needed anymore. 2022-01-06 11:09:59 +01:00			`#!/usr/bin/env python3`
			`#`
			`# export-controlled-vocabularies.py v0.0.1`
			`#`
			`# This is a legacy script used to export the controlled vocabularies from a CSV`
			`# file. Originally we were embedding the controlled vocabularies directly inside`
			`# the CSV, but we eventually decided that this was unwieldy and error prone.`
			`#`
			`# SPDX-License-Identifier: GPL-3.0-only`

			`import argparse`
			`import os`
			`import re`
			`import sys`
			`from shutil import rmtree`

			`import pandas as pd`


			`def parseSchema(schema_df):`
			`# Iterate over all rows (the "index, row" syntax allows us to access column`
			`# headings in each row, which isn't possible if we just do row).`
			`for index, row in schema_df.iterrows():`

			`if row["dspace field name"] is not None and row["dspace field name"] != "":`
			`dspace_field_name = row["dspace field name"]`
			`else:`
			`dspace_field_name = False`

			`# Generate a "safe" version of the element name for use in URLs and`
			`# files by using the DSpace field name with dots replaced by dashes.`
			`element_name_safe = dspace_field_name.replace(".", "-").lower()`

			`print(f"element name: {element_name_safe}")`

			`# Export controlled vocabularies from CSV file if they exist`
			`if row["element controlled values or terms"]:`
			`exportVocabulary(`
			`row["element controlled values or terms"], element_name_safe`
			`)`


			`def exportVocabulary(vocabulary: str, element_name_safe: str):`
			`# Create an empty list where we'll add all the values (we don't need to do`
			`# it this way, but using a list allows us to de-duplicate the values).`
			`controlledVocabularyLines = []`
			`for value in vocabulary.split("\|\|"):`
			`if value not in controlledVocabularyLines:`
			`controlledVocabularyLines.append(value)`

			`with open(f"data/controlled-vocabularies/{element_name_safe}.txt", "w") as f:`
			`for value in controlledVocabularyLines:`
			`f.write(f"{value}\n")`

			`if args.debug:`
			`print(f"Exported controlled vocabulary: {element_name_safe}")`


			`parser = argparse.ArgumentParser(`
			`description="Parse an ISEAL schema CSV file to extract embedded controlled vocabularies."`
			`)`
			`parser.add_argument(`
			`"--clean",`
			`help="Clean controlled vocabularies directory before exporting.",`
			`action="store_true",`
			`)`
			`parser.add_argument(`
			`"-d",`
			`"--debug",`
			`help="Print debug messages.",`
			`action="store_true",`
			`)`
			`parser.add_argument(`
			`"-i",`
			`"--input-file",`
			`help="Path to schema fields file (ie, iseal-core.csv).",`
			`required=True,`
			`type=argparse.FileType("r"),`
			`)`
			`args = parser.parse_args()`

			`if args.clean:`
			`if args.debug:`
			`print(f"Cleaning controlled vocabularies directory")`

			`rmtree("data/controlled-vocabularies", ignore_errors=True)`

			`if args.debug:`
			`print(f"Creating controlled vocabularies directory")`

			`# Make sure controlled vocabularies directory exists.`
			`# metadata and controlled vocabularies for Hugo to process.`
			`os.makedirs("data/controlled-vocabularies", mode=0o755, exist_ok=True)`

			`if args.debug:`
			`print(f"Opening {args.input_file.name}")`

			`df = pd.read_csv(args.input_file.name)`
			`df.dropna(how="all", axis=1, inplace=True)`
			`df.fillna("", inplace=True)`

			`parseSchema(df)`