iseal-core/util/generate-hugo-content.py

#!/usr/bin/env python3
#
# generate-hugo-content.py v0.0.1
#
# SPDX-License-Identifier: GPL-3.0-only

import argparse
import os
import re
import sys
from shutil import rmtree

import pandas as pd


def parseSchema(schema_df):
    # Iterate over all rows (the "index, row" syntax allows us to access column
    # headings in each row, which isn't possible if we just do row).
    for index, row in schema_df.iterrows():
        # Split the element name on " - " because the category is duplicated
        # here, but in a format that is more difficult to use than the "idss
        # module category" field. I will # encourage Peter to modify this
        # field so it is less descriptive because that's what the "idss module
        # category" field does (and more consistently).
        if " - " in row["element name"]:
            # We only want the specific element name, not the category, ie:
            #
            #   [Category]  [Element]
            #   FSC audit - sampling system
            element_name = row["element name"].split(" - ")[1]
        else:
            element_name = row["element name"]

        # Make sure element name is URL friendly because we need to use it in
        # the file system and in the URL.
        #
        # Replace two or more whitespaces with one
        element_name = re.sub(r"\s{2,}", " ", element_name)
        # Replace unnecessary stuff in some element names (I should tell Peter
        # that these belong in the description)
        element_name = re.sub(r"\s?\(\w+\)", "", element_name)
        # Remove commas and question marks
        element_name = re.sub(r"[,?]", "", element_name)
        # Replace ": " with a dash (as in "Evaluation: ")
        element_name = element_name.replace(": ", "-")
        # Replace " / " with a dash (as in "biome / zone")
        element_name = element_name.replace(" / ", "-")
        # Replace whitespace, colons, and slashes with dashes
        element_name = re.sub(r"[\s/]", "-", element_name)
        # Lower case it
        element_name = element_name.lower()
        # Strip just in case
        element_name = element_name.strip()

        # For example Assurance, Certification, Core, Impact, etc
        module = row["idss schema module"].capitalize()
        # For example Certifying Body, FSC audit, Certificate, etc
        cluster = row["idss element cluster"].capitalize()

        # Generate a URL-safe version of the element name, though we need to
        # think about what field we want to use here.
        element_name_safe = cluster.replace(" ", "-").lower() + "-" + element_name

        print(f"element name: {element_name_safe}")

        # Create output directory for term using the URL-safe version
        outputDirectory = f"site/content/terms/{element_name_safe}"
        os.makedirs(outputDirectory, mode=0o755, exist_ok=True)

        if args.debug:
            print(f"Created terms directory: site/content/terms/{element_name_safe}")

        # Take the element description as is, but remove quotes
        element_description = row["element description"].replace("'", "")

        # Take the element guidance as is
        if row["element guidance"]:
            comment = row["element guidance"]
        else:
            comment = False

        example = row["element link for more information"]

        # How to use these in the HTML, slightly overlapping?
        cardinality = row["element options"].capitalize()
        prop_type = row["element type"].capitalize()

        if row["element controlled values or terms"]:
            controlled_vocab = True

            exportVocabulary(
                row["element controlled values or terms"], element_name_safe
            )
        else:
            controlled_vocab = False

        if row["mandatory?"] == "MANDATORY":
            required = True
        else:
            required = False

        if row["dspace field name"] is not None and row["dspace field name"] != "":
            dspace_field_name = row["dspace field name"]
        else:
            dspace_field_name = False

        # Combine element type and options into a "policy" of sorts and convert
        # them to sentence case because they are ALL CAPS in the Excel. We don't
        # need to do any checks because these fields should always exist.
        policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.'

        if args.debug:
            print(f"Processed: {row['element name']}")

        # Create an empty list with lines we'll write to the term's index.md in
        # TOML frontmatter format for Hugo.
        indexLines = []
        indexLines.append("---\n")
        # Use the full title for now (even though it's ugly). Better to fix the
        # schema spreadsheet than try to process the title here.
        indexLines.append("title: '" + row["element name"] + "'\n")
        if dspace_field_name:
            indexLines.append(f"field: '{dspace_field_name}'\n")
        indexLines.append(f"slug: '{element_name_safe}'\n")
        if element_description:
            indexLines.append(f"description: '{element_description}'\n")
        if comment:
            indexLines.append(f"comment: '{comment}'\n")
        indexLines.append(f"required: {required}\n")
        if controlled_vocab:
            indexLines.append(f"vocabulary: '{element_name_safe}.txt'\n")
        if module:
            indexLines.append(f"module: '{module}'\n")
        if cluster:
            indexLines.append(f"cluster: '{cluster}'\n")
        indexLines.append(f"policy: '{policy}'\n")
        ## TODO: use some real date...?
        # indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n")
        indexLines.append("---")

        with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f:
            f.writelines(indexLines)


def exportVocabulary(vocabulary: str, element_name_safe: str):
    # Create an empty list where we'll add all the values (we don't need to do
    # it this way, but using a list allows us to de-duplicate the values).
    controlledVocabularyLines = []
    for value in vocabulary.split("||"):
        if value not in controlledVocabularyLines:
            controlledVocabularyLines.append(value)

    with open(
        f"site/content/terms/{element_name_safe}/{element_name_safe}.txt", "w"
    ) as f:
        for value in controlledVocabularyLines:
            f.write(f"{value}\n")

    if args.debug:
        print(f"Exported controlled vocabulary: {element_name_safe}")


parser = argparse.ArgumentParser(
    description="Parse an ISEAL schema Excel file to produce documentation about metadata requirements."
)
parser.add_argument(
    "--clean",
    help="Clean output directory before building.",
    action="store_true",
)
parser.add_argument(
    "-d",
    "--debug",
    help="Print debug messages.",
    action="store_true",
)
parser.add_argument(
    "-i",
    "--input-file",
    help="Path to schema fields file (idss_schema_fields.xlsx).",
    required=True,
    type=argparse.FileType("r"),
)
args = parser.parse_args()

if args.clean:
    if args.debug:
        print(f"Cleaning terms output directory")

    rmtree("site/content/terms", ignore_errors=True)

if args.debug:
    print(f"Creating terms output directory")
# Make sure content directory exists. This is where we will deposit all the term
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("site/content/terms", mode=0o755, exist_ok=True)

if args.debug:
    print(f"Opening {args.input_file.name}")

df = pd.read_excel(args.input_file.name)
# Added inplace=True
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)

parseSchema(df)