iseal-core/util/generate-hugo-content.py

#!/usr/bin/env python3
#
# generate-hugo-content.py v0.0.1
#
# SPDX-License-Identifier: GPL-3.0-only

import argparse
import os
import re
import sys
from shutil import copyfile, rmtree

import pandas as pd


def parseSchema(schema_df):
    # Iterate over all rows (the "index, row" syntax allows us to access column
    # headings in each row, which isn't possible if we just do row).
    for index, row in schema_df.iterrows():
        element_name = row["element name"]

        # Make sure element name is URL friendly because we need to use it in
        # the file system and in the URL.
        #
        # Replace two or more whitespaces with one
        element_name = re.sub(r"\s{2,}", " ", element_name)
        # Replace unnecessary stuff in some element names (I should tell Peter
        # that these belong in the description)
        element_name = re.sub(r"\s?\(\w+\)", "", element_name)
        # Remove commas and question marks
        element_name = re.sub(r"[,?]", "", element_name)
        # Replace ": " with a dash (as in "Evaluation: ")
        element_name = element_name.replace(": ", "-")
        # Replace " / " with a dash (as in "biome / zone")
        element_name = element_name.replace(" / ", "-")
        # Replace whitespace, colons, and slashes with dashes
        element_name = re.sub(r"[\s/]", "-", element_name)
        # Lower case it
        element_name = element_name.lower()
        # Strip just in case
        element_name = element_name.strip()

        # For example Certifying Body, FSC audit, Certificate, etc
        cluster = row["idss element cluster"].capitalize()

        # Extract the module (whether from IDSS or extension), for example
        # Assurance, Certification, Core, Impact, etc
        if "idss schema module" in df.columns:
            module = row["idss schema module"]

            # Use the default home layout
            layout = "home"
        elif "fsc extension module" in df.columns:
            module = row["fsc extension module"]

            # Since we know this is the FSC schema we can set the layout to
            # use the custom fsc layout instead of the default home layout.
            layout = "fsc"

        if row["dspace field name"] is not None and row["dspace field name"] != "":
            dspace_field_name = row["dspace field name"]
        else:
            dspace_field_name = False

        # Generate a "safe" version of the element name for use in URLs and
        # files by using the DSpace field name with dots replaced by dashes.
        element_name_safe = dspace_field_name.replace(".", "-").lower()

        print(f"element name: {element_name_safe}")

        # Create output directory for term using the URL-safe version
        outputDirectory = f"site/content/terms/{element_name_safe}"
        os.makedirs(outputDirectory, mode=0o755, exist_ok=True)

        if args.debug:
            print(f"Created terms directory: site/content/terms/{element_name_safe}")

        # Take the element description as is, but remove quotes
        element_description = row["element description"].replace("'", "")

        # Take the element guidance as is
        if row["element guidance"]:
            comment = row["element guidance"]
        else:
            comment = False

        example = row["element link for more information"]

        # How to use these in the HTML, slightly overlapping?
        cardinality = row["element options"].capitalize()
        prop_type = row["element type"].capitalize()

        if os.path.isfile(f"data/controlled-vocabularies/{element_name_safe}.txt"):
            controlled_vocab = True

            controlled_vocabulary_src = (
                f"data/controlled-vocabularies/{element_name_safe}.txt"
            )
            controlled_vocabulary_dst = (
                f"site/content/terms/{element_name_safe}/vocabulary.txt"
            )

            copyfile(controlled_vocabulary_src, controlled_vocabulary_dst)

            if args.debug:
                print(f"Copied controlled vocabulary: {element_name_safe}")
        else:
            controlled_vocab = False

        if "mandatory?" in df.columns and row["mandatory?"] == "mandatory":
            required = True
        else:
            required = False

        # Combine element type and options into a "policy" of sorts and convert
        # them to sentence case because they are lowercase in the CSV. We don't
        # need to do any checks because these fields should always exist.
        policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.'

        if args.debug:
            print(f"Processed: {row['element name']}")

        # Create an empty list with lines we'll write to the term's index.md in
        # TOML frontmatter format for Hugo.
        indexLines = []
        indexLines.append("---\n")
        # Use the full title for now (even though it's ugly). Better to fix the
        # schema spreadsheet than try to process the title here.
        indexLines.append("title: '" + row["element name"] + "'\n")
        if dspace_field_name:
            indexLines.append(f"field: '{dspace_field_name}'\n")
        indexLines.append(f"slug: '{element_name_safe}'\n")
        if element_description:
            indexLines.append(f"description: '{element_description}'\n")
        if comment:
            indexLines.append(f"comment: '{comment}'\n")
        indexLines.append(f"required: {required}\n")
        if controlled_vocab:
            indexLines.append(f"vocabulary: 'vocabulary.txt'\n")
        if module:
            indexLines.append(f"module: '{module}'\n")
        if cluster:
            indexLines.append(f"cluster: '{cluster}'\n")
        indexLines.append(f"policy: '{policy}'\n")
        if layout:
            indexLines.append(f"layout: '{layout}'\n")
        ## TODO: use some real date...?
        # indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n")
        indexLines.append("---")

        with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f:
            f.writelines(indexLines)


parser = argparse.ArgumentParser(
    description="Parse an ISEAL schema CSV file to produce documentation about metadata requirements."
)
parser.add_argument(
    "--clean",
    help="Clean output directory before building.",
    action="store_true",
)
parser.add_argument(
    "-d",
    "--debug",
    help="Print debug messages.",
    action="store_true",
)
parser.add_argument(
    "-i",
    "--input-file",
    help="Path to schema fields file (ie, iseal-core.csv).",
    required=True,
    type=argparse.FileType("r"),
)
args = parser.parse_args()

if args.clean:
    if args.debug:
        print(f"Cleaning terms output directory")

    rmtree("site/content/terms", ignore_errors=True)

if args.debug:
    print(f"Creating terms output directory")
# Make sure content directory exists. This is where we will deposit all the term
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("site/content/terms", mode=0o755, exist_ok=True)

if args.debug:
    print(f"Opening {args.input_file.name}")

df = pd.read_csv(args.input_file.name)
# Added inplace=True
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)

parseSchema(df)
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`#!/usr/bin/env python3`
			`#`
util/generate-hugo-content.py: Update header 2021-11-17 16:16:22 +01:00			`# generate-hugo-content.py v0.0.1`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`#`
			`# SPDX-License-Identifier: GPL-3.0-only`

			`import argparse`
			`import os`
			`import re`
			`import sys`
util/*.py: run isort Sorts and organizes the Python imports. 2022-01-30 10:52:52 +01:00			`from shutil import copyfile, rmtree`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00
			`import pandas as pd`


			`def parseSchema(schema_df):`
			`# Iterate over all rows (the "index, row" syntax allows us to access column`
			`# headings in each row, which isn't possible if we just do row).`
			`for index, row in schema_df.iterrows():`
util/generate-hugo-content.py: don't split element name on dash We don't need to do this anymore since we adopted the new clusters and cleaned up the element name field. 2021-12-13 12:16:39 +01:00			`element_name = row["element name"]`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00
			`# Make sure element name is URL friendly because we need to use it in`
			`# the file system and in the URL.`
			`#`
			`# Replace two or more whitespaces with one`
			`element_name = re.sub(r"\s{2,}", " ", element_name)`
			`# Replace unnecessary stuff in some element names (I should tell Peter`
			`# that these belong in the description)`
			`element_name = re.sub(r"\s?\(\w+\)", "", element_name)`
			`# Remove commas and question marks`
			`element_name = re.sub(r"[,?]", "", element_name)`
			`# Replace ": " with a dash (as in "Evaluation: ")`
			`element_name = element_name.replace(": ", "-")`
			`# Replace " / " with a dash (as in "biome / zone")`
			`element_name = element_name.replace(" / ", "-")`
			`# Replace whitespace, colons, and slashes with dashes`
			`element_name = re.sub(r"[\s/]", "-", element_name)`
			`# Lower case it`
			`element_name = element_name.lower()`
			`# Strip just in case`
			`element_name = element_name.strip()`

			`# For example Certifying Body, FSC audit, Certificate, etc`
util/generate-hugo-content.py: update for clusters and modules Peter re-worked the schema to incorporate the concept of clusters and modules. 2021-11-26 10:58:06 +01:00			`cluster = row["idss element cluster"].capitalize()`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00
util/generate-hugo-content.py: add support for fsc.csv Currently the only difference is the name of the module column and that there are no mandatory fields so we need to be careful there. 2021-12-21 17:17:59 +01:00			`# Extract the module (whether from IDSS or extension), for example`
			`# Assurance, Certification, Core, Impact, etc`
			`if "idss schema module" in df.columns:`
			`module = row["idss schema module"]`

			`# Use the default home layout`
			`layout = "home"`
			`elif "fsc extension module" in df.columns:`
			`module = row["fsc extension module"]`

			`# Since we know this is the FSC schema we can set the layout to`
			`# use the custom fsc layout instead of the default home layout.`
			`layout = "fsc"`
util/generate-hugo-content.py: minor re-org We get the cluster first since clusters encompass modules. 2021-12-13 12:20:09 +01:00
util/generate-hugo-content.py: use dspace field name Now all elements have DSpace field names so we can use that as the unique identifier for each element. 2022-01-06 11:09:24 +01:00			`if row["dspace field name"] is not None and row["dspace field name"] != "":`
			`dspace_field_name = row["dspace field name"]`
			`else:`
			`dspace_field_name = False`

util/generate-hugo-content.py: update comment 2021-12-13 12:31:29 +01:00			`# Generate a "safe" version of the element name for use in URLs and`
util/generate-hugo-content.py: use dspace field name Now all elements have DSpace field names so we can use that as the unique identifier for each element. 2022-01-06 11:09:24 +01:00			`# files by using the DSpace field name with dots replaced by dashes.`
			`element_name_safe = dspace_field_name.replace(".", "-").lower()`
util/generate-hugo-content.py: update for clusters and modules Peter re-worked the schema to incorporate the concept of clusters and modules. 2021-11-26 10:58:06 +01:00
			`print(f"element name: {element_name_safe}")`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00
			`# Create output directory for term using the URL-safe version`
			`outputDirectory = f"site/content/terms/{element_name_safe}"`
			`os.makedirs(outputDirectory, mode=0o755, exist_ok=True)`

			`if args.debug:`
			`print(f"Created terms directory: site/content/terms/{element_name_safe}")`

			`# Take the element description as is, but remove quotes`
			`element_description = row["element description"].replace("'", "")`

			`# Take the element guidance as is`
			`if row["element guidance"]:`
			`comment = row["element guidance"]`
			`else:`
			`comment = False`

			`example = row["element link for more information"]`

			`# How to use these in the HTML, slightly overlapping?`
			`cardinality = row["element options"].capitalize()`
			`prop_type = row["element type"].capitalize()`

util/generate-hugo-content.py: controlled vocabularies We are planning to remove the controlled vocabularies from the CSV files so we should not expect that this column will exist. Instead, check if there is a controlled vocabulary in the data directory. The controlled vocabularies were already exported once using the util/export-controlled-vocabularies.py script so we don't actually need them in the CSVs anymore. 2022-01-30 18:02:30 +01:00			`if os.path.isfile(f"data/controlled-vocabularies/{element_name_safe}.txt"):`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`controlled_vocab = True`

util/generate-hugo-content.py: add support for fsc.csv Currently the only difference is the name of the module column and that there are no mandatory fields so we need to be careful there. 2021-12-21 17:17:59 +01:00			`controlled_vocabulary_src = (`
			`f"data/controlled-vocabularies/{element_name_safe}.txt"`
			`)`
			`controlled_vocabulary_dst = (`
			`f"site/content/terms/{element_name_safe}/vocabulary.txt"`
			`)`
util/generate-hugo-content.py: re-work vocabularies Read vocabularies from the data/controlled-vocabularies directory instead of exporting them from the schema itself. Also, I use the name vocabulary.txt for all of them on the site since they are in each field's directory already. 2021-12-13 14:03:20 +01:00
			`copyfile(controlled_vocabulary_src, controlled_vocabulary_dst)`

			`if args.debug:`
			`print(f"Copied controlled vocabulary: {element_name_safe}")`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`else:`
			`controlled_vocab = False`

Use lowercase "mandatory" Closes #12 2022-04-16 17:20:06 +02:00			`if "mandatory?" in df.columns and row["mandatory?"] == "mandatory":`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`required = True`
			`else:`
			`required = False`

			`# Combine element type and options into a "policy" of sorts and convert`
Don't use ALL CAPS in CSVs Convert to lower case and update scripts. Closes #13 2022-04-16 17:44:20 +02:00			`# them to sentence case because they are lowercase in the CSV. We don't`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`# need to do any checks because these fields should always exist.`
			`policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.'`

			`if args.debug:`
			`print(f"Processed: {row['element name']}")`

			`# Create an empty list with lines we'll write to the term's index.md in`
			`# TOML frontmatter format for Hugo.`
			`indexLines = []`
			`indexLines.append("---\n")`
util/generate-hugo-content.py: update for clusters and modules Peter re-worked the schema to incorporate the concept of clusters and modules. 2021-11-26 10:58:06 +01:00			`# Use the full title for now (even though it's ugly). Better to fix the`
			`# schema spreadsheet than try to process the title here.`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`indexLines.append("title: '" + row["element name"] + "'\n")`
			`if dspace_field_name:`
			`indexLines.append(f"field: '{dspace_field_name}'\n")`
			`indexLines.append(f"slug: '{element_name_safe}'\n")`
			`if element_description:`
			`indexLines.append(f"description: '{element_description}'\n")`
			`if comment:`
			`indexLines.append(f"comment: '{comment}'\n")`
			`indexLines.append(f"required: {required}\n")`
			`if controlled_vocab:`
util/generate-hugo-content.py: re-work vocabularies Read vocabularies from the data/controlled-vocabularies directory instead of exporting them from the schema itself. Also, I use the name vocabulary.txt for all of them on the site since they are in each field's directory already. 2021-12-13 14:03:20 +01:00			`indexLines.append(f"vocabulary: 'vocabulary.txt'\n")`
util/generate-hugo-content.py: write module to content We need to write the IDSS module to the site content. 2021-11-11 14:05:18 +01:00			`if module:`
			`indexLines.append(f"module: '{module}'\n")`
util/generate-hugo-content.py: update for clusters and modules Peter re-worked the schema to incorporate the concept of clusters and modules. 2021-11-26 10:58:06 +01:00			`if cluster:`
			`indexLines.append(f"cluster: '{cluster}'\n")`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`indexLines.append(f"policy: '{policy}'\n")`
util/generate-hugo-content.py: add support for fsc.csv Currently the only difference is the name of the module column and that there are no mandatory fields so we need to be careful there. 2021-12-21 17:17:59 +01:00			`if layout:`
			`indexLines.append(f"layout: '{layout}'\n")`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`## TODO: use some real date...?`
			`# indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n")`
			`indexLines.append("---")`

			`with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f:`
			`f.writelines(indexLines)`


			`parser = argparse.ArgumentParser(`
Update references to shema-fields.csv 2021-12-20 11:04:11 +01:00			`description="Parse an ISEAL schema CSV file to produce documentation about metadata requirements."`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`)`
			`parser.add_argument(`
			`"--clean",`
			`help="Clean output directory before building.",`
			`action="store_true",`
			`)`
			`parser.add_argument(`
			`"-d",`
			`"--debug",`
			`help="Print debug messages.",`
			`action="store_true",`
			`)`
			`parser.add_argument(`
			`"-i",`
			`"--input-file",`
Update references to shema-fields.csv 2021-12-20 11:04:11 +01:00			`help="Path to schema fields file (ie, iseal-core.csv).",`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`required=True,`
			`type=argparse.FileType("r"),`
			`)`
			`args = parser.parse_args()`

			`if args.clean:`
			`if args.debug:`
			`print(f"Cleaning terms output directory")`

			`rmtree("site/content/terms", ignore_errors=True)`

			`if args.debug:`
			`print(f"Creating terms output directory")`
			`# Make sure content directory exists. This is where we will deposit all the term`
			`# metadata and controlled vocabularies for Hugo to process.`
			`os.makedirs("site/content/terms", mode=0o755, exist_ok=True)`

			`if args.debug:`
			`print(f"Opening {args.input_file.name}")`

util/generate-hugo-content.py: read schema from CSV Read the schema fields from CSV instead of Excel now that there is a copy here in the repository. 2021-12-07 21:57:09 +01:00			`df = pd.read_csv(args.input_file.name)`
Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. 2021-11-01 07:21:50 +01:00			`# Added inplace=True`
			`df.dropna(how="all", axis=1, inplace=True)`
			`df.fillna("", inplace=True)`

			`parseSchema(df)`