From 30310e6db61790dcd91b46de4b030beed2ee0b71 Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Mon, 1 Nov 2021 08:21:50 +0200
Subject: [PATCH] Add util/generate-docs.py

This script parses the Excel schema file to produce term metadata
and extract controlled vocabularies to the site content directory.
After running this we can generate the site using Hugo.
---
 util/generate-docs.py | 199 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100755 util/generate-docs.py

diff --git a/util/generate-docs.py b/util/generate-docs.py
new file mode 100755
index 00000000..e0275624
--- /dev/null
+++ b/util/generate-docs.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+#
+# generate-docs.py v0.0.1
+#
+# SPDX-License-Identifier: GPL-3.0-only
+
+import argparse
+import os
+import re
+import sys
+from shutil import rmtree
+
+import pandas as pd
+
+
+def parseSchema(schema_df):
+    # Iterate over all rows (the "index, row" syntax allows us to access column
+    # headings in each row, which isn't possible if we just do row).
+    for index, row in schema_df.iterrows():
+        # Split the element name on " - " because the category is duplicated
+        # here, but in a format that is more difficult to use than the "idss
+        # module category" field. I will # encourage Peter to modify this
+        # field so it is less descriptive because that's what the "idss module
+        # category" field does (and more consistently).
+        if " - " in row["element name"]:
+            # We only want the specific element name, not the category, ie:
+            #
+            #   [Category]  [Element]
+            #   FSC audit - sampling system
+            element_name = row["element name"].split(" - ")[1]
+        else:
+            element_name = row["element name"]
+
+        # Make sure element name is URL friendly because we need to use it in
+        # the file system and in the URL.
+        #
+        # Replace two or more whitespaces with one
+        element_name = re.sub(r"\s{2,}", " ", element_name)
+        # Replace unnecessary stuff in some element names (I should tell Peter
+        # that these belong in the description)
+        element_name = re.sub(r"\s?\(\w+\)", "", element_name)
+        # Remove commas and question marks
+        element_name = re.sub(r"[,?]", "", element_name)
+        # Replace ": " with a dash (as in "Evaluation: ")
+        element_name = element_name.replace(": ", "-")
+        # Replace " / " with a dash (as in "biome / zone")
+        element_name = element_name.replace(" / ", "-")
+        # Replace whitespace, colons, and slashes with dashes
+        element_name = re.sub(r"[\s/]", "-", element_name)
+        # Lower case it
+        element_name = element_name.lower()
+        # Strip just in case
+        element_name = element_name.strip()
+
+        # For example Assurance, Certification, Core, Impact, etc
+        module = row["idss module"].lower()
+        # For example Certifying Body, FSC audit, Certificate, etc
+        module_cat = row["idss module category"]
+
+        # Generate a URL-safe version of the element name, though we need to
+        # think about what field we want to use here.
+        element_name_safe = module_cat.replace(" ", "-").lower() + "-" + element_name
+
+        # Create output directory for term using the URL-safe version
+        outputDirectory = f"site/content/terms/{element_name_safe}"
+        os.makedirs(outputDirectory, mode=0o755, exist_ok=True)
+
+        if args.debug:
+            print(f"Created terms directory: site/content/terms/{element_name_safe}")
+
+        # Take the element description as is, but remove quotes
+        element_description = row["element description"].replace("'", "")
+
+        # Take the element guidance as is
+        if row["element guidance"]:
+            comment = row["element guidance"]
+        else:
+            comment = False
+
+        example = row["element link for more information"]
+
+        # How to use these in the HTML, slightly overlapping?
+        cardinality = row["element options"].capitalize()
+        prop_type = row["element type"].capitalize()
+
+        if row["element controlled values or terms"]:
+            controlled_vocab = True
+
+            exportVocabulary(
+                row["element controlled values or terms"], element_name_safe
+            )
+        else:
+            controlled_vocab = False
+
+        if row["mandatory?"] == "MANDATORY":
+            required = True
+        else:
+            required = False
+
+        if row["dspace field name"] is not None and row["dspace field name"] != "":
+            dspace_field_name = row["dspace field name"]
+        else:
+            dspace_field_name = False
+
+        # Combine element type and options into a "policy" of sorts and convert
+        # them to sentence case because they are ALL CAPS in the Excel. We don't
+        # need to do any checks because these fields should always exist.
+        policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.'
+
+        if args.debug:
+            print(f"Processed: {row['element name']}")
+
+        # Create an empty list with lines we'll write to the term's index.md in
+        # TOML frontmatter format for Hugo.
+        indexLines = []
+        indexLines.append("---\n")
+        # Use the full title for now (even though it's ugly)
+        indexLines.append("title: '" + row["element name"] + "'\n")
+        if dspace_field_name:
+            indexLines.append(f"field: '{dspace_field_name}'\n")
+        indexLines.append(f"slug: '{element_name_safe}'\n")
+        if element_description:
+            indexLines.append(f"description: '{element_description}'\n")
+        if comment:
+            indexLines.append(f"comment: '{comment}'\n")
+        indexLines.append(f"required: {required}\n")
+        if controlled_vocab:
+            indexLines.append(f"vocabulary: '{element_name_safe}.txt'\n")
+        indexLines.append(f"policy: '{policy}'\n")
+        ## TODO: use some real date...?
+        # indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n")
+        indexLines.append("---")
+
+        with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f:
+            f.writelines(indexLines)
+
+
+def exportVocabulary(vocabulary: str, element_name_safe: str):
+    # Create an empty list where we'll add all the values (we don't need to do
+    # it this way, but using a list allows us to de-duplicate the values).
+    controlledVocabularyLines = []
+    for value in vocabulary.split("||"):
+        if value not in controlledVocabularyLines:
+            controlledVocabularyLines.append(value)
+
+    with open(
+        f"site/content/terms/{element_name_safe}/{element_name_safe}.txt", "w"
+    ) as f:
+        for value in controlledVocabularyLines:
+            f.write(f"{value}\n")
+
+    if args.debug:
+        print(f"Exported controlled vocabulary: {element_name_safe}")
+
+
+parser = argparse.ArgumentParser(
+    description="Parse an ISEAL schema Excel file to produce documentation about metadata requirements."
+)
+parser.add_argument(
+    "--clean",
+    help="Clean output directory before building.",
+    action="store_true",
+)
+parser.add_argument(
+    "-d",
+    "--debug",
+    help="Print debug messages.",
+    action="store_true",
+)
+parser.add_argument(
+    "-i",
+    "--input-file",
+    help="Path to schema fields file (idss_schema_fields.xlsx).",
+    required=True,
+    type=argparse.FileType("r"),
+)
+args = parser.parse_args()
+
+if args.clean:
+    if args.debug:
+        print(f"Cleaning terms output directory")
+
+    rmtree("site/content/terms", ignore_errors=True)
+
+if args.debug:
+    print(f"Creating terms output directory")
+# Make sure content directory exists. This is where we will deposit all the term
+# metadata and controlled vocabularies for Hugo to process.
+os.makedirs("site/content/terms", mode=0o755, exist_ok=True)
+
+if args.debug:
+    print(f"Opening {args.input_file.name}")
+
+df = pd.read_excel(args.input_file.name)
+# Added inplace=True
+df.dropna(how="all", axis=1, inplace=True)
+df.fillna("", inplace=True)
+
+parseSchema(df)