From 30310e6db61790dcd91b46de4b030beed2ee0b71 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 1 Nov 2021 08:21:50 +0200 Subject: [PATCH] Add util/generate-docs.py This script parses the Excel schema file to produce term metadata and extract controlled vocabularies to the site content directory. After running this we can generate the site using Hugo. --- util/generate-docs.py | 199 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100755 util/generate-docs.py diff --git a/util/generate-docs.py b/util/generate-docs.py new file mode 100755 index 00000000..e0275624 --- /dev/null +++ b/util/generate-docs.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +# +# generate-docs.py v0.0.1 +# +# SPDX-License-Identifier: GPL-3.0-only + +import argparse +import os +import re +import sys +from shutil import rmtree + +import pandas as pd + + +def parseSchema(schema_df): + # Iterate over all rows (the "index, row" syntax allows us to access column + # headings in each row, which isn't possible if we just do row). + for index, row in schema_df.iterrows(): + # Split the element name on " - " because the category is duplicated + # here, but in a format that is more difficult to use than the "idss + # module category" field. I will # encourage Peter to modify this + # field so it is less descriptive because that's what the "idss module + # category" field does (and more consistently). + if " - " in row["element name"]: + # We only want the specific element name, not the category, ie: + # + # [Category] [Element] + # FSC audit - sampling system + element_name = row["element name"].split(" - ")[1] + else: + element_name = row["element name"] + + # Make sure element name is URL friendly because we need to use it in + # the file system and in the URL. + # + # Replace two or more whitespaces with one + element_name = re.sub(r"\s{2,}", " ", element_name) + # Replace unnecessary stuff in some element names (I should tell Peter + # that these belong in the description) + element_name = re.sub(r"\s?\(\w+\)", "", element_name) + # Remove commas and question marks + element_name = re.sub(r"[,?]", "", element_name) + # Replace ": " with a dash (as in "Evaluation: ") + element_name = element_name.replace(": ", "-") + # Replace " / " with a dash (as in "biome / zone") + element_name = element_name.replace(" / ", "-") + # Replace whitespace, colons, and slashes with dashes + element_name = re.sub(r"[\s/]", "-", element_name) + # Lower case it + element_name = element_name.lower() + # Strip just in case + element_name = element_name.strip() + + # For example Assurance, Certification, Core, Impact, etc + module = row["idss module"].lower() + # For example Certifying Body, FSC audit, Certificate, etc + module_cat = row["idss module category"] + + # Generate a URL-safe version of the element name, though we need to + # think about what field we want to use here. + element_name_safe = module_cat.replace(" ", "-").lower() + "-" + element_name + + # Create output directory for term using the URL-safe version + outputDirectory = f"site/content/terms/{element_name_safe}" + os.makedirs(outputDirectory, mode=0o755, exist_ok=True) + + if args.debug: + print(f"Created terms directory: site/content/terms/{element_name_safe}") + + # Take the element description as is, but remove quotes + element_description = row["element description"].replace("'", "") + + # Take the element guidance as is + if row["element guidance"]: + comment = row["element guidance"] + else: + comment = False + + example = row["element link for more information"] + + # How to use these in the HTML, slightly overlapping? + cardinality = row["element options"].capitalize() + prop_type = row["element type"].capitalize() + + if row["element controlled values or terms"]: + controlled_vocab = True + + exportVocabulary( + row["element controlled values or terms"], element_name_safe + ) + else: + controlled_vocab = False + + if row["mandatory?"] == "MANDATORY": + required = True + else: + required = False + + if row["dspace field name"] is not None and row["dspace field name"] != "": + dspace_field_name = row["dspace field name"] + else: + dspace_field_name = False + + # Combine element type and options into a "policy" of sorts and convert + # them to sentence case because they are ALL CAPS in the Excel. We don't + # need to do any checks because these fields should always exist. + policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.' + + if args.debug: + print(f"Processed: {row['element name']}") + + # Create an empty list with lines we'll write to the term's index.md in + # TOML frontmatter format for Hugo. + indexLines = [] + indexLines.append("---\n") + # Use the full title for now (even though it's ugly) + indexLines.append("title: '" + row["element name"] + "'\n") + if dspace_field_name: + indexLines.append(f"field: '{dspace_field_name}'\n") + indexLines.append(f"slug: '{element_name_safe}'\n") + if element_description: + indexLines.append(f"description: '{element_description}'\n") + if comment: + indexLines.append(f"comment: '{comment}'\n") + indexLines.append(f"required: {required}\n") + if controlled_vocab: + indexLines.append(f"vocabulary: '{element_name_safe}.txt'\n") + indexLines.append(f"policy: '{policy}'\n") + ## TODO: use some real date...? + # indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n") + indexLines.append("---") + + with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f: + f.writelines(indexLines) + + +def exportVocabulary(vocabulary: str, element_name_safe: str): + # Create an empty list where we'll add all the values (we don't need to do + # it this way, but using a list allows us to de-duplicate the values). + controlledVocabularyLines = [] + for value in vocabulary.split("||"): + if value not in controlledVocabularyLines: + controlledVocabularyLines.append(value) + + with open( + f"site/content/terms/{element_name_safe}/{element_name_safe}.txt", "w" + ) as f: + for value in controlledVocabularyLines: + f.write(f"{value}\n") + + if args.debug: + print(f"Exported controlled vocabulary: {element_name_safe}") + + +parser = argparse.ArgumentParser( + description="Parse an ISEAL schema Excel file to produce documentation about metadata requirements." +) +parser.add_argument( + "--clean", + help="Clean output directory before building.", + action="store_true", +) +parser.add_argument( + "-d", + "--debug", + help="Print debug messages.", + action="store_true", +) +parser.add_argument( + "-i", + "--input-file", + help="Path to schema fields file (idss_schema_fields.xlsx).", + required=True, + type=argparse.FileType("r"), +) +args = parser.parse_args() + +if args.clean: + if args.debug: + print(f"Cleaning terms output directory") + + rmtree("site/content/terms", ignore_errors=True) + +if args.debug: + print(f"Creating terms output directory") +# Make sure content directory exists. This is where we will deposit all the term +# metadata and controlled vocabularies for Hugo to process. +os.makedirs("site/content/terms", mode=0o755, exist_ok=True) + +if args.debug: + print(f"Opening {args.input_file.name}") + +df = pd.read_excel(args.input_file.name) +# Added inplace=True +df.dropna(how="all", axis=1, inplace=True) +df.fillna("", inplace=True) + +parseSchema(df)