Add util/generate-docs.py

This script parses the Excel schema file to produce term metadata
and extract controlled vocabularies to the site content directory.
After running this we can generate the site using Hugo.
This commit is contained in:
Alan Orth 2021-11-01 08:21:50 +02:00
parent c7c0d0efcc
commit 30310e6db6
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
1 changed files with 199 additions and 0 deletions

199
util/generate-docs.py Executable file
View File

@ -0,0 +1,199 @@
#!/usr/bin/env python3
#
# generate-docs.py v0.0.1
#
# SPDX-License-Identifier: GPL-3.0-only
import argparse
import os
import re
import sys
from shutil import rmtree
import pandas as pd
def parseSchema(schema_df):
# Iterate over all rows (the "index, row" syntax allows us to access column
# headings in each row, which isn't possible if we just do row).
for index, row in schema_df.iterrows():
# Split the element name on " - " because the category is duplicated
# here, but in a format that is more difficult to use than the "idss
# module category" field. I will # encourage Peter to modify this
# field so it is less descriptive because that's what the "idss module
# category" field does (and more consistently).
if " - " in row["element name"]:
# We only want the specific element name, not the category, ie:
#
# [Category] [Element]
# FSC audit - sampling system
element_name = row["element name"].split(" - ")[1]
else:
element_name = row["element name"]
# Make sure element name is URL friendly because we need to use it in
# the file system and in the URL.
#
# Replace two or more whitespaces with one
element_name = re.sub(r"\s{2,}", " ", element_name)
# Replace unnecessary stuff in some element names (I should tell Peter
# that these belong in the description)
element_name = re.sub(r"\s?\(\w+\)", "", element_name)
# Remove commas and question marks
element_name = re.sub(r"[,?]", "", element_name)
# Replace ": " with a dash (as in "Evaluation: ")
element_name = element_name.replace(": ", "-")
# Replace " / " with a dash (as in "biome / zone")
element_name = element_name.replace(" / ", "-")
# Replace whitespace, colons, and slashes with dashes
element_name = re.sub(r"[\s/]", "-", element_name)
# Lower case it
element_name = element_name.lower()
# Strip just in case
element_name = element_name.strip()
# For example Assurance, Certification, Core, Impact, etc
module = row["idss module"].lower()
# For example Certifying Body, FSC audit, Certificate, etc
module_cat = row["idss module category"]
# Generate a URL-safe version of the element name, though we need to
# think about what field we want to use here.
element_name_safe = module_cat.replace(" ", "-").lower() + "-" + element_name
# Create output directory for term using the URL-safe version
outputDirectory = f"site/content/terms/{element_name_safe}"
os.makedirs(outputDirectory, mode=0o755, exist_ok=True)
if args.debug:
print(f"Created terms directory: site/content/terms/{element_name_safe}")
# Take the element description as is, but remove quotes
element_description = row["element description"].replace("'", "")
# Take the element guidance as is
if row["element guidance"]:
comment = row["element guidance"]
else:
comment = False
example = row["element link for more information"]
# How to use these in the HTML, slightly overlapping?
cardinality = row["element options"].capitalize()
prop_type = row["element type"].capitalize()
if row["element controlled values or terms"]:
controlled_vocab = True
exportVocabulary(
row["element controlled values or terms"], element_name_safe
)
else:
controlled_vocab = False
if row["mandatory?"] == "MANDATORY":
required = True
else:
required = False
if row["dspace field name"] is not None and row["dspace field name"] != "":
dspace_field_name = row["dspace field name"]
else:
dspace_field_name = False
# Combine element type and options into a "policy" of sorts and convert
# them to sentence case because they are ALL CAPS in the Excel. We don't
# need to do any checks because these fields should always exist.
policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.'
if args.debug:
print(f"Processed: {row['element name']}")
# Create an empty list with lines we'll write to the term's index.md in
# TOML frontmatter format for Hugo.
indexLines = []
indexLines.append("---\n")
# Use the full title for now (even though it's ugly)
indexLines.append("title: '" + row["element name"] + "'\n")
if dspace_field_name:
indexLines.append(f"field: '{dspace_field_name}'\n")
indexLines.append(f"slug: '{element_name_safe}'\n")
if element_description:
indexLines.append(f"description: '{element_description}'\n")
if comment:
indexLines.append(f"comment: '{comment}'\n")
indexLines.append(f"required: {required}\n")
if controlled_vocab:
indexLines.append(f"vocabulary: '{element_name_safe}.txt'\n")
indexLines.append(f"policy: '{policy}'\n")
## TODO: use some real date...?
# indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n")
indexLines.append("---")
with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f:
f.writelines(indexLines)
def exportVocabulary(vocabulary: str, element_name_safe: str):
# Create an empty list where we'll add all the values (we don't need to do
# it this way, but using a list allows us to de-duplicate the values).
controlledVocabularyLines = []
for value in vocabulary.split("||"):
if value not in controlledVocabularyLines:
controlledVocabularyLines.append(value)
with open(
f"site/content/terms/{element_name_safe}/{element_name_safe}.txt", "w"
) as f:
for value in controlledVocabularyLines:
f.write(f"{value}\n")
if args.debug:
print(f"Exported controlled vocabulary: {element_name_safe}")
parser = argparse.ArgumentParser(
description="Parse an ISEAL schema Excel file to produce documentation about metadata requirements."
)
parser.add_argument(
"--clean",
help="Clean output directory before building.",
action="store_true",
)
parser.add_argument(
"-d",
"--debug",
help="Print debug messages.",
action="store_true",
)
parser.add_argument(
"-i",
"--input-file",
help="Path to schema fields file (idss_schema_fields.xlsx).",
required=True,
type=argparse.FileType("r"),
)
args = parser.parse_args()
if args.clean:
if args.debug:
print(f"Cleaning terms output directory")
rmtree("site/content/terms", ignore_errors=True)
if args.debug:
print(f"Creating terms output directory")
# Make sure content directory exists. This is where we will deposit all the term
# metadata and controlled vocabularies for Hugo to process.
os.makedirs("site/content/terms", mode=0o755, exist_ok=True)
if args.debug:
print(f"Opening {args.input_file.name}")
df = pd.read_excel(args.input_file.name)
# Added inplace=True
df.dropna(how="all", axis=1, inplace=True)
df.fillna("", inplace=True)
parseSchema(df)