mirror of
https://github.com/ISEAL-Community/iseal-core.git
synced 2024-11-04 14:23:03 +01:00
207 lines
7.5 KiB
Python
Executable File
207 lines
7.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# generate-hugo-content.py v0.0.1
|
|
#
|
|
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
from shutil import rmtree
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def parseSchema(schema_df):
|
|
# Iterate over all rows (the "index, row" syntax allows us to access column
|
|
# headings in each row, which isn't possible if we just do row).
|
|
for index, row in schema_df.iterrows():
|
|
# Split the element name on " - " because the category is duplicated
|
|
# here, but in a format that is more difficult to use than the "idss
|
|
# module category" field. I will # encourage Peter to modify this
|
|
# field so it is less descriptive because that's what the "idss module
|
|
# category" field does (and more consistently).
|
|
if " - " in row["element name"]:
|
|
# We only want the specific element name, not the category, ie:
|
|
#
|
|
# [Category] [Element]
|
|
# FSC audit - sampling system
|
|
element_name = row["element name"].split(" - ")[1]
|
|
else:
|
|
element_name = row["element name"]
|
|
|
|
# Make sure element name is URL friendly because we need to use it in
|
|
# the file system and in the URL.
|
|
#
|
|
# Replace two or more whitespaces with one
|
|
element_name = re.sub(r"\s{2,}", " ", element_name)
|
|
# Replace unnecessary stuff in some element names (I should tell Peter
|
|
# that these belong in the description)
|
|
element_name = re.sub(r"\s?\(\w+\)", "", element_name)
|
|
# Remove commas and question marks
|
|
element_name = re.sub(r"[,?]", "", element_name)
|
|
# Replace ": " with a dash (as in "Evaluation: ")
|
|
element_name = element_name.replace(": ", "-")
|
|
# Replace " / " with a dash (as in "biome / zone")
|
|
element_name = element_name.replace(" / ", "-")
|
|
# Replace whitespace, colons, and slashes with dashes
|
|
element_name = re.sub(r"[\s/]", "-", element_name)
|
|
# Lower case it
|
|
element_name = element_name.lower()
|
|
# Strip just in case
|
|
element_name = element_name.strip()
|
|
|
|
# For example Assurance, Certification, Core, Impact, etc
|
|
module = row["idss schema module"].capitalize()
|
|
# For example Certifying Body, FSC audit, Certificate, etc
|
|
cluster = row["idss element cluster"].capitalize()
|
|
|
|
# Generate a URL-safe version of the element name, though we need to
|
|
# think about what field we want to use here.
|
|
element_name_safe = cluster.replace(" ", "-").lower() + "-" + element_name
|
|
|
|
print(f"element name: {element_name_safe}")
|
|
|
|
# Create output directory for term using the URL-safe version
|
|
outputDirectory = f"site/content/terms/{element_name_safe}"
|
|
os.makedirs(outputDirectory, mode=0o755, exist_ok=True)
|
|
|
|
if args.debug:
|
|
print(f"Created terms directory: site/content/terms/{element_name_safe}")
|
|
|
|
# Take the element description as is, but remove quotes
|
|
element_description = row["element description"].replace("'", "")
|
|
|
|
# Take the element guidance as is
|
|
if row["element guidance"]:
|
|
comment = row["element guidance"]
|
|
else:
|
|
comment = False
|
|
|
|
example = row["element link for more information"]
|
|
|
|
# How to use these in the HTML, slightly overlapping?
|
|
cardinality = row["element options"].capitalize()
|
|
prop_type = row["element type"].capitalize()
|
|
|
|
if row["element controlled values or terms"]:
|
|
controlled_vocab = True
|
|
|
|
exportVocabulary(
|
|
row["element controlled values or terms"], element_name_safe
|
|
)
|
|
else:
|
|
controlled_vocab = False
|
|
|
|
if row["mandatory?"] == "MANDATORY":
|
|
required = True
|
|
else:
|
|
required = False
|
|
|
|
if row["dspace field name"] is not None and row["dspace field name"] != "":
|
|
dspace_field_name = row["dspace field name"]
|
|
else:
|
|
dspace_field_name = False
|
|
|
|
# Combine element type and options into a "policy" of sorts and convert
|
|
# them to sentence case because they are ALL CAPS in the Excel. We don't
|
|
# need to do any checks because these fields should always exist.
|
|
policy = f'{row["element type"].capitalize()}. {row["element options"].capitalize()}.'
|
|
|
|
if args.debug:
|
|
print(f"Processed: {row['element name']}")
|
|
|
|
# Create an empty list with lines we'll write to the term's index.md in
|
|
# TOML frontmatter format for Hugo.
|
|
indexLines = []
|
|
indexLines.append("---\n")
|
|
# Use the full title for now (even though it's ugly). Better to fix the
|
|
# schema spreadsheet than try to process the title here.
|
|
indexLines.append("title: '" + row["element name"] + "'\n")
|
|
if dspace_field_name:
|
|
indexLines.append(f"field: '{dspace_field_name}'\n")
|
|
indexLines.append(f"slug: '{element_name_safe}'\n")
|
|
if element_description:
|
|
indexLines.append(f"description: '{element_description}'\n")
|
|
if comment:
|
|
indexLines.append(f"comment: '{comment}'\n")
|
|
indexLines.append(f"required: {required}\n")
|
|
if controlled_vocab:
|
|
indexLines.append(f"vocabulary: '{element_name_safe}.txt'\n")
|
|
if module:
|
|
indexLines.append(f"module: '{module}'\n")
|
|
if cluster:
|
|
indexLines.append(f"cluster: '{cluster}'\n")
|
|
indexLines.append(f"policy: '{policy}'\n")
|
|
## TODO: use some real date...?
|
|
# indexLines.append(f"date: '2019-05-04T00:00:00+00:00'\n")
|
|
indexLines.append("---")
|
|
|
|
with open(f"site/content/terms/{element_name_safe}/index.md", "w") as f:
|
|
f.writelines(indexLines)
|
|
|
|
|
|
def exportVocabulary(vocabulary: str, element_name_safe: str):
|
|
# Create an empty list where we'll add all the values (we don't need to do
|
|
# it this way, but using a list allows us to de-duplicate the values).
|
|
controlledVocabularyLines = []
|
|
for value in vocabulary.split("||"):
|
|
if value not in controlledVocabularyLines:
|
|
controlledVocabularyLines.append(value)
|
|
|
|
with open(
|
|
f"site/content/terms/{element_name_safe}/{element_name_safe}.txt", "w"
|
|
) as f:
|
|
for value in controlledVocabularyLines:
|
|
f.write(f"{value}\n")
|
|
|
|
if args.debug:
|
|
print(f"Exported controlled vocabulary: {element_name_safe}")
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Parse an ISEAL schema Excel file to produce documentation about metadata requirements."
|
|
)
|
|
parser.add_argument(
|
|
"--clean",
|
|
help="Clean output directory before building.",
|
|
action="store_true",
|
|
)
|
|
parser.add_argument(
|
|
"-d",
|
|
"--debug",
|
|
help="Print debug messages.",
|
|
action="store_true",
|
|
)
|
|
parser.add_argument(
|
|
"-i",
|
|
"--input-file",
|
|
help="Path to schema fields file (idss_schema_fields.xlsx).",
|
|
required=True,
|
|
type=argparse.FileType("r"),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.clean:
|
|
if args.debug:
|
|
print(f"Cleaning terms output directory")
|
|
|
|
rmtree("site/content/terms", ignore_errors=True)
|
|
|
|
if args.debug:
|
|
print(f"Creating terms output directory")
|
|
# Make sure content directory exists. This is where we will deposit all the term
|
|
# metadata and controlled vocabularies for Hugo to process.
|
|
os.makedirs("site/content/terms", mode=0o755, exist_ok=True)
|
|
|
|
if args.debug:
|
|
print(f"Opening {args.input_file.name}")
|
|
|
|
df = pd.read_excel(args.input_file.name)
|
|
# Added inplace=True
|
|
df.dropna(how="all", axis=1, inplace=True)
|
|
df.fillna("", inplace=True)
|
|
|
|
parseSchema(df)
|