iseal-core/util/create-rdf.py
Alan Orth fe36c5c25b util/*.py: run isort
Sorts and organizes the Python imports.
2022-01-30 13:29:07 +03:00

468 lines
20 KiB
Python
Executable File

#!/usr/bin/env python
# coding: utf-8
import json
import os
import re
import pandas as pd
from rdflib import BNode, Graph, Literal, URIRef
from rdflib.namespace import DC, DCTERMS, OWL, RDF, RDFS, SKOS, XSD
def make_core():
g = Graph()
##namespace
# NS = "http://iseal.org/terms/"
NS = "https://alanorth.github.io/iseal-schema/#"
## create ontology
iseal = URIRef(NS)
g.add((iseal, RDF.type, OWL.Ontology))
df = pd.read_csv("../data/iseal-core.csv")
df.dropna(how="all", axis=1)
df.fillna("", inplace=True)
for index, row in df.iterrows():
element_name = row["element name"]
element_description = row["element description"]
comment = row["element guidance"]
example = row["element link for more information"]
cardinality = row["element options"]
prop_type = row["element type"]
controlled_vocab = row["element controlled values or terms"]
module = row["idss element cluster"]
module_cat = row["idss schema module"]
dc = row["element link for dublin core attributes"]
dspace = row["dspace field name"]
##module
moduleUri = URIRef(NS + module)
if not (None, SKOS.prefLabel, Literal(module)) in g:
##create module as skos concept
g.add((moduleUri, RDF.type, OWL.Class)) ## SKOS.Concept
g.add((moduleUri, SKOS.prefLabel, Literal(module)))
##element
# if '-' not in element_name:
if True: ## lazy reindenting
concept = module_cat # element_name.split(' - ')[0]
# element = element_name.strip()
## code from Alan
# Make sure element name is URL friendly because we need to use it in
# the file system and in the URL.
#
# Replace two or more whitespaces with one
element_name = re.sub(r"\s{2,}", " ", element_name)
# Replace unnecessary stuff in some element names (I should tell Peter
# that these belong in the description)
element_name = re.sub(r"\s?\(\w+\)", "", element_name)
# Remove commas and question marks
element_name = re.sub(r"[,?]", "", element_name)
# Replace ": " with a dash (as in "Evaluation: ")
element_name = element_name.replace(": ", "-")
# Replace " / " with a dash (as in "biome / zone")
element_name = element_name.replace(" / ", "-")
# Replace whitespace, colons, and slashes with dashes
element_name = re.sub(r"[\s/]", "-", element_name)
# Lower case it
element_name = element_name.lower()
# Strip just in case
element_name = element_name.strip()
# For example Certifying Body, FSC audit, Certificate, etc
cluster = row["idss element cluster"].capitalize()
# For example Assurance, Certification, Core, Impact, etc
module = row["idss schema module"].capitalize()
# Generate a "safe" version of the element name for use in URLs and
# files by combining the cluster and the element name. This could
# change in the future.
element_name_safe = cluster.replace(" ", "-").lower() + "-" + element_name
element = element_name_safe
conceptUri = URIRef(NS + concept.replace(" ", "_"))
if not (None, SKOS.prefLabel, Literal(concept)) in g:
##create concept as skos concept
g.add((conceptUri, RDF.type, OWL.Class)) ## SKOS.Concept
g.add((conceptUri, SKOS.prefLabel, Literal(concept)))
g.add((conceptUri, RDFS.subClassOf, moduleUri))
## create properties
elementURI = URIRef(NS + element.replace(" ", "_"))
if prop_type == "CONTROLLED VALUE": ## object property
g.add((elementURI, SKOS.prefLabel, Literal(element)))
g.add((elementURI, RDF.type, OWL.ObjectProperty))
g.add((elementURI, OWL.domain, conceptUri))
## add suproperty link
if dc:
dct = dc.split(":")[1]
if "wgs84" in dc:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef(
"http://www.w3.org/2003/01/geo/wgs84_pos#" + dct
),
)
)
else:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef("http://purl.org/dc/terms/" + dct),
)
)
## add dspace alternative ID
g.add(
(
elementURI,
URIRef("http://purl.org/dc/terms/alternative"),
Literal(dspace),
)
)
## create controlled vocab
cvURI = URIRef(NS + "VOCAB_" + element.replace(" ", "_"))
g.add(
(cvURI, RDF.type, OWL.Class)
) ## SKOS.Concept ## SKOS.Collection??
g.add((cvURI, SKOS.prefLabel, Literal("VOCAB " + element)))
for term in controlled_vocab.split("||"):
termURI = URIRef(NS + term.replace(" ", "_").replace("|", ""))
g.add((termURI, RDF.type, OWL.Class)) ## SKOS.Concept
g.add((termURI, SKOS.prefLabel, Literal(term)))
g.add((termURI, RDFS.subClassOf, cvURI)) ## SKOS.member???
g.add((elementURI, OWL.range, cvURI))
## add the controlled vocab information on properties directly
g.add(
(
elementURI,
URIRef("http://purl.org/dc/dcam/rangeIncludes"),
Literal(
"https://raw.githubusercontent.com/alanorth/iseal-schema/main/data/controlled-vocabularies/"
+ element
+ ".txt"
),
)
)
## cardinality
if cardinality == "MULTI SELECT FROM CONTROL LIST":
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.minQualifiedCardinality, Literal(1)))
g.add((br, OWL.someValuesFrom, cvURI))
g.add((conceptUri, RDFS.subClassOf, br))
else:
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.maxQualifiedCardinality, Literal(1)))
g.add((br, OWL.onClass, cvURI))
g.add((conceptUri, RDFS.subClassOf, br))
# elif prop_type == 'URL': ## object property
# g.add((elementURI, RDF.type, OWL.ObjectProperty))
# g.add((elementURI, OWL.domain, conceptUri))
# g.add((elementURI, OWL.range, URIRef("") ))
# g.add((elementURI, SKOS.prefLabel, Literal(element)))
else: ## datatype properties
g.add((elementURI, SKOS.prefLabel, Literal(element)))
g.add((elementURI, RDF.type, OWL.DatatypeProperty))
g.add((elementURI, OWL.domain, conceptUri))
if dc:
dct = dc.split(":")[1]
if "wgs84" in dc:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef(
"http://www.w3.org/2003/01/geo/wgs84_pos#" + dct
),
)
)
else:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef("http://purl.org/dc/terms/" + dct),
)
)
## add dspace alternative ID
g.add(
(
elementURI,
URIRef("http://purl.org/dc/terms/alternative"),
Literal(dspace),
)
)
range = None
if prop_type == "DATE":
g.add((elementURI, OWL.range, XSD.date))
range = XSD.date
elif prop_type == "NUMERIC VALUE":
g.add((elementURI, OWL.range, XSD.float))
range = XSD.float
else:
g.add((elementURI, OWL.range, XSD.string))
range = XSD.string
##cardinality
if cardinality == "REPEAT VALUES":
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.someValuesFrom, range))
g.add((conceptUri, RDFS.subClassOf, br))
else:
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.maxQualifiedCardinality, Literal(1)))
g.add((br, OWL.onDataRange, range))
g.add((conceptUri, RDFS.subClassOf, br))
if comment:
g.add((elementURI, SKOS.scopeNote, Literal(comment)))
if example:
g.add((elementURI, RDFS.comment, Literal(example)))
if element_description:
g.add((elementURI, SKOS.definition, Literal(element_description)))
# else:
# print(element_name)
## save graph
g.serialize(destination="idds_new3.ttl", format="turtle")
def make_fsc():
g = Graph()
##namespace
# NS = "http://iseal.org/terms/"
NS = "https://alanorth.github.io/iseal-schema/FSC#"
## create ontology
iseal = URIRef(NS)
g.add((iseal, RDF.type, OWL.Ontology))
df = pd.read_excel("./idss_schema_fields_new2.xlsx", "fsc extension")
df.dropna(how="all", axis=1)
df.fillna("", inplace=True)
for index, row in df.iterrows():
element_name = row["element name"]
element_description = row["element description"]
comment = row["element guidance"]
example = row["element link for more information"]
cardinality = row["element options"]
prop_type = row["element type"]
controlled_vocab = row["element controlled values or terms"]
module = row["idss element cluster"]
module_cat = row["fsc extension module"]
dc = row["element link for dublin core attributes"]
dspace = row["dspace field name"]
##module
moduleUri = URIRef(NS + module)
if not (None, SKOS.prefLabel, Literal(module)) in g:
##create module as skos concept
g.add((moduleUri, RDF.type, OWL.Class)) ## SKOS.Concept
g.add((moduleUri, SKOS.prefLabel, Literal(module)))
##element
# if '-' not in element_name:
if True: ## lazy reindenting
concept = module_cat # element_name.split(' - ')[0]
# element = element_name.strip()
## code from Alan
# Make sure element name is URL friendly because we need to use it in
# the file system and in the URL.
#
# Replace two or more whitespaces with one
element_name = re.sub(r"\s{2,}", " ", element_name)
# Replace unnecessary stuff in some element names (I should tell Peter
# that these belong in the description)
element_name = re.sub(r"\s?\(\w+\)", "", element_name)
# Remove commas and question marks
element_name = re.sub(r"[,?]", "", element_name)
# Replace ": " with a dash (as in "Evaluation: ")
element_name = element_name.replace(": ", "-")
# Replace " / " with a dash (as in "biome / zone")
element_name = element_name.replace(" / ", "-")
# Replace whitespace, colons, and slashes with dashes
element_name = re.sub(r"[\s/]", "-", element_name)
# Lower case it
element_name = element_name.lower()
# Strip just in case
element_name = element_name.strip()
# For example Certifying Body, FSC audit, Certificate, etc
cluster = row["idss element cluster"].capitalize()
# For example Assurance, Certification, Core, Impact, etc
module = row["fsc extension module"].capitalize()
# Generate a "safe" version of the element name for use in URLs and
# files by combining the cluster and the element name. This could
# change in the future.
element_name_safe = cluster.replace(" ", "-").lower() + "-" + element_name
element = element_name_safe
# remove extra fsc in name
element = element.replace("fsc-fsc-", "fsc-")
conceptUri = URIRef(NS + concept.replace(" ", "_"))
if not (None, SKOS.prefLabel, Literal(concept)) in g:
##create concept as skos concept
g.add((conceptUri, RDF.type, OWL.Class)) ## SKOS.Concept
g.add((conceptUri, SKOS.prefLabel, Literal(concept)))
g.add((conceptUri, RDFS.subClassOf, moduleUri))
## create properties
elementURI = URIRef(NS + element.replace(" ", "_"))
if prop_type == "CONTROLLED VALUE": ## object property
g.add((elementURI, SKOS.prefLabel, Literal(element)))
g.add((elementURI, RDF.type, OWL.ObjectProperty))
g.add((elementURI, OWL.domain, conceptUri))
## add suproperty link
if dc:
dct = dc.split(":")[1]
if "wgs84" in dc:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef(
"http://www.w3.org/2003/01/geo/wgs84_pos#" + dct
),
)
)
else:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef("http://purl.org/dc/terms/" + dct),
)
)
## add dspace alternative ID
# g.add((elementURI, URIRef("http://purl.org/dc/terms/alternative"), Literal(dspace)))
## create controlled vocab
cvURI = URIRef(NS + "VOCAB_" + element.replace(" ", "_"))
g.add(
(cvURI, RDF.type, OWL.Class)
) ## SKOS.Concept ## SKOS.Collection??
g.add((cvURI, SKOS.prefLabel, Literal("VOCAB " + element)))
for term in controlled_vocab.split("||"):
termURI = URIRef(NS + term.replace(" ", "_").replace("|", ""))
g.add((termURI, RDF.type, OWL.Class)) ## SKOS.Concept
g.add((termURI, SKOS.prefLabel, Literal(term)))
g.add((termURI, RDFS.subClassOf, cvURI)) ## SKOS.member???
g.add((elementURI, OWL.range, cvURI))
## add the controlled vocab information on properties directly
g.add(
(
elementURI,
URIRef("http://purl.org/dc/dcam/rangeIncludes"),
Literal(
"https://raw.githubusercontent.com/alanorth/iseal-schema/main/data/controlled-vocabularies/"
+ element
+ ".txt"
),
)
)
## cardinality
if cardinality == "MULTI SELECT FROM CONTROL LIST":
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.minQualifiedCardinality, Literal(1)))
g.add((br, OWL.someValuesFrom, cvURI))
g.add((conceptUri, RDFS.subClassOf, br))
else:
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.maxQualifiedCardinality, Literal(1)))
g.add((br, OWL.onClass, cvURI))
g.add((conceptUri, RDFS.subClassOf, br))
# elif prop_type == 'URL': ## object property
# g.add((elementURI, RDF.type, OWL.ObjectProperty))
# g.add((elementURI, OWL.domain, conceptUri))
# g.add((elementURI, OWL.range, URIRef("") ))
# g.add((elementURI, SKOS.prefLabel, Literal(element)))
else: ## datatype properties
g.add((elementURI, SKOS.prefLabel, Literal(element)))
g.add((elementURI, RDF.type, OWL.DatatypeProperty))
g.add((elementURI, OWL.domain, conceptUri))
if dc:
dct = dc.split(":")[1]
if "wgs84" in dc:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef(
"http://www.w3.org/2003/01/geo/wgs84_pos#" + dct
),
)
)
else:
g.add(
(
elementURI,
RDFS.subPropertyOf,
URIRef("http://purl.org/dc/terms/" + dct),
)
)
## add dspace alternative ID
# g.add((elementURI, URIRef("http://purl.org/dc/terms/alternative"), Literal(dspace)))
range = None
if prop_type == "DATE":
g.add((elementURI, OWL.range, XSD.date))
range = XSD.date
elif prop_type == "NUMERIC VALUE":
g.add((elementURI, OWL.range, XSD.float))
range = XSD.float
else:
g.add((elementURI, OWL.range, XSD.string))
range = XSD.string
##cardinality
if cardinality == "REPEAT VALUES":
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.someValuesFrom, range))
g.add((conceptUri, RDFS.subClassOf, br))
else:
br = BNode()
g.add((br, RDF.type, OWL.Restriction))
g.add((br, OWL.onProperty, elementURI))
g.add((br, OWL.maxQualifiedCardinality, Literal(1)))
g.add((br, OWL.onDataRange, range))
g.add((conceptUri, RDFS.subClassOf, br))
if comment:
g.add((elementURI, SKOS.scopeNote, Literal(comment)))
if example:
g.add((elementURI, RDFS.comment, Literal(example)))
if element_description:
g.add((elementURI, SKOS.definition, Literal(element_description)))
# else:
# print(element_name)
## save graph
g.serialize(destination="fsc.ttl", format="turtle")
make_core()