Add util/create-schema-dspace.py

Script to read the ISEAL Core and FSC metadata CSVs and create the
appropriate schemas and metadata fields in a DSpace 6.x repository.
This commit is contained in:
Alan Orth 2022-01-09 10:10:00 +02:00
parent f21038663e
commit 57cc35c839
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

254
util/create-schema-dspace.py Executable file
View File

@ -0,0 +1,254 @@
#!/usr/bin/env python3
#
# create-schema-dspace.py 0.0.1
#
# SPDX-License-Identifier: GPL-3.0-only
#
# ---
#
# A quick and dirty script to read schema fields from a CSV file and create them
# them in the DSpace metadata registry using the REST API. Specify an email and
# for a DSpace user with administrator privileges when running:
#
# $ ./util/create-schema-dspace.py -e me@example.com -p 'fuu!'
#
# You can optionally specify the URL of a DSpace REST application (default is to
# use http://localhost:8080/rest).
#
# This script is written for Python 3 and requires several modules that you can
# install with pip (I recommend setting up a Python virtual environment first):
#
# $ pip install requests pandas
#
import argparse
import signal
import sys
import pandas as pd
import requests
def signal_handler(signal, frame):
sys.exit(1)
# Try to log in, for example:
#
# $ http -f POST http://localhost:8080/rest/login email=aorth@fuuu.com password='fuuuuuu'
#
def login(user, password):
request_url = rest_login_endpoint
headers = {"user-agent": rest_user_agent}
data = {"email": args.user, "password": args.password}
print(f"Logging in...")
try:
request = requests.post(rest_login_endpoint, headers=headers, data=data)
except requests.ConnectionError:
sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n")
exit(1)
if request.status_code != requests.codes.ok:
sys.stderr.write(f" Login failed.\n")
exit(1)
return request.cookies["JSESSIONID"]
# Check the authentication status of the specified JSESSIONID.
def check_session(sessionid):
request_url = rest_status_endpoint
headers = {"user-agent": rest_user_agent, "Accept": "application/json"}
cookies = {"JSESSIONID": sessionid}
print(f"Checking session status...")
try:
request = requests.get(request_url, headers=headers, cookies=cookies)
except requests.ConnectionError:
sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n")
exit(1)
if request.status_code == requests.codes.ok:
if not request.json()["authenticated"]:
sys.stderr.write(f" Session expired: {sessionid}.\n")
exit(1)
else:
sys.stderr.write(f" Error checking session status.\n")
exit(1)
# Create a new schema by passing a Schema Object, for example:
#
# $ http POST http://localhost:8080/rest/registries/schema Cookie:JSESSIONID=549756EB08169F17697A56A7D56901B3 < schema.json
#
# See: https://wiki.lyrasis.org/display/DSDOC6x/REST+API#RESTAPI-SchemaObject
def create_schema(schema):
request_url = rest_schema_registry_endpoint
headers = {"user-agent": rest_user_agent}
cookies = {"JSESSIONID": session}
print(f" Attempting to create schema: {schema['prefix']}")
try:
request = requests.post(request_url, headers, json=schema, cookies=cookies)
except requests.ConnectionError:
sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n")
exit(1)
# Check the status
if request.status_code == requests.codes.ok:
return True
else:
return False
# Create a new metadata field by passing a MetadataField Object, for example:
#
# $ http POST http://localhost:8080/rest/registries/schema/fuuu/metadata-fields \
# Cookie:JSESSIONID=549756EB08169F17697A56A7D56901B3 < field.json
#
# See: https://wiki.lyrasis.org/display/DSDOC6x/REST+API#RESTAPI-MetadataFieldObject
def create_field(schema_prefix, field):
request_url = f"{rest_schema_registry_endpoint}/{schema_prefix}/metadata-fields"
headers = {"user-agent": rest_user_agent}
cookies = {"JSESSIONID": session}
print(f" Attempting to create field: {field['name']}")
try:
request = requests.post(
request_url,
headers=headers,
json=field,
cookies=cookies,
)
except requests.ConnectionError:
sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n")
exit(1)
# Check the status
if request.status_code == requests.codes.ok:
return True
else:
return False
def parse_fields(schema_df):
# Iterate over all rows (the "index, row" syntax allows us to access column
# headings in each row, which isn't possible if we just do row).
for index, row in schema_df.iterrows():
dspace_field_name = row["dspace field name"]
# Extract the prefix from the field name, ie is.link.url
dspace_field_prefix = dspace_field_name.split(".")[0]
element_name = row["element name"]
# Make sure we only try to create fields in IS and FSC schemas for now.
# In the future we may create fields for other schema extensions, but
# we don't want to create dcterms fields, for example.
if "is" in dspace_field_prefix or "fsc" in dspace_field_prefix:
# Extract the element, ie "link" in is.link.url
dspace_field_element = dspace_field_name.split(".")[1]
try:
# Extract the qualifier, ie "url" in is.link.url, if it exists
dspace_field_qualifier = dspace_field_name.split(".")[2]
except IndexError:
dspace_field_qualifier = None
# Create a list of tuples with the metadata field components
field_components = [
("element", dspace_field_element),
("qualifier", dspace_field_qualifier),
("name", row["dspace field name"]),
("description", element_name),
]
# Create a dict from the list of tuples аbove. We can pass the dict
# to requests directly and it will be converted to JSON.
field = dict(field_components)
if create_field(dspace_field_prefix, field):
print(f" Created field: {dspace_field_name}")
else:
print(f" Field probably already exists: {dspace_field_name}")
parser = argparse.ArgumentParser(
description="Create ISEAL and FSC schemas in a DSpace 6.x repository."
)
parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true")
parser.add_argument(
"-u",
"--rest-url",
help="URL of the DSpace 6.x REST API.",
default="http://localhost:8080/rest",
)
parser.add_argument("-e", "--user", help="Email of administrator user.")
parser.add_argument("-p", "--password", help="Email of administrator user.")
parser.add_argument(
"-s", "--jsessionid", help="JESSIONID, if previously authenticated."
)
args = parser.parse_args()
# DSpace 6.x REST API base URL and endpoints
rest_base_url = args.rest_url
rest_login_endpoint = f"{rest_base_url}/login"
rest_status_endpoint = f"{rest_base_url}/status"
rest_schema_registry_endpoint = f"{rest_base_url}/registries/schema"
rest_user_agent = "Alan Test Python Requests Bot"
session = args.jsessionid
# set the signal handler for SIGINT (^C)
signal.signal(signal.SIGINT, signal_handler)
# Try to login if no session was passed
if not args.jsessionid:
session = login(args.user, args.password)
else:
check_session(args.jsessionid)
if args.debug:
sys.stderr.write(f" Logged in, using JSESSIONID: {session}\n")
print("\nCreating schemas...")
iseal_schema = {
"namespace": "https://iseal-community.github.io/iseal-core",
"prefix": "is",
}
fsc_schema = {
"namespace": "https://iseal-community.github.io/iseal-core/fsc",
"prefix": "fsc",
}
if create_schema(iseal_schema):
print(" Created ISEAL Core schema")
else:
print(" ISEAL Core schema probably already exists")
if create_schema(fsc_schema):
print(" Created FSC schema")
else:
print(" FSC schema probably already exists")
print("\nCreating fields...")
for file in ["data/iseal-core.csv", "data/fsc.csv"]:
if args.debug:
sys.stderr.write(f" Opening {file}\n")
try:
df = pd.read_csv(file, usecols=["dspace field name", "element name"])
parse_fields(df)
except FileNotFoundError:
sys.stderr.write(f" Could not open {file}\n")