diff --git a/util/create-schema-dspace.py b/util/create-schema-dspace.py new file mode 100755 index 00000000..46acb389 --- /dev/null +++ b/util/create-schema-dspace.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +# +# create-schema-dspace.py 0.0.1 +# +# SPDX-License-Identifier: GPL-3.0-only +# +# --- +# +# A quick and dirty script to read schema fields from a CSV file and create them +# them in the DSpace metadata registry using the REST API. Specify an email and +# for a DSpace user with administrator privileges when running: +# +# $ ./util/create-schema-dspace.py -e me@example.com -p 'fuu!' +# +# You can optionally specify the URL of a DSpace REST application (default is to +# use http://localhost:8080/rest). +# +# This script is written for Python 3 and requires several modules that you can +# install with pip (I recommend setting up a Python virtual environment first): +# +# $ pip install requests pandas +# + +import argparse +import signal +import sys + +import pandas as pd +import requests + + +def signal_handler(signal, frame): + sys.exit(1) + + +# Try to log in, for example: +# +# $ http -f POST http://localhost:8080/rest/login email=aorth@fuuu.com password='fuuuuuu' +# +def login(user, password): + request_url = rest_login_endpoint + headers = {"user-agent": rest_user_agent} + data = {"email": args.user, "password": args.password} + + print(f"Logging in...") + + try: + request = requests.post(rest_login_endpoint, headers=headers, data=data) + except requests.ConnectionError: + sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n") + + exit(1) + + if request.status_code != requests.codes.ok: + sys.stderr.write(f" Login failed.\n") + + exit(1) + + return request.cookies["JSESSIONID"] + + +# Check the authentication status of the specified JSESSIONID. +def check_session(sessionid): + request_url = rest_status_endpoint + headers = {"user-agent": rest_user_agent, "Accept": "application/json"} + cookies = {"JSESSIONID": sessionid} + + print(f"Checking session status...") + + try: + request = requests.get(request_url, headers=headers, cookies=cookies) + except requests.ConnectionError: + sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n") + + exit(1) + + if request.status_code == requests.codes.ok: + if not request.json()["authenticated"]: + sys.stderr.write(f" Session expired: {sessionid}.\n") + + exit(1) + else: + sys.stderr.write(f" Error checking session status.\n") + + exit(1) + + +# Create a new schema by passing a Schema Object, for example: +# +# $ http POST http://localhost:8080/rest/registries/schema Cookie:JSESSIONID=549756EB08169F17697A56A7D56901B3 < schema.json +# +# See: https://wiki.lyrasis.org/display/DSDOC6x/REST+API#RESTAPI-SchemaObject +def create_schema(schema): + request_url = rest_schema_registry_endpoint + headers = {"user-agent": rest_user_agent} + cookies = {"JSESSIONID": session} + + print(f" Attempting to create schema: {schema['prefix']}") + + try: + request = requests.post(request_url, headers, json=schema, cookies=cookies) + except requests.ConnectionError: + sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n") + + exit(1) + + # Check the status + if request.status_code == requests.codes.ok: + return True + else: + return False + + +# Create a new metadata field by passing a MetadataField Object, for example: +# +# $ http POST http://localhost:8080/rest/registries/schema/fuuu/metadata-fields \ +# Cookie:JSESSIONID=549756EB08169F17697A56A7D56901B3 < field.json +# +# See: https://wiki.lyrasis.org/display/DSDOC6x/REST+API#RESTAPI-MetadataFieldObject +def create_field(schema_prefix, field): + request_url = f"{rest_schema_registry_endpoint}/{schema_prefix}/metadata-fields" + headers = {"user-agent": rest_user_agent} + cookies = {"JSESSIONID": session} + + print(f" Attempting to create field: {field['name']}") + + try: + request = requests.post( + request_url, + headers=headers, + json=field, + cookies=cookies, + ) + except requests.ConnectionError: + sys.stderr.write(f" Could not connect to REST API: {args.request_url}.\n") + + exit(1) + + # Check the status + if request.status_code == requests.codes.ok: + return True + else: + return False + + +def parse_fields(schema_df): + # Iterate over all rows (the "index, row" syntax allows us to access column + # headings in each row, which isn't possible if we just do row). + for index, row in schema_df.iterrows(): + dspace_field_name = row["dspace field name"] + # Extract the prefix from the field name, ie is.link.url + dspace_field_prefix = dspace_field_name.split(".")[0] + element_name = row["element name"] + + # Make sure we only try to create fields in IS and FSC schemas for now. + # In the future we may create fields for other schema extensions, but + # we don't want to create dcterms fields, for example. + if "is" in dspace_field_prefix or "fsc" in dspace_field_prefix: + # Extract the element, ie "link" in is.link.url + dspace_field_element = dspace_field_name.split(".")[1] + + try: + # Extract the qualifier, ie "url" in is.link.url, if it exists + dspace_field_qualifier = dspace_field_name.split(".")[2] + except IndexError: + dspace_field_qualifier = None + + # Create a list of tuples with the metadata field components + field_components = [ + ("element", dspace_field_element), + ("qualifier", dspace_field_qualifier), + ("name", row["dspace field name"]), + ("description", element_name), + ] + + # Create a dict from the list of tuples аbove. We can pass the dict + # to requests directly and it will be converted to JSON. + field = dict(field_components) + + if create_field(dspace_field_prefix, field): + print(f" Created field: {dspace_field_name}") + else: + print(f" Field probably already exists: {dspace_field_name}") + + +parser = argparse.ArgumentParser( + description="Create ISEAL and FSC schemas in a DSpace 6.x repository." +) +parser.add_argument("-d", "--debug", help="Print debug messages.", action="store_true") +parser.add_argument( + "-u", + "--rest-url", + help="URL of the DSpace 6.x REST API.", + default="http://localhost:8080/rest", +) +parser.add_argument("-e", "--user", help="Email of administrator user.") +parser.add_argument("-p", "--password", help="Email of administrator user.") +parser.add_argument( + "-s", "--jsessionid", help="JESSIONID, if previously authenticated." +) +args = parser.parse_args() + +# DSpace 6.x REST API base URL and endpoints +rest_base_url = args.rest_url +rest_login_endpoint = f"{rest_base_url}/login" +rest_status_endpoint = f"{rest_base_url}/status" +rest_schema_registry_endpoint = f"{rest_base_url}/registries/schema" +rest_user_agent = "Alan Test Python Requests Bot" +session = args.jsessionid + +# set the signal handler for SIGINT (^C) +signal.signal(signal.SIGINT, signal_handler) + +# Try to login if no session was passed +if not args.jsessionid: + session = login(args.user, args.password) +else: + check_session(args.jsessionid) + +if args.debug: + sys.stderr.write(f" Logged in, using JSESSIONID: {session}\n") + +print("\nCreating schemas...") + +iseal_schema = { + "namespace": "https://iseal-community.github.io/iseal-core", + "prefix": "is", +} +fsc_schema = { + "namespace": "https://iseal-community.github.io/iseal-core/fsc", + "prefix": "fsc", +} + +if create_schema(iseal_schema): + print(" Created ISEAL Core schema") +else: + print(" ISEAL Core schema probably already exists") + +if create_schema(fsc_schema): + print(" Created FSC schema") +else: + print(" FSC schema probably already exists") + +print("\nCreating fields...") + +for file in ["data/iseal-core.csv", "data/fsc.csv"]: + if args.debug: + sys.stderr.write(f" Opening {file}\n") + + try: + df = pd.read_csv(file, usecols=["dspace field name", "element name"]) + parse_fields(df) + except FileNotFoundError: + sys.stderr.write(f" Could not open {file}\n")