1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.
This commit is contained in:
2021-03-11 10:33:16 +02:00
parent b16fa9121f
commit 6e4b0e5c1b
4 changed files with 27 additions and 1 deletions

View File

@ -150,6 +150,11 @@ def run(argv):
if column == "filename":
df[column] = df[column].apply(check.filename_extension)
# Check: SPDX license identifier
match = re.match(r"dcterms\.license.*$", column)
if match is not None:
df[column] = df[column].apply(check.spdx_license_identifier)
##
# Perform some checks on rows so we can consider items as a whole rather
# than simple on a field-by-field basis. This allows us to check whether

View File

@ -3,6 +3,7 @@ from datetime import datetime, timedelta
import pandas as pd
import requests
import requests_cache
import spdx_license_list
from colorama import Fore
from pycountry import languages
@ -317,3 +318,23 @@ def filename_extension(field):
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field
def spdx_license_identifier(field):
"""Check if a license is a valid SPDX identifier.
Prints the value if it is invalid.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if value not in spdx_license_list.LICENSES:
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
pass
return field