mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-08 06:06:00 +02:00
Add validation of SPDX license identifiers
Currently this only checks the dcterms.license field and the result will only be a warning.
This commit is contained in:
@ -150,6 +150,11 @@ def run(argv):
|
||||
if column == "filename":
|
||||
df[column] = df[column].apply(check.filename_extension)
|
||||
|
||||
# Check: SPDX license identifier
|
||||
match = re.match(r"dcterms\.license.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.spdx_license_identifier)
|
||||
|
||||
##
|
||||
# Perform some checks on rows so we can consider items as a whole rather
|
||||
# than simple on a field-by-field basis. This allows us to check whether
|
||||
|
@ -3,6 +3,7 @@ from datetime import datetime, timedelta
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
import spdx_license_list
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
|
||||
@ -317,3 +318,23 @@ def filename_extension(field):
|
||||
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def spdx_license_identifier(field):
|
||||
"""Check if a license is a valid SPDX identifier.
|
||||
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
if value not in spdx_license_list.LICENSES:
|
||||
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
|
||||
|
||||
pass
|
||||
|
||||
return field
|
||||
|
Reference in New Issue
Block a user