1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-10 23:26:01 +02:00

Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.
This commit is contained in:
2021-03-11 10:33:16 +02:00
parent b16fa9121f
commit 6e4b0e5c1b
4 changed files with 27 additions and 1 deletions

View File

@ -3,6 +3,7 @@ from datetime import datetime, timedelta
import pandas as pd
import requests
import requests_cache
import spdx_license_list
from colorama import Fore
from pycountry import languages
@ -317,3 +318,23 @@ def filename_extension(field):
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field
def spdx_license_identifier(field):
"""Check if a license is a valid SPDX identifier.
Prints the value if it is invalid.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if value not in spdx_license_list.LICENSES:
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
pass
return field