1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 12:12:18 +01:00

Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.
This commit is contained in:
Alan Orth 2021-03-11 10:33:16 +02:00
parent b16fa9121f
commit 6e4b0e5c1b
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 27 additions and 1 deletions

View File

@ -103,7 +103,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
- Better logging, for example with INFO, WARN, and ERR levels - Better logging, for example with INFO, WARN, and ERR levels
- Verbose, debug, or quiet options - Verbose, debug, or quiet options
- Warn if an author is shorter than 3 characters? - Warn if an author is shorter than 3 characters?
- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006 - Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
- Warn if two items use the same file in `filename` column - Warn if two items use the same file in `filename` column
- Add an option to drop invalid AGROVOC subjects? - Add an option to drop invalid AGROVOC subjects?

View File

@ -150,6 +150,11 @@ def run(argv):
if column == "filename": if column == "filename":
df[column] = df[column].apply(check.filename_extension) df[column] = df[column].apply(check.filename_extension)
# Check: SPDX license identifier
match = re.match(r"dcterms\.license.*$", column)
if match is not None:
df[column] = df[column].apply(check.spdx_license_identifier)
## ##
# Perform some checks on rows so we can consider items as a whole rather # Perform some checks on rows so we can consider items as a whole rather
# than simple on a field-by-field basis. This allows us to check whether # than simple on a field-by-field basis. This allows us to check whether

View File

@ -3,6 +3,7 @@ from datetime import datetime, timedelta
import pandas as pd import pandas as pd
import requests import requests
import requests_cache import requests_cache
import spdx_license_list
from colorama import Fore from colorama import Fore
from pycountry import languages from pycountry import languages
@ -317,3 +318,23 @@ def filename_extension(field):
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field return field
def spdx_license_identifier(field):
"""Check if a license is a valid SPDX identifier.
Prints the value if it is invalid.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if value not in spdx_license_list.LICENSES:
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
pass
return field

View File

@ -20,6 +20,7 @@ requests-cache = "^0.5.2"
pycountry = "^19.8.18" pycountry = "^19.8.18"
langid = "^1.1.6" langid = "^1.1.6"
colorama = "^0.4.4" colorama = "^0.4.4"
spdx-license-list = "^0.5.2"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^6.1.1" pytest = "^6.1.1"