Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result will only be a warning.
2025-07-25 07:18:04 +02:00 · 2021-03-11 10:33:16 +02:00
parent b16fa9121f
commit 6e4b0e5c1b
4 changed files with 27 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -103,7 +103,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 - Better logging, for example with INFO, WARN, and ERR levels
 - Verbose, debug, or quiet options
 - Warn if an author is shorter than 3 characters?
 - Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
 - Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
 - Warn if two items use the same file in `filename` column
 - Add an option to drop invalid AGROVOC subjects?
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -150,6 +150,11 @@ def run(argv):
        if column == "filename":
            df[column] = df[column].apply(check.filename_extension)
        # Check: SPDX license identifier
        match = re.match(r"dcterms\.license.*$", column)
        if match is not None:
            df[column] = df[column].apply(check.spdx_license_identifier)
    ##
    # Perform some checks on rows so we can consider items as a whole rather
    # than simple on a field-by-field basis. This allows us to check whether
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -3,6 +3,7 @@ from datetime import datetime, timedelta
 import pandas as pd
 import requests
 import requests_cache
 import spdx_license_list
 from colorama import Fore
 from pycountry import languages
@ -317,3 +318,23 @@ def filename_extension(field):
            print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
    return field
 def spdx_license_identifier(field):
    """Check if a license is a valid SPDX identifier.
    Prints the value if it is invalid.
    """
    # Skip fields with missing values
    if pd.isna(field):
        return
    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        if value not in spdx_license_list.LICENSES:
            print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
            pass
    return field
--- a/pyproject.toml
+++ b/pyproject.toml
@ -20,6 +20,7 @@ requests-cache = "^0.5.2"
 pycountry = "^19.8.18"
 langid = "^1.1.6"
 colorama = "^0.4.4"
 spdx-license-list = "^0.5.2"
 [tool.poetry.dev-dependencies]
 pytest = "^6.1.1"