diff --git a/README.md b/README.md index 6899320..00009e5 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib - Better logging, for example with INFO, WARN, and ERR levels - Verbose, debug, or quiet options - Warn if an author is shorter than 3 characters? -- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module? - Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006 - Warn if two items use the same file in `filename` column - Add an option to drop invalid AGROVOC subjects? diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index c5794d8..bdea4ec 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -150,6 +150,11 @@ def run(argv): if column == "filename": df[column] = df[column].apply(check.filename_extension) + # Check: SPDX license identifier + match = re.match(r"dcterms\.license.*$", column) + if match is not None: + df[column] = df[column].apply(check.spdx_license_identifier) + ## # Perform some checks on rows so we can consider items as a whole rather # than simple on a field-by-field basis. This allows us to check whether diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index f5a7eda..d0ba1f9 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta import pandas as pd import requests import requests_cache +import spdx_license_list from colorama import Fore from pycountry import languages @@ -317,3 +318,23 @@ def filename_extension(field): print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") return field + + +def spdx_license_identifier(field): + """Check if a license is a valid SPDX identifier. + + Prints the value if it is invalid. + """ + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + for value in field.split("||"): + if value not in spdx_license_list.LICENSES: + print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}") + + pass + + return field diff --git a/pyproject.toml b/pyproject.toml index 01998ed..af6a221 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ requests-cache = "^0.5.2" pycountry = "^19.8.18" langid = "^1.1.6" colorama = "^0.4.4" +spdx-license-list = "^0.5.2" [tool.poetry.dev-dependencies] pytest = "^6.1.1"