mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
Add validation of SPDX license identifiers
Currently this only checks the dcterms.license field and the result will only be a warning.
This commit is contained in:
parent
b16fa9121f
commit
6e4b0e5c1b
@ -103,7 +103,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Better logging, for example with INFO, WARN, and ERR levels
|
- Better logging, for example with INFO, WARN, and ERR levels
|
||||||
- Verbose, debug, or quiet options
|
- Verbose, debug, or quiet options
|
||||||
- Warn if an author is shorter than 3 characters?
|
- Warn if an author is shorter than 3 characters?
|
||||||
- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
|
|
||||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||||
- Warn if two items use the same file in `filename` column
|
- Warn if two items use the same file in `filename` column
|
||||||
- Add an option to drop invalid AGROVOC subjects?
|
- Add an option to drop invalid AGROVOC subjects?
|
||||||
|
@ -150,6 +150,11 @@ def run(argv):
|
|||||||
if column == "filename":
|
if column == "filename":
|
||||||
df[column] = df[column].apply(check.filename_extension)
|
df[column] = df[column].apply(check.filename_extension)
|
||||||
|
|
||||||
|
# Check: SPDX license identifier
|
||||||
|
match = re.match(r"dcterms\.license.*$", column)
|
||||||
|
if match is not None:
|
||||||
|
df[column] = df[column].apply(check.spdx_license_identifier)
|
||||||
|
|
||||||
##
|
##
|
||||||
# Perform some checks on rows so we can consider items as a whole rather
|
# Perform some checks on rows so we can consider items as a whole rather
|
||||||
# than simple on a field-by-field basis. This allows us to check whether
|
# than simple on a field-by-field basis. This allows us to check whether
|
||||||
|
@ -3,6 +3,7 @@ from datetime import datetime, timedelta
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import requests_cache
|
import requests_cache
|
||||||
|
import spdx_license_list
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pycountry import languages
|
from pycountry import languages
|
||||||
|
|
||||||
@ -317,3 +318,23 @@ def filename_extension(field):
|
|||||||
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
||||||
|
|
||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
|
def spdx_license_identifier(field):
|
||||||
|
"""Check if a license is a valid SPDX identifier.
|
||||||
|
|
||||||
|
Prints the value if it is invalid.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
for value in field.split("||"):
|
||||||
|
if value not in spdx_license_list.LICENSES:
|
||||||
|
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
return field
|
||||||
|
@ -20,6 +20,7 @@ requests-cache = "^0.5.2"
|
|||||||
pycountry = "^19.8.18"
|
pycountry = "^19.8.18"
|
||||||
langid = "^1.1.6"
|
langid = "^1.1.6"
|
||||||
colorama = "^0.4.4"
|
colorama = "^0.4.4"
|
||||||
|
spdx-license-list = "^0.5.2"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
pytest = "^6.1.1"
|
pytest = "^6.1.1"
|
||||||
|
Loading…
Reference in New Issue
Block a user