mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-21 11:42:20 +01:00
Use licenses.json from SPDX instead of spdx-license-list
spdx-license-list has been deprecated[1] and already has outdated information compared to recent SPDX data releases. Now I use the JSON license data directly from SPDX[2] (currently version 3.19). The JSON file is loaded from the package's data directory using Python 3's stdlib functions from importlib[3], though we now need Python 3.9 as a minimum for importlib.resources.files[4]. Also note that the data directory is not properly packaged via setuptools, so this only works for local installs, and not via versions published to pypi, for example (I'm currently not doing this anyways). If I want to publish this in the future I will need to modify setup.py/pyproject.toml to include the data files. [1] https://gitlab.com/uniqx/spdx-license-list [2] https://github.com/spdx/license-list-data/blob/main/json/licenses.json [3] https://copdips.com/2022/09/adding-data-files-to-python-package-with-setup-py.html [4] https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files
This commit is contained in:
parent
051777bcec
commit
7cc49b500d
@ -9,13 +9,12 @@ import country_converter as coco
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
import spdx_license_list
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
from stdnum import issn as stdnum_issn
|
||||
|
||||
from csv_metadata_quality.util import is_mojibake
|
||||
from csv_metadata_quality.util import is_mojibake, load_spdx_licenses
|
||||
|
||||
|
||||
def issn(field):
|
||||
@ -317,9 +316,11 @@ def spdx_license_identifier(field):
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
spdx_licenses = load_spdx_licenses()
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
if value not in spdx_license_list.LICENSES:
|
||||
if value not in spdx_licenses:
|
||||
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
|
||||
|
||||
return
|
||||
|
6308
csv_metadata_quality/data/licenses.json
Normal file
6308
csv_metadata_quality/data/licenses.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,9 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
|
||||
import json
|
||||
from importlib.resources import files
|
||||
|
||||
from ftfy.badness import is_bad
|
||||
|
||||
|
||||
@ -49,3 +53,13 @@ def is_mojibake(field):
|
||||
else:
|
||||
# Encodable as CP-1252, Mojibake alert level high
|
||||
return True
|
||||
|
||||
|
||||
def load_spdx_licenses():
|
||||
"""Returns a Python list of SPDX short license identifiers."""
|
||||
|
||||
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
||||
licenses = json.load(f)
|
||||
|
||||
# List comprehension to extract the license ID for each license
|
||||
return [license["licenseId"] for license in licenses["licenses"]]
|
||||
|
@ -18,7 +18,6 @@ requests = "^2.28.1"
|
||||
requests-cache = "^0.9.7"
|
||||
langid = "^1.1.6"
|
||||
colorama = "^0.4.5"
|
||||
spdx-license-list = "^0.5.2"
|
||||
ftfy = "^6.1.1"
|
||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
|
||||
|
Loading…
Reference in New Issue
Block a user