mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-24 14:50:17 +01:00
Alan Orth
7cc49b500d
spdx-license-list has been deprecated[1] and already has outdated information compared to recent SPDX data releases. Now I use the JSON license data directly from SPDX[2] (currently version 3.19). The JSON file is loaded from the package's data directory using Python 3's stdlib functions from importlib[3], though we now need Python 3.9 as a minimum for importlib.resources.files[4]. Also note that the data directory is not properly packaged via setuptools, so this only works for local installs, and not via versions published to pypi, for example (I'm currently not doing this anyways). If I want to publish this in the future I will need to modify setup.py/pyproject.toml to include the data files. [1] https://gitlab.com/uniqx/spdx-license-list [2] https://github.com/spdx/license-list-data/blob/main/json/licenses.json [3] https://copdips.com/2022/09/adding-data-files-to-python-package-with-setup-py.html [4] https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files
66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
|
|
import json
|
|
from importlib.resources import files
|
|
|
|
from ftfy.badness import is_bad
|
|
|
|
|
|
def is_nfc(field):
|
|
"""Utility function to check whether a string is using normalized Unicode.
|
|
Python's built-in unicodedata library has the is_normalized() function, but
|
|
it was only introduced in Python 3.8. By using a simple utility function we
|
|
are able to run on Python >= 3.6 again.
|
|
|
|
See: https://docs.python.org/3/library/unicodedata.html
|
|
|
|
Return boolean.
|
|
"""
|
|
|
|
from unicodedata import normalize
|
|
|
|
return field == normalize("NFC", field)
|
|
|
|
|
|
def is_mojibake(field):
|
|
"""Determines whether a string contains mojibake.
|
|
|
|
We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
|
|
as something else like CP-1252 (Windows Latin). This manifests in the form
|
|
of "mojibake", for example:
|
|
|
|
- CIAT Publicaçao
|
|
- CIAT Publicación
|
|
|
|
This uses the excellent "fixes text for you" (ftfy) library to determine
|
|
whether a string contains characters that have been encoded in one encoding
|
|
and decoded in another.
|
|
|
|
Inspired by this code snippet from Martijn Pieters on StackOverflow:
|
|
https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
|
|
|
|
Return boolean.
|
|
"""
|
|
if not is_bad(field):
|
|
# Nothing weird, should be okay
|
|
return False
|
|
try:
|
|
field.encode("sloppy-windows-1252")
|
|
except UnicodeEncodeError:
|
|
# Not CP-1252 encodable, probably fine
|
|
return False
|
|
else:
|
|
# Encodable as CP-1252, Mojibake alert level high
|
|
return True
|
|
|
|
|
|
def load_spdx_licenses():
|
|
"""Returns a Python list of SPDX short license identifiers."""
|
|
|
|
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
|
licenses = json.load(f)
|
|
|
|
# List comprehension to extract the license ID for each license
|
|
return [license["licenseId"] for license in licenses["licenses"]]
|