mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-21 11:42:20 +01:00
Alan Orth
1491e1edb0
All checks were successful
continuous-integration/drone/push Build is passing
When we install and run this from CI, this file needs to exist in the package's folder inside site-packages. Then we can use __file__ to get the path relative to the package. See: https://python-packaging.readthedocs.io/en/latest/non-code-files.html
66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
|
|
import json
|
|
import os
|
|
|
|
from ftfy.badness import is_bad
|
|
|
|
|
|
def is_nfc(field):
|
|
"""Utility function to check whether a string is using normalized Unicode.
|
|
Python's built-in unicodedata library has the is_normalized() function, but
|
|
it was only introduced in Python 3.8. By using a simple utility function we
|
|
are able to run on Python >= 3.6 again.
|
|
|
|
See: https://docs.python.org/3/library/unicodedata.html
|
|
|
|
Return boolean.
|
|
"""
|
|
|
|
from unicodedata import normalize
|
|
|
|
return field == normalize("NFC", field)
|
|
|
|
|
|
def is_mojibake(field):
|
|
"""Determines whether a string contains mojibake.
|
|
|
|
We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
|
|
as something else like CP-1252 (Windows Latin). This manifests in the form
|
|
of "mojibake", for example:
|
|
|
|
- CIAT Publicaçao
|
|
- CIAT Publicación
|
|
|
|
This uses the excellent "fixes text for you" (ftfy) library to determine
|
|
whether a string contains characters that have been encoded in one encoding
|
|
and decoded in another.
|
|
|
|
Inspired by this code snippet from Martijn Pieters on StackOverflow:
|
|
https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
|
|
|
|
Return boolean.
|
|
"""
|
|
if not is_bad(field):
|
|
# Nothing weird, should be okay
|
|
return False
|
|
try:
|
|
field.encode("sloppy-windows-1252")
|
|
except UnicodeEncodeError:
|
|
# Not CP-1252 encodable, probably fine
|
|
return False
|
|
else:
|
|
# Encodable as CP-1252, Mojibake alert level high
|
|
return True
|
|
|
|
|
|
def load_spdx_licenses():
|
|
"""Returns a Python list of SPDX short license identifiers."""
|
|
|
|
with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
|
|
licenses = json.load(f)
|
|
|
|
# List comprehension to extract the license ID for each license
|
|
return [license["licenseId"] for license in licenses["licenses"]]
|