mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-10-31 19:43:00 +01:00
Alan Orth
e4faf114dc
The sequence_weirdness() heuristic is deprecated. Now we should use is_bad(). See: https://ftfy.readthedocs.io/en/v6.0/heuristic.html See: https://github.com/rspeer/python-ftfy/blob/master/CHANGELOG.md#version-60-april-2-2021
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
from ftfy.badness import is_bad
|
|
|
|
|
|
def is_nfc(field):
|
|
"""Utility function to check whether a string is using normalized Unicode.
|
|
Python's built-in unicodedata library has the is_normalized() function, but
|
|
it was only introduced in Python 3.8. By using a simple utility function we
|
|
are able to run on Python >= 3.6 again.
|
|
|
|
See: https://docs.python.org/3/library/unicodedata.html
|
|
|
|
Return boolean.
|
|
"""
|
|
|
|
from unicodedata import normalize
|
|
|
|
return field == normalize("NFC", field)
|
|
|
|
|
|
def is_mojibake(field):
|
|
"""Determines whether a string contains mojibake.
|
|
|
|
We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
|
|
as something else like CP-1252 (Windows Latin). This manifests in the form
|
|
of "mojibake", for example:
|
|
|
|
- CIAT Publicaçao
|
|
- CIAT Publicación
|
|
|
|
This uses the excellent "fixes text for you" (ftfy) library to determine
|
|
whether a string contains characters that have been encoded in one encoding
|
|
and decoded in another.
|
|
|
|
Inspired by this code snippet from Martijn Pieters on StackOverflow:
|
|
https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
|
|
|
|
Return boolean.
|
|
"""
|
|
if not is_bad(field):
|
|
# Nothing weird, should be okay
|
|
return False
|
|
try:
|
|
field.encode("sloppy-windows-1252")
|
|
except UnicodeEncodeError:
|
|
# Not CP-1252 encodable, probably fine
|
|
return False
|
|
else:
|
|
# Encodable as CP-1252, Mojibake alert level high
|
|
return True
|