2021-03-19 15:04:13 +01:00
|
|
|
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
|
2021-12-15 20:58:07 +01:00
|
|
|
from ftfy.badness import is_bad
|
2021-03-19 09:22:21 +01:00
|
|
|
|
|
|
|
|
2020-01-15 11:17:52 +01:00
|
|
|
def is_nfc(field):
|
|
|
|
"""Utility function to check whether a string is using normalized Unicode.
|
|
|
|
Python's built-in unicodedata library has the is_normalized() function, but
|
|
|
|
it was only introduced in Python 3.8. By using a simple utility function we
|
|
|
|
are able to run on Python >= 3.6 again.
|
|
|
|
|
|
|
|
See: https://docs.python.org/3/library/unicodedata.html
|
|
|
|
|
|
|
|
Return boolean.
|
|
|
|
"""
|
|
|
|
|
|
|
|
from unicodedata import normalize
|
|
|
|
|
|
|
|
return field == normalize("NFC", field)
|
2021-03-19 09:22:21 +01:00
|
|
|
|
|
|
|
|
|
|
|
def is_mojibake(field):
|
|
|
|
"""Determines whether a string contains mojibake.
|
|
|
|
|
|
|
|
We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
|
|
|
|
as something else like CP-1252 (Windows Latin). This manifests in the form
|
|
|
|
of "mojibake", for example:
|
|
|
|
|
|
|
|
- CIAT Publicaçao
|
|
|
|
- CIAT Publicación
|
|
|
|
|
|
|
|
This uses the excellent "fixes text for you" (ftfy) library to determine
|
|
|
|
whether a string contains characters that have been encoded in one encoding
|
|
|
|
and decoded in another.
|
|
|
|
|
|
|
|
Inspired by this code snippet from Martijn Pieters on StackOverflow:
|
|
|
|
https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
|
|
|
|
|
|
|
|
Return boolean.
|
|
|
|
"""
|
2021-12-15 20:58:07 +01:00
|
|
|
if not is_bad(field):
|
2021-03-19 09:22:21 +01:00
|
|
|
# Nothing weird, should be okay
|
|
|
|
return False
|
|
|
|
try:
|
|
|
|
field.encode("sloppy-windows-1252")
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
# Not CP-1252 encodable, probably fine
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
# Encodable as CP-1252, Mojibake alert level high
|
|
|
|
return True
|