1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-17 01:53:52 +02:00

Add Unicode normalization

This will check all strings for un-normalized Unicode characters.
Normalization is done using NFC. This includes tests and updated
sample data (data/test.csv).

See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html
This commit is contained in:
2020-01-15 11:37:54 +02:00
parent 403b253762
commit 49e3543878
5 changed files with 63 additions and 1 deletions
README.md
csv_metadata_quality
data
tests

@ -201,3 +201,27 @@ def comma_space(field, field_name):
field = re.sub(r",(\w)", r", \1", field)
return field
def normalize_unicode(field, field_name):
"""Fix occurrences of decomposed Unicode characters by normalizing them
with NFC to their canonical forms, for example:
Ouédraogo, Mathieu → Ouédraogo, Mathieu
Return normalized string.
"""
from unicodedata import is_normalized
from unicodedata import normalize
# Skip fields with missing values
if pd.isna(field):
return
# Check if the current string is using normalized Unicode (NFC)
if not is_normalized("NFC", field):
print(f"Normalizing Unicode ({field_name}): {field}")
field = normalize("NFC", field)
return field