mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-08 06:06:00 +02:00
Add Unicode normalization
This will check all strings for un-normalized Unicode characters. Normalization is done using NFC. This includes tests and updated sample data (data/test.csv). See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html
This commit is contained in:
@ -94,6 +94,11 @@ def run(argv):
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||
|
||||
# Fix: perform Unicode normalization (NFC) to convert decomposed
|
||||
# characters into their canonical forms.
|
||||
if args.unsafe_fixes:
|
||||
df[column] = df[column].apply(fix.normalize_unicode, field_name=column)
|
||||
|
||||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
|
@ -201,3 +201,27 @@ def comma_space(field, field_name):
|
||||
field = re.sub(r",(\w)", r", \1", field)
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def normalize_unicode(field, field_name):
|
||||
"""Fix occurrences of decomposed Unicode characters by normalizing them
|
||||
with NFC to their canonical forms, for example:
|
||||
|
||||
Ouédraogo, Mathieu → Ouédraogo, Mathieu
|
||||
|
||||
Return normalized string.
|
||||
"""
|
||||
|
||||
from unicodedata import is_normalized
|
||||
from unicodedata import normalize
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Check if the current string is using normalized Unicode (NFC)
|
||||
if not is_normalized("NFC", field):
|
||||
print(f"Normalizing Unicode ({field_name}): {field}")
|
||||
field = normalize("NFC", field)
|
||||
|
||||
return field
|
||||
|
Reference in New Issue
Block a user