Add Unicode normalization

This will check all strings for un-normalized Unicode characters. Normalization is done using NFC. This includes tests and updated sample data (data/test.csv). See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html
2025-08-23 05:11:49 +02:00 · 2020-01-15 11:37:54 +02:00
parent 403b253762
commit 49e3543878
5 changed files with 63 additions and 1 deletions
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -201,3 +201,27 @@ def comma_space(field, field_name):
        field = re.sub(r",(\w)", r", \1", field)

    return field
+
+
+def normalize_unicode(field, field_name):
+    """Fix occurrences of decomposed Unicode characters by normalizing them
+    with NFC to their canonical forms, for example:
+
+    Ouédraogo, Mathieu → Ouédraogo, Mathieu
+
+    Return normalized string.
+    """
+
+    from unicodedata import is_normalized
+    from unicodedata import normalize
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Check if the current string is using normalized Unicode (NFC)
+    if not is_normalized("NFC", field):
+        print(f"Normalizing Unicode ({field_name}): {field}")
+        field = normalize("NFC", field)
+
+    return field