Add utility function to check normalization

Python's built-in unicodedata library includes the is_normalized() function starting with Python 3.8. This utility function allows us to do the same thing with earlier Python versions. See: https://docs.python.org/3/library/unicodedata.html
2025-08-23 13:21:50 +02:00 · 2020-01-15 12:17:52 +02:00
parent 550ce7fb7e
commit 365ecda324
2 changed files with 16 additions and 2 deletions
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -212,7 +212,7 @@ def normalize_unicode(field, field_name):
    Return normalized string.
    """

-    from unicodedata import is_normalized
+    from csv_metadata_quality.util import is_nfc
    from unicodedata import normalize

    # Skip fields with missing values
@@ -220,7 +220,7 @@ def normalize_unicode(field, field_name):
        return

    # Check if the current string is using normalized Unicode (NFC)
-    if not is_normalized("NFC", field):
+    if not is_nfc(field):
        print(f"Normalizing Unicode ({field_name}): {field}")
        field = normalize("NFC", field)

--- a/csv_metadata_quality/util.py
+++ b/csv_metadata_quality/util.py
@@ -0,0 +1,14 @@
+def is_nfc(field):
+    """Utility function to check whether a string is using normalized Unicode.
+    Python's built-in unicodedata library has the is_normalized() function, but
+    it was only introduced in Python 3.8. By using a simple utility function we
+    are able to run on Python >= 3.6 again.
+
+    See: https://docs.python.org/3/library/unicodedata.html
+
+    Return boolean.
+    """
+
+    from unicodedata import normalize
+
+    return field == normalize("NFC", field)