Add utility function to check normalization

Python's built-in unicodedata library includes the is_normalized() function starting with Python 3.8. This utility function allows us to do the same thing with earlier Python versions. See: https://docs.python.org/3/library/unicodedata.html
2025-08-23 13:21:50 +02:00 · 2020-01-15 12:17:52 +02:00
parent 550ce7fb7e
commit 365ecda324
2 changed files with 16 additions and 2 deletions
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -212,7 +212,7 @@ def normalize_unicode(field, field_name):
    Return normalized string.
    """
-    from unicodedata import is_normalized
+    from csv_metadata_quality.util import is_nfc
    from unicodedata import normalize
    # Skip fields with missing values
@@ -220,7 +220,7 @@ def normalize_unicode(field, field_name):
        return
    # Check if the current string is using normalized Unicode (NFC)
-    if not is_normalized("NFC", field):
+    if not is_nfc(field):
        print(f"Normalizing Unicode ({field_name}): {field}")
        field = normalize("NFC", field)
--- a/csv_metadata_quality/util.py
+++ b/csv_metadata_quality/util.py
@@ -0,0 +1,14 @@
 def is_nfc(field):
    """Utility function to check whether a string is using normalized Unicode.
    Python's built-in unicodedata library has the is_normalized() function, but
    it was only introduced in Python 3.8. By using a simple utility function we
    are able to run on Python >= 3.6 again.
    See: https://docs.python.org/3/library/unicodedata.html
    Return boolean.
    """
    from unicodedata import normalize
    return field == normalize("NFC", field)