diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 85e5077..2270750 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -212,7 +212,7 @@ def normalize_unicode(field, field_name): Return normalized string. """ - from unicodedata import is_normalized + from csv_metadata_quality.util import is_nfc from unicodedata import normalize # Skip fields with missing values @@ -220,7 +220,7 @@ def normalize_unicode(field, field_name): return # Check if the current string is using normalized Unicode (NFC) - if not is_normalized("NFC", field): + if not is_nfc(field): print(f"Normalizing Unicode ({field_name}): {field}") field = normalize("NFC", field) diff --git a/csv_metadata_quality/util.py b/csv_metadata_quality/util.py new file mode 100644 index 0000000..bcd0c06 --- /dev/null +++ b/csv_metadata_quality/util.py @@ -0,0 +1,14 @@ +def is_nfc(field): + """Utility function to check whether a string is using normalized Unicode. + Python's built-in unicodedata library has the is_normalized() function, but + it was only introduced in Python 3.8. By using a simple utility function we + are able to run on Python >= 3.6 again. + + See: https://docs.python.org/3/library/unicodedata.html + + Return boolean. + """ + + from unicodedata import normalize + + return field == normalize("NFC", field)