1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-01-24 11:13:22 +01:00

Add utility function to check normalization

Python's built-in unicodedata library includes the is_normalized()
function starting with Python 3.8. This utility function allows us
to do the same thing with earlier Python versions.

See: https://docs.python.org/3/library/unicodedata.html
This commit is contained in:
Alan Orth 2020-01-15 12:17:52 +02:00
parent 550ce7fb7e
commit 365ecda324
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 16 additions and 2 deletions

View File

@ -212,7 +212,7 @@ def normalize_unicode(field, field_name):
Return normalized string.
"""
from unicodedata import is_normalized
from csv_metadata_quality.util import is_nfc
from unicodedata import normalize
# Skip fields with missing values
@ -220,7 +220,7 @@ def normalize_unicode(field, field_name):
return
# Check if the current string is using normalized Unicode (NFC)
if not is_normalized("NFC", field):
if not is_nfc(field):
print(f"Normalizing Unicode ({field_name}): {field}")
field = normalize("NFC", field)

View File

@ -0,0 +1,14 @@
def is_nfc(field):
"""Utility function to check whether a string is using normalized Unicode.
Python's built-in unicodedata library has the is_normalized() function, but
it was only introduced in Python 3.8. By using a simple utility function we
are able to run on Python >= 3.6 again.
See: https://docs.python.org/3/library/unicodedata.html
Return boolean.
"""
from unicodedata import normalize
return field == normalize("NFC", field)