mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-22 05:45:02 +01:00
Add utility function to check normalization
Python's built-in unicodedata library includes the is_normalized() function starting with Python 3.8. This utility function allows us to do the same thing with earlier Python versions. See: https://docs.python.org/3/library/unicodedata.html
This commit is contained in:
parent
550ce7fb7e
commit
365ecda324
@ -212,7 +212,7 @@ def normalize_unicode(field, field_name):
|
|||||||
Return normalized string.
|
Return normalized string.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from unicodedata import is_normalized
|
from csv_metadata_quality.util import is_nfc
|
||||||
from unicodedata import normalize
|
from unicodedata import normalize
|
||||||
|
|
||||||
# Skip fields with missing values
|
# Skip fields with missing values
|
||||||
@ -220,7 +220,7 @@ def normalize_unicode(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Check if the current string is using normalized Unicode (NFC)
|
# Check if the current string is using normalized Unicode (NFC)
|
||||||
if not is_normalized("NFC", field):
|
if not is_nfc(field):
|
||||||
print(f"Normalizing Unicode ({field_name}): {field}")
|
print(f"Normalizing Unicode ({field_name}): {field}")
|
||||||
field = normalize("NFC", field)
|
field = normalize("NFC", field)
|
||||||
|
|
||||||
|
14
csv_metadata_quality/util.py
Normal file
14
csv_metadata_quality/util.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
def is_nfc(field):
|
||||||
|
"""Utility function to check whether a string is using normalized Unicode.
|
||||||
|
Python's built-in unicodedata library has the is_normalized() function, but
|
||||||
|
it was only introduced in Python 3.8. By using a simple utility function we
|
||||||
|
are able to run on Python >= 3.6 again.
|
||||||
|
|
||||||
|
See: https://docs.python.org/3/library/unicodedata.html
|
||||||
|
|
||||||
|
Return boolean.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unicodedata import normalize
|
||||||
|
|
||||||
|
return field == normalize("NFC", field)
|
Loading…
Reference in New Issue
Block a user