1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Add Unicode normalization

This will check all strings for un-normalized Unicode characters.
Normalization is done using NFC. This includes tests and updated
sample data (data/test.csv).

See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html
This commit is contained in:
2020-01-15 11:37:54 +02:00
parent 403b253762
commit 49e3543878
5 changed files with 63 additions and 1 deletions

View File

@ -66,3 +66,25 @@ def test_fix_comma_space():
field_name = "dc.contributor.author"
assert fix.comma_space(value, field_name) == "Orth, Alan S."
def test_fix_normalized_unicode():
"""Test fixing a string that is already in its normalized (NFC) Unicode form."""
# string using the normalized canonical form of é
value = "Ouédraogo, Mathieu"
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
def test_fix_decomposed_unicode():
"""Test fixing a string that contains Unicode string."""
# string using the decomposed form of é
value = "Ouédraogo, Mathieu"
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"