Add Unicode normalization

This will check all strings for un-normalized Unicode characters. Normalization is done using NFC. This includes tests and updated sample data (data/test.csv). See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html
2025-07-03 21:13:28 +02:00 · 2020-01-15 11:37:54 +02:00
parent 403b253762
commit 49e3543878
5 changed files with 63 additions and 1 deletions
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@ -66,3 +66,25 @@ def test_fix_comma_space():
    field_name = "dc.contributor.author"

    assert fix.comma_space(value, field_name) == "Orth, Alan S."
+
+
+def test_fix_normalized_unicode():
+    """Test fixing a string that is already in its normalized (NFC) Unicode form."""
+
+    # string using the normalized canonical form of é
+    value = "Ouédraogo, Mathieu"
+
+    field_name = "dc.contributor.author"
+
+    assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
+
+
+def test_fix_decomposed_unicode():
+    """Test fixing a string that contains Unicode string."""
+
+    # string using the decomposed form of é
+    value = "Ouédraogo, Mathieu"
+
+    field_name = "dc.contributor.author"
+
+    assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"