Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI.
2025-08-04 20:17:04 +02:00 · 2019-07-29 16:38:10 +03:00
parent ae66382046
commit 8047a57cc5
4 changed files with 54 additions and 0 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -25,6 +25,9 @@ def main(argv):
        # Fix: whitespace
        df[column] = df[column].apply(fix.whitespace)

+        # Fix: unnecessary Unicode
+        df[column] = df[column].apply(fix.unnecessary_unicode)
+
        # Check: invalid multi-value separator
        df[column] = df[column].apply(check.separators)

--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -65,3 +65,45 @@ def separators(field):
    new_field = '||'.join(values)

    return new_field
+
+
+def unnecessary_unicode(field):
+    """Remove unnecessary Unicode characters.
+
+    Removes unnecessary Unicode characters like:
+        - Zero-width space (U+200B)
+        - Replacement character (U+FFFD)
+        - No-break space (U+00A0)
+
+    Return string with characters removed.
+    """
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Check for zero-width space characters (U+200B)
+    pattern = re.compile(r'\u200B')
+    match = re.findall(pattern, field)
+
+    if match:
+        print(f'Removing unnecessary Unicode (U+200B): {field}')
+        field = re.sub(pattern, '', field)
+
+    # Check for replacement characters (U+FFFD)
+    pattern = re.compile(r'\uFFFD')
+    match = re.findall(pattern, field)
+
+    if match:
+        print(f'Removing unnecessary Unicode (U+FFFD): {field}')
+        field = re.sub(pattern, '', field)
+
+    # Check for no-break spaces (U+00A0)
+    pattern = re.compile(r'\u00A0')
+    match = re.findall(pattern, field)
+
+    if match:
+        print(f'Removing unnecessary Unicode (U+00A0): {field}')
+        field = re.sub(pattern, '', field)
+
+    return field
--- a/data/test.csv
+++ b/data/test.csv
@ -5,3 +5,4 @@ Sophia,2019-06-15,,
 Test,2019-06-150,,
 "Doe, J.",2019-06-15||2019-01-10,,
 Someone,,0378-5955|0378-5955,
+Unnecessary Unicode,2019-07-29,,
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@ -31,3 +31,11 @@ def test_fix_invalid_separators():
    value = 'Alan|Orth'

    assert fix.separators(value) == 'Alan||Orth'
+
+
+def test_fix_unnecessary_unicode():
+    '''Test fixing unnecessary Unicode.'''
+
+    value = 'Alan Orth'
+
+    assert fix.unnecessary_unicode(value) == 'Alan Orth'