csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes

Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail.
2025-08-05 04:25:41 +02:00 · 2019-08-11 00:07:21 +03:00
parent 13d5221378
commit 232ff99898
1 changed files with 13 additions and 2 deletions
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -68,14 +68,17 @@ def separators(field):


 def unnecessary_unicode(field):
-    """Remove unnecessary Unicode characters.
+    """Remove and replace unnecessary Unicode characters.

    Removes unnecessary Unicode characters like:
        - Zero-width space (U+200B)
        - Replacement character (U+FFFD)
        - No-break space (U+00A0)

-    Return string with characters removed.
+    Replaces unnecessary Unicode characters like:
+        - Soft hyphen (U+00AD) → hyphen
+
+    Return string with characters removed or replaced.
    """

    # Skip fields with missing values
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
        print(f'Removing unnecessary Unicode (U+00A0): {field}')
        field = re.sub(pattern, '', field)

+    # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
+    pattern = re.compile(r'\u002D*?\u00AD')
+    match = re.findall(pattern, field)
+
+    if match:
+        print(f'Replacing unnecessary Unicode (U+00AD): {field}')
+        field = re.sub(pattern, '-', field)
+
    return field