From 232ff99898dd9fb629517d2f3cf7e1605fb7028e Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 11 Aug 2019 00:07:21 +0300 Subject: [PATCH] csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail. --- csv_metadata_quality/fix.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 90b897d..5fddf62 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -68,14 +68,17 @@ def separators(field): def unnecessary_unicode(field): - """Remove unnecessary Unicode characters. + """Remove and replace unnecessary Unicode characters. Removes unnecessary Unicode characters like: - Zero-width space (U+200B) - Replacement character (U+FFFD) - No-break space (U+00A0) - Return string with characters removed. + Replaces unnecessary Unicode characters like: + - Soft hyphen (U+00AD) → hyphen + + Return string with characters removed or replaced. """ # Skip fields with missing values @@ -106,6 +109,14 @@ def unnecessary_unicode(field): print(f'Removing unnecessary Unicode (U+00A0): {field}') field = re.sub(pattern, '', field) + # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen + pattern = re.compile(r'\u002D*?\u00AD') + match = re.findall(pattern, field) + + if match: + print(f'Replacing unnecessary Unicode (U+00AD): {field}') + field = re.sub(pattern, '-', field) + return field