1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 12:12:18 +01:00

csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes

Add a check for soft hyphens (U+00AD). In one sample CSV I have a
normal hyphen followed by a soft hyphen in an ISBN. This causes the
ISBN validation to fail.
This commit is contained in:
Alan Orth 2019-08-11 00:07:21 +03:00
parent 13d5221378
commit 232ff99898
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -68,14 +68,17 @@ def separators(field):
def unnecessary_unicode(field): def unnecessary_unicode(field):
"""Remove unnecessary Unicode characters. """Remove and replace unnecessary Unicode characters.
Removes unnecessary Unicode characters like: Removes unnecessary Unicode characters like:
- Zero-width space (U+200B) - Zero-width space (U+200B)
- Replacement character (U+FFFD) - Replacement character (U+FFFD)
- No-break space (U+00A0) - No-break space (U+00A0)
Return string with characters removed. Replaces unnecessary Unicode characters like:
- Soft hyphen (U+00AD) hyphen
Return string with characters removed or replaced.
""" """
# Skip fields with missing values # Skip fields with missing values
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
print(f'Removing unnecessary Unicode (U+00A0): {field}') print(f'Removing unnecessary Unicode (U+00A0): {field}')
field = re.sub(pattern, '', field) field = re.sub(pattern, '', field)
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
pattern = re.compile(r'\u002D*?\u00AD')
match = re.findall(pattern, field)
if match:
print(f'Replacing unnecessary Unicode (U+00AD): {field}')
field = re.sub(pattern, '-', field)
return field return field