1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 14:16:00 +02:00

Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters,
etc that add nothing to the metadata and often cause errors during
parsing or displaying in a UI.
This commit is contained in:
2019-07-29 16:38:10 +03:00
parent ae66382046
commit 8047a57cc5
4 changed files with 54 additions and 0 deletions

View File

@ -25,6 +25,9 @@ def main(argv):
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace)
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: invalid multi-value separator
df[column] = df[column].apply(check.separators)

View File

@ -65,3 +65,45 @@ def separators(field):
new_field = '||'.join(values)
return new_field
def unnecessary_unicode(field):
"""Remove unnecessary Unicode characters.
Removes unnecessary Unicode characters like:
- Zero-width space (U+200B)
- Replacement character (U+FFFD)
- No-break space (U+00A0)
Return string with characters removed.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Check for zero-width space characters (U+200B)
pattern = re.compile(r'\u200B')
match = re.findall(pattern, field)
if match:
print(f'Removing unnecessary Unicode (U+200B): {field}')
field = re.sub(pattern, '', field)
# Check for replacement characters (U+FFFD)
pattern = re.compile(r'\uFFFD')
match = re.findall(pattern, field)
if match:
print(f'Removing unnecessary Unicode (U+FFFD): {field}')
field = re.sub(pattern, '', field)
# Check for no-break spaces (U+00A0)
pattern = re.compile(r'\u00A0')
match = re.findall(pattern, field)
if match:
print(f'Removing unnecessary Unicode (U+00A0): {field}')
field = re.sub(pattern, '', field)
return field