mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 04:02:19 +01:00
Add support for fixing "unnecessary" Unicode
These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI.
This commit is contained in:
parent
ae66382046
commit
8047a57cc5
@ -25,6 +25,9 @@ def main(argv):
|
||||
# Fix: whitespace
|
||||
df[column] = df[column].apply(fix.whitespace)
|
||||
|
||||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
# Check: invalid multi-value separator
|
||||
df[column] = df[column].apply(check.separators)
|
||||
|
||||
|
@ -65,3 +65,45 @@ def separators(field):
|
||||
new_field = '||'.join(values)
|
||||
|
||||
return new_field
|
||||
|
||||
|
||||
def unnecessary_unicode(field):
|
||||
"""Remove unnecessary Unicode characters.
|
||||
|
||||
Removes unnecessary Unicode characters like:
|
||||
- Zero-width space (U+200B)
|
||||
- Replacement character (U+FFFD)
|
||||
- No-break space (U+00A0)
|
||||
|
||||
Return string with characters removed.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Check for zero-width space characters (U+200B)
|
||||
pattern = re.compile(r'\u200B')
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Removing unnecessary Unicode (U+200B): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
|
||||
# Check for replacement characters (U+FFFD)
|
||||
pattern = re.compile(r'\uFFFD')
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Removing unnecessary Unicode (U+FFFD): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
|
||||
# Check for no-break spaces (U+00A0)
|
||||
pattern = re.compile(r'\u00A0')
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
|
||||
return field
|
||||
|
@ -5,3 +5,4 @@ Sophia,2019-06-15,,
|
||||
Test,2019-06-150,,
|
||||
"Doe, J.",2019-06-15||2019-01-10,,
|
||||
Someone,,0378-5955|0378-5955,
|
||||
Unnecessary Unicode,2019-07-29,,
|
||||
|
|
@ -31,3 +31,11 @@ def test_fix_invalid_separators():
|
||||
value = 'Alan|Orth'
|
||||
|
||||
assert fix.separators(value) == 'Alan||Orth'
|
||||
|
||||
|
||||
def test_fix_unnecessary_unicode():
|
||||
'''Test fixing unnecessary Unicode.'''
|
||||
|
||||
value = 'Alan Orth'
|
||||
|
||||
assert fix.unnecessary_unicode(value) == 'Alan Orth'
|
||||
|
Loading…
Reference in New Issue
Block a user