mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-22 05:45:02 +01:00
Add support for fixing "unnecessary" Unicode
These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI.
This commit is contained in:
parent
ae66382046
commit
8047a57cc5
@ -25,6 +25,9 @@ def main(argv):
|
|||||||
# Fix: whitespace
|
# Fix: whitespace
|
||||||
df[column] = df[column].apply(fix.whitespace)
|
df[column] = df[column].apply(fix.whitespace)
|
||||||
|
|
||||||
|
# Fix: unnecessary Unicode
|
||||||
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Check: invalid multi-value separator
|
# Check: invalid multi-value separator
|
||||||
df[column] = df[column].apply(check.separators)
|
df[column] = df[column].apply(check.separators)
|
||||||
|
|
||||||
|
@ -65,3 +65,45 @@ def separators(field):
|
|||||||
new_field = '||'.join(values)
|
new_field = '||'.join(values)
|
||||||
|
|
||||||
return new_field
|
return new_field
|
||||||
|
|
||||||
|
|
||||||
|
def unnecessary_unicode(field):
|
||||||
|
"""Remove unnecessary Unicode characters.
|
||||||
|
|
||||||
|
Removes unnecessary Unicode characters like:
|
||||||
|
- Zero-width space (U+200B)
|
||||||
|
- Replacement character (U+FFFD)
|
||||||
|
- No-break space (U+00A0)
|
||||||
|
|
||||||
|
Return string with characters removed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for zero-width space characters (U+200B)
|
||||||
|
pattern = re.compile(r'\u200B')
|
||||||
|
match = re.findall(pattern, field)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
print(f'Removing unnecessary Unicode (U+200B): {field}')
|
||||||
|
field = re.sub(pattern, '', field)
|
||||||
|
|
||||||
|
# Check for replacement characters (U+FFFD)
|
||||||
|
pattern = re.compile(r'\uFFFD')
|
||||||
|
match = re.findall(pattern, field)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
print(f'Removing unnecessary Unicode (U+FFFD): {field}')
|
||||||
|
field = re.sub(pattern, '', field)
|
||||||
|
|
||||||
|
# Check for no-break spaces (U+00A0)
|
||||||
|
pattern = re.compile(r'\u00A0')
|
||||||
|
match = re.findall(pattern, field)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
||||||
|
field = re.sub(pattern, '', field)
|
||||||
|
|
||||||
|
return field
|
||||||
|
@ -5,3 +5,4 @@ Sophia,2019-06-15,,
|
|||||||
Test,2019-06-150,,
|
Test,2019-06-150,,
|
||||||
"Doe, J.",2019-06-15||2019-01-10,,
|
"Doe, J.",2019-06-15||2019-01-10,,
|
||||||
Someone,,0378-5955|0378-5955,
|
Someone,,0378-5955|0378-5955,
|
||||||
|
Unnecessary Unicode,2019-07-29,,
|
||||||
|
|
@ -31,3 +31,11 @@ def test_fix_invalid_separators():
|
|||||||
value = 'Alan|Orth'
|
value = 'Alan|Orth'
|
||||||
|
|
||||||
assert fix.separators(value) == 'Alan||Orth'
|
assert fix.separators(value) == 'Alan||Orth'
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_unnecessary_unicode():
|
||||||
|
'''Test fixing unnecessary Unicode.'''
|
||||||
|
|
||||||
|
value = 'Alan Orth'
|
||||||
|
|
||||||
|
assert fix.unnecessary_unicode(value) == 'Alan Orth'
|
||||||
|
Loading…
Reference in New Issue
Block a user