Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters,
etc that add nothing to the metadata and often cause errors during
parsing or displaying in a UI.
This commit is contained in:
Alan Orth 2019-07-29 16:38:10 +03:00
parent ae66382046
commit 8047a57cc5
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 54 additions and 0 deletions

View File

@ -25,6 +25,9 @@ def main(argv):
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace)
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: invalid multi-value separator
df[column] = df[column].apply(check.separators)

View File

@ -65,3 +65,45 @@ def separators(field):
new_field = '||'.join(values)
return new_field
def unnecessary_unicode(field):
"""Remove unnecessary Unicode characters.
Removes unnecessary Unicode characters like:
- Zero-width space (U+200B)
- Replacement character (U+FFFD)
- No-break space (U+00A0)
Return string with characters removed.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Check for zero-width space characters (U+200B)
pattern = re.compile(r'\u200B')
match = re.findall(pattern, field)
if match:
print(f'Removing unnecessary Unicode (U+200B): {field}')
field = re.sub(pattern, '', field)
# Check for replacement characters (U+FFFD)
pattern = re.compile(r'\uFFFD')
match = re.findall(pattern, field)
if match:
print(f'Removing unnecessary Unicode (U+FFFD): {field}')
field = re.sub(pattern, '', field)
# Check for no-break spaces (U+00A0)
pattern = re.compile(r'\u00A0')
match = re.findall(pattern, field)
if match:
print(f'Removing unnecessary Unicode (U+00A0): {field}')
field = re.sub(pattern, '', field)
return field

View File

@ -5,3 +5,4 @@ Sophia,2019-06-15,,
Test,2019-06-150,,
"Doe, J.",2019-06-15||2019-01-10,,
Someone,,0378-5955|0378-5955,
Unnecessary Unicode,2019-07-29,,

1 dc.contributor.author birthdate dc.identifier.issn dc.identifier.isbn
5 Test 2019-06-150
6 Doe, J. 2019-06-15||2019-01-10
7 Someone 0378-5955|0378-5955
8 Unnecessary Unicode​ 2019-07-29

View File

@ -31,3 +31,11 @@ def test_fix_invalid_separators():
value = 'Alan|Orth'
assert fix.separators(value) == 'Alan||Orth'
def test_fix_unnecessary_unicode():
'''Test fixing unnecessary Unicode.'''
value = 'Alan Orth'
assert fix.unnecessary_unicode(value) == 'Alan Orth'