1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 13:55:03 +01:00

csv_metadata_quality/fix.py: Replace non-breaking spaces

We should be replacing non-breaking spaces (U+00A0) with normal sp-
aces instead of removing them.
This commit is contained in:
Alan Orth 2019-10-01 16:55:04 +03:00
parent 1c75608d54
commit c42f8b4812
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -74,10 +74,10 @@ def unnecessary_unicode(field):
Removes unnecessary Unicode characters like: Removes unnecessary Unicode characters like:
- Zero-width space (U+200B) - Zero-width space (U+200B)
- Replacement character (U+FFFD) - Replacement character (U+FFFD)
- No-break space (U+00A0)
Replaces unnecessary Unicode characters like: Replaces unnecessary Unicode characters like:
- Soft hyphen (U+00AD) hyphen - Soft hyphen (U+00AD) hyphen
- No-break space (U+00A0) space
Return string with characters removed or replaced. Return string with characters removed or replaced.
""" """
@ -107,8 +107,8 @@ def unnecessary_unicode(field):
match = re.findall(pattern, field) match = re.findall(pattern, field)
if match: if match:
print(f"Removing unnecessary Unicode (U+00A0): {field}") print(f"Replacing unnecessary Unicode (U+00A0): {field}")
field = re.sub(pattern, "", field) field = re.sub(pattern, " ", field)
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
pattern = re.compile(r"\u002D*?\u00AD") pattern = re.compile(r"\u002D*?\u00AD")