mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-22 05:45:02 +01:00
csv_metadata_quality/fix.py: Replace non-breaking spaces
We should be replacing non-breaking spaces (U+00A0) with normal sp- aces instead of removing them.
This commit is contained in:
parent
1c75608d54
commit
c42f8b4812
@ -74,10 +74,10 @@ def unnecessary_unicode(field):
|
|||||||
Removes unnecessary Unicode characters like:
|
Removes unnecessary Unicode characters like:
|
||||||
- Zero-width space (U+200B)
|
- Zero-width space (U+200B)
|
||||||
- Replacement character (U+FFFD)
|
- Replacement character (U+FFFD)
|
||||||
- No-break space (U+00A0)
|
|
||||||
|
|
||||||
Replaces unnecessary Unicode characters like:
|
Replaces unnecessary Unicode characters like:
|
||||||
- Soft hyphen (U+00AD) → hyphen
|
- Soft hyphen (U+00AD) → hyphen
|
||||||
|
- No-break space (U+00A0) → space
|
||||||
|
|
||||||
Return string with characters removed or replaced.
|
Return string with characters removed or replaced.
|
||||||
"""
|
"""
|
||||||
@ -107,8 +107,8 @@ def unnecessary_unicode(field):
|
|||||||
match = re.findall(pattern, field)
|
match = re.findall(pattern, field)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
print(f"Removing unnecessary Unicode (U+00A0): {field}")
|
print(f"Replacing unnecessary Unicode (U+00A0): {field}")
|
||||||
field = re.sub(pattern, "", field)
|
field = re.sub(pattern, " ", field)
|
||||||
|
|
||||||
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
||||||
pattern = re.compile(r"\u002D*?\u00AD")
|
pattern = re.compile(r"\u002D*?\u00AD")
|
||||||
|
Loading…
Reference in New Issue
Block a user