mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes
Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail.
This commit is contained in:
parent
13d5221378
commit
232ff99898
@ -68,14 +68,17 @@ def separators(field):
|
|||||||
|
|
||||||
|
|
||||||
def unnecessary_unicode(field):
|
def unnecessary_unicode(field):
|
||||||
"""Remove unnecessary Unicode characters.
|
"""Remove and replace unnecessary Unicode characters.
|
||||||
|
|
||||||
Removes unnecessary Unicode characters like:
|
Removes unnecessary Unicode characters like:
|
||||||
- Zero-width space (U+200B)
|
- Zero-width space (U+200B)
|
||||||
- Replacement character (U+FFFD)
|
- Replacement character (U+FFFD)
|
||||||
- No-break space (U+00A0)
|
- No-break space (U+00A0)
|
||||||
|
|
||||||
Return string with characters removed.
|
Replaces unnecessary Unicode characters like:
|
||||||
|
- Soft hyphen (U+00AD) → hyphen
|
||||||
|
|
||||||
|
Return string with characters removed or replaced.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Skip fields with missing values
|
# Skip fields with missing values
|
||||||
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
|
|||||||
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
||||||
field = re.sub(pattern, '', field)
|
field = re.sub(pattern, '', field)
|
||||||
|
|
||||||
|
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
||||||
|
pattern = re.compile(r'\u002D*?\u00AD')
|
||||||
|
match = re.findall(pattern, field)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
print(f'Replacing unnecessary Unicode (U+00AD): {field}')
|
||||||
|
field = re.sub(pattern, '-', field)
|
||||||
|
|
||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user