mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 04:02:19 +01:00
csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes
Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail.
This commit is contained in:
parent
13d5221378
commit
232ff99898
@ -68,14 +68,17 @@ def separators(field):
|
||||
|
||||
|
||||
def unnecessary_unicode(field):
|
||||
"""Remove unnecessary Unicode characters.
|
||||
"""Remove and replace unnecessary Unicode characters.
|
||||
|
||||
Removes unnecessary Unicode characters like:
|
||||
- Zero-width space (U+200B)
|
||||
- Replacement character (U+FFFD)
|
||||
- No-break space (U+00A0)
|
||||
|
||||
Return string with characters removed.
|
||||
Replaces unnecessary Unicode characters like:
|
||||
- Soft hyphen (U+00AD) → hyphen
|
||||
|
||||
Return string with characters removed or replaced.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
|
||||
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
|
||||
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
||||
pattern = re.compile(r'\u002D*?\u00AD')
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Replacing unnecessary Unicode (U+00AD): {field}')
|
||||
field = re.sub(pattern, '-', field)
|
||||
|
||||
return field
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user