1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.
This commit is contained in:
2019-07-29 17:08:49 +03:00
parent 8047a57cc5
commit fa4fa3491b
5 changed files with 39 additions and 0 deletions

View File

@ -31,6 +31,9 @@ def main(argv):
# Check: invalid multi-value separator
df[column] = df[column].apply(check.separators)
# Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters)
# Fix: invalid multi-value separator
if args.unsafe_fixes:
df[column] = df[column].apply(fix.separators)

View File

@ -124,3 +124,26 @@ def date(field):
return field
except ValueError:
print(f'Invalid date: {field}')
def suspicious_characters(field):
"""Warn about suspicious characters.
Look for standalone characters that could indicate encoding or copy/paste
errors for languages with accents. For example: foreˆt should be forêt.
"""
# Skip fields with missing values
if pd.isna(field):
return
# List of suspicious characters, for example: ́ˆ~`
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
for character in suspicious_characters:
character_set = set(character)
if character_set.issubset(field):
print(f'Suspicious character: {field}')
return field