mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-11 23:56:00 +02:00
Add check for "suspicious" characters
These standalone characters often indicate issues with encoding or copy/paste in languages with accents like French and Spanish. For example: foreˆt should be forêt. It is not possible to fix these issues automatically, but this will print a warning so you can notify the owner of the data.
This commit is contained in:
@ -124,3 +124,26 @@ def date(field):
|
||||
return field
|
||||
except ValueError:
|
||||
print(f'Invalid date: {field}')
|
||||
|
||||
|
||||
def suspicious_characters(field):
|
||||
"""Warn about suspicious characters.
|
||||
|
||||
Look for standalone characters that could indicate encoding or copy/paste
|
||||
errors for languages with accents. For example: foreˆt should be forêt.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# List of suspicious characters, for example: ́ˆ~`
|
||||
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
|
||||
|
||||
for character in suspicious_characters:
|
||||
character_set = set(character)
|
||||
|
||||
if character_set.issubset(field):
|
||||
print(f'Suspicious character: {field}')
|
||||
|
||||
return field
|
||||
|
Reference in New Issue
Block a user