mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-30 16:04:29 +01:00
csv_metadata_quality/app.py: move unnecessary Unicode fix
We actually want to do this after we try to fix mojibake with ftfy. These "unnecessary" Unicode characters could actually help ftfy in some cases because often times they indicate that some character from another encoding was there before (like an accent, dash, or smart quote).
This commit is contained in:
parent
95015febbd
commit
e7322efadd
@ -103,9 +103,6 @@ def run(argv):
|
||||
if args.unsafe_fixes:
|
||||
df[column] = df[column].apply(fix.normalize_unicode, field_name=column)
|
||||
|
||||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
# Check: suspicious characters
|
||||
df[column].apply(check.suspicious_characters, field_name=column)
|
||||
|
||||
@ -115,6 +112,9 @@ def run(argv):
|
||||
else:
|
||||
df[column].apply(check.mojibake, field_name=column)
|
||||
|
||||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
# Fix: invalid and unnecessary multi-value separators
|
||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
|
Loading…
Reference in New Issue
Block a user