1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-21 11:42:20 +01:00

csv_metadata_quality/app.py: move unnecessary Unicode fix

We actually want to do this after we try to fix mojibake with ftfy.
These "unnecessary" Unicode characters could actually help ftfy in
some cases because often times they indicate that some character
from another encoding was there before (like an accent, dash, or
smart quote).
This commit is contained in:
Alan Orth 2021-12-15 13:53:25 +02:00
parent 95015febbd
commit e7322efadd
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -103,9 +103,6 @@ def run(argv):
if args.unsafe_fixes:
df[column] = df[column].apply(fix.normalize_unicode, field_name=column)
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: suspicious characters
df[column].apply(check.suspicious_characters, field_name=column)
@ -115,6 +112,9 @@ def run(argv):
else:
df[column].apply(check.mojibake, field_name=column)
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)
# Fix: invalid and unnecessary multi-value separators
df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators