From e7322efaddde6f3a105a20bddabe9b0bb3687465 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 15 Dec 2021 13:53:25 +0200 Subject: [PATCH] csv_metadata_quality/app.py: move unnecessary Unicode fix We actually want to do this after we try to fix mojibake with ftfy. These "unnecessary" Unicode characters could actually help ftfy in some cases because often times they indicate that some character from another encoding was there before (like an accent, dash, or smart quote). --- csv_metadata_quality/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 362958e..d025be9 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -103,9 +103,6 @@ def run(argv): if args.unsafe_fixes: df[column] = df[column].apply(fix.normalize_unicode, field_name=column) - # Fix: unnecessary Unicode - df[column] = df[column].apply(fix.unnecessary_unicode) - # Check: suspicious characters df[column].apply(check.suspicious_characters, field_name=column) @@ -115,6 +112,9 @@ def run(argv): else: df[column].apply(check.mojibake, field_name=column) + # Fix: unnecessary Unicode + df[column] = df[column].apply(fix.unnecessary_unicode) + # Fix: invalid and unnecessary multi-value separators df[column] = df[column].apply(fix.separators, field_name=column) # Run whitespace fix again after fixing invalid separators