From ff49a80432df645e531d5c48f7ae77782a7060f5 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 15 Dec 2021 21:51:51 +0200 Subject: [PATCH] csv_metadata_quality/fix.py: configure ftfy Don't replace smart quotes in ftfy. If our text has them we should keep them. --- csv_metadata_quality/fix.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 2e9cd85..83d57ca 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -5,7 +5,7 @@ from unicodedata import normalize import pandas as pd from colorama import Fore -from ftfy import fix_text +from ftfy import fix_text, TextFixerConfig from csv_metadata_quality.util import is_mojibake, is_nfc @@ -280,9 +280,12 @@ def mojibake(field, field_name): if pd.isna(field): return field + # We don't want ftfy to change “smart quotes” to "ASCII quotes" + config = TextFixerConfig(uncurl_quotes=False) + if is_mojibake(field): print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}") - return fix_text(field) + return fix_text(field, config) else: return field