csv_metadata_quality/fix.py: configure ftfy

Don't replace smart quotes in ftfy. If our text has them we should
keep them.
This commit is contained in:
Alan Orth 2021-12-15 21:51:51 +02:00
parent 8b15154285
commit ff49a80432
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
1 changed files with 5 additions and 2 deletions

View File

@ -5,7 +5,7 @@ from unicodedata import normalize
import pandas as pd
from colorama import Fore
from ftfy import fix_text
from ftfy import fix_text, TextFixerConfig
from csv_metadata_quality.util import is_mojibake, is_nfc
@ -280,9 +280,12 @@ def mojibake(field, field_name):
if pd.isna(field):
return field
# We don't want ftfy to change “smart quotes” to "ASCII quotes"
config = TextFixerConfig(uncurl_quotes=False)
if is_mojibake(field):
print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
return fix_text(field)
return fix_text(field, config)
else:
return field