1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 05:45:02 +01:00

csv_metadata_quality/fix.py: configure ftfy

Don't replace smart quotes in ftfy. If our text has them we should
keep them.
This commit is contained in:
Alan Orth 2021-12-15 21:51:51 +02:00
parent 8b15154285
commit ff49a80432
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -5,7 +5,7 @@ from unicodedata import normalize
import pandas as pd import pandas as pd
from colorama import Fore from colorama import Fore
from ftfy import fix_text from ftfy import fix_text, TextFixerConfig
from csv_metadata_quality.util import is_mojibake, is_nfc from csv_metadata_quality.util import is_mojibake, is_nfc
@ -280,9 +280,12 @@ def mojibake(field, field_name):
if pd.isna(field): if pd.isna(field):
return field return field
# We don't want ftfy to change “smart quotes” to "ASCII quotes"
config = TextFixerConfig(uncurl_quotes=False)
if is_mojibake(field): if is_mojibake(field):
print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}") print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
return fix_text(field) return fix_text(field, config)
else: else:
return field return field