mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-21 21:35:02 +01:00
csv_metadata_quality/fix.py: configure ftfy
Don't replace smart quotes in ftfy. If our text has them we should keep them.
This commit is contained in:
parent
8b15154285
commit
ff49a80432
@ -5,7 +5,7 @@ from unicodedata import normalize
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from ftfy import fix_text
|
from ftfy import fix_text, TextFixerConfig
|
||||||
|
|
||||||
from csv_metadata_quality.util import is_mojibake, is_nfc
|
from csv_metadata_quality.util import is_mojibake, is_nfc
|
||||||
|
|
||||||
@ -280,9 +280,12 @@ def mojibake(field, field_name):
|
|||||||
if pd.isna(field):
|
if pd.isna(field):
|
||||||
return field
|
return field
|
||||||
|
|
||||||
|
# We don't want ftfy to change “smart quotes” to "ASCII quotes"
|
||||||
|
config = TextFixerConfig(uncurl_quotes=False)
|
||||||
|
|
||||||
if is_mojibake(field):
|
if is_mojibake(field):
|
||||||
print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
|
print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
|
||||||
|
|
||||||
return fix_text(field)
|
return fix_text(field, config)
|
||||||
else:
|
else:
|
||||||
return field
|
return field
|
||||||
|
Loading…
Reference in New Issue
Block a user