mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-10 07:06:00 +02:00
Add checks and unsafe fixes for mojibake
This detects whether text has likely been encoded in one encoding and decoded in another, perhaps multiple times. This often results in display of "mojibake" characters. For example, a file encoded in UTF-8 is opened as CP-1252 (Windows Latin codepage) in Microsoft Excel, and saved again as UTF-8. You will see strings like this in the resulting file: - CIAT Publicaçao - CIAT Publicación The correct version of these in UTF-8 would be: - CIAT Publicaçao - CIAT Publicación I use a code snippet from Martijn Pieters on StackOverflow to de- tect whether a string is "weird" as determined by the excellent "fixes text for you" (ftfy) Python library, then check if a weird string encodes as CP-1252 or not. If so, I can try to fix it. See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
This commit is contained in:
@ -3,8 +3,9 @@ from unicodedata import normalize
|
||||
|
||||
import pandas as pd
|
||||
from colorama import Fore
|
||||
from ftfy import fix_text
|
||||
|
||||
from csv_metadata_quality.util import is_nfc
|
||||
from csv_metadata_quality.util import is_mojibake, is_nfc
|
||||
|
||||
|
||||
def whitespace(field, field_name):
|
||||
@ -253,3 +254,22 @@ def normalize_unicode(field, field_name):
|
||||
field = normalize("NFC", field)
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def mojibake(field, field_name):
|
||||
"""Attempts to fix mojibake (text that was encoded in one encoding and deco-
|
||||
ded in another, perhaps multiple times). See util.py.
|
||||
|
||||
Return fixed string.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return field
|
||||
|
||||
if is_mojibake(field):
|
||||
print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
|
||||
|
||||
return fix_text(field)
|
||||
else:
|
||||
return field
|
||||
|
Reference in New Issue
Block a user