Add checks and unsafe fixes for mojibake

This detects whether text has likely been encoded in one encoding and decoded in another, perhaps multiple times. This often results in display of "mojibake" characters. For example, a file encoded in UTF-8 is opened as CP-1252 (Windows Latin codepage) in Microsoft Excel, and saved again as UTF-8. You will see strings like this in the resulting file: - CIAT PublicaÃ§ao - CIAT PublicaciÃ³n The correct version of these in UTF-8 would be: - CIAT Publicaçao - CIAT Publicación I use a code snippet from Martijn Pieters on StackOverflow to de- tect whether a string is "weird" as determined by the excellent "fixes text for you" (ftfy) Python library, then check if a weird string encodes as CP-1252 or not. If so, I can try to fix it. See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
2025-09-12 22:57:02 +02:00 · 2021-03-19 10:22:21 +02:00
parent e92ec5d371
commit 898bb412c3
5 changed files with 85 additions and 1 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -107,6 +107,13 @@ def run(argv):
        # Check: suspicious characters
        df[column].apply(check.suspicious_characters, field_name=column)
        # Check: mojibake
        df[column].apply(check.mojibake, field_name=column)
        # Fix: mojibake
        if args.unsafe_fixes:
            df[column] = df[column].apply(fix.mojibake, field_name=column)
        # Fix: invalid and unnecessary multi-value separators
        df[column] = df[column].apply(fix.separators, field_name=column)
        # Run whitespace fix again after fixing invalid separators
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -11,6 +11,8 @@ from pycountry import languages
 from stdnum import isbn as stdnum_isbn
 from stdnum import issn as stdnum_issn
 from csv_metadata_quality.util import is_mojibake
 def issn(field):
    """Check if an ISSN is valid.
@@ -345,3 +347,22 @@ def duplicate_items(df):
                )
            else:
                items.append(item_title_type_date)
 def mojibake(field, field_name):
    """Check for mojibake (text that was encoded in one encoding and decoded in
    in another, perhaps multiple times). See util.py.
    Prints the string if it contains suspected mojibake.
    """
    # Skip fields with missing values
    if pd.isna(field):
        return
    if is_mojibake(field):
        print(
            f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}"
        )
    return
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -3,8 +3,9 @@ from unicodedata import normalize
 import pandas as pd
 from colorama import Fore
 from ftfy import fix_text
-from csv_metadata_quality.util import is_nfc
+from csv_metadata_quality.util import is_mojibake, is_nfc
 def whitespace(field, field_name):
@@ -253,3 +254,22 @@ def normalize_unicode(field, field_name):
        field = normalize("NFC", field)
    return field
 def mojibake(field, field_name):
    """Attempts to fix mojibake (text that was encoded in one encoding and deco-
    ded in another, perhaps multiple times). See util.py.
    Return fixed string.
    """
    # Skip fields with missing values
    if pd.isna(field):
        return field
    if is_mojibake(field):
        print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
        return fix_text(field)
    else:
        return field
--- a/csv_metadata_quality/util.py
+++ b/csv_metadata_quality/util.py
@@ -1,3 +1,6 @@
 from ftfy.badness import sequence_weirdness
 def is_nfc(field):
    """Utility function to check whether a string is using normalized Unicode.
    Python's built-in unicodedata library has the is_normalized() function, but
@@ -12,3 +15,35 @@ def is_nfc(field):
    from unicodedata import normalize
    return field == normalize("NFC", field)
 def is_mojibake(field):
    """Determines whether a string contains mojibake.
    We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
    as something else like CP-1252 (Windows Latin). This manifests in the form
    of "mojibake", for example:
        - CIAT PublicaÃ§ao
        - CIAT PublicaciÃ³n
    This uses the excellent "fixes text for you" (ftfy) library to determine
    whether a string contains characters that have been encoded in one encoding
    and decoded in another.
    Inspired by this code snippet from Martijn Pieters on StackOverflow:
    https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
    Return boolean.
    """
    if not sequence_weirdness(field):
        # Nothing weird, should be okay
        return False
    try:
        field.encode("sloppy-windows-1252")
    except UnicodeEncodeError:
        # Not CP-1252 encodable, probably fine
        return False
    else:
        # Encodable as CP-1252, Mojibake alert level high
        return True
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ pycountry = "^19.8.18"
 langid = "^1.1.6"
 colorama = "^0.4.4"
 spdx-license-list = "^0.5.2"
 ftfy = "^5.9"
 [tool.poetry.dev-dependencies]
 pytest = "^6.1.1"