diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 47bd4dc..36111ca 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -107,6 +107,13 @@ def run(argv): # Check: suspicious characters df[column].apply(check.suspicious_characters, field_name=column) + # Check: mojibake + df[column].apply(check.mojibake, field_name=column) + + # Fix: mojibake + if args.unsafe_fixes: + df[column] = df[column].apply(fix.mojibake, field_name=column) + # Fix: invalid and unnecessary multi-value separators df[column] = df[column].apply(fix.separators, field_name=column) # Run whitespace fix again after fixing invalid separators diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 07b2919..7c0355b 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -11,6 +11,8 @@ from pycountry import languages from stdnum import isbn as stdnum_isbn from stdnum import issn as stdnum_issn +from csv_metadata_quality.util import is_mojibake + def issn(field): """Check if an ISSN is valid. @@ -345,3 +347,22 @@ def duplicate_items(df): ) else: items.append(item_title_type_date) + + +def mojibake(field, field_name): + """Check for mojibake (text that was encoded in one encoding and decoded in + in another, perhaps multiple times). See util.py. + + Prints the string if it contains suspected mojibake. + """ + + # Skip fields with missing values + if pd.isna(field): + return + + if is_mojibake(field): + print( + f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}" + ) + + return diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index ccba484..c4d741d 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -3,8 +3,9 @@ from unicodedata import normalize import pandas as pd from colorama import Fore +from ftfy import fix_text -from csv_metadata_quality.util import is_nfc +from csv_metadata_quality.util import is_mojibake, is_nfc def whitespace(field, field_name): @@ -253,3 +254,22 @@ def normalize_unicode(field, field_name): field = normalize("NFC", field) return field + + +def mojibake(field, field_name): + """Attempts to fix mojibake (text that was encoded in one encoding and deco- + ded in another, perhaps multiple times). See util.py. + + Return fixed string. + """ + + # Skip fields with missing values + if pd.isna(field): + return field + + if is_mojibake(field): + print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}") + + return fix_text(field) + else: + return field diff --git a/csv_metadata_quality/util.py b/csv_metadata_quality/util.py index bcd0c06..32b0dbe 100644 --- a/csv_metadata_quality/util.py +++ b/csv_metadata_quality/util.py @@ -1,3 +1,6 @@ +from ftfy.badness import sequence_weirdness + + def is_nfc(field): """Utility function to check whether a string is using normalized Unicode. Python's built-in unicodedata library has the is_normalized() function, but @@ -12,3 +15,35 @@ def is_nfc(field): from unicodedata import normalize return field == normalize("NFC", field) + + +def is_mojibake(field): + """Determines whether a string contains mojibake. + + We commonly deal with CSV files that were *encoded* in UTF-8, but decoded + as something else like CP-1252 (Windows Latin). This manifests in the form + of "mojibake", for example: + + - CIAT Publicaçao + - CIAT Publicación + + This uses the excellent "fixes text for you" (ftfy) library to determine + whether a string contains characters that have been encoded in one encoding + and decoded in another. + + Inspired by this code snippet from Martijn Pieters on StackOverflow: + https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python + + Return boolean. + """ + if not sequence_weirdness(field): + # Nothing weird, should be okay + return False + try: + field.encode("sloppy-windows-1252") + except UnicodeEncodeError: + # Not CP-1252 encodable, probably fine + return False + else: + # Encodable as CP-1252, Mojibake alert level high + return True diff --git a/pyproject.toml b/pyproject.toml index b564d9b..8886d13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ pycountry = "^19.8.18" langid = "^1.1.6" colorama = "^0.4.4" spdx-license-list = "^0.5.2" +ftfy = "^5.9" [tool.poetry.dev-dependencies] pytest = "^6.1.1"