Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or copy/paste in languages with accents like French and Spanish. For example: foreˆt should be forêt. It is not possible to fix these issues automatically, but this will print a warning so you can notify the owner of the data.
2025-08-07 05:25:37 +02:00 · 2019-07-29 17:08:49 +03:00
parent 8047a57cc5
commit fa4fa3491b
5 changed files with 39 additions and 0 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -31,6 +31,9 @@ def main(argv):
        # Check: invalid multi-value separator
        df[column] = df[column].apply(check.separators)

+        # Check: suspicious characters
+        df[column] = df[column].apply(check.suspicious_characters)
+
        # Fix: invalid multi-value separator
        if args.unsafe_fixes:
            df[column] = df[column].apply(fix.separators)
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -124,3 +124,26 @@ def date(field):
        return field
    except ValueError:
        print(f'Invalid date: {field}')
+
+
+def suspicious_characters(field):
+    """Warn about suspicious characters.
+
+    Look for standalone characters that could indicate encoding or copy/paste
+    errors for languages with accents. For example: foreˆt should be forêt.
+    """
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # List of suspicious characters, for example:  ́ˆ~`
+    suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
+
+    for character in suspicious_characters:
+        character_set = set(character)
+
+        if character_set.issubset(field):
+            print(f'Suspicious character: {field}')
+
+    return field