Improve suspicious character detection

Now it will print just the part of the metadata value that contains the suspicious character (up to 80 characters, so we don't make the line break on terminals that use 80 character width by default). Also, print the name of the field in which the metadata value is so that it is easier for the user to locate.
2025-09-16 00:16:40 +02:00 · 2019-08-09 01:22:59 +03:00
parent 8772bdec51
commit 62fea95087
2 changed files with 16 additions and 5 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -48,7 +48,7 @@ def run(argv):
        df[column] = df[column].apply(check.separators)
        # Check: suspicious characters
-        df[column] = df[column].apply(check.suspicious_characters)
+        df[column] = df[column].apply(check.suspicious_characters, field_name=column)
        # Fix: invalid multi-value separator
        if args.unsafe_fixes:
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -128,7 +128,7 @@ def date(field):
        return field
-def suspicious_characters(field):
+def suspicious_characters(field, field_name):
    """Warn about suspicious characters.
    Look for standalone characters that could indicate encoding or copy/paste
@@ -143,10 +143,21 @@ def suspicious_characters(field):
    suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
    for character in suspicious_characters:
-        character_set = set(character)
+        # Find the position of the suspicious character in the string
        suspicious_character_position = field.find(character)
-        if character_set.issubset(field):
+        # Python returns -1 if there is no match
-            print(f'Suspicious character: {field}')
+        if suspicious_character_position != -1:
            # Create a temporary new string starting from the position of the
            # suspicious character
            field_subset = field[suspicious_character_position:]
            # Print part of the metadata value starting from the suspicious
            # character and spanning enough of the rest to give a preview,
            # but not too much to cause the line to break in terminals with
            # a default of 80 characters width.
            suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
            print(f'{suspicious_character_msg:1.80}')
    return field