From 62fea950877b336711831ac81502d551b9567b4b Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Fri, 9 Aug 2019 01:22:59 +0300
Subject: [PATCH] Improve suspicious character detection

Now it will print just the part of the metadata value that contains
the suspicious character (up to 80 characters, so we don't make the
line break on terminals that use 80 character width by default).

Also, print the name of the field in which the metadata value is so
that it is easier for the user to locate.
---
 csv_metadata_quality/app.py   |  2 +-
 csv_metadata_quality/check.py | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py
index 2b0b5ab..7af4388 100644
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -48,7 +48,7 @@ def run(argv):
         df[column] = df[column].apply(check.separators)
 
         # Check: suspicious characters
-        df[column] = df[column].apply(check.suspicious_characters)
+        df[column] = df[column].apply(check.suspicious_characters, field_name=column)
 
         # Fix: invalid multi-value separator
         if args.unsafe_fixes:
diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py
index f057bc9..478612d 100755
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -128,7 +128,7 @@ def date(field):
         return field
 
 
-def suspicious_characters(field):
+def suspicious_characters(field, field_name):
     """Warn about suspicious characters.
 
     Look for standalone characters that could indicate encoding or copy/paste
@@ -143,10 +143,21 @@ def suspicious_characters(field):
     suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
 
     for character in suspicious_characters:
-        character_set = set(character)
+        # Find the position of the suspicious character in the string
+        suspicious_character_position = field.find(character)
 
-        if character_set.issubset(field):
-            print(f'Suspicious character: {field}')
+        # Python returns -1 if there is no match
+        if suspicious_character_position != -1:
+            # Create a temporary new string starting from the position of the
+            # suspicious character
+            field_subset = field[suspicious_character_position:]
+
+            # Print part of the metadata value starting from the suspicious
+            # character and spanning enough of the rest to give a preview,
+            # but not too much to cause the line to break in terminals with
+            # a default of 80 characters width.
+            suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
+            print(f'{suspicious_character_msg:1.80}')
 
     return field