From 62fea950877b336711831ac81502d551b9567b4b Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 9 Aug 2019 01:22:59 +0300 Subject: [PATCH] Improve suspicious character detection Now it will print just the part of the metadata value that contains the suspicious character (up to 80 characters, so we don't make the line break on terminals that use 80 character width by default). Also, print the name of the field in which the metadata value is so that it is easier for the user to locate. --- csv_metadata_quality/app.py | 2 +- csv_metadata_quality/check.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 2b0b5ab..7af4388 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -48,7 +48,7 @@ def run(argv): df[column] = df[column].apply(check.separators) # Check: suspicious characters - df[column] = df[column].apply(check.suspicious_characters) + df[column] = df[column].apply(check.suspicious_characters, field_name=column) # Fix: invalid multi-value separator if args.unsafe_fixes: diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index f057bc9..478612d 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -128,7 +128,7 @@ def date(field): return field -def suspicious_characters(field): +def suspicious_characters(field, field_name): """Warn about suspicious characters. Look for standalone characters that could indicate encoding or copy/paste @@ -143,10 +143,21 @@ def suspicious_characters(field): suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060'] for character in suspicious_characters: - character_set = set(character) + # Find the position of the suspicious character in the string + suspicious_character_position = field.find(character) - if character_set.issubset(field): - print(f'Suspicious character: {field}') + # Python returns -1 if there is no match + if suspicious_character_position != -1: + # Create a temporary new string starting from the position of the + # suspicious character + field_subset = field[suspicious_character_position:] + + # Print part of the metadata value starting from the suspicious + # character and spanning enough of the rest to give a preview, + # but not too much to cause the line to break in terminals with + # a default of 80 characters width. + suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}' + print(f'{suspicious_character_msg:1.80}') return field