1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 12:12:18 +01:00

Improve suspicious character detection

Now it will print just the part of the metadata value that contains
the suspicious character (up to 80 characters, so we don't make the
line break on terminals that use 80 character width by default).

Also, print the name of the field in which the metadata value is so
that it is easier for the user to locate.
This commit is contained in:
Alan Orth 2019-08-09 01:22:59 +03:00
parent 8772bdec51
commit 62fea95087
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 16 additions and 5 deletions

View File

@ -48,7 +48,7 @@ def run(argv):
df[column] = df[column].apply(check.separators) df[column] = df[column].apply(check.separators)
# Check: suspicious characters # Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters) df[column] = df[column].apply(check.suspicious_characters, field_name=column)
# Fix: invalid multi-value separator # Fix: invalid multi-value separator
if args.unsafe_fixes: if args.unsafe_fixes:

View File

@ -128,7 +128,7 @@ def date(field):
return field return field
def suspicious_characters(field): def suspicious_characters(field, field_name):
"""Warn about suspicious characters. """Warn about suspicious characters.
Look for standalone characters that could indicate encoding or copy/paste Look for standalone characters that could indicate encoding or copy/paste
@ -143,10 +143,21 @@ def suspicious_characters(field):
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060'] suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
for character in suspicious_characters: for character in suspicious_characters:
character_set = set(character) # Find the position of the suspicious character in the string
suspicious_character_position = field.find(character)
if character_set.issubset(field): # Python returns -1 if there is no match
print(f'Suspicious character: {field}') if suspicious_character_position != -1:
# Create a temporary new string starting from the position of the
# suspicious character
field_subset = field[suspicious_character_position:]
# Print part of the metadata value starting from the suspicious
# character and spanning enough of the rest to give a preview,
# but not too much to cause the line to break in terminals with
# a default of 80 characters width.
suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
print(f'{suspicious_character_msg:1.80}')
return field return field