mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-17 19:47:03 +01:00
Improve suspicious character detection
Now it will print just the part of the metadata value that contains the suspicious character (up to 80 characters, so we don't make the line break on terminals that use 80 character width by default). Also, print the name of the field in which the metadata value is so that it is easier for the user to locate.
This commit is contained in:
parent
8772bdec51
commit
62fea95087
@ -48,7 +48,7 @@ def run(argv):
|
||||
df[column] = df[column].apply(check.separators)
|
||||
|
||||
# Check: suspicious characters
|
||||
df[column] = df[column].apply(check.suspicious_characters)
|
||||
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
||||
|
||||
# Fix: invalid multi-value separator
|
||||
if args.unsafe_fixes:
|
||||
|
@ -128,7 +128,7 @@ def date(field):
|
||||
return field
|
||||
|
||||
|
||||
def suspicious_characters(field):
|
||||
def suspicious_characters(field, field_name):
|
||||
"""Warn about suspicious characters.
|
||||
|
||||
Look for standalone characters that could indicate encoding or copy/paste
|
||||
@ -143,10 +143,21 @@ def suspicious_characters(field):
|
||||
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
|
||||
|
||||
for character in suspicious_characters:
|
||||
character_set = set(character)
|
||||
# Find the position of the suspicious character in the string
|
||||
suspicious_character_position = field.find(character)
|
||||
|
||||
if character_set.issubset(field):
|
||||
print(f'Suspicious character: {field}')
|
||||
# Python returns -1 if there is no match
|
||||
if suspicious_character_position != -1:
|
||||
# Create a temporary new string starting from the position of the
|
||||
# suspicious character
|
||||
field_subset = field[suspicious_character_position:]
|
||||
|
||||
# Print part of the metadata value starting from the suspicious
|
||||
# character and spanning enough of the rest to give a preview,
|
||||
# but not too much to cause the line to break in terminals with
|
||||
# a default of 80 characters width.
|
||||
suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
|
||||
print(f'{suspicious_character_msg:1.80}')
|
||||
|
||||
return field
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user