mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-25 23:28:18 +01:00
Add check for "suspicious" characters
These standalone characters often indicate issues with encoding or copy/paste in languages with accents like French and Spanish. For example: foreˆt should be forêt. It is not possible to fix these issues automatically, but this will print a warning so you can notify the owner of the data.
This commit is contained in:
parent
8047a57cc5
commit
fa4fa3491b
@ -11,6 +11,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
|
|||||||
- Fix leading, trailing, and excessive whitespace ✓
|
- Fix leading, trailing, and excessive whitespace ✓
|
||||||
- Fix invalid multi-value separators ("|") using `--unsafe-fixes` ✓
|
- Fix invalid multi-value separators ("|") using `--unsafe-fixes` ✓
|
||||||
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc ✓
|
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc ✓
|
||||||
|
- Check for "suspicious" characters that could indicate encoding or copy/paste issues, for example "foreˆt" should be "forêt" ✓
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
The easiest way to install CSV Metadata Quality is with [pipenv](https://github.com/pypa/pipenv):
|
The easiest way to install CSV Metadata Quality is with [pipenv](https://github.com/pypa/pipenv):
|
||||||
|
@ -31,6 +31,9 @@ def main(argv):
|
|||||||
# Check: invalid multi-value separator
|
# Check: invalid multi-value separator
|
||||||
df[column] = df[column].apply(check.separators)
|
df[column] = df[column].apply(check.separators)
|
||||||
|
|
||||||
|
# Check: suspicious characters
|
||||||
|
df[column] = df[column].apply(check.suspicious_characters)
|
||||||
|
|
||||||
# Fix: invalid multi-value separator
|
# Fix: invalid multi-value separator
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
df[column] = df[column].apply(fix.separators)
|
df[column] = df[column].apply(fix.separators)
|
||||||
|
@ -124,3 +124,26 @@ def date(field):
|
|||||||
return field
|
return field
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print(f'Invalid date: {field}')
|
print(f'Invalid date: {field}')
|
||||||
|
|
||||||
|
|
||||||
|
def suspicious_characters(field):
|
||||||
|
"""Warn about suspicious characters.
|
||||||
|
|
||||||
|
Look for standalone characters that could indicate encoding or copy/paste
|
||||||
|
errors for languages with accents. For example: foreˆt should be forêt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# List of suspicious characters, for example: ́ˆ~`
|
||||||
|
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
|
||||||
|
|
||||||
|
for character in suspicious_characters:
|
||||||
|
character_set = set(character)
|
||||||
|
|
||||||
|
if character_set.issubset(field):
|
||||||
|
print(f'Suspicious character: {field}')
|
||||||
|
|
||||||
|
return field
|
||||||
|
@ -6,3 +6,4 @@ Test,2019-06-150,,
|
|||||||
"Doe, J.",2019-06-15||2019-01-10,,
|
"Doe, J.",2019-06-15||2019-01-10,,
|
||||||
Someone,,0378-5955|0378-5955,
|
Someone,,0378-5955|0378-5955,
|
||||||
Unnecessary Unicode,2019-07-29,,
|
Unnecessary Unicode,2019-07-29,,
|
||||||
|
Suspicious Character||foreˆt,2019-07-29,,
|
||||||
|
|
@ -105,3 +105,14 @@ def test_check_valid_date():
|
|||||||
result = check.date(value)
|
result = check.date(value)
|
||||||
|
|
||||||
assert result == value
|
assert result == value
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_suspicious_characters(capsys):
|
||||||
|
'''Test checking for suspicious characters.'''
|
||||||
|
|
||||||
|
value = 'foreˆt'
|
||||||
|
|
||||||
|
check.suspicious_characters(value)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert captured.out == f'Suspicious character: {value}\n'
|
||||||
|
Loading…
Reference in New Issue
Block a user