From fa4fa3491bcd20319e8ae482009968859dd3c5e9 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 29 Jul 2019 17:08:49 +0300 Subject: [PATCH] Add check for "suspicious" characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These standalone characters often indicate issues with encoding or copy/paste in languages with accents like French and Spanish. For example: foreˆt should be forêt. It is not possible to fix these issues automatically, but this will print a warning so you can notify the owner of the data. --- README.md | 1 + csv_metadata_quality/app.py | 3 +++ csv_metadata_quality/check.py | 23 +++++++++++++++++++++++ data/test.csv | 1 + tests/test_check.py | 11 +++++++++++ 5 files changed, 39 insertions(+) diff --git a/README.md b/README.md index 45cc9cc..2d3b364 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht - Fix leading, trailing, and excessive whitespace ✓ - Fix invalid multi-value separators ("|") using `--unsafe-fixes` ✓ - Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc ✓ +- Check for "suspicious" characters that could indicate encoding or copy/paste issues, for example "foreˆt" should be "forêt" ✓ ## Installation The easiest way to install CSV Metadata Quality is with [pipenv](https://github.com/pypa/pipenv): diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index a753f12..3d7745a 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -31,6 +31,9 @@ def main(argv): # Check: invalid multi-value separator df[column] = df[column].apply(check.separators) + # Check: suspicious characters + df[column] = df[column].apply(check.suspicious_characters) + # Fix: invalid multi-value separator if args.unsafe_fixes: df[column] = df[column].apply(fix.separators) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 28640cd..bd87272 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -124,3 +124,26 @@ def date(field): return field except ValueError: print(f'Invalid date: {field}') + + +def suspicious_characters(field): + """Warn about suspicious characters. + + Look for standalone characters that could indicate encoding or copy/paste + errors for languages with accents. For example: foreˆt should be forêt. + """ + + # Skip fields with missing values + if pd.isna(field): + return + + # List of suspicious characters, for example: ́ˆ~` + suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060'] + + for character in suspicious_characters: + character_set = set(character) + + if character_set.issubset(field): + print(f'Suspicious character: {field}') + + return field diff --git a/data/test.csv b/data/test.csv index 9e244f6..5a1192a 100644 --- a/data/test.csv +++ b/data/test.csv @@ -6,3 +6,4 @@ Test,2019-06-150,, "Doe, J.",2019-06-15||2019-01-10,, Someone,,0378-5955|0378-5955, Unnecessary Unicode​,2019-07-29,, +Suspicious Character||foreˆt,2019-07-29,, diff --git a/tests/test_check.py b/tests/test_check.py index dad28d0..dd45075 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -105,3 +105,14 @@ def test_check_valid_date(): result = check.date(value) assert result == value + + +def test_check_suspicious_characters(capsys): + '''Test checking for suspicious characters.''' + + value = 'foreˆt' + + check.suspicious_characters(value) + + captured = capsys.readouterr() + assert captured.out == f'Suspicious character: {value}\n'