diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 1695673..a753f12 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -25,6 +25,9 @@ def main(argv): # Fix: whitespace df[column] = df[column].apply(fix.whitespace) + # Fix: unnecessary Unicode + df[column] = df[column].apply(fix.unnecessary_unicode) + # Check: invalid multi-value separator df[column] = df[column].apply(check.separators) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 1d273ef..32c218d 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -65,3 +65,45 @@ def separators(field): new_field = '||'.join(values) return new_field + + +def unnecessary_unicode(field): + """Remove unnecessary Unicode characters. + + Removes unnecessary Unicode characters like: + - Zero-width space (U+200B) + - Replacement character (U+FFFD) + - No-break space (U+00A0) + + Return string with characters removed. + """ + + # Skip fields with missing values + if pd.isna(field): + return + + # Check for zero-width space characters (U+200B) + pattern = re.compile(r'\u200B') + match = re.findall(pattern, field) + + if match: + print(f'Removing unnecessary Unicode (U+200B): {field}') + field = re.sub(pattern, '', field) + + # Check for replacement characters (U+FFFD) + pattern = re.compile(r'\uFFFD') + match = re.findall(pattern, field) + + if match: + print(f'Removing unnecessary Unicode (U+FFFD): {field}') + field = re.sub(pattern, '', field) + + # Check for no-break spaces (U+00A0) + pattern = re.compile(r'\u00A0') + match = re.findall(pattern, field) + + if match: + print(f'Removing unnecessary Unicode (U+00A0): {field}') + field = re.sub(pattern, '', field) + + return field diff --git a/data/test.csv b/data/test.csv index 852b0f5..9e244f6 100644 --- a/data/test.csv +++ b/data/test.csv @@ -5,3 +5,4 @@ Sophia,2019-06-15,, Test,2019-06-150,, "Doe, J.",2019-06-15||2019-01-10,, Someone,,0378-5955|0378-5955, +Unnecessary Unicode​,2019-07-29,, diff --git a/tests/test_fix.py b/tests/test_fix.py index dd20af8..e02514a 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -31,3 +31,11 @@ def test_fix_invalid_separators(): value = 'Alan|Orth' assert fix.separators(value) == 'Alan||Orth' + + +def test_fix_unnecessary_unicode(): + '''Test fixing unnecessary Unicode.''' + + value = 'Alan​ Orth' + + assert fix.unnecessary_unicode(value) == 'Alan Orth'