From c42f8b4812c5f2500c7f86e797925918448b1c13 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 1 Oct 2019 16:55:04 +0300 Subject: [PATCH] csv_metadata_quality/fix.py: Replace non-breaking spaces We should be replacing non-breaking spaces (U+00A0) with normal sp- aces instead of removing them. --- csv_metadata_quality/fix.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 38ebb56..4ea0ad9 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -74,10 +74,10 @@ def unnecessary_unicode(field): Removes unnecessary Unicode characters like: - Zero-width space (U+200B) - Replacement character (U+FFFD) - - No-break space (U+00A0) Replaces unnecessary Unicode characters like: - Soft hyphen (U+00AD) → hyphen + - No-break space (U+00A0) → space Return string with characters removed or replaced. """ @@ -107,8 +107,8 @@ def unnecessary_unicode(field): match = re.findall(pattern, field) if match: - print(f"Removing unnecessary Unicode (U+00A0): {field}") - field = re.sub(pattern, "", field) + print(f"Replacing unnecessary Unicode (U+00A0): {field}") + field = re.sub(pattern, " ", field) # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen pattern = re.compile(r"\u002D*?\u00AD")