diff --git a/README.md b/README.md index b88c22b..45d270a 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht - Validate subjects against the AGROVOC REST API - Fix leading, trailing, and excessive (ie, more than one) whitespace - Fix invalid multi-value separators (`|`) using `--unsafe-fixes` +- Fix problematic newlines (line feeds) using `--unsafe-fixes` - Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc - Check for "suspicious" characters that indicate encoding or copy/paste issues, for example "foreˆt" should be "forêt" - Remove duplicate metadata values diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index e30d4cd..bc02a1a 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -25,6 +25,10 @@ def main(argv): # Fix: whitespace df[column] = df[column].apply(fix.whitespace) + # Fix: newlines + if args.unsafe_fixes: + df[column] = df[column].apply(fix.newlines) + # Fix: unnecessary Unicode df[column] = df[column].apply(fix.unnecessary_unicode) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 2d40558..90b897d 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -134,3 +134,34 @@ def duplicates(field): new_field = '||'.join(new_values) return new_field + + +def newlines(field): + """Fix newlines. + + Single metadata values should not span multiple lines because this is not + rendered properly in DSpace's XMLUI and even causes issues during import. + + Implementation note: this currently only detects Unix line feeds (0x0a). + This is essentially when a user presses "Enter" to move to the next line. + Other newlines like the Windows carriage return are already handled with + the string stipping performed in the whitespace fixes. + + Confusingly, in Vim '\n' matches a line feed when searching, but you must + use '\r' to *insert* a line feed, ie in a search and replace expression. + + Return string with newlines removed. + """ + + # Skip fields with missing values + if pd.isna(field): + return + + # Check for Unix line feed (LF) + match = re.findall(r'\n', field) + + if match: + print(f'Removing newline: {field}') + field = field.replace('\n', '') + + return field diff --git a/data/test.csv b/data/test.csv index a533ba5..1023732 100644 --- a/data/test.csv +++ b/data/test.csv @@ -17,3 +17,5 @@ Invalid ISO 639-2 language,2019-07-29,,,jp, Invalid ISO 639-3 language,2019-07-29,,,chi, Invalid language,2019-07-29,,,Span, Invalid AGROVOC subject,2019-07-29,,,,FOREST +Newline,2019-07-30,,,,"TANZA +NIA" diff --git a/tests/test_fix.py b/tests/test_fix.py index 421aa41..beccc8c 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -47,3 +47,12 @@ def test_fix_duplicates(): value = 'Kenya||Kenya' assert fix.duplicates(value) == 'Kenya' + + +def test_fix_newlines(): + '''Test fixing newlines.''' + + value = '''Ken +ya''' + + assert fix.newlines(value) == 'Kenya'