From 40e77db713ca17b5e64573f10cc15f32db9cc660 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 28 Jul 2019 22:53:39 +0300 Subject: [PATCH] Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "||" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "|" for some legitimate purpose. --- csv_metadata_quality/app.py | 7 +++++++ csv_metadata_quality/fix.py | 30 ++++++++++++++++++++++++++++++ data/test.csv | 2 +- tests/test_fix.py | 8 ++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 3fcc45d..5d236ba 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -9,6 +9,7 @@ def parse_args(argv): parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.') parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8')) parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8')) + parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true') args = parser.parse_args() return args @@ -28,6 +29,12 @@ def main(argv): # Run invalid multi-value separator check on all columns df[column] = df[column].apply(check.separators) + # Run invalid multi-value separator fix on all columns + if args.unsafe_fixes: + df[column] = df[column].apply(fix.separators) + # Run whitespace fix again after fixing invalid separators + df[column] = df[column].apply(fix.whitespace) + # check if column is an issn column like dc.identifier.issn match = re.match(r'^.*?issn.*$', column) if match is not None: diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 8144f4e..0bec8e2 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -35,3 +35,33 @@ def whitespace(field): new_field = '||'.join(values) return new_field + + +def separators(field): + """Fix for invalid multi-value separators (ie "|").""" + + # Skip fields with missing values + if pd.isna(field): + return + + # Initialize an empty list to hold the cleaned values + values = list() + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + # After splitting, see if there are any remaining "|" characters + pattern = re.compile(r'\|') + match = re.findall(pattern, value) + + if len(match) > 0: + print(f'Fixing invalid multi-value separator: {value}') + + value = re.sub(pattern, '||', value) + + # Save cleaned value + values.append(value) + + # Create a new field consisting of all values joined with "||" + new_field = '||'.join(values) + + return new_field diff --git a/data/test.csv b/data/test.csv index c21c84f..852b0f5 100644 --- a/data/test.csv +++ b/data/test.csv @@ -4,4 +4,4 @@ Stella|| Stella ||Stella Orth||Stella ,1984-11-27,2321-2302,99921-58-10-7 Sophia,2019-06-15,, Test,2019-06-150,, "Doe, J.",2019-06-15||2019-01-10,, -Someone,,, +Someone,,0378-5955|0378-5955, diff --git a/tests/test_fix.py b/tests/test_fix.py index 1241819..dd20af8 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -23,3 +23,11 @@ def test_fix_excessive_whitespace(): value = 'Alan Orth' assert fix.whitespace(value) == 'Alan Orth' + + +def test_fix_invalid_separators(): + '''Test fixing invalid multi-value separators.''' + + value = 'Alan|Orth' + + assert fix.separators(value) == 'Alan||Orth'