From 801870e0ba2229eb06a12143286cdcc21767a76e Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 26 Jul 2019 19:08:28 +0300 Subject: [PATCH] Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). --- fix.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100755 fix.py diff --git a/fix.py b/fix.py new file mode 100755 index 0000000..e7eaf56 --- /dev/null +++ b/fix.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +import pandas as pd + +def fix_whitespace(value): + """Fix whitespace issues. + + Return string with leading, trailing, and consecutive whitespace trimmed. + """ + + import re + + # Skip cells with missing values + if pd.isna(value): + return + + # Try to split multi-value cells on "||" separator + #for value in cell.split('||'): + + # Check for leading whitespace + pattern = re.compile(r'^\s+') + match = re.findall(pattern, value) + + if len(match) > 0: + print('DEBUG: Leading whitespace') + value = re.sub(pattern, '', value) + + # Check for leading whitespace in multi-value cells + # SOME VALUE|| ANOTHER VALUE + pattern = re.compile(r'\|\|\s+') + match = re.findall(pattern, value) + + if len(match) > 0: + print('DEBUG: Leading whitespace in multi-value cell') + value = re.sub(pattern, '||', value) + + # Check for trailing whitespace + pattern = re.compile(r'\s+$') + match = re.findall(pattern, value) + + if len(match) > 0: + print('DEBUG: Trailing whitespace') + value = re.sub(pattern, '', value) + + # Check for trailing whitespace in multi-value cells + # SOME VALUE ||ANOTHER VALUE + pattern = re.compile(r'\s+\|\|') + match = re.findall(pattern, value) + + if len(match) > 0: + print('DEBUG: Trailing whitespace in multi-value cell') + value = re.sub(pattern, '||', value) + + return value + + +# Read all fields as strings so dates don't get converted from 1998 to 1998.0 +df = pd.read_csv('/home/aorth/Downloads/2019-07-26-Bioversity-Migration.csv', dtype=str) +#df = pd.read_csv('/tmp/quality.csv') +#df = pd.read_csv('/tmp/omg.csv') + +# Fix whitespace in all columns +for column in df.columns.values.tolist(): + print(column) + + # Skip the id column + #if column == 'id': + # continue + + df[column] = df[column].apply(fix_whitespace) + +# Write +df.to_csv('/tmp/omg.fixed.csv', index=False)