Add fix.py

Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields).
2025-09-18 01:16:41 +02:00 · 2019-07-26 19:08:28 +03:00
parent 21b78b9519
commit 801870e0ba
1 changed files with 73 additions and 0 deletions
--- a/fix.py
+++ b/fix.py
@@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 import pandas as pd
 def fix_whitespace(value):
    """Fix whitespace issues.
    Return string with leading, trailing, and consecutive whitespace trimmed.
    """
    import re
    # Skip cells with missing values
    if pd.isna(value):
        return
    # Try to split multi-value cells on "||" separator
    #for value in cell.split('||'):
    # Check for leading whitespace
    pattern = re.compile(r'^\s+')
    match = re.findall(pattern, value)
    if len(match) > 0:
        print('DEBUG: Leading whitespace')
        value = re.sub(pattern, '', value)
    # Check for leading whitespace in multi-value cells
    # SOME VALUE|| ANOTHER VALUE
    pattern = re.compile(r'\|\|\s+')
    match = re.findall(pattern, value)
    if len(match) > 0:
        print('DEBUG: Leading whitespace in multi-value cell')
        value = re.sub(pattern, '||', value)
    # Check for trailing whitespace
    pattern = re.compile(r'\s+$')
    match = re.findall(pattern, value)
    if len(match) > 0:
        print('DEBUG: Trailing whitespace')
        value = re.sub(pattern, '', value)
    # Check for trailing whitespace in multi-value cells
    # SOME VALUE ||ANOTHER VALUE
    pattern = re.compile(r'\s+\|\|')
    match = re.findall(pattern, value)
    if len(match) > 0:
        print('DEBUG: Trailing whitespace in multi-value cell')
        value = re.sub(pattern, '||', value)
    return value
 # Read all fields as strings so dates don't get converted from 1998 to 1998.0
 df = pd.read_csv('/home/aorth/Downloads/2019-07-26-Bioversity-Migration.csv', dtype=str)
 #df = pd.read_csv('/tmp/quality.csv')
 #df = pd.read_csv('/tmp/omg.csv')
 # Fix whitespace in all columns
 for column in df.columns.values.tolist():
    print(column)
    # Skip the id column
    #if column == 'id':
    #    continue
    df[column] = df[column].apply(fix_whitespace)
 # Write
 df.to_csv('/tmp/omg.fixed.csv', index=False)