diff --git a/csv_metadata_quality/__init__.py b/csv_metadata_quality/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/csv_metadata_quality/__main__.py b/csv_metadata_quality/__main__.py new file mode 100644 index 0000000..2e23872 --- /dev/null +++ b/csv_metadata_quality/__main__.py @@ -0,0 +1,4 @@ +from csv_metadata_quality import app + +if __name__ == '__main__': + app.run() diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py new file mode 100644 index 0000000..223121d --- /dev/null +++ b/csv_metadata_quality/app.py @@ -0,0 +1,17 @@ +import csv_metadata_quality.fix as fix +import pandas as pd + +def run(): + # Read all fields as strings so dates don't get converted from 1998 to 1998.0 + #df = pd.read_csv('/home/aorth/Downloads/2019-07-26-Bioversity-Migration.csv', dtype=str) + #df = pd.read_csv('/tmp/quality.csv', dtype=str) + df = pd.read_csv('/tmp/omg.csv', dtype=str) + + # Fix whitespace in all columns + for column in df.columns.values.tolist(): + print(f'DEBUG: {column}') + + df[column] = df[column].apply(fix.whitespace) + + # Write + df.to_csv('/tmp/omg.fixed.csv', index=False) diff --git a/fix.py b/csv_metadata_quality/fix.py similarity index 63% rename from fix.py rename to csv_metadata_quality/fix.py index c3bfe78..7a449da 100755 --- a/fix.py +++ b/csv_metadata_quality/fix.py @@ -1,14 +1,16 @@ -#!/usr/bin/env python3 - import pandas as pd +import re -def fix_whitespace(field): +def alan(): + print('Alan') + + +def whitespace(field): """Fix whitespace issues. Return string with leading, trailing, and consecutive whitespace trimmed. """ - import re # Skip fields with missing values if pd.isna(field): @@ -38,17 +40,3 @@ def fix_whitespace(field): return new_field - -# Read all fields as strings so dates don't get converted from 1998 to 1998.0 -#df = pd.read_csv('/home/aorth/Downloads/2019-07-26-Bioversity-Migration.csv', dtype=str) -#df = pd.read_csv('/tmp/quality.csv', dtype=str) -df = pd.read_csv('/tmp/omg.csv', dtype=str) - -# Fix whitespace in all columns -for column in df.columns.values.tolist(): - print(f'DEBUG: {column}') - - df[column] = df[column].apply(fix_whitespace) - -# Write -df.to_csv('/tmp/omg.fixed.csv', index=False)