Refactor as package with subpackages

This makes it cleaner for introducing checks, fixes, tests, docs, and tests in the future. Currently can be run like this: python -m csv_metadata_quality CSV input and output paths are still hard coded. See: https://dev.to/codemouse92/dead-simple-python-project-structure-and-imports-38c6
2025-08-23 05:11:49 +02:00 · 2019-07-26 22:11:10 +03:00
parent ef5b8f7244
commit 232d28e13e
4 changed files with 27 additions and 18 deletions
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -0,0 +1,42 @@
+import pandas as pd
+import re
+
+def alan():
+    print('Alan')
+
+
+def whitespace(field):
+    """Fix whitespace issues.
+
+    Return string with leading, trailing, and consecutive whitespace trimmed.
+    """
+
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Initialize an empty list to hold the cleaned values
+    values = list()
+
+    # Try to split multi-value field on "||" separator
+    for value in field.split('||'):
+        # Strip leading and trailing whitespace
+        value = value.strip()
+
+        # Replace excessive whitespace (>2) with one space
+        pattern = re.compile(r'\s{2,}')
+        match = re.findall(pattern, value)
+
+        if len(match) > 0:
+            print('DEBUG: Excessive whitespace')
+            value = re.sub(pattern, ' ', value)
+
+        # Save cleaned value
+        values.append(value)
+
+    # Create a new field consisting of all values joined with "||"
+    new_field = '||'.join(values)
+
+    return new_field
+