diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index ce85048..b348eb0 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -82,7 +82,7 @@ def run(argv): continue # Fix: whitespace - df[column] = df[column].apply(fix.whitespace) + df[column] = df[column].apply(fix.whitespace, field_name=column) # Fix: newlines if args.unsafe_fixes: @@ -104,19 +104,19 @@ def run(argv): df[column] = df[column].apply(fix.unnecessary_unicode) # Check: invalid multi-value separator - df[column] = df[column].apply(check.separators) + df[column] = df[column].apply(check.separators, field_name=column) # Check: suspicious characters df[column] = df[column].apply(check.suspicious_characters, field_name=column) # Fix: invalid multi-value separator if args.unsafe_fixes: - df[column] = df[column].apply(fix.separators) + df[column] = df[column].apply(fix.separators, field_name=column) # Run whitespace fix again after fixing invalid separators - df[column] = df[column].apply(fix.whitespace) + df[column] = df[column].apply(fix.whitespace, field_name=column) # Fix: duplicate metadata values - df[column] = df[column].apply(fix.duplicates) + df[column] = df[column].apply(fix.duplicates, field_name=column) # Check: invalid AGROVOC subject if args.agrovoc_fields: diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index dd94aef..bafb2c7 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -51,7 +51,7 @@ def isbn(field): return field -def separators(field): +def separators(field, field_name): """Check for invalid multi-value separators (ie "|" or "|||"). Prints the field with the invalid multi-value separator. @@ -70,7 +70,7 @@ def separators(field): match = re.findall(r"^.*?\|.*$", value) if match: - print(f"Invalid multi-value separator: {field}") + print(f"Invalid multi-value separator ({field_name}): {field}") return field diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 2270750..8d729b1 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -3,7 +3,7 @@ import re import pandas as pd -def whitespace(field): +def whitespace(field, field_name): """Fix whitespace issues. Return string with leading, trailing, and consecutive whitespace trimmed. @@ -26,7 +26,7 @@ def whitespace(field): match = re.findall(pattern, value) if match: - print(f"Removing excessive whitespace: {value}") + print(f"Removing excessive whitespace ({field_name}): {value}") value = re.sub(pattern, " ", value) # Save cleaned value @@ -38,7 +38,7 @@ def whitespace(field): return new_field -def separators(field): +def separators(field, field_name): """Fix for invalid multi-value separators (ie "|").""" # Skip fields with missing values @@ -55,7 +55,7 @@ def separators(field): match = re.findall(pattern, value) if match: - print(f"Fixing invalid multi-value separator: {value}") + print(f"Fixing invalid multi-value separator ({field_name}): {value}") value = re.sub(pattern, "||", value) @@ -121,7 +121,7 @@ def unnecessary_unicode(field): return field -def duplicates(field): +def duplicates(field, field_name): """Remove duplicate metadata values.""" # Skip fields with missing values @@ -140,7 +140,7 @@ def duplicates(field): if value not in new_values: new_values.append(value) else: - print(f"Removing duplicate value: {value}") + print(f"Removing duplicate value ({field_name}): {value}") # Create a new field consisting of all values joined with "||" new_field = "||".join(new_values) diff --git a/tests/test_check.py b/tests/test_check.py index dfbfd1d..b9f8619 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -51,10 +51,12 @@ def test_check_invalid_separators(capsys): value = "Alan|Orth" - check.separators(value) + field_name = "dc.contributor.author" + + check.separators(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Invalid multi-value separator: {value}\n" + assert captured.out == f"Invalid multi-value separator ({field_name}): {value}\n" def test_check_valid_separators(): @@ -62,7 +64,9 @@ def test_check_valid_separators(): value = "Alan||Orth" - result = check.separators(value) + field_name = "dc.contributor.author" + + result = check.separators(value, field_name) assert result == value diff --git a/tests/test_fix.py b/tests/test_fix.py index 9b56796..a2cd066 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -6,7 +6,9 @@ def test_fix_leading_whitespace(): value = " Alan" - assert fix.whitespace(value) == "Alan" + field_name = "dc.contributor.author" + + assert fix.whitespace(value, field_name) == "Alan" def test_fix_trailing_whitespace(): @@ -14,7 +16,9 @@ def test_fix_trailing_whitespace(): value = "Alan " - assert fix.whitespace(value) == "Alan" + field_name = "dc.contributor.author" + + assert fix.whitespace(value, field_name) == "Alan" def test_fix_excessive_whitespace(): @@ -22,7 +26,9 @@ def test_fix_excessive_whitespace(): value = "Alan Orth" - assert fix.whitespace(value) == "Alan Orth" + field_name = "dc.contributor.author" + + assert fix.whitespace(value, field_name) == "Alan Orth" def test_fix_invalid_separators(): @@ -30,7 +36,9 @@ def test_fix_invalid_separators(): value = "Alan|Orth" - assert fix.separators(value) == "Alan||Orth" + field_name = "dc.contributor.author" + + assert fix.separators(value, field_name) == "Alan||Orth" def test_fix_unnecessary_unicode(): @@ -46,7 +54,9 @@ def test_fix_duplicates(): value = "Kenya||Kenya" - assert fix.duplicates(value) == "Kenya" + field_name = "dc.contributor.author" + + assert fix.duplicates(value, field_name) == "Kenya" def test_fix_newlines():