1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 04:02:19 +01:00

Output field name for more fixes and checks

This helps identify which field has the error.
This commit is contained in:
Alan Orth 2020-01-16 12:35:11 +02:00
parent 40ba9bae6c
commit 28b5996aa6
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
5 changed files with 35 additions and 21 deletions

View File

@ -82,7 +82,7 @@ def run(argv):
continue continue
# Fix: whitespace # Fix: whitespace
df[column] = df[column].apply(fix.whitespace) df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: newlines # Fix: newlines
if args.unsafe_fixes: if args.unsafe_fixes:
@ -104,19 +104,19 @@ def run(argv):
df[column] = df[column].apply(fix.unnecessary_unicode) df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: invalid multi-value separator # Check: invalid multi-value separator
df[column] = df[column].apply(check.separators) df[column] = df[column].apply(check.separators, field_name=column)
# Check: suspicious characters # Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters, field_name=column) df[column] = df[column].apply(check.suspicious_characters, field_name=column)
# Fix: invalid multi-value separator # Fix: invalid multi-value separator
if args.unsafe_fixes: if args.unsafe_fixes:
df[column] = df[column].apply(fix.separators) df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators # Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace) df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: duplicate metadata values # Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates) df[column] = df[column].apply(fix.duplicates, field_name=column)
# Check: invalid AGROVOC subject # Check: invalid AGROVOC subject
if args.agrovoc_fields: if args.agrovoc_fields:

View File

@ -51,7 +51,7 @@ def isbn(field):
return field return field
def separators(field): def separators(field, field_name):
"""Check for invalid multi-value separators (ie "|" or "|||"). """Check for invalid multi-value separators (ie "|" or "|||").
Prints the field with the invalid multi-value separator. Prints the field with the invalid multi-value separator.
@ -70,7 +70,7 @@ def separators(field):
match = re.findall(r"^.*?\|.*$", value) match = re.findall(r"^.*?\|.*$", value)
if match: if match:
print(f"Invalid multi-value separator: {field}") print(f"Invalid multi-value separator ({field_name}): {field}")
return field return field

View File

@ -3,7 +3,7 @@ import re
import pandas as pd import pandas as pd
def whitespace(field): def whitespace(field, field_name):
"""Fix whitespace issues. """Fix whitespace issues.
Return string with leading, trailing, and consecutive whitespace trimmed. Return string with leading, trailing, and consecutive whitespace trimmed.
@ -26,7 +26,7 @@ def whitespace(field):
match = re.findall(pattern, value) match = re.findall(pattern, value)
if match: if match:
print(f"Removing excessive whitespace: {value}") print(f"Removing excessive whitespace ({field_name}): {value}")
value = re.sub(pattern, " ", value) value = re.sub(pattern, " ", value)
# Save cleaned value # Save cleaned value
@ -38,7 +38,7 @@ def whitespace(field):
return new_field return new_field
def separators(field): def separators(field, field_name):
"""Fix for invalid multi-value separators (ie "|").""" """Fix for invalid multi-value separators (ie "|")."""
# Skip fields with missing values # Skip fields with missing values
@ -55,7 +55,7 @@ def separators(field):
match = re.findall(pattern, value) match = re.findall(pattern, value)
if match: if match:
print(f"Fixing invalid multi-value separator: {value}") print(f"Fixing invalid multi-value separator ({field_name}): {value}")
value = re.sub(pattern, "||", value) value = re.sub(pattern, "||", value)
@ -121,7 +121,7 @@ def unnecessary_unicode(field):
return field return field
def duplicates(field): def duplicates(field, field_name):
"""Remove duplicate metadata values.""" """Remove duplicate metadata values."""
# Skip fields with missing values # Skip fields with missing values
@ -140,7 +140,7 @@ def duplicates(field):
if value not in new_values: if value not in new_values:
new_values.append(value) new_values.append(value)
else: else:
print(f"Removing duplicate value: {value}") print(f"Removing duplicate value ({field_name}): {value}")
# Create a new field consisting of all values joined with "||" # Create a new field consisting of all values joined with "||"
new_field = "||".join(new_values) new_field = "||".join(new_values)

View File

@ -51,10 +51,12 @@ def test_check_invalid_separators(capsys):
value = "Alan|Orth" value = "Alan|Orth"
check.separators(value) field_name = "dc.contributor.author"
check.separators(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid multi-value separator: {value}\n" assert captured.out == f"Invalid multi-value separator ({field_name}): {value}\n"
def test_check_valid_separators(): def test_check_valid_separators():
@ -62,7 +64,9 @@ def test_check_valid_separators():
value = "Alan||Orth" value = "Alan||Orth"
result = check.separators(value) field_name = "dc.contributor.author"
result = check.separators(value, field_name)
assert result == value assert result == value

View File

@ -6,7 +6,9 @@ def test_fix_leading_whitespace():
value = " Alan" value = " Alan"
assert fix.whitespace(value) == "Alan" field_name = "dc.contributor.author"
assert fix.whitespace(value, field_name) == "Alan"
def test_fix_trailing_whitespace(): def test_fix_trailing_whitespace():
@ -14,7 +16,9 @@ def test_fix_trailing_whitespace():
value = "Alan " value = "Alan "
assert fix.whitespace(value) == "Alan" field_name = "dc.contributor.author"
assert fix.whitespace(value, field_name) == "Alan"
def test_fix_excessive_whitespace(): def test_fix_excessive_whitespace():
@ -22,7 +26,9 @@ def test_fix_excessive_whitespace():
value = "Alan Orth" value = "Alan Orth"
assert fix.whitespace(value) == "Alan Orth" field_name = "dc.contributor.author"
assert fix.whitespace(value, field_name) == "Alan Orth"
def test_fix_invalid_separators(): def test_fix_invalid_separators():
@ -30,7 +36,9 @@ def test_fix_invalid_separators():
value = "Alan|Orth" value = "Alan|Orth"
assert fix.separators(value) == "Alan||Orth" field_name = "dc.contributor.author"
assert fix.separators(value, field_name) == "Alan||Orth"
def test_fix_unnecessary_unicode(): def test_fix_unnecessary_unicode():
@ -46,7 +54,9 @@ def test_fix_duplicates():
value = "Kenya||Kenya" value = "Kenya||Kenya"
assert fix.duplicates(value) == "Kenya" field_name = "dc.contributor.author"
assert fix.duplicates(value, field_name) == "Kenya"
def test_fix_newlines(): def test_fix_newlines():