mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
Output field name for more fixes and checks
This helps identify which field has the error.
This commit is contained in:
parent
40ba9bae6c
commit
28b5996aa6
@ -82,7 +82,7 @@ def run(argv):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Fix: whitespace
|
# Fix: whitespace
|
||||||
df[column] = df[column].apply(fix.whitespace)
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
|
|
||||||
# Fix: newlines
|
# Fix: newlines
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
@ -104,19 +104,19 @@ def run(argv):
|
|||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Check: invalid multi-value separator
|
# Check: invalid multi-value separator
|
||||||
df[column] = df[column].apply(check.separators)
|
df[column] = df[column].apply(check.separators, field_name=column)
|
||||||
|
|
||||||
# Check: suspicious characters
|
# Check: suspicious characters
|
||||||
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
||||||
|
|
||||||
# Fix: invalid multi-value separator
|
# Fix: invalid multi-value separator
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
df[column] = df[column].apply(fix.separators)
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
# Run whitespace fix again after fixing invalid separators
|
# Run whitespace fix again after fixing invalid separators
|
||||||
df[column] = df[column].apply(fix.whitespace)
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
|
|
||||||
# Fix: duplicate metadata values
|
# Fix: duplicate metadata values
|
||||||
df[column] = df[column].apply(fix.duplicates)
|
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
||||||
|
|
||||||
# Check: invalid AGROVOC subject
|
# Check: invalid AGROVOC subject
|
||||||
if args.agrovoc_fields:
|
if args.agrovoc_fields:
|
||||||
|
@ -51,7 +51,7 @@ def isbn(field):
|
|||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
def separators(field):
|
def separators(field, field_name):
|
||||||
"""Check for invalid multi-value separators (ie "|" or "|||").
|
"""Check for invalid multi-value separators (ie "|" or "|||").
|
||||||
|
|
||||||
Prints the field with the invalid multi-value separator.
|
Prints the field with the invalid multi-value separator.
|
||||||
@ -70,7 +70,7 @@ def separators(field):
|
|||||||
match = re.findall(r"^.*?\|.*$", value)
|
match = re.findall(r"^.*?\|.*$", value)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
print(f"Invalid multi-value separator: {field}")
|
print(f"Invalid multi-value separator ({field_name}): {field}")
|
||||||
|
|
||||||
return field
|
return field
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ import re
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def whitespace(field):
|
def whitespace(field, field_name):
|
||||||
"""Fix whitespace issues.
|
"""Fix whitespace issues.
|
||||||
|
|
||||||
Return string with leading, trailing, and consecutive whitespace trimmed.
|
Return string with leading, trailing, and consecutive whitespace trimmed.
|
||||||
@ -26,7 +26,7 @@ def whitespace(field):
|
|||||||
match = re.findall(pattern, value)
|
match = re.findall(pattern, value)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
print(f"Removing excessive whitespace: {value}")
|
print(f"Removing excessive whitespace ({field_name}): {value}")
|
||||||
value = re.sub(pattern, " ", value)
|
value = re.sub(pattern, " ", value)
|
||||||
|
|
||||||
# Save cleaned value
|
# Save cleaned value
|
||||||
@ -38,7 +38,7 @@ def whitespace(field):
|
|||||||
return new_field
|
return new_field
|
||||||
|
|
||||||
|
|
||||||
def separators(field):
|
def separators(field, field_name):
|
||||||
"""Fix for invalid multi-value separators (ie "|")."""
|
"""Fix for invalid multi-value separators (ie "|")."""
|
||||||
|
|
||||||
# Skip fields with missing values
|
# Skip fields with missing values
|
||||||
@ -55,7 +55,7 @@ def separators(field):
|
|||||||
match = re.findall(pattern, value)
|
match = re.findall(pattern, value)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
print(f"Fixing invalid multi-value separator: {value}")
|
print(f"Fixing invalid multi-value separator ({field_name}): {value}")
|
||||||
|
|
||||||
value = re.sub(pattern, "||", value)
|
value = re.sub(pattern, "||", value)
|
||||||
|
|
||||||
@ -121,7 +121,7 @@ def unnecessary_unicode(field):
|
|||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
def duplicates(field):
|
def duplicates(field, field_name):
|
||||||
"""Remove duplicate metadata values."""
|
"""Remove duplicate metadata values."""
|
||||||
|
|
||||||
# Skip fields with missing values
|
# Skip fields with missing values
|
||||||
@ -140,7 +140,7 @@ def duplicates(field):
|
|||||||
if value not in new_values:
|
if value not in new_values:
|
||||||
new_values.append(value)
|
new_values.append(value)
|
||||||
else:
|
else:
|
||||||
print(f"Removing duplicate value: {value}")
|
print(f"Removing duplicate value ({field_name}): {value}")
|
||||||
|
|
||||||
# Create a new field consisting of all values joined with "||"
|
# Create a new field consisting of all values joined with "||"
|
||||||
new_field = "||".join(new_values)
|
new_field = "||".join(new_values)
|
||||||
|
@ -51,10 +51,12 @@ def test_check_invalid_separators(capsys):
|
|||||||
|
|
||||||
value = "Alan|Orth"
|
value = "Alan|Orth"
|
||||||
|
|
||||||
check.separators(value)
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
check.separators(value, field_name)
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert captured.out == f"Invalid multi-value separator: {value}\n"
|
assert captured.out == f"Invalid multi-value separator ({field_name}): {value}\n"
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_separators():
|
def test_check_valid_separators():
|
||||||
@ -62,7 +64,9 @@ def test_check_valid_separators():
|
|||||||
|
|
||||||
value = "Alan||Orth"
|
value = "Alan||Orth"
|
||||||
|
|
||||||
result = check.separators(value)
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
result = check.separators(value, field_name)
|
||||||
|
|
||||||
assert result == value
|
assert result == value
|
||||||
|
|
||||||
|
@ -6,7 +6,9 @@ def test_fix_leading_whitespace():
|
|||||||
|
|
||||||
value = " Alan"
|
value = " Alan"
|
||||||
|
|
||||||
assert fix.whitespace(value) == "Alan"
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
assert fix.whitespace(value, field_name) == "Alan"
|
||||||
|
|
||||||
|
|
||||||
def test_fix_trailing_whitespace():
|
def test_fix_trailing_whitespace():
|
||||||
@ -14,7 +16,9 @@ def test_fix_trailing_whitespace():
|
|||||||
|
|
||||||
value = "Alan "
|
value = "Alan "
|
||||||
|
|
||||||
assert fix.whitespace(value) == "Alan"
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
assert fix.whitespace(value, field_name) == "Alan"
|
||||||
|
|
||||||
|
|
||||||
def test_fix_excessive_whitespace():
|
def test_fix_excessive_whitespace():
|
||||||
@ -22,7 +26,9 @@ def test_fix_excessive_whitespace():
|
|||||||
|
|
||||||
value = "Alan Orth"
|
value = "Alan Orth"
|
||||||
|
|
||||||
assert fix.whitespace(value) == "Alan Orth"
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
assert fix.whitespace(value, field_name) == "Alan Orth"
|
||||||
|
|
||||||
|
|
||||||
def test_fix_invalid_separators():
|
def test_fix_invalid_separators():
|
||||||
@ -30,7 +36,9 @@ def test_fix_invalid_separators():
|
|||||||
|
|
||||||
value = "Alan|Orth"
|
value = "Alan|Orth"
|
||||||
|
|
||||||
assert fix.separators(value) == "Alan||Orth"
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
assert fix.separators(value, field_name) == "Alan||Orth"
|
||||||
|
|
||||||
|
|
||||||
def test_fix_unnecessary_unicode():
|
def test_fix_unnecessary_unicode():
|
||||||
@ -46,7 +54,9 @@ def test_fix_duplicates():
|
|||||||
|
|
||||||
value = "Kenya||Kenya"
|
value = "Kenya||Kenya"
|
||||||
|
|
||||||
assert fix.duplicates(value) == "Kenya"
|
field_name = "dc.contributor.author"
|
||||||
|
|
||||||
|
assert fix.duplicates(value, field_name) == "Kenya"
|
||||||
|
|
||||||
|
|
||||||
def test_fix_newlines():
|
def test_fix_newlines():
|
||||||
|
Loading…
Reference in New Issue
Block a user