1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Expand check/fix for multi-value separators

I just came across some metadata that had unnecessary multi-value
separators at the end of a field, causing a blank value to be used.

For example: "Kenya||Tanzania||"
This commit is contained in:
2021-01-03 15:30:03 +02:00
parent c26ad83534
commit 0dc66c5c4e
4 changed files with 30 additions and 5 deletions

View File

@ -103,13 +103,13 @@ def run(argv):
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: invalid multi-value separator
# Check: invalid and unnecessary multi-value separators
df[column] = df[column].apply(check.separators, field_name=column)
# Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
# Fix: invalid multi-value separator
# Fix: invalid and unnecessary multi-value separators
if args.unsafe_fixes:
df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators

View File

@ -57,7 +57,11 @@ def isbn(field):
def separators(field, field_name):
"""Check for invalid multi-value separators (ie "|" or "|||").
"""Check for invalid and unnecessary multi-value separators, for example:
value|value
value|||value
value||value||
Prints the field with the invalid multi-value separator.
"""
@ -70,10 +74,16 @@ def separators(field, field_name):
# Try to split multi-value field on "||" separator
for value in field.split("||"):
# Check if the current value is blank
if value == "":
print(f"Unnecessary multi-value separator ({field_name}): {field}")
continue
# After splitting, see if there are any remaining "|" characters
match = re.findall(r"^.*?\|.*$", value)
# Check if there was a match
if match:
print(f"Invalid multi-value separator ({field_name}): {field}")

View File

@ -42,7 +42,14 @@ def whitespace(field, field_name):
def separators(field, field_name):
"""Fix for invalid multi-value separators (ie "|")."""
"""Fix for invalid and unnecessary multi-value separators, for example:
value|value
value|||value
value||value||
Prints the field with the invalid multi-value separator.
"""
# Skip fields with missing values
if pd.isna(field):
@ -53,6 +60,12 @@ def separators(field, field_name):
# Try to split multi-value field on "||" separator
for value in field.split("||"):
# Check if the value is blank and skip it
if value == "":
print(f"Fixing unnecessary multi-value separator ({field_name}): {field}")
continue
# After splitting, see if there are any remaining "|" characters
pattern = re.compile(r"\|")
match = re.findall(pattern, value)