1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-07-06 06:21:36 +02:00

Expand check/fix for multi-value separators

I just came across some metadata that had unnecessary multi-value
separators at the end of a field, causing a blank value to be used.

For example: "Kenya||Tanzania||"
This commit is contained in:
2021-01-03 15:30:03 +02:00
parent c26ad83534
commit 0dc66c5c4e
4 changed files with 30 additions and 5 deletions

View File

@ -42,7 +42,14 @@ def whitespace(field, field_name):
def separators(field, field_name):
"""Fix for invalid multi-value separators (ie "|")."""
"""Fix for invalid and unnecessary multi-value separators, for example:
value|value
value|||value
value||value||
Prints the field with the invalid multi-value separator.
"""
# Skip fields with missing values
if pd.isna(field):
@ -53,6 +60,12 @@ def separators(field, field_name):
# Try to split multi-value field on "||" separator
for value in field.split("||"):
# Check if the value is blank and skip it
if value == "":
print(f"Fixing unnecessary multi-value separator ({field_name}): {field}")
continue
# After splitting, see if there are any remaining "|" characters
pattern = re.compile(r"\|")
match = re.findall(pattern, value)