mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-17 19:47:03 +01:00
Remove checks for invalid multi-value separators
Now that I no longer treat the fix for these as "unsafe" I don't a ctually need to check for them—I can just fix them when I see them.
This commit is contained in:
parent
3656e9f976
commit
10612cf891
@ -104,9 +104,6 @@ def run(argv):
|
|||||||
# Fix: unnecessary Unicode
|
# Fix: unnecessary Unicode
|
||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Check: invalid and unnecessary multi-value separators
|
|
||||||
df[column] = df[column].apply(check.separators, field_name=column)
|
|
||||||
|
|
||||||
# Check: suspicious characters
|
# Check: suspicious characters
|
||||||
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
||||||
|
|
||||||
|
@ -58,42 +58,6 @@ def isbn(field):
|
|||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
def separators(field, field_name):
|
|
||||||
"""Check for invalid and unnecessary multi-value separators, for example:
|
|
||||||
|
|
||||||
value|value
|
|
||||||
value|||value
|
|
||||||
value||value||
|
|
||||||
|
|
||||||
Prints the field with the invalid multi-value separator.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Skip fields with missing values
|
|
||||||
if pd.isna(field):
|
|
||||||
return
|
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
|
||||||
for value in field.split("||"):
|
|
||||||
# Check if the current value is blank
|
|
||||||
if value == "":
|
|
||||||
print(
|
|
||||||
f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}"
|
|
||||||
)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# After splitting, see if there are any remaining "|" characters
|
|
||||||
match = re.findall(r"^.*?\|.*$", value)
|
|
||||||
|
|
||||||
# Check if there was a match
|
|
||||||
if match:
|
|
||||||
print(
|
|
||||||
f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{field}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return field
|
|
||||||
|
|
||||||
|
|
||||||
def date(field, field_name):
|
def date(field, field_name):
|
||||||
"""Check if a date is valid.
|
"""Check if a date is valid.
|
||||||
|
|
||||||
|
@ -47,50 +47,6 @@ def test_check_valid_isbn():
|
|||||||
assert result == value
|
assert result == value
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_separators(capsys):
|
|
||||||
"""Test checking invalid multi-value separators."""
|
|
||||||
|
|
||||||
value = "Alan|Orth"
|
|
||||||
|
|
||||||
field_name = "dc.contributor.author"
|
|
||||||
|
|
||||||
check.separators(value, field_name)
|
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert (
|
|
||||||
captured.out
|
|
||||||
== f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{value}\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_check_unnecessary_separators(capsys):
|
|
||||||
"""Test checking unnecessary multi-value separators."""
|
|
||||||
|
|
||||||
field = "Alan||Orth||"
|
|
||||||
|
|
||||||
field_name = "dc.contributor.author"
|
|
||||||
|
|
||||||
check.separators(field, field_name)
|
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert (
|
|
||||||
captured.out
|
|
||||||
== f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_separators():
|
|
||||||
"""Test checking valid multi-value separators."""
|
|
||||||
|
|
||||||
value = "Alan||Orth"
|
|
||||||
|
|
||||||
field_name = "dc.contributor.author"
|
|
||||||
|
|
||||||
result = check.separators(value, field_name)
|
|
||||||
|
|
||||||
assert result == value
|
|
||||||
|
|
||||||
|
|
||||||
def test_check_missing_date(capsys):
|
def test_check_missing_date(capsys):
|
||||||
"""Test checking missing date."""
|
"""Test checking missing date."""
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user