1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-17 19:47:03 +01:00

Remove checks for invalid multi-value separators

Now that I no longer treat the fix for these as "unsafe" I don't a
ctually need to check for them—I can just fix them when I see them.
This commit is contained in:
Alan Orth 2021-03-14 21:01:21 +02:00
parent 3656e9f976
commit 10612cf891
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
3 changed files with 0 additions and 83 deletions

View File

@ -104,9 +104,6 @@ def run(argv):
# Fix: unnecessary Unicode # Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode) df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: invalid and unnecessary multi-value separators
df[column] = df[column].apply(check.separators, field_name=column)
# Check: suspicious characters # Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters, field_name=column) df[column] = df[column].apply(check.suspicious_characters, field_name=column)

View File

@ -58,42 +58,6 @@ def isbn(field):
return field return field
def separators(field, field_name):
"""Check for invalid and unnecessary multi-value separators, for example:
value|value
value|||value
value||value||
Prints the field with the invalid multi-value separator.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split("||"):
# Check if the current value is blank
if value == "":
print(
f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}"
)
continue
# After splitting, see if there are any remaining "|" characters
match = re.findall(r"^.*?\|.*$", value)
# Check if there was a match
if match:
print(
f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{field}"
)
return field
def date(field, field_name): def date(field, field_name):
"""Check if a date is valid. """Check if a date is valid.

View File

@ -47,50 +47,6 @@ def test_check_valid_isbn():
assert result == value assert result == value
def test_check_invalid_separators(capsys):
"""Test checking invalid multi-value separators."""
value = "Alan|Orth"
field_name = "dc.contributor.author"
check.separators(value, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_unnecessary_separators(capsys):
"""Test checking unnecessary multi-value separators."""
field = "Alan||Orth||"
field_name = "dc.contributor.author"
check.separators(field, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}\n"
)
def test_check_valid_separators():
"""Test checking valid multi-value separators."""
value = "Alan||Orth"
field_name = "dc.contributor.author"
result = check.separators(value, field_name)
assert result == value
def test_check_missing_date(capsys): def test_check_missing_date(capsys):
"""Test checking missing date.""" """Test checking missing date."""