1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-06-09 13:25:08 +02:00

Add check for invalid multi-value separators

This commit is contained in:
Alan Orth 2019-07-26 23:48:24 +03:00
parent 02f9d8a736
commit aaf3537ba4
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 25 additions and 0 deletions

View File

@ -14,6 +14,9 @@ def run():
df[column] = df[column].apply(fix.whitespace)
# Run invalid multi-value separator check on all columns
df[column] = df[column].apply(check.separators)
if column == 'dc.identifier.issn':
df[column] = df[column].apply(check.issn)

View File

@ -44,3 +44,25 @@ def isbn(field):
if not isbn.is_valid(value):
print(f'Invalid ISBN: {value}')
def separators(field):
"""Check for invalid multi-value separators (ie "|" or "|||").
Prints the field with the invalid multi-value separator.
"""
import re
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split('||'):
# After splitting, see if there are any remaining "|" characters
match = re.findall(r'^.*?\|.*$', value)
if len(match) > 0:
print(f'Invalid multi-value separator: {field}')