mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 04:02:19 +01:00
Add check for invalid multi-value separators
This commit is contained in:
parent
02f9d8a736
commit
aaf3537ba4
@ -14,6 +14,9 @@ def run():
|
||||
|
||||
df[column] = df[column].apply(fix.whitespace)
|
||||
|
||||
# Run invalid multi-value separator check on all columns
|
||||
df[column] = df[column].apply(check.separators)
|
||||
|
||||
if column == 'dc.identifier.issn':
|
||||
df[column] = df[column].apply(check.issn)
|
||||
|
||||
|
@ -44,3 +44,25 @@ def isbn(field):
|
||||
|
||||
if not isbn.is_valid(value):
|
||||
print(f'Invalid ISBN: {value}')
|
||||
|
||||
|
||||
def separators(field):
|
||||
"""Check for invalid multi-value separators (ie "|" or "|||").
|
||||
|
||||
Prints the field with the invalid multi-value separator.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
|
||||
# After splitting, see if there are any remaining "|" characters
|
||||
match = re.findall(r'^.*?\|.*$', value)
|
||||
|
||||
if len(match) > 0:
|
||||
print(f'Invalid multi-value separator: {field}')
|
||||
|
Loading…
Reference in New Issue
Block a user