mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-25 07:10:17 +01:00
Add check for invalid multi-value separators
This commit is contained in:
parent
02f9d8a736
commit
aaf3537ba4
@ -14,6 +14,9 @@ def run():
|
|||||||
|
|
||||||
df[column] = df[column].apply(fix.whitespace)
|
df[column] = df[column].apply(fix.whitespace)
|
||||||
|
|
||||||
|
# Run invalid multi-value separator check on all columns
|
||||||
|
df[column] = df[column].apply(check.separators)
|
||||||
|
|
||||||
if column == 'dc.identifier.issn':
|
if column == 'dc.identifier.issn':
|
||||||
df[column] = df[column].apply(check.issn)
|
df[column] = df[column].apply(check.issn)
|
||||||
|
|
||||||
|
@ -44,3 +44,25 @@ def isbn(field):
|
|||||||
|
|
||||||
if not isbn.is_valid(value):
|
if not isbn.is_valid(value):
|
||||||
print(f'Invalid ISBN: {value}')
|
print(f'Invalid ISBN: {value}')
|
||||||
|
|
||||||
|
|
||||||
|
def separators(field):
|
||||||
|
"""Check for invalid multi-value separators (ie "|" or "|||").
|
||||||
|
|
||||||
|
Prints the field with the invalid multi-value separator.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
for value in field.split('||'):
|
||||||
|
|
||||||
|
# After splitting, see if there are any remaining "|" characters
|
||||||
|
match = re.findall(r'^.*?\|.*$', value)
|
||||||
|
|
||||||
|
if len(match) > 0:
|
||||||
|
print(f'Invalid multi-value separator: {field}')
|
||||||
|
Loading…
Reference in New Issue
Block a user