From aaf3537ba4ea0dbb4e2729f11491f2162ccc8671 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 26 Jul 2019 23:48:24 +0300 Subject: [PATCH] Add check for invalid multi-value separators --- csv_metadata_quality/app.py | 3 +++ csv_metadata_quality/check.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 3951528..63f36cd 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -14,6 +14,9 @@ def run(): df[column] = df[column].apply(fix.whitespace) + # Run invalid multi-value separator check on all columns + df[column] = df[column].apply(check.separators) + if column == 'dc.identifier.issn': df[column] = df[column].apply(check.issn) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 506fdb4..0568a50 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -44,3 +44,25 @@ def isbn(field): if not isbn.is_valid(value): print(f'Invalid ISBN: {value}') + + +def separators(field): + """Check for invalid multi-value separators (ie "|" or "|||"). + + Prints the field with the invalid multi-value separator. + """ + + import re + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + + # After splitting, see if there are any remaining "|" characters + match = re.findall(r'^.*?\|.*$', value) + + if len(match) > 0: + print(f'Invalid multi-value separator: {field}')