diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 3951528..63f36cd 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -14,6 +14,9 @@ def run(): df[column] = df[column].apply(fix.whitespace) + # Run invalid multi-value separator check on all columns + df[column] = df[column].apply(check.separators) + if column == 'dc.identifier.issn': df[column] = df[column].apply(check.issn) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 506fdb4..0568a50 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -44,3 +44,25 @@ def isbn(field): if not isbn.is_valid(value): print(f'Invalid ISBN: {value}') + + +def separators(field): + """Check for invalid multi-value separators (ie "|" or "|||"). + + Prints the field with the invalid multi-value separator. + """ + + import re + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + + # After splitting, see if there are any remaining "|" characters + match = re.findall(r'^.*?\|.*$', value) + + if len(match) > 0: + print(f'Invalid multi-value separator: {field}')