From 3596381d03a7e9fb50ebbb077066eb796850c907 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 24 Jan 2023 14:13:55 +0300 Subject: [PATCH] csv_metadata_quality/app.py: separators fix Don't run the invalid separators fix on title fields because some items use "|" in the title to indicate something like a subtitle. For example: Progress Review and Work Planning Meeting | Day 1 --- csv_metadata_quality/app.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 0bcc61f..0278c59 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -121,10 +121,14 @@ def run(argv): # Fix: unnecessary Unicode df[column] = df[column].apply(fix.unnecessary_unicode) - # Fix: invalid and unnecessary multi-value separators - df[column] = df[column].apply(fix.separators, field_name=column) - # Run whitespace fix again after fixing invalid separators - df[column] = df[column].apply(fix.whitespace, field_name=column) + # Fix: invalid and unnecessary multi-value separators. Skip the title + # field because sometimes "|" is used to indicate something like a + # subtitle. + match = re.match(r"^.*?title.*$", column) + if match is None: + df[column] = df[column].apply(fix.separators, field_name=column) + # Run whitespace fix again after fixing invalid separators + df[column] = df[column].apply(fix.whitespace, field_name=column) # Fix: duplicate metadata values df[column] = df[column].apply(fix.duplicates, field_name=column)