1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-21 11:42:20 +01:00

csv_metadata_quality/app.py: separators fix

Don't run the invalid separators fix on title fields because some
items use "|" in the title to indicate something like a subtitle.

For example:

    Progress Review and Work Planning Meeting | Day 1
This commit is contained in:
Alan Orth 2023-01-24 14:13:55 +03:00
parent 5abd32a41f
commit 3596381d03
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -121,10 +121,14 @@ def run(argv):
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)
# Fix: invalid and unnecessary multi-value separators
df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: invalid and unnecessary multi-value separators. Skip the title
# field because sometimes "|" is used to indicate something like a
# subtitle.
match = re.match(r"^.*?title.*$", column)
if match is None:
df[column] = df[column].apply(fix.separators, field_name=column)
# Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates, field_name=column)