mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-17 19:47:03 +01:00
csv_metadata_quality/app.py: separators fix
Don't run the invalid separators fix on title fields because some items use "|" in the title to indicate something like a subtitle. For example: Progress Review and Work Planning Meeting | Day 1
This commit is contained in:
parent
5abd32a41f
commit
3596381d03
@ -121,10 +121,14 @@ def run(argv):
|
|||||||
# Fix: unnecessary Unicode
|
# Fix: unnecessary Unicode
|
||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators
|
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
# field because sometimes "|" is used to indicate something like a
|
||||||
# Run whitespace fix again after fixing invalid separators
|
# subtitle.
|
||||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
match = re.match(r"^.*?title.*$", column)
|
||||||
|
if match is None:
|
||||||
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
|
# Run whitespace fix again after fixing invalid separators
|
||||||
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
|
|
||||||
# Fix: duplicate metadata values
|
# Fix: duplicate metadata values
|
||||||
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
||||||
|
Loading…
Reference in New Issue
Block a user