1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-21 11:42:20 +01:00

csv_metadata_quality/app.py: skip newline fix on description

The description field often has free-form text like the abstract and
there are too many legitimate newlines here to be correcting them
automatically.
This commit is contained in:
Alan Orth 2023-04-22 12:16:13 -07:00
parent 1491e1edb0
commit e2d46e9495
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -90,7 +90,9 @@ def run(argv):
continue
if args.unsafe_fixes:
match = re.match(r"^.*?abstract.*$", column)
# Skip whitespace and newline fixes on abstracts and descriptions
# because there are too many with legitimate multi-line metadata.
match = re.match(r"^.*?(abstract|description).*$", column)
if match is None:
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace, field_name=column)