1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-17 19:47:03 +01:00

csv_metadata_quality/app.py: disable whitespace on abstracts

It's too aggressive on abstracts. If people paste in text from a
PDF there are often newlines, and most of the time this is what
they want.
This commit is contained in:
Alan Orth 2023-02-07 16:48:40 +03:00
parent d5afbad788
commit 545bb8cd0c
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -90,12 +90,14 @@ def run(argv):
continue
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: newlines
if args.unsafe_fixes:
df[column] = df[column].apply(fix.newlines, field_name=column)
match = re.match(r"^.*?abstract.*$", column)
if match is None:
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace, field_name=column)
# Fix: newlines
df[column] = df[column].apply(fix.newlines, field_name=column)
# Fix: missing space after comma. Only run on author and citation
# fields for now, as this problem is mostly an issue in names.