From 81190d56bb4ff39a81fb4f0995626ff52901ac55 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 28 Aug 2019 00:05:52 +0300 Subject: [PATCH] Add fix for missing space after commas This happens in names very often, for example in the contributor and citation fields. I will limit this to those fields for now and hide this fix behind the "unsafe fixes" option until I test it more. --- csv_metadata_quality/app.py | 7 +++++++ csv_metadata_quality/fix.py | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 92b2f98..3799f33 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -55,6 +55,13 @@ def run(argv): if args.unsafe_fixes: df[column] = df[column].apply(fix.newlines) + # Fix: missing space after comma. Only run on author and citation + # fields for now, as this problem is mostly an issue in names. + if args.unsafe_fixes: + match = re.match(r'^.*?(author|citation).*$', column) + if match is not None: + df[column] = df[column].apply(fix.comma_space, field_name=column) + # Fix: unnecessary Unicode df[column] = df[column].apply(fix.unnecessary_unicode) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 5fddf62..f664a78 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -176,3 +176,27 @@ def newlines(field): field = field.replace('\n', '') return field + + +def comma_space(field, field_name): + """Fix occurrences of commas missing a trailing space, for example: + + Orth,Alan S. + + This is a very common mistake in author and citation fields. + + Return string with a space added. + """ + + # Skip fields with missing values + if pd.isna(field): + return + + # Check for comma followed by a word character + match = re.findall(r',\w', field) + + if match: + print(f'Adding space after comma ({field_name}): {field}') + field = re.sub(r',(\w)', r', \1', field) + + return field