1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-06-09 13:25:08 +02:00

Add fix for missing space after commas

This happens in names very often, for example in the contributor
and citation fields. I will limit this to those fields for now and
hide this fix behind the "unsafe fixes" option until I test it more.
This commit is contained in:
Alan Orth 2019-08-28 00:05:52 +03:00
parent 2af714fb05
commit 81190d56bb
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 31 additions and 0 deletions

View File

@ -55,6 +55,13 @@ def run(argv):
if args.unsafe_fixes:
df[column] = df[column].apply(fix.newlines)
# Fix: missing space after comma. Only run on author and citation
# fields for now, as this problem is mostly an issue in names.
if args.unsafe_fixes:
match = re.match(r'^.*?(author|citation).*$', column)
if match is not None:
df[column] = df[column].apply(fix.comma_space, field_name=column)
# Fix: unnecessary Unicode
df[column] = df[column].apply(fix.unnecessary_unicode)

View File

@ -176,3 +176,27 @@ def newlines(field):
field = field.replace('\n', '')
return field
def comma_space(field, field_name):
"""Fix occurrences of commas missing a trailing space, for example:
Orth,Alan S.
This is a very common mistake in author and citation fields.
Return string with a space added.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Check for comma followed by a word character
match = re.findall(r',\w', field)
if match:
print(f'Adding space after comma ({field_name}): {field}')
field = re.sub(r',(\w)', r', \1', field)
return field