mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
Add fix for missing space after commas
This happens in names very often, for example in the contributor and citation fields. I will limit this to those fields for now and hide this fix behind the "unsafe fixes" option until I test it more.
This commit is contained in:
parent
2af714fb05
commit
81190d56bb
@ -55,6 +55,13 @@ def run(argv):
|
|||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
df[column] = df[column].apply(fix.newlines)
|
df[column] = df[column].apply(fix.newlines)
|
||||||
|
|
||||||
|
# Fix: missing space after comma. Only run on author and citation
|
||||||
|
# fields for now, as this problem is mostly an issue in names.
|
||||||
|
if args.unsafe_fixes:
|
||||||
|
match = re.match(r'^.*?(author|citation).*$', column)
|
||||||
|
if match is not None:
|
||||||
|
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||||
|
|
||||||
# Fix: unnecessary Unicode
|
# Fix: unnecessary Unicode
|
||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
|
@ -176,3 +176,27 @@ def newlines(field):
|
|||||||
field = field.replace('\n', '')
|
field = field.replace('\n', '')
|
||||||
|
|
||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
|
def comma_space(field, field_name):
|
||||||
|
"""Fix occurrences of commas missing a trailing space, for example:
|
||||||
|
|
||||||
|
Orth,Alan S.
|
||||||
|
|
||||||
|
This is a very common mistake in author and citation fields.
|
||||||
|
|
||||||
|
Return string with a space added.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for comma followed by a word character
|
||||||
|
match = re.findall(r',\w', field)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
print(f'Adding space after comma ({field_name}): {field}')
|
||||||
|
field = re.sub(r',(\w)', r', \1', field)
|
||||||
|
|
||||||
|
return field
|
||||||
|
Loading…
Reference in New Issue
Block a user