1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-25 07:10:17 +01:00

csv_metadata_quality/app.py: Improve comments

This commit is contained in:
Alan Orth 2019-07-29 16:24:35 +03:00
parent 42920e9c7c
commit d73f7b54b1
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -22,29 +22,29 @@ def main(argv):
df = pd.read_csv(args.input_file, dtype=str)
for column in df.columns.values.tolist():
# Run whitespace fix on all columns
# Fix: whitespace
df[column] = df[column].apply(fix.whitespace)
# Run invalid multi-value separator check on all columns
# Check: invalid multi-value separator
df[column] = df[column].apply(check.separators)
# Run invalid multi-value separator fix on all columns
# Fix: invalid multi-value separator
if args.unsafe_fixes:
df[column] = df[column].apply(fix.separators)
# Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace)
# check if column is an issn column like dc.identifier.issn
# Check: invalid ISSN
match = re.match(r'^.*?issn.*$', column)
if match is not None:
df[column] = df[column].apply(check.issn)
# check if column is an isbn column like dc.identifier.isbn
# Check: invalid ISBN
match = re.match(r'^.*?isbn.*$', column)
if match is not None:
df[column] = df[column].apply(check.isbn)
# check if column is a date column like dc.date.issued
# Check: invalid date
match = re.match(r'^.*?date.*$', column)
if match is not None:
df[column] = df[column].apply(check.date)