1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 13:55:03 +01:00

csv_metadata_quality/app.py: Improve comments

This commit is contained in:
Alan Orth 2019-07-29 16:24:35 +03:00
parent 42920e9c7c
commit d73f7b54b1
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -22,29 +22,29 @@ def main(argv):
df = pd.read_csv(args.input_file, dtype=str) df = pd.read_csv(args.input_file, dtype=str)
for column in df.columns.values.tolist(): for column in df.columns.values.tolist():
# Run whitespace fix on all columns # Fix: whitespace
df[column] = df[column].apply(fix.whitespace) df[column] = df[column].apply(fix.whitespace)
# Run invalid multi-value separator check on all columns # Check: invalid multi-value separator
df[column] = df[column].apply(check.separators) df[column] = df[column].apply(check.separators)
# Run invalid multi-value separator fix on all columns # Fix: invalid multi-value separator
if args.unsafe_fixes: if args.unsafe_fixes:
df[column] = df[column].apply(fix.separators) df[column] = df[column].apply(fix.separators)
# Run whitespace fix again after fixing invalid separators # Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace) df[column] = df[column].apply(fix.whitespace)
# check if column is an issn column like dc.identifier.issn # Check: invalid ISSN
match = re.match(r'^.*?issn.*$', column) match = re.match(r'^.*?issn.*$', column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.issn) df[column] = df[column].apply(check.issn)
# check if column is an isbn column like dc.identifier.isbn # Check: invalid ISBN
match = re.match(r'^.*?isbn.*$', column) match = re.match(r'^.*?isbn.*$', column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.isbn) df[column] = df[column].apply(check.isbn)
# check if column is a date column like dc.date.issued # Check: invalid date
match = re.match(r'^.*?date.*$', column) match = re.match(r'^.*?date.*$', column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.date) df[column] = df[column].apply(check.date)