1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-21 19:51:11 +02:00

Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.
This commit is contained in:
2021-03-16 16:04:19 +02:00
parent 9a5e3fd6ef
commit 330a7b7b9c
4 changed files with 32 additions and 32 deletions

@ -105,7 +105,7 @@ def run(argv):
df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
df[column].apply(check.suspicious_characters, field_name=column)
# Fix: invalid and unnecessary multi-value separators
df[column] = df[column].apply(fix.separators, field_name=column)
@ -120,36 +120,36 @@ def run(argv):
# Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","):
if column == field:
df[column] = df[column].apply(check.agrovoc, field_name=column)
df[column].apply(check.agrovoc, field_name=column)
# Check: invalid language
match = re.match(r"^.*?language.*$", column)
if match is not None:
df[column] = df[column].apply(check.language)
df[column].apply(check.language)
# Check: invalid ISSN
match = re.match(r"^.*?issn.*$", column)
if match is not None:
df[column] = df[column].apply(check.issn)
df[column].apply(check.issn)
# Check: invalid ISBN
match = re.match(r"^.*?isbn.*$", column)
if match is not None:
df[column] = df[column].apply(check.isbn)
df[column].apply(check.isbn)
# Check: invalid date
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
if match is not None:
df[column] = df[column].apply(check.date, field_name=column)
df[column].apply(check.date, field_name=column)
# Check: filename extension
if column == "filename":
df[column] = df[column].apply(check.filename_extension)
df[column].apply(check.filename_extension)
# Check: SPDX license identifier
match = re.match(r"dcterms\.license.*$", column)
if match is not None:
df[column] = df[column].apply(check.spdx_license_identifier)
df[column].apply(check.spdx_license_identifier)
##
# Perform some checks on rows so we can consider items as a whole rather