1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-17 19:47:03 +01:00

Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.
This commit is contained in:
Alan Orth 2021-03-16 16:04:19 +02:00
parent 9a5e3fd6ef
commit 330a7b7b9c
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 32 additions and 32 deletions

View File

@ -105,7 +105,7 @@ def run(argv):
df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
df[column].apply(check.suspicious_characters, field_name=column)
# Fix: invalid and unnecessary multi-value separators
df[column] = df[column].apply(fix.separators, field_name=column)
@ -120,36 +120,36 @@ def run(argv):
# Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","):
if column == field:
df[column] = df[column].apply(check.agrovoc, field_name=column)
df[column].apply(check.agrovoc, field_name=column)
# Check: invalid language
match = re.match(r"^.*?language.*$", column)
if match is not None:
df[column] = df[column].apply(check.language)
df[column].apply(check.language)
# Check: invalid ISSN
match = re.match(r"^.*?issn.*$", column)
if match is not None:
df[column] = df[column].apply(check.issn)
df[column].apply(check.issn)
# Check: invalid ISBN
match = re.match(r"^.*?isbn.*$", column)
if match is not None:
df[column] = df[column].apply(check.isbn)
df[column].apply(check.isbn)
# Check: invalid date
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
if match is not None:
df[column] = df[column].apply(check.date, field_name=column)
df[column].apply(check.date, field_name=column)
# Check: filename extension
if column == "filename":
df[column] = df[column].apply(check.filename_extension)
df[column].apply(check.filename_extension)
# Check: SPDX license identifier
match = re.match(r"dcterms\.license.*$", column)
if match is not None:
df[column] = df[column].apply(check.spdx_license_identifier)
df[column].apply(check.spdx_license_identifier)
##
# Perform some checks on rows so we can consider items as a whole rather

View File

@ -32,7 +32,7 @@ def issn(field):
if not stdnum_issn.is_valid(value):
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
return field
return
def isbn(field):
@ -55,7 +55,7 @@ def isbn(field):
if not stdnum_isbn.is_valid(value):
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
return field
return
def date(field, field_name):
@ -83,13 +83,13 @@ def date(field, field_name):
f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
)
return field
return
try:
# Check if date is valid YYYY format
datetime.strptime(field, "%Y")
return field
return
except ValueError:
pass
@ -97,7 +97,7 @@ def date(field, field_name):
# Check if date is valid YYYY-MM format
datetime.strptime(field, "%Y-%m")
return field
return
except ValueError:
pass
@ -105,7 +105,7 @@ def date(field, field_name):
# Check if date is valid YYYY-MM-DD format
datetime.strptime(field, "%Y-%m-%d")
return field
return
except ValueError:
pass
@ -113,11 +113,11 @@ def date(field, field_name):
# Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")
return field
return
except ValueError:
print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
return field
return
def suspicious_characters(field, field_name):
@ -151,7 +151,7 @@ def suspicious_characters(field, field_name):
suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
print(f"{suspicious_character_msg:1.80}")
return field
return
def language(field):
@ -184,7 +184,7 @@ def language(field):
else:
print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
return field
return
def agrovoc(field, field_name):
@ -230,7 +230,7 @@ def agrovoc(field, field_name):
if len(data["results"]) == 0:
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
return field
return
def filename_extension(field):
@ -281,7 +281,7 @@ def filename_extension(field):
if filename_extension_match is False:
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field
return
def spdx_license_identifier(field):
@ -301,4 +301,4 @@ def spdx_license_identifier(field):
pass
return field
return

View File

@ -94,4 +94,4 @@ def correct_language(row):
)
else:
return language
return

View File

@ -23,7 +23,7 @@ def test_check_valid_issn():
result = check.issn(value)
assert result == value
assert result == None
def test_check_invalid_isbn(capsys):
@ -44,7 +44,7 @@ def test_check_valid_isbn():
result = check.isbn(value)
assert result == value
assert result == None
def test_check_missing_date(capsys):
@ -100,7 +100,7 @@ def test_check_valid_date():
result = check.date(value, field_name)
assert result == value
assert result == None
def test_check_suspicious_characters(capsys):
@ -126,7 +126,7 @@ def test_check_valid_iso639_1_language():
result = check.language(value)
assert result == value
assert result == None
def test_check_valid_iso639_3_language():
@ -136,7 +136,7 @@ def test_check_valid_iso639_3_language():
result = check.language(value)
assert result == value
assert result == None
def test_check_invalid_iso639_1_language(capsys):
@ -199,7 +199,7 @@ def test_check_valid_agrovoc():
result = check.agrovoc(value, field_name)
assert result == value
assert result == None
def test_check_uncommon_filename_extension(capsys):
@ -223,7 +223,7 @@ def test_check_common_filename_extension():
result = check.filename_extension(value)
assert result == value
assert result == None
def test_check_incorrect_iso_639_1_language(capsys):
@ -276,7 +276,7 @@ def test_check_correct_iso_639_1_language():
result = experimental.correct_language(series)
assert result == language
assert result == None
def test_check_correct_iso_639_3_language():
@ -291,7 +291,7 @@ def test_check_correct_iso_639_3_language():
result = experimental.correct_language(series)
assert result == language
assert result == None
def test_check_valid_spdx_license_identifier():
@ -301,7 +301,7 @@ def test_check_valid_spdx_license_identifier():
result = check.spdx_license_identifier(license)
assert result == license
assert result == None
def test_check_invalid_spdx_license_identifier(capsys):