From 330a7b7b9c5cbbc5b4d981d01a4e72f76757dee8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 16 Mar 2021 16:04:19 +0200 Subject: [PATCH] Don't unnecessarily rewrite DataFrames for checks By using df[column] = df[column].apply(check...) we were re-writing the DataFrame every time we returned from a check. We don't actuall y need to return a value at all, as the point of checks is to print a warning to the screen. In Python a "return" statement without a v ariable returns None. I haven't measured the impact of this, but I assume it will mean we are faster and use less memory. --- csv_metadata_quality/app.py | 16 ++++++++-------- csv_metadata_quality/check.py | 26 +++++++++++++------------- csv_metadata_quality/experimental.py | 2 +- tests/test_check.py | 20 ++++++++++---------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 263ad7f..9e81de6 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -105,7 +105,7 @@ def run(argv): df[column] = df[column].apply(fix.unnecessary_unicode) # Check: suspicious characters - df[column] = df[column].apply(check.suspicious_characters, field_name=column) + df[column].apply(check.suspicious_characters, field_name=column) # Fix: invalid and unnecessary multi-value separators df[column] = df[column].apply(fix.separators, field_name=column) @@ -120,36 +120,36 @@ def run(argv): # Identify fields the user wants to validate against AGROVOC for field in args.agrovoc_fields.split(","): if column == field: - df[column] = df[column].apply(check.agrovoc, field_name=column) + df[column].apply(check.agrovoc, field_name=column) # Check: invalid language match = re.match(r"^.*?language.*$", column) if match is not None: - df[column] = df[column].apply(check.language) + df[column].apply(check.language) # Check: invalid ISSN match = re.match(r"^.*?issn.*$", column) if match is not None: - df[column] = df[column].apply(check.issn) + df[column].apply(check.issn) # Check: invalid ISBN match = re.match(r"^.*?isbn.*$", column) if match is not None: - df[column] = df[column].apply(check.isbn) + df[column].apply(check.isbn) # Check: invalid date match = re.match(r"^.*?(date|dcterms\.issued).*$", column) if match is not None: - df[column] = df[column].apply(check.date, field_name=column) + df[column].apply(check.date, field_name=column) # Check: filename extension if column == "filename": - df[column] = df[column].apply(check.filename_extension) + df[column].apply(check.filename_extension) # Check: SPDX license identifier match = re.match(r"dcterms\.license.*$", column) if match is not None: - df[column] = df[column].apply(check.spdx_license_identifier) + df[column].apply(check.spdx_license_identifier) ## # Perform some checks on rows so we can consider items as a whole rather diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 89b261c..0010edb 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -32,7 +32,7 @@ def issn(field): if not stdnum_issn.is_valid(value): print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}") - return field + return def isbn(field): @@ -55,7 +55,7 @@ def isbn(field): if not stdnum_isbn.is_valid(value): print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}") - return field + return def date(field, field_name): @@ -83,13 +83,13 @@ def date(field, field_name): f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}" ) - return field + return try: # Check if date is valid YYYY format datetime.strptime(field, "%Y") - return field + return except ValueError: pass @@ -97,7 +97,7 @@ def date(field, field_name): # Check if date is valid YYYY-MM format datetime.strptime(field, "%Y-%m") - return field + return except ValueError: pass @@ -105,7 +105,7 @@ def date(field, field_name): # Check if date is valid YYYY-MM-DD format datetime.strptime(field, "%Y-%m-%d") - return field + return except ValueError: pass @@ -113,11 +113,11 @@ def date(field, field_name): # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ") - return field + return except ValueError: print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}") - return field + return def suspicious_characters(field, field_name): @@ -151,7 +151,7 @@ def suspicious_characters(field, field_name): suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}" print(f"{suspicious_character_msg:1.80}") - return field + return def language(field): @@ -184,7 +184,7 @@ def language(field): else: print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}") - return field + return def agrovoc(field, field_name): @@ -230,7 +230,7 @@ def agrovoc(field, field_name): if len(data["results"]) == 0: print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") - return field + return def filename_extension(field): @@ -281,7 +281,7 @@ def filename_extension(field): if filename_extension_match is False: print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") - return field + return def spdx_license_identifier(field): @@ -301,4 +301,4 @@ def spdx_license_identifier(field): pass - return field + return diff --git a/csv_metadata_quality/experimental.py b/csv_metadata_quality/experimental.py index ba45273..2633899 100644 --- a/csv_metadata_quality/experimental.py +++ b/csv_metadata_quality/experimental.py @@ -94,4 +94,4 @@ def correct_language(row): ) else: - return language + return diff --git a/tests/test_check.py b/tests/test_check.py index db20485..0e3c4b9 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -23,7 +23,7 @@ def test_check_valid_issn(): result = check.issn(value) - assert result == value + assert result == None def test_check_invalid_isbn(capsys): @@ -44,7 +44,7 @@ def test_check_valid_isbn(): result = check.isbn(value) - assert result == value + assert result == None def test_check_missing_date(capsys): @@ -100,7 +100,7 @@ def test_check_valid_date(): result = check.date(value, field_name) - assert result == value + assert result == None def test_check_suspicious_characters(capsys): @@ -126,7 +126,7 @@ def test_check_valid_iso639_1_language(): result = check.language(value) - assert result == value + assert result == None def test_check_valid_iso639_3_language(): @@ -136,7 +136,7 @@ def test_check_valid_iso639_3_language(): result = check.language(value) - assert result == value + assert result == None def test_check_invalid_iso639_1_language(capsys): @@ -199,7 +199,7 @@ def test_check_valid_agrovoc(): result = check.agrovoc(value, field_name) - assert result == value + assert result == None def test_check_uncommon_filename_extension(capsys): @@ -223,7 +223,7 @@ def test_check_common_filename_extension(): result = check.filename_extension(value) - assert result == value + assert result == None def test_check_incorrect_iso_639_1_language(capsys): @@ -276,7 +276,7 @@ def test_check_correct_iso_639_1_language(): result = experimental.correct_language(series) - assert result == language + assert result == None def test_check_correct_iso_639_3_language(): @@ -291,7 +291,7 @@ def test_check_correct_iso_639_3_language(): result = experimental.correct_language(series) - assert result == language + assert result == None def test_check_valid_spdx_license_identifier(): @@ -301,7 +301,7 @@ def test_check_valid_spdx_license_identifier(): result = check.spdx_license_identifier(license) - assert result == license + assert result == None def test_check_invalid_spdx_license_identifier(capsys):