1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-07-04 21:43:25 +02:00

5 Commits

Author SHA1 Message Date
14010896a5 csv_metadata_quality/experimental.py: Move all imports to top of file
All checks were successful
continuous-integration/drone/push Build is passing
PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports
2021-03-16 16:13:34 +02:00
ab3af2ec62 csv_metadata_quality/check.py: Reformat with black 2021-03-16 16:12:33 +02:00
1aa2084230 CHANGELOG.md: Add note about checks 2021-03-16 16:11:24 +02:00
330a7b7b9c Don't unnecessarily rewrite DataFrames for checks
By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.
2021-03-16 16:04:19 +02:00
9a5e3fd6ef README.md: Add TODO about detecting duplicates 2021-03-16 14:03:26 +02:00
6 changed files with 50 additions and 42 deletions

View File

@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### Changed
- Fixing invalid multi-value separators like `|` and `|||` is no longer class- - Fixing invalid multi-value separators like `|` and `|||` is no longer class-
ified as "unsafe" as I have yet to see a case where this was intentional ified as "unsafe" as I have yet to see a case where this was intentional
- Not user visible, but now checks only print a warning to the screen instead
of returning a value and re-writing the DataFrame, which should be faster and
use less memory
### Added ### Added
- Configurable directory for AGROVOC requests cache (to allow running the web - Configurable directory for AGROVOC requests cache (to allow running the web

View File

@ -116,6 +116,10 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
- Warn if item is Open Access, but missing a license - Warn if item is Open Access, but missing a license
- Warn if item has an ISSN but no journal title - Warn if item has an ISSN but no journal title
- Update journal titles from ISSN - Update journal titles from ISSN
- Check for duplicates
- If I check titles only, then I might miss if one is a Report and another is a Presentation
- I could just check each item against each other item, but that sounds slow...
- Perhaps I could check for the number of unique values in a few rows, like title and doi, and see if it is the same as the total number of items
## License ## License
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html). This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).

View File

@ -105,7 +105,7 @@ def run(argv):
df[column] = df[column].apply(fix.unnecessary_unicode) df[column] = df[column].apply(fix.unnecessary_unicode)
# Check: suspicious characters # Check: suspicious characters
df[column] = df[column].apply(check.suspicious_characters, field_name=column) df[column].apply(check.suspicious_characters, field_name=column)
# Fix: invalid and unnecessary multi-value separators # Fix: invalid and unnecessary multi-value separators
df[column] = df[column].apply(fix.separators, field_name=column) df[column] = df[column].apply(fix.separators, field_name=column)
@ -120,36 +120,36 @@ def run(argv):
# Identify fields the user wants to validate against AGROVOC # Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","): for field in args.agrovoc_fields.split(","):
if column == field: if column == field:
df[column] = df[column].apply(check.agrovoc, field_name=column) df[column].apply(check.agrovoc, field_name=column)
# Check: invalid language # Check: invalid language
match = re.match(r"^.*?language.*$", column) match = re.match(r"^.*?language.*$", column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.language) df[column].apply(check.language)
# Check: invalid ISSN # Check: invalid ISSN
match = re.match(r"^.*?issn.*$", column) match = re.match(r"^.*?issn.*$", column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.issn) df[column].apply(check.issn)
# Check: invalid ISBN # Check: invalid ISBN
match = re.match(r"^.*?isbn.*$", column) match = re.match(r"^.*?isbn.*$", column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.isbn) df[column].apply(check.isbn)
# Check: invalid date # Check: invalid date
match = re.match(r"^.*?(date|dcterms\.issued).*$", column) match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.date, field_name=column) df[column].apply(check.date, field_name=column)
# Check: filename extension # Check: filename extension
if column == "filename": if column == "filename":
df[column] = df[column].apply(check.filename_extension) df[column].apply(check.filename_extension)
# Check: SPDX license identifier # Check: SPDX license identifier
match = re.match(r"dcterms\.license.*$", column) match = re.match(r"dcterms\.license.*$", column)
if match is not None: if match is not None:
df[column] = df[column].apply(check.spdx_license_identifier) df[column].apply(check.spdx_license_identifier)
## ##
# Perform some checks on rows so we can consider items as a whole rather # Perform some checks on rows so we can consider items as a whole rather

View File

@ -32,7 +32,7 @@ def issn(field):
if not stdnum_issn.is_valid(value): if not stdnum_issn.is_valid(value):
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}") print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
return field return
def isbn(field): def isbn(field):
@ -55,7 +55,7 @@ def isbn(field):
if not stdnum_isbn.is_valid(value): if not stdnum_isbn.is_valid(value):
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}") print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
return field return
def date(field, field_name): def date(field, field_name):
@ -83,13 +83,13 @@ def date(field, field_name):
f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}" f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
) )
return field return
try: try:
# Check if date is valid YYYY format # Check if date is valid YYYY format
datetime.strptime(field, "%Y") datetime.strptime(field, "%Y")
return field return
except ValueError: except ValueError:
pass pass
@ -97,7 +97,7 @@ def date(field, field_name):
# Check if date is valid YYYY-MM format # Check if date is valid YYYY-MM format
datetime.strptime(field, "%Y-%m") datetime.strptime(field, "%Y-%m")
return field return
except ValueError: except ValueError:
pass pass
@ -105,7 +105,7 @@ def date(field, field_name):
# Check if date is valid YYYY-MM-DD format # Check if date is valid YYYY-MM-DD format
datetime.strptime(field, "%Y-%m-%d") datetime.strptime(field, "%Y-%m-%d")
return field return
except ValueError: except ValueError:
pass pass
@ -113,11 +113,11 @@ def date(field, field_name):
# Check if date is valid YYYY-MM-DDTHH:MM:SSZ format # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ") datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")
return field return
except ValueError: except ValueError:
print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}") print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
return field return
def suspicious_characters(field, field_name): def suspicious_characters(field, field_name):
@ -151,7 +151,7 @@ def suspicious_characters(field, field_name):
suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}" suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
print(f"{suspicious_character_msg:1.80}") print(f"{suspicious_character_msg:1.80}")
return field return
def language(field): def language(field):
@ -184,7 +184,7 @@ def language(field):
else: else:
print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}") print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
return field return
def agrovoc(field, field_name): def agrovoc(field, field_name):
@ -211,7 +211,9 @@ def agrovoc(field, field_name):
# running in an environment where we can't write to the current working di- # running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web). # rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after) requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries # prune old cache entries
requests_cache.core.remove_expired_responses() requests_cache.core.remove_expired_responses()
@ -230,7 +232,7 @@ def agrovoc(field, field_name):
if len(data["results"]) == 0: if len(data["results"]) == 0:
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
return field return
def filename_extension(field): def filename_extension(field):
@ -281,7 +283,7 @@ def filename_extension(field):
if filename_extension_match is False: if filename_extension_match is False:
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field return
def spdx_license_identifier(field): def spdx_license_identifier(field):
@ -301,4 +303,4 @@ def spdx_license_identifier(field):
pass pass
return field return

View File

@ -1,5 +1,9 @@
import re
import langid
import pandas as pd import pandas as pd
from colorama import Fore from colorama import Fore
from pycountry import languages
def correct_language(row): def correct_language(row):
@ -11,11 +15,6 @@ def correct_language(row):
language and returns the value in the language field if it does match. language and returns the value in the language field if it does match.
""" """
import re
import langid
from pycountry import languages
# Initialize some variables at global scope so that we can set them in the # Initialize some variables at global scope so that we can set them in the
# loop scope below and still be able to access them afterwards. # loop scope below and still be able to access them afterwards.
language = "" language = ""
@ -94,4 +93,4 @@ def correct_language(row):
) )
else: else:
return language return

View File

@ -23,7 +23,7 @@ def test_check_valid_issn():
result = check.issn(value) result = check.issn(value)
assert result == value assert result == None
def test_check_invalid_isbn(capsys): def test_check_invalid_isbn(capsys):
@ -44,7 +44,7 @@ def test_check_valid_isbn():
result = check.isbn(value) result = check.isbn(value)
assert result == value assert result == None
def test_check_missing_date(capsys): def test_check_missing_date(capsys):
@ -100,7 +100,7 @@ def test_check_valid_date():
result = check.date(value, field_name) result = check.date(value, field_name)
assert result == value assert result == None
def test_check_suspicious_characters(capsys): def test_check_suspicious_characters(capsys):
@ -126,7 +126,7 @@ def test_check_valid_iso639_1_language():
result = check.language(value) result = check.language(value)
assert result == value assert result == None
def test_check_valid_iso639_3_language(): def test_check_valid_iso639_3_language():
@ -136,7 +136,7 @@ def test_check_valid_iso639_3_language():
result = check.language(value) result = check.language(value)
assert result == value assert result == None
def test_check_invalid_iso639_1_language(capsys): def test_check_invalid_iso639_1_language(capsys):
@ -199,7 +199,7 @@ def test_check_valid_agrovoc():
result = check.agrovoc(value, field_name) result = check.agrovoc(value, field_name)
assert result == value assert result == None
def test_check_uncommon_filename_extension(capsys): def test_check_uncommon_filename_extension(capsys):
@ -223,7 +223,7 @@ def test_check_common_filename_extension():
result = check.filename_extension(value) result = check.filename_extension(value)
assert result == value assert result == None
def test_check_incorrect_iso_639_1_language(capsys): def test_check_incorrect_iso_639_1_language(capsys):
@ -276,7 +276,7 @@ def test_check_correct_iso_639_1_language():
result = experimental.correct_language(series) result = experimental.correct_language(series)
assert result == language assert result == None
def test_check_correct_iso_639_3_language(): def test_check_correct_iso_639_3_language():
@ -291,7 +291,7 @@ def test_check_correct_iso_639_3_language():
result = experimental.correct_language(series) result = experimental.correct_language(series)
assert result == language assert result == None
def test_check_valid_spdx_license_identifier(): def test_check_valid_spdx_license_identifier():
@ -301,7 +301,7 @@ def test_check_valid_spdx_license_identifier():
result = check.spdx_license_identifier(license) result = check.spdx_license_identifier(license)
assert result == license assert result == None
def test_check_invalid_spdx_license_identifier(capsys): def test_check_invalid_spdx_license_identifier(capsys):