1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 04:02:19 +01:00
csv-metadata-quality/tests/test_check.py

505 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# SPDX-License-Identifier: GPL-3.0-only
import pandas as pd
from colorama import Fore
import csv_metadata_quality.check as check
import csv_metadata_quality.experimental as experimental
def test_check_invalid_issn(capsys):
"""Test checking invalid ISSN."""
value = "2321-2302"
check.issn(value)
captured = capsys.readouterr()
assert captured.out == f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}\n"
def test_check_valid_issn():
"""Test checking valid ISSN."""
value = "0024-9319"
result = check.issn(value)
assert result == None
def test_check_invalid_isbn(capsys):
"""Test checking invalid ISBN."""
value = "99921-58-10-6"
check.isbn(value)
captured = capsys.readouterr()
assert captured.out == f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}\n"
def test_check_valid_isbn():
"""Test checking valid ISBN."""
value = "99921-58-10-7"
result = check.isbn(value)
assert result == None
def test_check_missing_date(capsys):
"""Test checking missing date."""
value = None
field_name = "dc.date.issued"
check.date(value, field_name)
captured = capsys.readouterr()
assert captured.out == f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}\n"
def test_check_multiple_dates(capsys):
"""Test checking multiple dates."""
value = "1990||1991"
field_name = "dc.date.issued"
check.date(value, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_invalid_date(capsys):
"""Test checking invalid ISO8601 date."""
value = "1990-0"
field_name = "dc.date.issued"
check.date(value, field_name)
captured = capsys.readouterr()
assert (
captured.out == f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_valid_date():
"""Test checking valid ISO8601 date."""
value = "1990"
field_name = "dc.date.issued"
result = check.date(value, field_name)
assert result == None
def test_check_suspicious_characters(capsys):
"""Test checking for suspicious characters."""
value = "foreˆt"
field_name = "dc.contributor.author"
check.suspicious_characters(value, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}ˆt\n"
)
def test_check_valid_iso639_1_language():
"""Test valid ISO 639-1 (alpha 2) language."""
value = "ja"
result = check.language(value)
assert result == None
def test_check_valid_iso639_3_language():
"""Test valid ISO 639-3 (alpha 3) language."""
value = "eng"
result = check.language(value)
assert result == None
def test_check_invalid_iso639_1_language(capsys):
"""Test invalid ISO 639-1 (alpha 2) language."""
value = "jp"
check.language(value)
captured = capsys.readouterr()
assert (
captured.out == f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}\n"
)
def test_check_invalid_iso639_3_language(capsys):
"""Test invalid ISO 639-3 (alpha 3) language."""
value = "chi"
check.language(value)
captured = capsys.readouterr()
assert (
captured.out == f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}\n"
)
def test_check_invalid_language(capsys):
"""Test invalid language."""
value = "Span"
check.language(value)
captured = capsys.readouterr()
assert captured.out == f"{Fore.RED}Invalid language: {Fore.RESET}{value}\n"
def test_check_invalid_agrovoc(capsys):
"""Test invalid AGROVOC subject. Invalid values *will not* be dropped."""
valid_agrovoc = "LIVESTOCK"
invalid_agrovoc = "FOREST"
value = f"{valid_agrovoc}||{invalid_agrovoc}"
field_name = "dcterms.subject"
drop = False
new_value = check.agrovoc(value, field_name, drop)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
)
assert new_value == value
def test_check_invalid_agrovoc_dropped(capsys):
"""Test invalid AGROVOC subjects. Invalid values *will* be dropped."""
valid_agrovoc = "LIVESTOCK"
invalid_agrovoc = "FOREST"
value = f"{valid_agrovoc}||{invalid_agrovoc}"
field_name = "dcterms.subject"
drop = True
new_value = check.agrovoc(value, field_name, drop)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
)
assert new_value == valid_agrovoc
def test_check_valid_agrovoc():
"""Test valid AGROVOC subject."""
value = "FORESTS"
field_name = "dcterms.subject"
drop = False
result = check.agrovoc(value, field_name, drop)
assert result == "FORESTS"
def test_check_uncommon_filename_extension(capsys):
"""Test uncommon filename extension."""
value = "file.pdf.lck"
check.filename_extension(value)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}\n"
)
def test_check_common_filename_extension():
"""Test common filename extension."""
value = "file.pdf"
result = check.filename_extension(value)
assert result == None
def test_check_incorrect_iso_639_1_language(capsys):
"""Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "es"
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
experimental.correct_language(series)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Possibly incorrect language {language} (detected en): {Fore.RESET}{title}\n"
)
def test_check_incorrect_iso_639_3_language(capsys):
"""Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "spa"
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
experimental.correct_language(series)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Possibly incorrect language {language} (detected eng): {Fore.RESET}{title}\n"
)
def test_check_correct_iso_639_1_language():
"""Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "en"
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
result = experimental.correct_language(series)
assert result == None
def test_check_correct_iso_639_3_language():
"""Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "eng"
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
result = experimental.correct_language(series)
assert result == None
def test_check_valid_spdx_license_identifier():
"""Test valid SPDX license identifier."""
license = "CC-BY-SA-4.0"
result = check.spdx_license_identifier(license)
assert result == None
def test_check_invalid_spdx_license_identifier(capsys):
"""Test invalid SPDX license identifier."""
license = "CC-BY-SA"
result = check.spdx_license_identifier(license)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{license}\n"
)
def test_check_duplicate_item(capsys):
"""Test item with duplicate title, type, and date."""
item_title = "Title"
item_type = "Report"
item_date = "2021-03-17"
d = {
"dc.title": [item_title, item_title],
"dcterms.type": [item_type, item_type],
"dcterms.issued": [item_date, item_date],
}
df = pd.DataFrame(data=d)
result = check.duplicate_items(df)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Possible duplicate (dc.title): {Fore.RESET}{item_title}\n"
)
def test_check_no_mojibake():
"""Test string with no mojibake."""
field = "CIAT Publicaçao"
field_name = "dcterms.isPartOf"
result = check.mojibake(field, field_name)
assert result == None
def test_check_mojibake(capsys):
"""Test string with mojibake."""
field = "CIAT Publicaçao"
field_name = "dcterms.isPartOf"
result = check.mojibake(field, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
)
def test_check_doi_field():
"""Test an item with a DOI field."""
doi = "https://doi.org/10.1186/1743-422X-9-218"
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
# Emulate a column in a transposed dataframe (which is just a series), with
# the citation and a DOI field.
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
result = check.citation_doi(series)
assert result == None
def test_check_doi_only_in_citation(capsys):
"""Test an item with a DOI in its citation, but no DOI field."""
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
# Emulate a column in a transposed dataframe (which is just a series), with
# an empty DOI field and a citation containing a DOI.
d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
check.citation_doi(series)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n"
)
def test_title_in_citation():
"""Test an item with its title in the citation."""
title = "Testing all the things"
citation = "Orth, A. 2021. Testing all the things."
# Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
result = check.title_in_citation(series)
assert result == None
def test_title_not_in_citation(capsys):
"""Test an item with its title missing from the citation."""
title = "Testing all the things"
citation = "Orth, A. 2021. Testing all teh things."
# Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
check.title_in_citation(series)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}\n"
)
def test_country_matches_region():
"""Test an item with regions matching its country list."""
country = "Kenya"
region = "Eastern Africa"
# Emulate a column in a transposed dataframe (which is just a series)
d = {"cg.coverage.country": country, "cg.coverage.region": region}
series = pd.Series(data=d)
result = check.countries_match_regions(series)
assert result == None
def test_country_not_matching_region(capsys):
"""Test an item with regions not matching its country list."""
title = "Testing an item with no matching region."
country = "Kenya"
region = ""
missing_region = "Eastern Africa"
# Emulate a column in a transposed dataframe (which is just a series)
d = {
"dc.title": title,
"cg.coverage.country": country,
"cg.coverage.region": region,
}
series = pd.Series(data=d)
check.countries_match_regions(series)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{title}\n"
)