1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-09 14:46:00 +02:00

Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.
This commit is contained in:
2022-09-02 15:59:22 +03:00
parent 1f76247353
commit 040e56fc76
6 changed files with 54 additions and 24 deletions

View File

@ -403,8 +403,9 @@ def test_check_doi_field():
# the citation and a DOI field.
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
exclude = list()
result = check.citation_doi(series)
result = check.citation_doi(series, exclude)
assert result == None
@ -413,13 +414,14 @@ def test_check_doi_only_in_citation(capsys):
"""Test an item with a DOI in its citation, but no DOI field."""
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with
# an empty DOI field and a citation containing a DOI.
d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
check.citation_doi(series)
check.citation_doi(series, exclude)
captured = capsys.readouterr()
assert (
@ -433,13 +435,14 @@ def test_title_in_citation():
title = "Testing all the things"
citation = "Orth, A. 2021. Testing all the things."
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
result = check.title_in_citation(series)
result = check.title_in_citation(series, exclude)
assert result == None
@ -449,13 +452,14 @@ def test_title_not_in_citation(capsys):
title = "Testing all the things"
citation = "Orth, A. 2021. Testing all teh things."
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
check.title_in_citation(series)
check.title_in_citation(series, exclude)
captured = capsys.readouterr()
assert (
@ -469,12 +473,13 @@ def test_country_matches_region():
country = "Kenya"
region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series)
d = {"cg.coverage.country": country, "cg.coverage.region": region}
series = pd.Series(data=d)
result = check.countries_match_regions(series)
result = check.countries_match_regions(series, exclude)
assert result == None
@ -486,6 +491,7 @@ def test_country_not_matching_region(capsys):
country = "Kenya"
region = ""
missing_region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series)
d = {
@ -495,7 +501,7 @@ def test_country_not_matching_region(capsys):
}
series = pd.Series(data=d)
check.countries_match_regions(series)
check.countries_match_regions(series, exclude)
captured = capsys.readouterr()
assert (

View File

@ -131,6 +131,7 @@ def test_fix_country_not_matching_region():
country = "Kenya"
region = ""
missing_region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series)
d = {
@ -140,7 +141,7 @@ def test_fix_country_not_matching_region():
}
series = pd.Series(data=d)
result = fix.countries_match_regions(series)
result = fix.countries_match_regions(series, exclude)
# Emulate the correct series we are expecting
d_correct = {