mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-21 11:42:20 +01:00
Improve exclude function
When a user explicitly requests that a field be excluded with -x we skip that field in most checks. Up until now that did not include the item-based checks using a transposed dataframe because we don't know the metadata field names (labels) until we iterate over them. Now the excludes are respected for item-based checks.
This commit is contained in:
parent
1f76247353
commit
040e56fc76
@ -200,20 +200,22 @@ def run(argv):
|
||||
# should rename column in this for loop...
|
||||
for column in df_transposed.columns:
|
||||
# Check: citation DOI
|
||||
check.citation_doi(df_transposed[column])
|
||||
check.citation_doi(df_transposed[column], exclude)
|
||||
|
||||
# Check: title in citation
|
||||
check.title_in_citation(df_transposed[column])
|
||||
check.title_in_citation(df_transposed[column], exclude)
|
||||
|
||||
if args.unsafe_fixes:
|
||||
# Fix: countries match regions
|
||||
df_transposed[column] = fix.countries_match_regions(df_transposed[column])
|
||||
df_transposed[column] = fix.countries_match_regions(
|
||||
df_transposed[column], exclude
|
||||
)
|
||||
else:
|
||||
# Check: countries match regions
|
||||
check.countries_match_regions(df_transposed[column])
|
||||
check.countries_match_regions(df_transposed[column], exclude)
|
||||
|
||||
if args.experimental_checks:
|
||||
experimental.correct_language(df_transposed[column])
|
||||
experimental.correct_language(df_transposed[column], exclude)
|
||||
|
||||
# Transpose the DataFrame back before writing. This is probably wasteful to
|
||||
# do every time since we technically only need to do it if we've done the
|
||||
|
@ -391,13 +391,20 @@ def mojibake(field, field_name):
|
||||
return
|
||||
|
||||
|
||||
def citation_doi(row):
|
||||
def citation_doi(row, exclude):
|
||||
"""Check for the scenario where an item has a DOI listed in its citation,
|
||||
but does not have a cg.identifier.doi field.
|
||||
|
||||
Function prints a warning if the DOI field is missing, but there is a DOI
|
||||
in the citation.
|
||||
"""
|
||||
# Check if the user requested us to skip any DOI fields so we can
|
||||
# just return before going any further.
|
||||
for field in exclude:
|
||||
match = re.match(r"^.*?doi.*$", field)
|
||||
if match is not None:
|
||||
return
|
||||
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
citation = ""
|
||||
@ -415,9 +422,10 @@ def citation_doi(row):
|
||||
if match is not None:
|
||||
return
|
||||
|
||||
# Get the name of the citation field
|
||||
# Check if the current label is a citation field and make sure the user
|
||||
# hasn't asked to skip it. If not, then set the citation.
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
citation = row[label]
|
||||
|
||||
if citation != "":
|
||||
@ -433,7 +441,7 @@ def citation_doi(row):
|
||||
return
|
||||
|
||||
|
||||
def title_in_citation(row):
|
||||
def title_in_citation(row, exclude):
|
||||
"""Check for the scenario where an item's title is missing from its cita-
|
||||
tion. This could mean that it is missing entirely, or perhaps just exists
|
||||
in a different format (whitespace, accents, etc).
|
||||
@ -455,12 +463,12 @@ def title_in_citation(row):
|
||||
|
||||
# Find the name of the title column
|
||||
match = re.match(r"^(dc|dcterms)\.title.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
title = row[label]
|
||||
|
||||
# Find the name of the citation column
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
citation = row[label]
|
||||
|
||||
if citation != "":
|
||||
@ -470,7 +478,7 @@ def title_in_citation(row):
|
||||
return
|
||||
|
||||
|
||||
def countries_match_regions(row):
|
||||
def countries_match_regions(row, exclude):
|
||||
"""Check for the scenario where an item has country coverage metadata, but
|
||||
does not have the corresponding region metadata. For example, an item that
|
||||
has country coverage "Kenya" should also have region "Eastern Africa" acc-
|
||||
@ -514,6 +522,12 @@ def countries_match_regions(row):
|
||||
if match is not None:
|
||||
title_column_name = label
|
||||
|
||||
# Make sure the user has not asked to exclude any metadata fields. If so, we
|
||||
# should return immediately.
|
||||
column_names = [country_column_name, region_column_name, title_column_name]
|
||||
if any(field in column_names for field in exclude):
|
||||
return
|
||||
|
||||
# Make sure we found the country and region columns
|
||||
if country_column_name != "" and region_column_name != "":
|
||||
# If we don't have any countries then we should return early before
|
||||
|
@ -8,7 +8,7 @@ from colorama import Fore
|
||||
from pycountry import languages
|
||||
|
||||
|
||||
def correct_language(row):
|
||||
def correct_language(row, exclude):
|
||||
"""Analyze the text used in the title, abstract, and citation fields to pre-
|
||||
dict the language being used and compare it with the item's dc.language.iso
|
||||
field.
|
||||
@ -39,7 +39,8 @@ def correct_language(row):
|
||||
|
||||
language = row[label]
|
||||
|
||||
# Extract title if it is present
|
||||
# Extract title if it is present (note that we don't allow excluding
|
||||
# the title here because it complicates things).
|
||||
match = re.match(r"^.*?title.*$", label)
|
||||
if match is not None:
|
||||
title = row[label]
|
||||
@ -48,12 +49,12 @@ def correct_language(row):
|
||||
|
||||
# Extract abstract if it is present
|
||||
match = re.match(r"^.*?abstract.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Extract citation if it is present
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction
|
||||
|
@ -293,7 +293,7 @@ def mojibake(field, field_name):
|
||||
return field
|
||||
|
||||
|
||||
def countries_match_regions(row):
|
||||
def countries_match_regions(row, exclude):
|
||||
"""Check for the scenario where an item has country coverage metadata, but
|
||||
does not have the corresponding region metadata. For example, an item that
|
||||
has country coverage "Kenya" should also have region "Eastern Africa" acc-
|
||||
@ -337,6 +337,12 @@ def countries_match_regions(row):
|
||||
if match is not None:
|
||||
title_column_name = label
|
||||
|
||||
# Make sure the user has not asked to exclude any metadata fields. If so, we
|
||||
# should return immediately.
|
||||
column_names = [country_column_name, region_column_name, title_column_name]
|
||||
if any(field in column_names for field in exclude):
|
||||
return row
|
||||
|
||||
# Make sure we found the country and region columns
|
||||
if country_column_name != "" and region_column_name != "":
|
||||
# If we don't have any countries then we should return early before
|
||||
|
@ -403,8 +403,9 @@ def test_check_doi_field():
|
||||
# the citation and a DOI field.
|
||||
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
||||
series = pd.Series(data=d)
|
||||
exclude = list()
|
||||
|
||||
result = check.citation_doi(series)
|
||||
result = check.citation_doi(series, exclude)
|
||||
|
||||
assert result == None
|
||||
|
||||
@ -413,13 +414,14 @@ def test_check_doi_only_in_citation(capsys):
|
||||
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||
|
||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||
exclude = list()
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# an empty DOI field and a citation containing a DOI.
|
||||
d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
check.citation_doi(series)
|
||||
check.citation_doi(series, exclude)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
@ -433,13 +435,14 @@ def test_title_in_citation():
|
||||
|
||||
title = "Testing all the things"
|
||||
citation = "Orth, A. 2021. Testing all the things."
|
||||
exclude = list()
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the title and citation.
|
||||
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
result = check.title_in_citation(series)
|
||||
result = check.title_in_citation(series, exclude)
|
||||
|
||||
assert result == None
|
||||
|
||||
@ -449,13 +452,14 @@ def test_title_not_in_citation(capsys):
|
||||
|
||||
title = "Testing all the things"
|
||||
citation = "Orth, A. 2021. Testing all teh things."
|
||||
exclude = list()
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the title and citation.
|
||||
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
check.title_in_citation(series)
|
||||
check.title_in_citation(series, exclude)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
@ -469,12 +473,13 @@ def test_country_matches_region():
|
||||
|
||||
country = "Kenya"
|
||||
region = "Eastern Africa"
|
||||
exclude = list()
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
result = check.countries_match_regions(series)
|
||||
result = check.countries_match_regions(series, exclude)
|
||||
|
||||
assert result == None
|
||||
|
||||
@ -486,6 +491,7 @@ def test_country_not_matching_region(capsys):
|
||||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
exclude = list()
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
@ -495,7 +501,7 @@ def test_country_not_matching_region(capsys):
|
||||
}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
check.countries_match_regions(series)
|
||||
check.countries_match_regions(series, exclude)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
|
@ -131,6 +131,7 @@ def test_fix_country_not_matching_region():
|
||||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
exclude = list()
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
@ -140,7 +141,7 @@ def test_fix_country_not_matching_region():
|
||||
}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
result = fix.countries_match_regions(series)
|
||||
result = fix.countries_match_regions(series, exclude)
|
||||
|
||||
# Emulate the correct series we are expecting
|
||||
d_correct = {
|
||||
|
Loading…
Reference in New Issue
Block a user