1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-04-10 17:12:31 +02:00

Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.
This commit is contained in:
Alan Orth 2022-09-02 15:59:22 +03:00
parent 1f76247353
commit 040e56fc76
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
6 changed files with 54 additions and 24 deletions

View File

@ -200,20 +200,22 @@ def run(argv):
# should rename column in this for loop... # should rename column in this for loop...
for column in df_transposed.columns: for column in df_transposed.columns:
# Check: citation DOI # Check: citation DOI
check.citation_doi(df_transposed[column]) check.citation_doi(df_transposed[column], exclude)
# Check: title in citation # Check: title in citation
check.title_in_citation(df_transposed[column]) check.title_in_citation(df_transposed[column], exclude)
if args.unsafe_fixes: if args.unsafe_fixes:
# Fix: countries match regions # Fix: countries match regions
df_transposed[column] = fix.countries_match_regions(df_transposed[column]) df_transposed[column] = fix.countries_match_regions(
df_transposed[column], exclude
)
else: else:
# Check: countries match regions # Check: countries match regions
check.countries_match_regions(df_transposed[column]) check.countries_match_regions(df_transposed[column], exclude)
if args.experimental_checks: if args.experimental_checks:
experimental.correct_language(df_transposed[column]) experimental.correct_language(df_transposed[column], exclude)
# Transpose the DataFrame back before writing. This is probably wasteful to # Transpose the DataFrame back before writing. This is probably wasteful to
# do every time since we technically only need to do it if we've done the # do every time since we technically only need to do it if we've done the

View File

@ -391,13 +391,20 @@ def mojibake(field, field_name):
return return
def citation_doi(row): def citation_doi(row, exclude):
"""Check for the scenario where an item has a DOI listed in its citation, """Check for the scenario where an item has a DOI listed in its citation,
but does not have a cg.identifier.doi field. but does not have a cg.identifier.doi field.
Function prints a warning if the DOI field is missing, but there is a DOI Function prints a warning if the DOI field is missing, but there is a DOI
in the citation. in the citation.
""" """
# Check if the user requested us to skip any DOI fields so we can
# just return before going any further.
for field in exclude:
match = re.match(r"^.*?doi.*$", field)
if match is not None:
return
# Initialize some variables at global scope so that we can set them in the # Initialize some variables at global scope so that we can set them in the
# loop scope below and still be able to access them afterwards. # loop scope below and still be able to access them afterwards.
citation = "" citation = ""
@ -415,9 +422,10 @@ def citation_doi(row):
if match is not None: if match is not None:
return return
# Get the name of the citation field # Check if the current label is a citation field and make sure the user
# hasn't asked to skip it. If not, then set the citation.
match = re.match(r"^.*?[cC]itation.*$", label) match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None: if match is not None and label not in exclude:
citation = row[label] citation = row[label]
if citation != "": if citation != "":
@ -433,7 +441,7 @@ def citation_doi(row):
return return
def title_in_citation(row): def title_in_citation(row, exclude):
"""Check for the scenario where an item's title is missing from its cita- """Check for the scenario where an item's title is missing from its cita-
tion. This could mean that it is missing entirely, or perhaps just exists tion. This could mean that it is missing entirely, or perhaps just exists
in a different format (whitespace, accents, etc). in a different format (whitespace, accents, etc).
@ -455,12 +463,12 @@ def title_in_citation(row):
# Find the name of the title column # Find the name of the title column
match = re.match(r"^(dc|dcterms)\.title.*$", label) match = re.match(r"^(dc|dcterms)\.title.*$", label)
if match is not None: if match is not None and label not in exclude:
title = row[label] title = row[label]
# Find the name of the citation column # Find the name of the citation column
match = re.match(r"^.*?[cC]itation.*$", label) match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None: if match is not None and label not in exclude:
citation = row[label] citation = row[label]
if citation != "": if citation != "":
@ -470,7 +478,7 @@ def title_in_citation(row):
return return
def countries_match_regions(row): def countries_match_regions(row, exclude):
"""Check for the scenario where an item has country coverage metadata, but """Check for the scenario where an item has country coverage metadata, but
does not have the corresponding region metadata. For example, an item that does not have the corresponding region metadata. For example, an item that
has country coverage "Kenya" should also have region "Eastern Africa" acc- has country coverage "Kenya" should also have region "Eastern Africa" acc-
@ -514,6 +522,12 @@ def countries_match_regions(row):
if match is not None: if match is not None:
title_column_name = label title_column_name = label
# Make sure the user has not asked to exclude any metadata fields. If so, we
# should return immediately.
column_names = [country_column_name, region_column_name, title_column_name]
if any(field in column_names for field in exclude):
return
# Make sure we found the country and region columns # Make sure we found the country and region columns
if country_column_name != "" and region_column_name != "": if country_column_name != "" and region_column_name != "":
# If we don't have any countries then we should return early before # If we don't have any countries then we should return early before

View File

@ -8,7 +8,7 @@ from colorama import Fore
from pycountry import languages from pycountry import languages
def correct_language(row): def correct_language(row, exclude):
"""Analyze the text used in the title, abstract, and citation fields to pre- """Analyze the text used in the title, abstract, and citation fields to pre-
dict the language being used and compare it with the item's dc.language.iso dict the language being used and compare it with the item's dc.language.iso
field. field.
@ -39,7 +39,8 @@ def correct_language(row):
language = row[label] language = row[label]
# Extract title if it is present # Extract title if it is present (note that we don't allow excluding
# the title here because it complicates things).
match = re.match(r"^.*?title.*$", label) match = re.match(r"^.*?title.*$", label)
if match is not None: if match is not None:
title = row[label] title = row[label]
@ -48,12 +49,12 @@ def correct_language(row):
# Extract abstract if it is present # Extract abstract if it is present
match = re.match(r"^.*?abstract.*$", label) match = re.match(r"^.*?abstract.*$", label)
if match is not None: if match is not None and label not in exclude:
sample_strings.append(row[label]) sample_strings.append(row[label])
# Extract citation if it is present # Extract citation if it is present
match = re.match(r"^.*?[cC]itation.*$", label) match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None: if match is not None and label not in exclude:
sample_strings.append(row[label]) sample_strings.append(row[label])
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction # Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction

View File

@ -293,7 +293,7 @@ def mojibake(field, field_name):
return field return field
def countries_match_regions(row): def countries_match_regions(row, exclude):
"""Check for the scenario where an item has country coverage metadata, but """Check for the scenario where an item has country coverage metadata, but
does not have the corresponding region metadata. For example, an item that does not have the corresponding region metadata. For example, an item that
has country coverage "Kenya" should also have region "Eastern Africa" acc- has country coverage "Kenya" should also have region "Eastern Africa" acc-
@ -337,6 +337,12 @@ def countries_match_regions(row):
if match is not None: if match is not None:
title_column_name = label title_column_name = label
# Make sure the user has not asked to exclude any metadata fields. If so, we
# should return immediately.
column_names = [country_column_name, region_column_name, title_column_name]
if any(field in column_names for field in exclude):
return row
# Make sure we found the country and region columns # Make sure we found the country and region columns
if country_column_name != "" and region_column_name != "": if country_column_name != "" and region_column_name != "":
# If we don't have any countries then we should return early before # If we don't have any countries then we should return early before

View File

@ -403,8 +403,9 @@ def test_check_doi_field():
# the citation and a DOI field. # the citation and a DOI field.
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation} d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d) series = pd.Series(data=d)
exclude = list()
result = check.citation_doi(series) result = check.citation_doi(series, exclude)
assert result == None assert result == None
@ -413,13 +414,14 @@ def test_check_doi_only_in_citation(capsys):
"""Test an item with a DOI in its citation, but no DOI field.""" """Test an item with a DOI in its citation, but no DOI field."""
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218" citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with # Emulate a column in a transposed dataframe (which is just a series), with
# an empty DOI field and a citation containing a DOI. # an empty DOI field and a citation containing a DOI.
d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation} d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d) series = pd.Series(data=d)
check.citation_doi(series) check.citation_doi(series, exclude)
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (
@ -433,13 +435,14 @@ def test_title_in_citation():
title = "Testing all the things" title = "Testing all the things"
citation = "Orth, A. 2021. Testing all the things." citation = "Orth, A. 2021. Testing all the things."
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with # Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation. # the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation} d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d) series = pd.Series(data=d)
result = check.title_in_citation(series) result = check.title_in_citation(series, exclude)
assert result == None assert result == None
@ -449,13 +452,14 @@ def test_title_not_in_citation(capsys):
title = "Testing all the things" title = "Testing all the things"
citation = "Orth, A. 2021. Testing all teh things." citation = "Orth, A. 2021. Testing all teh things."
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with # Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation. # the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation} d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d) series = pd.Series(data=d)
check.title_in_citation(series) check.title_in_citation(series, exclude)
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (
@ -469,12 +473,13 @@ def test_country_matches_region():
country = "Kenya" country = "Kenya"
region = "Eastern Africa" region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series) # Emulate a column in a transposed dataframe (which is just a series)
d = {"cg.coverage.country": country, "cg.coverage.region": region} d = {"cg.coverage.country": country, "cg.coverage.region": region}
series = pd.Series(data=d) series = pd.Series(data=d)
result = check.countries_match_regions(series) result = check.countries_match_regions(series, exclude)
assert result == None assert result == None
@ -486,6 +491,7 @@ def test_country_not_matching_region(capsys):
country = "Kenya" country = "Kenya"
region = "" region = ""
missing_region = "Eastern Africa" missing_region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series) # Emulate a column in a transposed dataframe (which is just a series)
d = { d = {
@ -495,7 +501,7 @@ def test_country_not_matching_region(capsys):
} }
series = pd.Series(data=d) series = pd.Series(data=d)
check.countries_match_regions(series) check.countries_match_regions(series, exclude)
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (

View File

@ -131,6 +131,7 @@ def test_fix_country_not_matching_region():
country = "Kenya" country = "Kenya"
region = "" region = ""
missing_region = "Eastern Africa" missing_region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series) # Emulate a column in a transposed dataframe (which is just a series)
d = { d = {
@ -140,7 +141,7 @@ def test_fix_country_not_matching_region():
} }
series = pd.Series(data=d) series = pd.Series(data=d)
result = fix.countries_match_regions(series) result = fix.countries_match_regions(series, exclude)
# Emulate the correct series we are expecting # Emulate the correct series we are expecting
d_correct = { d_correct = {