mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-09 14:46:00 +02:00
Improve exclude function
When a user explicitly requests that a field be excluded with -x we skip that field in most checks. Up until now that did not include the item-based checks using a transposed dataframe because we don't know the metadata field names (labels) until we iterate over them. Now the excludes are respected for item-based checks.
This commit is contained in:
@ -200,20 +200,22 @@ def run(argv):
|
||||
# should rename column in this for loop...
|
||||
for column in df_transposed.columns:
|
||||
# Check: citation DOI
|
||||
check.citation_doi(df_transposed[column])
|
||||
check.citation_doi(df_transposed[column], exclude)
|
||||
|
||||
# Check: title in citation
|
||||
check.title_in_citation(df_transposed[column])
|
||||
check.title_in_citation(df_transposed[column], exclude)
|
||||
|
||||
if args.unsafe_fixes:
|
||||
# Fix: countries match regions
|
||||
df_transposed[column] = fix.countries_match_regions(df_transposed[column])
|
||||
df_transposed[column] = fix.countries_match_regions(
|
||||
df_transposed[column], exclude
|
||||
)
|
||||
else:
|
||||
# Check: countries match regions
|
||||
check.countries_match_regions(df_transposed[column])
|
||||
check.countries_match_regions(df_transposed[column], exclude)
|
||||
|
||||
if args.experimental_checks:
|
||||
experimental.correct_language(df_transposed[column])
|
||||
experimental.correct_language(df_transposed[column], exclude)
|
||||
|
||||
# Transpose the DataFrame back before writing. This is probably wasteful to
|
||||
# do every time since we technically only need to do it if we've done the
|
||||
|
@ -391,13 +391,20 @@ def mojibake(field, field_name):
|
||||
return
|
||||
|
||||
|
||||
def citation_doi(row):
|
||||
def citation_doi(row, exclude):
|
||||
"""Check for the scenario where an item has a DOI listed in its citation,
|
||||
but does not have a cg.identifier.doi field.
|
||||
|
||||
Function prints a warning if the DOI field is missing, but there is a DOI
|
||||
in the citation.
|
||||
"""
|
||||
# Check if the user requested us to skip any DOI fields so we can
|
||||
# just return before going any further.
|
||||
for field in exclude:
|
||||
match = re.match(r"^.*?doi.*$", field)
|
||||
if match is not None:
|
||||
return
|
||||
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
citation = ""
|
||||
@ -415,9 +422,10 @@ def citation_doi(row):
|
||||
if match is not None:
|
||||
return
|
||||
|
||||
# Get the name of the citation field
|
||||
# Check if the current label is a citation field and make sure the user
|
||||
# hasn't asked to skip it. If not, then set the citation.
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
citation = row[label]
|
||||
|
||||
if citation != "":
|
||||
@ -433,7 +441,7 @@ def citation_doi(row):
|
||||
return
|
||||
|
||||
|
||||
def title_in_citation(row):
|
||||
def title_in_citation(row, exclude):
|
||||
"""Check for the scenario where an item's title is missing from its cita-
|
||||
tion. This could mean that it is missing entirely, or perhaps just exists
|
||||
in a different format (whitespace, accents, etc).
|
||||
@ -455,12 +463,12 @@ def title_in_citation(row):
|
||||
|
||||
# Find the name of the title column
|
||||
match = re.match(r"^(dc|dcterms)\.title.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
title = row[label]
|
||||
|
||||
# Find the name of the citation column
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
citation = row[label]
|
||||
|
||||
if citation != "":
|
||||
@ -470,7 +478,7 @@ def title_in_citation(row):
|
||||
return
|
||||
|
||||
|
||||
def countries_match_regions(row):
|
||||
def countries_match_regions(row, exclude):
|
||||
"""Check for the scenario where an item has country coverage metadata, but
|
||||
does not have the corresponding region metadata. For example, an item that
|
||||
has country coverage "Kenya" should also have region "Eastern Africa" acc-
|
||||
@ -514,6 +522,12 @@ def countries_match_regions(row):
|
||||
if match is not None:
|
||||
title_column_name = label
|
||||
|
||||
# Make sure the user has not asked to exclude any metadata fields. If so, we
|
||||
# should return immediately.
|
||||
column_names = [country_column_name, region_column_name, title_column_name]
|
||||
if any(field in column_names for field in exclude):
|
||||
return
|
||||
|
||||
# Make sure we found the country and region columns
|
||||
if country_column_name != "" and region_column_name != "":
|
||||
# If we don't have any countries then we should return early before
|
||||
|
@ -8,7 +8,7 @@ from colorama import Fore
|
||||
from pycountry import languages
|
||||
|
||||
|
||||
def correct_language(row):
|
||||
def correct_language(row, exclude):
|
||||
"""Analyze the text used in the title, abstract, and citation fields to pre-
|
||||
dict the language being used and compare it with the item's dc.language.iso
|
||||
field.
|
||||
@ -39,7 +39,8 @@ def correct_language(row):
|
||||
|
||||
language = row[label]
|
||||
|
||||
# Extract title if it is present
|
||||
# Extract title if it is present (note that we don't allow excluding
|
||||
# the title here because it complicates things).
|
||||
match = re.match(r"^.*?title.*$", label)
|
||||
if match is not None:
|
||||
title = row[label]
|
||||
@ -48,12 +49,12 @@ def correct_language(row):
|
||||
|
||||
# Extract abstract if it is present
|
||||
match = re.match(r"^.*?abstract.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Extract citation if it is present
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
if match is not None and label not in exclude:
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction
|
||||
|
@ -293,7 +293,7 @@ def mojibake(field, field_name):
|
||||
return field
|
||||
|
||||
|
||||
def countries_match_regions(row):
|
||||
def countries_match_regions(row, exclude):
|
||||
"""Check for the scenario where an item has country coverage metadata, but
|
||||
does not have the corresponding region metadata. For example, an item that
|
||||
has country coverage "Kenya" should also have region "Eastern Africa" acc-
|
||||
@ -337,6 +337,12 @@ def countries_match_regions(row):
|
||||
if match is not None:
|
||||
title_column_name = label
|
||||
|
||||
# Make sure the user has not asked to exclude any metadata fields. If so, we
|
||||
# should return immediately.
|
||||
column_names = [country_column_name, region_column_name, title_column_name]
|
||||
if any(field in column_names for field in exclude):
|
||||
return row
|
||||
|
||||
# Make sure we found the country and region columns
|
||||
if country_column_name != "" and region_column_name != "":
|
||||
# If we don't have any countries then we should return early before
|
||||
|
Reference in New Issue
Block a user