From 040e56fc76c5e41a5cb521d6b79257ded02cb78e Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 2 Sep 2022 15:59:22 +0300 Subject: [PATCH] Improve exclude function When a user explicitly requests that a field be excluded with -x we skip that field in most checks. Up until now that did not include the item-based checks using a transposed dataframe because we don't know the metadata field names (labels) until we iterate over them. Now the excludes are respected for item-based checks. --- csv_metadata_quality/app.py | 12 +++++++----- csv_metadata_quality/check.py | 28 +++++++++++++++++++++------- csv_metadata_quality/experimental.py | 9 +++++---- csv_metadata_quality/fix.py | 8 +++++++- tests/test_check.py | 18 ++++++++++++------ tests/test_fix.py | 3 ++- 6 files changed, 54 insertions(+), 24 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 37c2c4c..9be5b5a 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -200,20 +200,22 @@ def run(argv): # should rename column in this for loop... for column in df_transposed.columns: # Check: citation DOI - check.citation_doi(df_transposed[column]) + check.citation_doi(df_transposed[column], exclude) # Check: title in citation - check.title_in_citation(df_transposed[column]) + check.title_in_citation(df_transposed[column], exclude) if args.unsafe_fixes: # Fix: countries match regions - df_transposed[column] = fix.countries_match_regions(df_transposed[column]) + df_transposed[column] = fix.countries_match_regions( + df_transposed[column], exclude + ) else: # Check: countries match regions - check.countries_match_regions(df_transposed[column]) + check.countries_match_regions(df_transposed[column], exclude) if args.experimental_checks: - experimental.correct_language(df_transposed[column]) + experimental.correct_language(df_transposed[column], exclude) # Transpose the DataFrame back before writing. This is probably wasteful to # do every time since we technically only need to do it if we've done the diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index c327c33..188b1ed 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -391,13 +391,20 @@ def mojibake(field, field_name): return -def citation_doi(row): +def citation_doi(row, exclude): """Check for the scenario where an item has a DOI listed in its citation, but does not have a cg.identifier.doi field. Function prints a warning if the DOI field is missing, but there is a DOI in the citation. """ + # Check if the user requested us to skip any DOI fields so we can + # just return before going any further. + for field in exclude: + match = re.match(r"^.*?doi.*$", field) + if match is not None: + return + # Initialize some variables at global scope so that we can set them in the # loop scope below and still be able to access them afterwards. citation = "" @@ -415,9 +422,10 @@ def citation_doi(row): if match is not None: return - # Get the name of the citation field + # Check if the current label is a citation field and make sure the user + # hasn't asked to skip it. If not, then set the citation. match = re.match(r"^.*?[cC]itation.*$", label) - if match is not None: + if match is not None and label not in exclude: citation = row[label] if citation != "": @@ -433,7 +441,7 @@ def citation_doi(row): return -def title_in_citation(row): +def title_in_citation(row, exclude): """Check for the scenario where an item's title is missing from its cita- tion. This could mean that it is missing entirely, or perhaps just exists in a different format (whitespace, accents, etc). @@ -455,12 +463,12 @@ def title_in_citation(row): # Find the name of the title column match = re.match(r"^(dc|dcterms)\.title.*$", label) - if match is not None: + if match is not None and label not in exclude: title = row[label] # Find the name of the citation column match = re.match(r"^.*?[cC]itation.*$", label) - if match is not None: + if match is not None and label not in exclude: citation = row[label] if citation != "": @@ -470,7 +478,7 @@ def title_in_citation(row): return -def countries_match_regions(row): +def countries_match_regions(row, exclude): """Check for the scenario where an item has country coverage metadata, but does not have the corresponding region metadata. For example, an item that has country coverage "Kenya" should also have region "Eastern Africa" acc- @@ -514,6 +522,12 @@ def countries_match_regions(row): if match is not None: title_column_name = label + # Make sure the user has not asked to exclude any metadata fields. If so, we + # should return immediately. + column_names = [country_column_name, region_column_name, title_column_name] + if any(field in column_names for field in exclude): + return + # Make sure we found the country and region columns if country_column_name != "" and region_column_name != "": # If we don't have any countries then we should return early before diff --git a/csv_metadata_quality/experimental.py b/csv_metadata_quality/experimental.py index eede102..269d5f2 100644 --- a/csv_metadata_quality/experimental.py +++ b/csv_metadata_quality/experimental.py @@ -8,7 +8,7 @@ from colorama import Fore from pycountry import languages -def correct_language(row): +def correct_language(row, exclude): """Analyze the text used in the title, abstract, and citation fields to pre- dict the language being used and compare it with the item's dc.language.iso field. @@ -39,7 +39,8 @@ def correct_language(row): language = row[label] - # Extract title if it is present + # Extract title if it is present (note that we don't allow excluding + # the title here because it complicates things). match = re.match(r"^.*?title.*$", label) if match is not None: title = row[label] @@ -48,12 +49,12 @@ def correct_language(row): # Extract abstract if it is present match = re.match(r"^.*?abstract.*$", label) - if match is not None: + if match is not None and label not in exclude: sample_strings.append(row[label]) # Extract citation if it is present match = re.match(r"^.*?[cC]itation.*$", label) - if match is not None: + if match is not None and label not in exclude: sample_strings.append(row[label]) # Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 1bfb7d9..3437fa9 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -293,7 +293,7 @@ def mojibake(field, field_name): return field -def countries_match_regions(row): +def countries_match_regions(row, exclude): """Check for the scenario where an item has country coverage metadata, but does not have the corresponding region metadata. For example, an item that has country coverage "Kenya" should also have region "Eastern Africa" acc- @@ -337,6 +337,12 @@ def countries_match_regions(row): if match is not None: title_column_name = label + # Make sure the user has not asked to exclude any metadata fields. If so, we + # should return immediately. + column_names = [country_column_name, region_column_name, title_column_name] + if any(field in column_names for field in exclude): + return row + # Make sure we found the country and region columns if country_column_name != "" and region_column_name != "": # If we don't have any countries then we should return early before diff --git a/tests/test_check.py b/tests/test_check.py index f5359c8..758a7c6 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -403,8 +403,9 @@ def test_check_doi_field(): # the citation and a DOI field. d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) + exclude = list() - result = check.citation_doi(series) + result = check.citation_doi(series, exclude) assert result == None @@ -413,13 +414,14 @@ def test_check_doi_only_in_citation(capsys): """Test an item with a DOI in its citation, but no DOI field.""" citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218" + exclude = list() # Emulate a column in a transposed dataframe (which is just a series), with # an empty DOI field and a citation containing a DOI. d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) - check.citation_doi(series) + check.citation_doi(series, exclude) captured = capsys.readouterr() assert ( @@ -433,13 +435,14 @@ def test_title_in_citation(): title = "Testing all the things" citation = "Orth, A. 2021. Testing all the things." + exclude = list() # Emulate a column in a transposed dataframe (which is just a series), with # the title and citation. d = {"dc.title": title, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) - result = check.title_in_citation(series) + result = check.title_in_citation(series, exclude) assert result == None @@ -449,13 +452,14 @@ def test_title_not_in_citation(capsys): title = "Testing all the things" citation = "Orth, A. 2021. Testing all teh things." + exclude = list() # Emulate a column in a transposed dataframe (which is just a series), with # the title and citation. d = {"dc.title": title, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) - check.title_in_citation(series) + check.title_in_citation(series, exclude) captured = capsys.readouterr() assert ( @@ -469,12 +473,13 @@ def test_country_matches_region(): country = "Kenya" region = "Eastern Africa" + exclude = list() # Emulate a column in a transposed dataframe (which is just a series) d = {"cg.coverage.country": country, "cg.coverage.region": region} series = pd.Series(data=d) - result = check.countries_match_regions(series) + result = check.countries_match_regions(series, exclude) assert result == None @@ -486,6 +491,7 @@ def test_country_not_matching_region(capsys): country = "Kenya" region = "" missing_region = "Eastern Africa" + exclude = list() # Emulate a column in a transposed dataframe (which is just a series) d = { @@ -495,7 +501,7 @@ def test_country_not_matching_region(capsys): } series = pd.Series(data=d) - check.countries_match_regions(series) + check.countries_match_regions(series, exclude) captured = capsys.readouterr() assert ( diff --git a/tests/test_fix.py b/tests/test_fix.py index 5ff3e6c..4f88b16 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -131,6 +131,7 @@ def test_fix_country_not_matching_region(): country = "Kenya" region = "" missing_region = "Eastern Africa" + exclude = list() # Emulate a column in a transposed dataframe (which is just a series) d = { @@ -140,7 +141,7 @@ def test_fix_country_not_matching_region(): } series = pd.Series(data=d) - result = fix.countries_match_regions(series) + result = fix.countries_match_regions(series, exclude) # Emulate the correct series we are expecting d_correct = {