Improve exclude function

When a user explicitly requests that a field be excluded with -x we skip that field in most checks. Up until now that did not include the item-based checks using a transposed dataframe because we don't know the metadata field names (labels) until we iterate over them. Now the excludes are respected for item-based checks.
2025-07-25 15:28:02 +02:00 · 2022-09-02 15:59:22 +03:00
parent 1f76247353
commit 040e56fc76
6 changed files with 54 additions and 24 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -200,20 +200,22 @@ def run(argv):
    # should rename column in this for loop...
    for column in df_transposed.columns:
        # Check: citation DOI
-        check.citation_doi(df_transposed[column])
+        check.citation_doi(df_transposed[column], exclude)

        # Check: title in citation
-        check.title_in_citation(df_transposed[column])
+        check.title_in_citation(df_transposed[column], exclude)

        if args.unsafe_fixes:
            # Fix: countries match regions
-            df_transposed[column] = fix.countries_match_regions(df_transposed[column])
+            df_transposed[column] = fix.countries_match_regions(
+                df_transposed[column], exclude
+            )
        else:
            # Check: countries match regions
-            check.countries_match_regions(df_transposed[column])
+            check.countries_match_regions(df_transposed[column], exclude)

        if args.experimental_checks:
-            experimental.correct_language(df_transposed[column])
+            experimental.correct_language(df_transposed[column], exclude)

    # Transpose the DataFrame back before writing. This is probably wasteful to
    # do every time since we technically only need to do it if we've done the
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -391,13 +391,20 @@ def mojibake(field, field_name):
    return


-def citation_doi(row):
+def citation_doi(row, exclude):
    """Check for the scenario where an item has a DOI listed in its citation,
    but does not have a cg.identifier.doi field.

    Function prints a warning if the DOI field is missing, but there is a DOI
    in the citation.
    """
+    # Check if the user requested us to skip any DOI fields so we can
+    # just return before going any further.
+    for field in exclude:
+        match = re.match(r"^.*?doi.*$", field)
+        if match is not None:
+            return
+
    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    citation = ""
@ -415,9 +422,10 @@ def citation_doi(row):
        if match is not None:
            return

-        # Get the name of the citation field
+        # Check if the current label is a citation field and make sure the user
+        # hasn't asked to skip it. If not, then set the citation.
        match = re.match(r"^.*?[cC]itation.*$", label)
-        if match is not None:
+        if match is not None and label not in exclude:
            citation = row[label]

    if citation != "":
@ -433,7 +441,7 @@ def citation_doi(row):
    return


-def title_in_citation(row):
+def title_in_citation(row, exclude):
    """Check for the scenario where an item's title is missing from its cita-
    tion. This could mean that it is missing entirely, or perhaps just exists
    in a different format (whitespace, accents, etc).
@ -455,12 +463,12 @@ def title_in_citation(row):

        # Find the name of the title column
        match = re.match(r"^(dc|dcterms)\.title.*$", label)
-        if match is not None:
+        if match is not None and label not in exclude:
            title = row[label]

        # Find the name of the citation column
        match = re.match(r"^.*?[cC]itation.*$", label)
-        if match is not None:
+        if match is not None and label not in exclude:
            citation = row[label]

    if citation != "":
@ -470,7 +478,7 @@ def title_in_citation(row):
    return


-def countries_match_regions(row):
+def countries_match_regions(row, exclude):
    """Check for the scenario where an item has country coverage metadata, but
    does not have the corresponding region metadata. For example, an item that
    has country coverage "Kenya" should also have region "Eastern Africa" acc-
@ -514,6 +522,12 @@ def countries_match_regions(row):
        if match is not None:
            title_column_name = label

+    # Make sure the user has not asked to exclude any metadata fields. If so, we
+    # should return immediately.
+    column_names = [country_column_name, region_column_name, title_column_name]
+    if any(field in column_names for field in exclude):
+        return
+
    # Make sure we found the country and region columns
    if country_column_name != "" and region_column_name != "":
        # If we don't have any countries then we should return early before
--- a/csv_metadata_quality/experimental.py
+++ b/csv_metadata_quality/experimental.py
@ -8,7 +8,7 @@ from colorama import Fore
 from pycountry import languages


-def correct_language(row):
+def correct_language(row, exclude):
    """Analyze the text used in the title, abstract, and citation fields to pre-
    dict the language being used and compare it with the item's dc.language.iso
    field.
@ -39,7 +39,8 @@ def correct_language(row):

            language = row[label]

-        # Extract title if it is present
+        # Extract title if it is present (note that we don't allow excluding
+        # the title here because it complicates things).
        match = re.match(r"^.*?title.*$", label)
        if match is not None:
            title = row[label]
@ -48,12 +49,12 @@ def correct_language(row):

        # Extract abstract if it is present
        match = re.match(r"^.*?abstract.*$", label)
-        if match is not None:
+        if match is not None and label not in exclude:
            sample_strings.append(row[label])

        # Extract citation if it is present
        match = re.match(r"^.*?[cC]itation.*$", label)
-        if match is not None:
+        if match is not None and label not in exclude:
            sample_strings.append(row[label])

    # Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -293,7 +293,7 @@ def mojibake(field, field_name):
        return field


-def countries_match_regions(row):
+def countries_match_regions(row, exclude):
    """Check for the scenario where an item has country coverage metadata, but
    does not have the corresponding region metadata. For example, an item that
    has country coverage "Kenya" should also have region "Eastern Africa" acc-
@ -337,6 +337,12 @@ def countries_match_regions(row):
        if match is not None:
            title_column_name = label

+    # Make sure the user has not asked to exclude any metadata fields. If so, we
+    # should return immediately.
+    column_names = [country_column_name, region_column_name, title_column_name]
+    if any(field in column_names for field in exclude):
+        return row
+
    # Make sure we found the country and region columns
    if country_column_name != "" and region_column_name != "":
        # If we don't have any countries then we should return early before
--- a/tests/test_check.py
+++ b/tests/test_check.py
@ -403,8 +403,9 @@ def test_check_doi_field():
    # the citation and a DOI field.
    d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)
+    exclude = list()

-    result = check.citation_doi(series)
+    result = check.citation_doi(series, exclude)

    assert result == None

@ -413,13 +414,14 @@ def test_check_doi_only_in_citation(capsys):
    """Test an item with a DOI in its citation, but no DOI field."""

    citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
+    exclude = list()

    # Emulate a column in a transposed dataframe (which is just a series), with
    # an empty DOI field and a citation containing a DOI.
    d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)

-    check.citation_doi(series)
+    check.citation_doi(series, exclude)

    captured = capsys.readouterr()
    assert (
@ -433,13 +435,14 @@ def test_title_in_citation():

    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all the things."
+    exclude = list()

    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
    d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)

-    result = check.title_in_citation(series)
+    result = check.title_in_citation(series, exclude)

    assert result == None

@ -449,13 +452,14 @@ def test_title_not_in_citation(capsys):

    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all teh things."
+    exclude = list()

    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
    d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)

-    check.title_in_citation(series)
+    check.title_in_citation(series, exclude)

    captured = capsys.readouterr()
    assert (
@ -469,12 +473,13 @@ def test_country_matches_region():

    country = "Kenya"
    region = "Eastern Africa"
+    exclude = list()

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {"cg.coverage.country": country, "cg.coverage.region": region}
    series = pd.Series(data=d)

-    result = check.countries_match_regions(series)
+    result = check.countries_match_regions(series, exclude)

    assert result == None

@ -486,6 +491,7 @@ def test_country_not_matching_region(capsys):
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
+    exclude = list()

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
@ -495,7 +501,7 @@ def test_country_not_matching_region(capsys):
    }
    series = pd.Series(data=d)

-    check.countries_match_regions(series)
+    check.countries_match_regions(series, exclude)

    captured = capsys.readouterr()
    assert (
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@ -131,6 +131,7 @@ def test_fix_country_not_matching_region():
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
+    exclude = list()

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
@ -140,7 +141,7 @@ def test_fix_country_not_matching_region():
    }
    series = pd.Series(data=d)

-    result = fix.countries_match_regions(series)
+    result = fix.countries_match_regions(series, exclude)

    # Emulate the correct series we are expecting
    d_correct = {