Add unsafe check to add missing regions

2025-08-05 12:35:39 +02:00 · 2022-07-28 16:52:43 +03:00
parent 344993370c
commit 689ee184f7
3 changed files with 124 additions and 3 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -205,14 +205,23 @@ def run(argv):
        # Check: title in citation
        check.title_in_citation(df_transposed[column])

+        if args.unsafe_fixes:
+            # Fix: countries match regions
+            df_transposed[column] = fix.countries_match_regions(df_transposed[column])
+        else:
            # Check: countries match regions
            check.countries_match_regions(df_transposed[column])

        if args.experimental_checks:
            experimental.correct_language(df_transposed[column])

+    # Transpose the DataFrame back before writing. This is probably wasteful to
+    # do every time since we technically only need to do it if we've done the
+    # countries/regions fix above, but I can't think of another way for now.
+    df_transposed_back = df_transposed.T
+
    # Write
-    df.to_csv(args.output_file, index=False)
+    df_transposed_back.to_csv(args.output_file, index=False)

    # Close the input and output files before exiting
    args.input_file.close()
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -3,6 +3,7 @@
 import re
 from unicodedata import normalize

+import country_converter as coco
 import pandas as pd
 from colorama import Fore
 from ftfy import TextFixerConfig, fix_text
@ -289,3 +290,83 @@ def mojibake(field, field_name):
        return fix_text(field, config)
    else:
        return field
+
+
+def countries_match_regions(row):
+    """Check for the scenario where an item has country coverage metadata, but
+    does not have the corresponding region metadata. For example, an item that
+    has country coverage "Kenya" should also have region "Eastern Africa" acc-
+    ording to the UN M.49 classification scheme.
+
+    See: https://unstats.un.org/unsd/methodology/m49/
+
+    Return fixed string.
+    """
+    # Initialize some variables at global scope so that we can set them in the
+    # loop scope below and still be able to access them afterwards.
+    country_column_name = ""
+    region_column_name = ""
+    title_column_name = ""
+
+    # Iterate over the labels of the current row's values to get the names of
+    # the title and citation columns. Then we check if the title is present in
+    # the citation.
+    for label in row.axes[0]:
+        # Find the name of the country column
+        match = re.match(r"^.*?country.*$", label)
+        if match is not None:
+            country_column_name = label
+
+        # Find the name of the region column
+        match = re.match(r"^.*?region.*$", label)
+        if match is not None:
+            region_column_name = label
+
+        # Find the name of the title column
+        match = re.match(r"^(dc|dcterms)\.title.*$", label)
+        if match is not None:
+            title_column_name = label
+
+    # Make sure we found the country and region columns
+    if country_column_name != "" and region_column_name != "":
+        # If we don't have any countries then we should return early before
+        # suggesting regions.
+        if row[country_column_name] is not None:
+            countries = row[country_column_name].split("||")
+        else:
+            return
+
+        if row[region_column_name] is not None:
+            regions = row[region_column_name].split("||")
+        else:
+            regions = list()
+
+        # An empty list for our regions so we can keep track for all countries
+        missing_regions = list()
+
+        for country in countries:
+            # Look up the UN M.49 regions for this country code. CoCo seems to
+            # only list the direct region, ie Western Africa, rather than all
+            # the parent regions ("Sub-Saharan Africa", "Africa", "World")
+            un_region = coco.convert(names=country, to="UNRegion")
+
+            if un_region not in regions:
+                if un_region not in missing_regions:
+                    missing_regions.append(un_region)
+
+        if len(missing_regions) > 0:
+            for missing_region in missing_regions:
+                print(
+                    f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
+                )
+
+        # Add the missing regions back to the row, paying attention to whether
+        # or not the row's regions are blank or not.
+        if row[region_column_name] is not None:
+            row[region_column_name] = row[region_column_name] + "||".join(
+                missing_regions
+            )
+        else:
+            row[region_column_name] = "||".join(missing_regions)
+
+    return row
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-3.0-only

+import pandas as pd
+
 import csv_metadata_quality.fix as fix


@ -120,3 +122,32 @@ def test_fix_mojibake():
    field_name = "dcterms.isPartOf"

    assert fix.mojibake(field, field_name) == "CIAT Publicaçao"
+
+
+def test_fix_country_not_matching_region():
+    """Test an item with regions not matching its country list."""
+
+    title = "Testing an item with no matching region."
+    country = "Kenya"
+    region = ""
+    missing_region = "Eastern Africa"
+
+    # Emulate a column in a transposed dataframe (which is just a series)
+    d = {
+        "dc.title": title,
+        "cg.coverage.country": country,
+        "cg.coverage.region": region,
+    }
+    series = pd.Series(data=d)
+
+    result = fix.countries_match_regions(series)
+
+    # Emulate the correct series we are expecting
+    d_correct = {
+        "dc.title": title,
+        "cg.coverage.country": country,
+        "cg.coverage.region": missing_region,
+    }
+    series_correct = pd.Series(data=d_correct)
+
+    pd.testing.assert_series_equal(result, series_correct)