Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa" according to the UN M.49 geolocation scheme.
2025-09-16 08:26:41 +02:00 · 2021-12-08 15:02:20 +02:00
parent ad33195ba3
commit ccc2a73456
3 changed files with 76 additions and 0 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -197,6 +197,9 @@ def run(argv):
        # Check: title in citation
        check.title_in_citation(df_transposed[column])

+        # Check: countries match regions
+        check.countries_match_regions(df_transposed[column])
+
        if args.experimental_checks:
            experimental.correct_language(df_transposed[column])

--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -4,6 +4,7 @@ import os
 import re
 from datetime import datetime, timedelta

+import country_converter as coco
 import pandas as pd
 import requests
 import requests_cache
@@ -447,3 +448,74 @@ def title_in_citation(row):
            print(f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}")

    return
+
+
+def countries_match_regions(row):
+    """Check for the scenario where an item has country coverage metadata, but
+    does not have the corresponding region metadata. For example, an item that
+    has country coverage "Kenya" should also have region "Eastern Africa" acc-
+    ording to the UN M.49 classification scheme.
+
+    See: https://unstats.un.org/unsd/methodology/m49/
+
+    Function prints a warning if the appropriate region is not present.
+    """
+    # Initialize some variables at global scope so that we can set them in the
+    # loop scope below and still be able to access them afterwards.
+    country_column_name = ""
+    region_column_name = ""
+    title_column_name = ""
+
+    # Iterate over the labels of the current row's values to get the names of
+    # the title and citation columns. Then we check if the title is present in
+    # the citation.
+    for label in row.axes[0]:
+        # Find the name of the country column
+        match = re.match(r"^.*?country.*$", label)
+        if match is not None:
+            country_column_name = label
+
+        # Find the name of the region column
+        match = re.match(r"^.*?region.*$", label)
+        if match is not None:
+            region_column_name = label
+
+        # Find the name of the title column
+        match = re.match(r"^(dc|dcterms)\.title.*$", label)
+        if match is not None:
+            title_column_name = label
+
+    # Make sure we found the country and region columns
+    if country_column_name != "" and region_column_name != "":
+        # If we don't have any countries then we should return early before
+        # suggesting regions.
+        if row[country_column_name] is not None:
+            countries = row[country_column_name].split("||")
+        else:
+            return
+
+        if row[region_column_name] is not None:
+            regions = row[region_column_name].split("||")
+        else:
+            regions = list()
+
+        # An empty list for our regions so we can keep track for all countries
+        missing_regions = list()
+
+        for country in countries:
+            # Look up the UN M.49 regions for this country code. CoCo seems to
+            # only list the direct region, ie Western Africa, rather than all
+            # the parent regions ("Sub-Saharan Africa", "Africa", "World")
+            un_region = coco.convert(names=country, to="UNRegion")
+
+            if un_region not in regions:
+                if un_region not in missing_regions:
+                    missing_regions.append(un_region)
+
+        if len(missing_regions) > 0:
+            for missing_region in missing_regions:
+                print(
+                    f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
+                )
+
+    return
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ colorama = "^0.4.4"
 spdx-license-list = "^0.5.2"
 ftfy = "^5.9"
 SQLAlchemy = ">=1.3.3,<1.4.23"
+country-converter = "^0.7.4"

 [tool.poetry.dev-dependencies]
 pytest = "^6.1.1"