From ccc2a734569496edd425c7d13fc46ae3048763bd Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 8 Dec 2021 15:02:20 +0200 Subject: [PATCH] Add check for countries without matching regions If we have country "Kenya" we should have region "Eastern Africa" according to the UN M.49 geolocation scheme. --- csv_metadata_quality/app.py | 3 ++ csv_metadata_quality/check.py | 72 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 76 insertions(+) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 4b1d934..362958e 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -197,6 +197,9 @@ def run(argv): # Check: title in citation check.title_in_citation(df_transposed[column]) + # Check: countries match regions + check.countries_match_regions(df_transposed[column]) + if args.experimental_checks: experimental.correct_language(df_transposed[column]) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 69f9126..6a2f89f 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -4,6 +4,7 @@ import os import re from datetime import datetime, timedelta +import country_converter as coco import pandas as pd import requests import requests_cache @@ -447,3 +448,74 @@ def title_in_citation(row): print(f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}") return + + +def countries_match_regions(row): + """Check for the scenario where an item has country coverage metadata, but + does not have the corresponding region metadata. For example, an item that + has country coverage "Kenya" should also have region "Eastern Africa" acc- + ording to the UN M.49 classification scheme. + + See: https://unstats.un.org/unsd/methodology/m49/ + + Function prints a warning if the appropriate region is not present. + """ + # Initialize some variables at global scope so that we can set them in the + # loop scope below and still be able to access them afterwards. + country_column_name = "" + region_column_name = "" + title_column_name = "" + + # Iterate over the labels of the current row's values to get the names of + # the title and citation columns. Then we check if the title is present in + # the citation. + for label in row.axes[0]: + # Find the name of the country column + match = re.match(r"^.*?country.*$", label) + if match is not None: + country_column_name = label + + # Find the name of the region column + match = re.match(r"^.*?region.*$", label) + if match is not None: + region_column_name = label + + # Find the name of the title column + match = re.match(r"^(dc|dcterms)\.title.*$", label) + if match is not None: + title_column_name = label + + # Make sure we found the country and region columns + if country_column_name != "" and region_column_name != "": + # If we don't have any countries then we should return early before + # suggesting regions. + if row[country_column_name] is not None: + countries = row[country_column_name].split("||") + else: + return + + if row[region_column_name] is not None: + regions = row[region_column_name].split("||") + else: + regions = list() + + # An empty list for our regions so we can keep track for all countries + missing_regions = list() + + for country in countries: + # Look up the UN M.49 regions for this country code. CoCo seems to + # only list the direct region, ie Western Africa, rather than all + # the parent regions ("Sub-Saharan Africa", "Africa", "World") + un_region = coco.convert(names=country, to="UNRegion") + + if un_region not in regions: + if un_region not in missing_regions: + missing_regions.append(un_region) + + if len(missing_regions) > 0: + for missing_region in missing_regions: + print( + f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}" + ) + + return diff --git a/pyproject.toml b/pyproject.toml index bd54fed..a76596c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ colorama = "^0.4.4" spdx-license-list = "^0.5.2" ftfy = "^5.9" SQLAlchemy = ">=1.3.3,<1.4.23" +country-converter = "^0.7.4" [tool.poetry.dev-dependencies] pytest = "^6.1.1"