mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 04:02:19 +01:00
Add check for countries without matching regions
If we have country "Kenya" we should have region "Eastern Africa" according to the UN M.49 geolocation scheme.
This commit is contained in:
parent
ad33195ba3
commit
ccc2a73456
@ -197,6 +197,9 @@ def run(argv):
|
|||||||
# Check: title in citation
|
# Check: title in citation
|
||||||
check.title_in_citation(df_transposed[column])
|
check.title_in_citation(df_transposed[column])
|
||||||
|
|
||||||
|
# Check: countries match regions
|
||||||
|
check.countries_match_regions(df_transposed[column])
|
||||||
|
|
||||||
if args.experimental_checks:
|
if args.experimental_checks:
|
||||||
experimental.correct_language(df_transposed[column])
|
experimental.correct_language(df_transposed[column])
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
import country_converter as coco
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import requests_cache
|
import requests_cache
|
||||||
@ -447,3 +448,74 @@ def title_in_citation(row):
|
|||||||
print(f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}")
|
print(f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}")
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def countries_match_regions(row):
|
||||||
|
"""Check for the scenario where an item has country coverage metadata, but
|
||||||
|
does not have the corresponding region metadata. For example, an item that
|
||||||
|
has country coverage "Kenya" should also have region "Eastern Africa" acc-
|
||||||
|
ording to the UN M.49 classification scheme.
|
||||||
|
|
||||||
|
See: https://unstats.un.org/unsd/methodology/m49/
|
||||||
|
|
||||||
|
Function prints a warning if the appropriate region is not present.
|
||||||
|
"""
|
||||||
|
# Initialize some variables at global scope so that we can set them in the
|
||||||
|
# loop scope below and still be able to access them afterwards.
|
||||||
|
country_column_name = ""
|
||||||
|
region_column_name = ""
|
||||||
|
title_column_name = ""
|
||||||
|
|
||||||
|
# Iterate over the labels of the current row's values to get the names of
|
||||||
|
# the title and citation columns. Then we check if the title is present in
|
||||||
|
# the citation.
|
||||||
|
for label in row.axes[0]:
|
||||||
|
# Find the name of the country column
|
||||||
|
match = re.match(r"^.*?country.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
country_column_name = label
|
||||||
|
|
||||||
|
# Find the name of the region column
|
||||||
|
match = re.match(r"^.*?region.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
region_column_name = label
|
||||||
|
|
||||||
|
# Find the name of the title column
|
||||||
|
match = re.match(r"^(dc|dcterms)\.title.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
title_column_name = label
|
||||||
|
|
||||||
|
# Make sure we found the country and region columns
|
||||||
|
if country_column_name != "" and region_column_name != "":
|
||||||
|
# If we don't have any countries then we should return early before
|
||||||
|
# suggesting regions.
|
||||||
|
if row[country_column_name] is not None:
|
||||||
|
countries = row[country_column_name].split("||")
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
if row[region_column_name] is not None:
|
||||||
|
regions = row[region_column_name].split("||")
|
||||||
|
else:
|
||||||
|
regions = list()
|
||||||
|
|
||||||
|
# An empty list for our regions so we can keep track for all countries
|
||||||
|
missing_regions = list()
|
||||||
|
|
||||||
|
for country in countries:
|
||||||
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
|
# only list the direct region, ie Western Africa, rather than all
|
||||||
|
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||||
|
un_region = coco.convert(names=country, to="UNRegion")
|
||||||
|
|
||||||
|
if un_region not in regions:
|
||||||
|
if un_region not in missing_regions:
|
||||||
|
missing_regions.append(un_region)
|
||||||
|
|
||||||
|
if len(missing_regions) > 0:
|
||||||
|
for missing_region in missing_regions:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
@ -23,6 +23,7 @@ colorama = "^0.4.4"
|
|||||||
spdx-license-list = "^0.5.2"
|
spdx-license-list = "^0.5.2"
|
||||||
ftfy = "^5.9"
|
ftfy = "^5.9"
|
||||||
SQLAlchemy = ">=1.3.3,<1.4.23"
|
SQLAlchemy = ">=1.3.3,<1.4.23"
|
||||||
|
country-converter = "^0.7.4"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
pytest = "^6.1.1"
|
pytest = "^6.1.1"
|
||||||
|
Loading…
Reference in New Issue
Block a user