1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 05:45:02 +01:00

csv_metadata_quality/check.py: missing region fixes

Port over the recent fixes and logic improvements to regions from
fix.py.
This commit is contained in:
Alan Orth 2022-09-01 16:38:35 +03:00
parent f49214fa2e
commit 117c6ca85d
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-3.0-only # SPDX-License-Identifier: GPL-3.0-only
import logging
import os import os
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -217,7 +218,7 @@ def agrovoc(field, field_name, drop):
) )
# prune old cache entries # prune old cache entries
requests_cache.remove_expired_responses() # requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values # Initialize an empty list to hold the validated AGROVOC values
values = list() values = list()
@ -485,6 +486,15 @@ def countries_match_regions(row):
region_column_name = "" region_column_name = ""
title_column_name = "" title_column_name = ""
# Instantiate a CountryConverter() object here. According to the docs it is
# more performant to do that as opposed to calling coco.convert() directly
# because we don't need to re-load the country data with each iteration.
cc = coco.CountryConverter()
# Set logging to ERROR so country_converter's convert() doesn't print the
# "not found in regex" warning message to the screen.
logging.basicConfig(level=logging.ERROR)
# Iterate over the labels of the current row's values to get the names of # Iterate over the labels of the current row's values to get the names of
# the title and citation columns. Then we check if the title is present in # the title and citation columns. Then we check if the title is present in
# the citation. # the citation.
@ -518,23 +528,15 @@ def countries_match_regions(row):
else: else:
regions = list() regions = list()
# An empty list for our regions so we can keep track for all countries
missing_regions = list()
for country in countries: for country in countries:
# Look up the UN M.49 regions for this country code. CoCo seems to # Look up the UN M.49 regions for this country code. CoCo seems to
# only list the direct region, ie Western Africa, rather than all # only list the direct region, ie Western Africa, rather than all
# the parent regions ("Sub-Saharan Africa", "Africa", "World") # the parent regions ("Sub-Saharan Africa", "Africa", "World")
un_region = coco.convert(names=country, to="UNRegion") un_region = cc.convert(names=country, to="UNRegion")
if un_region not in regions: if un_region != "not found" and un_region not in regions:
if un_region not in missing_regions:
missing_regions.append(un_region)
if len(missing_regions) > 0:
for missing_region in missing_regions:
print( print(
f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}" f"{Fore.YELLOW}Missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
) )
return return