mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-22 05:45:02 +01:00
csv_metadata_quality/check.py: missing region fixes
Port over the recent fixes and logic improvements to regions from fix.py.
This commit is contained in:
parent
f49214fa2e
commit
117c6ca85d
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@ -217,7 +218,7 @@ def agrovoc(field, field_name, drop):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# prune old cache entries
|
# prune old cache entries
|
||||||
requests_cache.remove_expired_responses()
|
# requests_cache.remove_expired_responses()
|
||||||
|
|
||||||
# Initialize an empty list to hold the validated AGROVOC values
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
values = list()
|
values = list()
|
||||||
@ -485,6 +486,15 @@ def countries_match_regions(row):
|
|||||||
region_column_name = ""
|
region_column_name = ""
|
||||||
title_column_name = ""
|
title_column_name = ""
|
||||||
|
|
||||||
|
# Instantiate a CountryConverter() object here. According to the docs it is
|
||||||
|
# more performant to do that as opposed to calling coco.convert() directly
|
||||||
|
# because we don't need to re-load the country data with each iteration.
|
||||||
|
cc = coco.CountryConverter()
|
||||||
|
|
||||||
|
# Set logging to ERROR so country_converter's convert() doesn't print the
|
||||||
|
# "not found in regex" warning message to the screen.
|
||||||
|
logging.basicConfig(level=logging.ERROR)
|
||||||
|
|
||||||
# Iterate over the labels of the current row's values to get the names of
|
# Iterate over the labels of the current row's values to get the names of
|
||||||
# the title and citation columns. Then we check if the title is present in
|
# the title and citation columns. Then we check if the title is present in
|
||||||
# the citation.
|
# the citation.
|
||||||
@ -518,23 +528,15 @@ def countries_match_regions(row):
|
|||||||
else:
|
else:
|
||||||
regions = list()
|
regions = list()
|
||||||
|
|
||||||
# An empty list for our regions so we can keep track for all countries
|
|
||||||
missing_regions = list()
|
|
||||||
|
|
||||||
for country in countries:
|
for country in countries:
|
||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
# only list the direct region, ie Western Africa, rather than all
|
# only list the direct region, ie Western Africa, rather than all
|
||||||
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||||
un_region = coco.convert(names=country, to="UNRegion")
|
un_region = cc.convert(names=country, to="UNRegion")
|
||||||
|
|
||||||
if un_region not in regions:
|
if un_region != "not found" and un_region not in regions:
|
||||||
if un_region not in missing_regions:
|
|
||||||
missing_regions.append(un_region)
|
|
||||||
|
|
||||||
if len(missing_regions) > 0:
|
|
||||||
for missing_region in missing_regions:
|
|
||||||
print(
|
print(
|
||||||
f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
f"{Fore.YELLOW}Missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user