mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-30 16:04:29 +01:00
Add unsafe check to add missing regions
This commit is contained in:
parent
344993370c
commit
689ee184f7
@ -205,14 +205,23 @@ def run(argv):
|
||||
# Check: title in citation
|
||||
check.title_in_citation(df_transposed[column])
|
||||
|
||||
# Check: countries match regions
|
||||
check.countries_match_regions(df_transposed[column])
|
||||
if args.unsafe_fixes:
|
||||
# Fix: countries match regions
|
||||
df_transposed[column] = fix.countries_match_regions(df_transposed[column])
|
||||
else:
|
||||
# Check: countries match regions
|
||||
check.countries_match_regions(df_transposed[column])
|
||||
|
||||
if args.experimental_checks:
|
||||
experimental.correct_language(df_transposed[column])
|
||||
|
||||
# Transpose the DataFrame back before writing. This is probably wasteful to
|
||||
# do every time since we technically only need to do it if we've done the
|
||||
# countries/regions fix above, but I can't think of another way for now.
|
||||
df_transposed_back = df_transposed.T
|
||||
|
||||
# Write
|
||||
df.to_csv(args.output_file, index=False)
|
||||
df_transposed_back.to_csv(args.output_file, index=False)
|
||||
|
||||
# Close the input and output files before exiting
|
||||
args.input_file.close()
|
||||
|
@ -3,6 +3,7 @@
|
||||
import re
|
||||
from unicodedata import normalize
|
||||
|
||||
import country_converter as coco
|
||||
import pandas as pd
|
||||
from colorama import Fore
|
||||
from ftfy import TextFixerConfig, fix_text
|
||||
@ -289,3 +290,83 @@ def mojibake(field, field_name):
|
||||
return fix_text(field, config)
|
||||
else:
|
||||
return field
|
||||
|
||||
|
||||
def countries_match_regions(row):
|
||||
"""Check for the scenario where an item has country coverage metadata, but
|
||||
does not have the corresponding region metadata. For example, an item that
|
||||
has country coverage "Kenya" should also have region "Eastern Africa" acc-
|
||||
ording to the UN M.49 classification scheme.
|
||||
|
||||
See: https://unstats.un.org/unsd/methodology/m49/
|
||||
|
||||
Return fixed string.
|
||||
"""
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
country_column_name = ""
|
||||
region_column_name = ""
|
||||
title_column_name = ""
|
||||
|
||||
# Iterate over the labels of the current row's values to get the names of
|
||||
# the title and citation columns. Then we check if the title is present in
|
||||
# the citation.
|
||||
for label in row.axes[0]:
|
||||
# Find the name of the country column
|
||||
match = re.match(r"^.*?country.*$", label)
|
||||
if match is not None:
|
||||
country_column_name = label
|
||||
|
||||
# Find the name of the region column
|
||||
match = re.match(r"^.*?region.*$", label)
|
||||
if match is not None:
|
||||
region_column_name = label
|
||||
|
||||
# Find the name of the title column
|
||||
match = re.match(r"^(dc|dcterms)\.title.*$", label)
|
||||
if match is not None:
|
||||
title_column_name = label
|
||||
|
||||
# Make sure we found the country and region columns
|
||||
if country_column_name != "" and region_column_name != "":
|
||||
# If we don't have any countries then we should return early before
|
||||
# suggesting regions.
|
||||
if row[country_column_name] is not None:
|
||||
countries = row[country_column_name].split("||")
|
||||
else:
|
||||
return
|
||||
|
||||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
|
||||
# An empty list for our regions so we can keep track for all countries
|
||||
missing_regions = list()
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
# only list the direct region, ie Western Africa, rather than all
|
||||
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||
un_region = coco.convert(names=country, to="UNRegion")
|
||||
|
||||
if un_region not in regions:
|
||||
if un_region not in missing_regions:
|
||||
missing_regions.append(un_region)
|
||||
|
||||
if len(missing_regions) > 0:
|
||||
for missing_region in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
|
||||
# Add the missing regions back to the row, paying attention to whether
|
||||
# or not the row's regions are blank or not.
|
||||
if row[region_column_name] is not None:
|
||||
row[region_column_name] = row[region_column_name] + "||".join(
|
||||
missing_regions
|
||||
)
|
||||
else:
|
||||
row[region_column_name] = "||".join(missing_regions)
|
||||
|
||||
return row
|
||||
|
@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import csv_metadata_quality.fix as fix
|
||||
|
||||
|
||||
@ -120,3 +122,32 @@ def test_fix_mojibake():
|
||||
field_name = "dcterms.isPartOf"
|
||||
|
||||
assert fix.mojibake(field, field_name) == "CIAT Publicaçao"
|
||||
|
||||
|
||||
def test_fix_country_not_matching_region():
|
||||
"""Test an item with regions not matching its country list."""
|
||||
|
||||
title = "Testing an item with no matching region."
|
||||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
"dc.title": title,
|
||||
"cg.coverage.country": country,
|
||||
"cg.coverage.region": region,
|
||||
}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
result = fix.countries_match_regions(series)
|
||||
|
||||
# Emulate the correct series we are expecting
|
||||
d_correct = {
|
||||
"dc.title": title,
|
||||
"cg.coverage.country": country,
|
||||
"cg.coverage.region": missing_region,
|
||||
}
|
||||
series_correct = pd.Series(data=d_correct)
|
||||
|
||||
pd.testing.assert_series_equal(result, series_correct)
|
||||
|
Loading…
Reference in New Issue
Block a user