1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-28 08:38:18 +01:00

Compare commits

..

No commits in common. "2e489fc9216aef8c71046f89e5a2f077b6eb5b1b" and "fdb7900cd0a4338b374d90fc1b466832d234902a" have entirely different histories.

4 changed files with 30 additions and 67 deletions

View File

@ -15,12 +15,8 @@ steps:
- python setup.py install
# Basic test
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
# Basic test with unsafe fixes
# Test with unsafe fixes
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
# Geography test
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
# Geography test with unsafe fixes
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
# Test with experimental checks
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
# Test with AGROVOC validation
@ -45,12 +41,8 @@ steps:
- python setup.py install
# Basic test
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
# Basic test with unsafe fixes
# Test with unsafe fixes
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
# Geography test
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
# Geography test with unsafe fixes
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
# Test with experimental checks
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
# Test with AGROVOC validation
@ -75,12 +67,8 @@ steps:
- python setup.py install
# Basic test
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
# Basic test with unsafe fixes
# Test with unsafe fixes
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
# Geography test
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
# Geography test with unsafe fixes
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
# Test with experimental checks
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
# Test with AGROVOC validation

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import logging
import os
import re
from datetime import datetime, timedelta
@ -218,7 +217,7 @@ def agrovoc(field, field_name, drop):
)
# prune old cache entries
# requests_cache.remove_expired_responses()
requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values
values = list()
@ -486,15 +485,6 @@ def countries_match_regions(row):
region_column_name = ""
title_column_name = ""
# Instantiate a CountryConverter() object here. According to the docs it is
# more performant to do that as opposed to calling coco.convert() directly
# because we don't need to re-load the country data with each iteration.
cc = coco.CountryConverter()
# Set logging to ERROR so country_converter's convert() doesn't print the
# "not found in regex" warning message to the screen.
logging.basicConfig(level=logging.ERROR)
# Iterate over the labels of the current row's values to get the names of
# the title and citation columns. Then we check if the title is present in
# the citation.
@ -528,15 +518,23 @@ def countries_match_regions(row):
else:
regions = list()
# An empty list for our regions so we can keep track for all countries
missing_regions = list()
for country in countries:
# Look up the UN M.49 regions for this country code. CoCo seems to
# only list the direct region, ie Western Africa, rather than all
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
un_region = cc.convert(names=country, to="UNRegion")
un_region = coco.convert(names=country, to="UNRegion")
if un_region != "not found" and un_region not in regions:
if un_region not in regions:
if un_region not in missing_regions:
missing_regions.append(un_region)
if len(missing_regions) > 0:
for missing_region in missing_regions:
print(
f"{Fore.YELLOW}Missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
)
return

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import logging
import re
from unicodedata import normalize
@ -309,15 +308,6 @@ def countries_match_regions(row):
region_column_name = ""
title_column_name = ""
# Instantiate a CountryConverter() object here. According to the docs it is
# more performant to do that as opposed to calling coco.convert() directly
# because we don't need to re-load the country data with each iteration.
cc = coco.CountryConverter()
# Set logging to ERROR so country_converter's convert() doesn't print the
# "not found in regex" warning message to the screen.
logging.basicConfig(level=logging.ERROR)
# Iterate over the labels of the current row's values to get the names of
# the title and citation columns. Then we check if the title is present in
# the citation.
@ -358,26 +348,26 @@ def countries_match_regions(row):
# Look up the UN M.49 regions for this country code. CoCo seems to
# only list the direct region, ie Western Africa, rather than all
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
un_region = cc.convert(names=country, to="UNRegion")
un_region = coco.convert(names=country, to="UNRegion")
# Add the new un_region to regions if it is not "not found" and if
# it doesn't already exist in regions.
if un_region != "not found" and un_region not in regions:
if un_region not in regions:
if un_region not in missing_regions:
print(
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
)
missing_regions.append(un_region)
if len(missing_regions) > 0:
# Add the missing regions back to the row, paying attention to whether
# or not the row's region column is None (aka null) or just an empty
# string (length would be 0).
if row[region_column_name] is not None and len(row[region_column_name]) > 0:
row[region_column_name] = (
row[region_column_name] + "||" + "||".join(missing_regions)
for missing_region in missing_regions:
print(
f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
)
else:
row[region_column_name] = "||".join(missing_regions)
# Add the missing regions back to the row, paying attention to whether
# or not the row's region column is None (aka null) or just an empty
# string (length would be 0).
if row[region_column_name] is not None and len(row[region_column_name]) > 0:
row[region_column_name] = (
row[region_column_name] + "||" + "||".join(missing_regions)
)
else:
row[region_column_name] = "||".join(missing_regions)
return row

View File

@ -1,13 +0,0 @@
dc.title,dcterms.issued,dcterms.type,dc.contributor.author,cg.coverage.country,cg.coverage.region
No country,2022-09-01,Report,"Orth, Alan",,
Matching country and region,2022-09-01,Report,"Orth, Alan",Kenya,Eastern Africa
Missing region,2022-09-01,Report,"Orth, Alan",Kenya,
Caribbean country with matching region,2022-09-01,Report,"Orth, Alan",Bahamas,Caribbean
Caribbean country with no region,2022-09-01,Report,"Orth, Alan",Bahamas,
Fake country with no region,2022-09-01,Report,"Orth, Alan",Yeah Baby,
SE Asian country with matching region,2022-09-01,Report,"Orth, Alan",Cambodia,South-eastern Asia
SE Asian country with no region,2022-09-01,Report,"Orth, Alan",Cambodia,
Duplicate countries with matching region,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,Eastern Africa
Duplicate countries with missing regions,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,
Multiple countries with no regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,
Multiple countries with mixed matching regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,Eastern Africa
1 dc.title dcterms.issued dcterms.type dc.contributor.author cg.coverage.country cg.coverage.region
2 No country 2022-09-01 Report Orth, Alan
3 Matching country and region 2022-09-01 Report Orth, Alan Kenya Eastern Africa
4 Missing region 2022-09-01 Report Orth, Alan Kenya
5 Caribbean country with matching region 2022-09-01 Report Orth, Alan Bahamas Caribbean
6 Caribbean country with no region 2022-09-01 Report Orth, Alan Bahamas
7 Fake country with no region 2022-09-01 Report Orth, Alan Yeah Baby
8 SE Asian country with matching region 2022-09-01 Report Orth, Alan Cambodia South-eastern Asia
9 SE Asian country with no region 2022-09-01 Report Orth, Alan Cambodia
10 Duplicate countries with matching region 2022-09-01 Report Orth, Alan Kenya||Kenya Eastern Africa
11 Duplicate countries with missing regions 2022-09-01 Report Orth, Alan Kenya||Kenya
12 Multiple countries with no regions 2022-09-01 Report Orth, Alan Kenya||Bahamas
13 Multiple countries with mixed matching regions 2022-09-01 Report Orth, Alan Kenya||Bahamas Eastern Africa