mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-28 08:38:18 +01:00
Compare commits
No commits in common. "2e489fc9216aef8c71046f89e5a2f077b6eb5b1b" and "fdb7900cd0a4338b374d90fc1b466832d234902a" have entirely different histories.
2e489fc921
...
fdb7900cd0
18
.drone.yml
18
.drone.yml
@ -15,12 +15,8 @@ steps:
|
||||
- python setup.py install
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
# Test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
@ -45,12 +41,8 @@ steps:
|
||||
- python setup.py install
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
# Test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
@ -75,12 +67,8 @@ steps:
|
||||
- python setup.py install
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
# Test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
|
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
@ -218,7 +217,7 @@ def agrovoc(field, field_name, drop):
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
# requests_cache.remove_expired_responses()
|
||||
requests_cache.remove_expired_responses()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
@ -486,15 +485,6 @@ def countries_match_regions(row):
|
||||
region_column_name = ""
|
||||
title_column_name = ""
|
||||
|
||||
# Instantiate a CountryConverter() object here. According to the docs it is
|
||||
# more performant to do that as opposed to calling coco.convert() directly
|
||||
# because we don't need to re-load the country data with each iteration.
|
||||
cc = coco.CountryConverter()
|
||||
|
||||
# Set logging to ERROR so country_converter's convert() doesn't print the
|
||||
# "not found in regex" warning message to the screen.
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
|
||||
# Iterate over the labels of the current row's values to get the names of
|
||||
# the title and citation columns. Then we check if the title is present in
|
||||
# the citation.
|
||||
@ -528,15 +518,23 @@ def countries_match_regions(row):
|
||||
else:
|
||||
regions = list()
|
||||
|
||||
# An empty list for our regions so we can keep track for all countries
|
||||
missing_regions = list()
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
# only list the direct region, ie Western Africa, rather than all
|
||||
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||
un_region = cc.convert(names=country, to="UNRegion")
|
||||
un_region = coco.convert(names=country, to="UNRegion")
|
||||
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
if un_region not in regions:
|
||||
if un_region not in missing_regions:
|
||||
missing_regions.append(un_region)
|
||||
|
||||
if len(missing_regions) > 0:
|
||||
for missing_region in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
|
||||
return
|
||||
|
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import re
|
||||
from unicodedata import normalize
|
||||
|
||||
@ -309,15 +308,6 @@ def countries_match_regions(row):
|
||||
region_column_name = ""
|
||||
title_column_name = ""
|
||||
|
||||
# Instantiate a CountryConverter() object here. According to the docs it is
|
||||
# more performant to do that as opposed to calling coco.convert() directly
|
||||
# because we don't need to re-load the country data with each iteration.
|
||||
cc = coco.CountryConverter()
|
||||
|
||||
# Set logging to ERROR so country_converter's convert() doesn't print the
|
||||
# "not found in regex" warning message to the screen.
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
|
||||
# Iterate over the labels of the current row's values to get the names of
|
||||
# the title and citation columns. Then we check if the title is present in
|
||||
# the citation.
|
||||
@ -358,18 +348,18 @@ def countries_match_regions(row):
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
# only list the direct region, ie Western Africa, rather than all
|
||||
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||
un_region = cc.convert(names=country, to="UNRegion")
|
||||
un_region = coco.convert(names=country, to="UNRegion")
|
||||
|
||||
# Add the new un_region to regions if it is not "not found" and if
|
||||
# it doesn't already exist in regions.
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
if un_region not in regions:
|
||||
if un_region not in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
missing_regions.append(un_region)
|
||||
|
||||
if len(missing_regions) > 0:
|
||||
for missing_region in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
|
||||
# Add the missing regions back to the row, paying attention to whether
|
||||
# or not the row's region column is None (aka null) or just an empty
|
||||
# string (length would be 0).
|
||||
|
@ -1,13 +0,0 @@
|
||||
dc.title,dcterms.issued,dcterms.type,dc.contributor.author,cg.coverage.country,cg.coverage.region
|
||||
No country,2022-09-01,Report,"Orth, Alan",,
|
||||
Matching country and region,2022-09-01,Report,"Orth, Alan",Kenya,Eastern Africa
|
||||
Missing region,2022-09-01,Report,"Orth, Alan",Kenya,
|
||||
Caribbean country with matching region,2022-09-01,Report,"Orth, Alan",Bahamas,Caribbean
|
||||
Caribbean country with no region,2022-09-01,Report,"Orth, Alan",Bahamas,
|
||||
Fake country with no region,2022-09-01,Report,"Orth, Alan",Yeah Baby,
|
||||
SE Asian country with matching region,2022-09-01,Report,"Orth, Alan",Cambodia,South-eastern Asia
|
||||
SE Asian country with no region,2022-09-01,Report,"Orth, Alan",Cambodia,
|
||||
Duplicate countries with matching region,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,Eastern Africa
|
||||
Duplicate countries with missing regions,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,
|
||||
Multiple countries with no regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,
|
||||
Multiple countries with mixed matching regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,Eastern Africa
|
|
Loading…
Reference in New Issue
Block a user