mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-28 16:48:20 +01:00
Compare commits
7 Commits
fdb7900cd0
...
2e489fc921
Author | SHA1 | Date | |
---|---|---|---|
2e489fc921 | |||
117c6ca85d | |||
f49214fa2e | |||
7ce20726d0 | |||
473be5ac2f | |||
7c61cae417 | |||
ae16289637 |
18
.drone.yml
18
.drone.yml
@ -15,8 +15,12 @@ steps:
|
|||||||
- python setup.py install
|
- python setup.py install
|
||||||
# Basic test
|
# Basic test
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Test with unsafe fixes
|
# Basic test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
|
# Geography test
|
||||||
|
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||||
|
# Geography test with unsafe fixes
|
||||||
|
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
@ -41,8 +45,12 @@ steps:
|
|||||||
- python setup.py install
|
- python setup.py install
|
||||||
# Basic test
|
# Basic test
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Test with unsafe fixes
|
# Basic test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
|
# Geography test
|
||||||
|
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||||
|
# Geography test with unsafe fixes
|
||||||
|
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
@ -67,8 +75,12 @@ steps:
|
|||||||
- python setup.py install
|
- python setup.py install
|
||||||
# Basic test
|
# Basic test
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Test with unsafe fixes
|
# Basic test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
|
# Geography test
|
||||||
|
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||||
|
# Geography test with unsafe fixes
|
||||||
|
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@ -217,7 +218,7 @@ def agrovoc(field, field_name, drop):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# prune old cache entries
|
# prune old cache entries
|
||||||
requests_cache.remove_expired_responses()
|
# requests_cache.remove_expired_responses()
|
||||||
|
|
||||||
# Initialize an empty list to hold the validated AGROVOC values
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
values = list()
|
values = list()
|
||||||
@ -485,6 +486,15 @@ def countries_match_regions(row):
|
|||||||
region_column_name = ""
|
region_column_name = ""
|
||||||
title_column_name = ""
|
title_column_name = ""
|
||||||
|
|
||||||
|
# Instantiate a CountryConverter() object here. According to the docs it is
|
||||||
|
# more performant to do that as opposed to calling coco.convert() directly
|
||||||
|
# because we don't need to re-load the country data with each iteration.
|
||||||
|
cc = coco.CountryConverter()
|
||||||
|
|
||||||
|
# Set logging to ERROR so country_converter's convert() doesn't print the
|
||||||
|
# "not found in regex" warning message to the screen.
|
||||||
|
logging.basicConfig(level=logging.ERROR)
|
||||||
|
|
||||||
# Iterate over the labels of the current row's values to get the names of
|
# Iterate over the labels of the current row's values to get the names of
|
||||||
# the title and citation columns. Then we check if the title is present in
|
# the title and citation columns. Then we check if the title is present in
|
||||||
# the citation.
|
# the citation.
|
||||||
@ -518,23 +528,15 @@ def countries_match_regions(row):
|
|||||||
else:
|
else:
|
||||||
regions = list()
|
regions = list()
|
||||||
|
|
||||||
# An empty list for our regions so we can keep track for all countries
|
|
||||||
missing_regions = list()
|
|
||||||
|
|
||||||
for country in countries:
|
for country in countries:
|
||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
# only list the direct region, ie Western Africa, rather than all
|
# only list the direct region, ie Western Africa, rather than all
|
||||||
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||||
un_region = coco.convert(names=country, to="UNRegion")
|
un_region = cc.convert(names=country, to="UNRegion")
|
||||||
|
|
||||||
if un_region not in regions:
|
if un_region != "not found" and un_region not in regions:
|
||||||
if un_region not in missing_regions:
|
|
||||||
missing_regions.append(un_region)
|
|
||||||
|
|
||||||
if len(missing_regions) > 0:
|
|
||||||
for missing_region in missing_regions:
|
|
||||||
print(
|
print(
|
||||||
f"{Fore.YELLOW}Missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
f"{Fore.YELLOW}Missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from unicodedata import normalize
|
from unicodedata import normalize
|
||||||
|
|
||||||
@ -308,6 +309,15 @@ def countries_match_regions(row):
|
|||||||
region_column_name = ""
|
region_column_name = ""
|
||||||
title_column_name = ""
|
title_column_name = ""
|
||||||
|
|
||||||
|
# Instantiate a CountryConverter() object here. According to the docs it is
|
||||||
|
# more performant to do that as opposed to calling coco.convert() directly
|
||||||
|
# because we don't need to re-load the country data with each iteration.
|
||||||
|
cc = coco.CountryConverter()
|
||||||
|
|
||||||
|
# Set logging to ERROR so country_converter's convert() doesn't print the
|
||||||
|
# "not found in regex" warning message to the screen.
|
||||||
|
logging.basicConfig(level=logging.ERROR)
|
||||||
|
|
||||||
# Iterate over the labels of the current row's values to get the names of
|
# Iterate over the labels of the current row's values to get the names of
|
||||||
# the title and citation columns. Then we check if the title is present in
|
# the title and citation columns. Then we check if the title is present in
|
||||||
# the citation.
|
# the citation.
|
||||||
@ -348,18 +358,18 @@ def countries_match_regions(row):
|
|||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
# only list the direct region, ie Western Africa, rather than all
|
# only list the direct region, ie Western Africa, rather than all
|
||||||
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
# the parent regions ("Sub-Saharan Africa", "Africa", "World")
|
||||||
un_region = coco.convert(names=country, to="UNRegion")
|
un_region = cc.convert(names=country, to="UNRegion")
|
||||||
|
|
||||||
if un_region not in regions:
|
# Add the new un_region to regions if it is not "not found" and if
|
||||||
|
# it doesn't already exist in regions.
|
||||||
|
if un_region != "not found" and un_region not in regions:
|
||||||
if un_region not in missing_regions:
|
if un_region not in missing_regions:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
|
)
|
||||||
missing_regions.append(un_region)
|
missing_regions.append(un_region)
|
||||||
|
|
||||||
if len(missing_regions) > 0:
|
if len(missing_regions) > 0:
|
||||||
for missing_region in missing_regions:
|
|
||||||
print(
|
|
||||||
f"{Fore.YELLOW}Adding missing region ({missing_region}): {Fore.RESET}{row[title_column_name]}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add the missing regions back to the row, paying attention to whether
|
# Add the missing regions back to the row, paying attention to whether
|
||||||
# or not the row's region column is None (aka null) or just an empty
|
# or not the row's region column is None (aka null) or just an empty
|
||||||
# string (length would be 0).
|
# string (length would be 0).
|
||||||
|
13
data/test-geography.csv
Normal file
13
data/test-geography.csv
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
dc.title,dcterms.issued,dcterms.type,dc.contributor.author,cg.coverage.country,cg.coverage.region
|
||||||
|
No country,2022-09-01,Report,"Orth, Alan",,
|
||||||
|
Matching country and region,2022-09-01,Report,"Orth, Alan",Kenya,Eastern Africa
|
||||||
|
Missing region,2022-09-01,Report,"Orth, Alan",Kenya,
|
||||||
|
Caribbean country with matching region,2022-09-01,Report,"Orth, Alan",Bahamas,Caribbean
|
||||||
|
Caribbean country with no region,2022-09-01,Report,"Orth, Alan",Bahamas,
|
||||||
|
Fake country with no region,2022-09-01,Report,"Orth, Alan",Yeah Baby,
|
||||||
|
SE Asian country with matching region,2022-09-01,Report,"Orth, Alan",Cambodia,South-eastern Asia
|
||||||
|
SE Asian country with no region,2022-09-01,Report,"Orth, Alan",Cambodia,
|
||||||
|
Duplicate countries with matching region,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,Eastern Africa
|
||||||
|
Duplicate countries with missing regions,2022-09-01,Report,"Orth, Alan",Kenya||Kenya,
|
||||||
|
Multiple countries with no regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,
|
||||||
|
Multiple countries with mixed matching regions,2022-09-01,Report,"Orth, Alan",Kenya||Bahamas,Eastern Africa
|
|
Loading…
Reference in New Issue
Block a user