1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-17 11:37:03 +01:00

Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
This commit is contained in:
Alan Orth 2019-07-30 00:30:31 +03:00
parent bb882315f1
commit 1f65a28307
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
7 changed files with 129 additions and 1 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
__pycache__
*.sqlite

View File

@ -13,6 +13,8 @@ pandas = "*"
python-stdnum = "*"
xlrd = "*"
iso-639 = "*"
requests = "*"
requests-cache = "*"
[requires]
python_version = "3.7"

46
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "d5c625fc9bd915e3199fc4f617c157ee002795fa44d9fa6f0a298749acc134e6"
"sha256": "c5c86b4dae011bcbf6705514d97aa55e0a59dd8b7927c38e34103d77eca13cc7"
},
"pipfile-spec": 6,
"requires": {
@ -16,6 +16,27 @@
]
},
"default": {
"certifi": {
"hashes": [
"sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939",
"sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695"
],
"version": "==2019.6.16"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"idna": {
"hashes": [
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
],
"version": "==2.8"
},
"iso-639": {
"hashes": [
"sha256:dc9cd4b880b898d774c47fe9775167404af8a85dd889d58f9008035109acce49"
@ -86,6 +107,22 @@
],
"version": "==2019.1"
},
"requests": {
"hashes": [
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
],
"index": "pypi",
"version": "==2.22.0"
},
"requests-cache": {
"hashes": [
"sha256:6822f788c5ee248995c4bfbd725de2002ad710182ba26a666e85b64981866060",
"sha256:73a7211870f7d67af5fd81cad2f67cfe1cd3eb4ee6a85155e07613968cc72dfc"
],
"index": "pypi",
"version": "==0.5.0"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
@ -93,6 +130,13 @@
],
"version": "==1.12.0"
},
"urllib3": {
"hashes": [
"sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
"sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
],
"version": "==1.25.3"
},
"xlrd": {
"hashes": [
"sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2",

View File

@ -9,6 +9,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
- Read Excel files
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
- Validate languages against ISO 639-2 and ISO 639-3
- Validate subjects against AGROVOC REST API
- Fix leading, trailing, and excessive whitespace
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc

View File

@ -43,6 +43,11 @@ def main(argv):
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates)
# Check: invalid AGROVOC subject
match = re.match(r'.*?dc\.subject.*$', column)
if match is not None:
df[column] = df[column].apply(check.agrovoc)
# Check: invalid language
match = re.match(r'^.*?language.*$', column)
if match is not None:

View File

@ -189,3 +189,57 @@ def language(field):
print(f'Invalid language: {value}')
return field
def agrovoc(field):
"""Check subject terms against AGROVOC REST API.
Logic copied from agrovoc-lookup.py.
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
Prints a warning if the value is invalid.
"""
from datetime import timedelta
import re
import requests
import requests_cache
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split('||'):
# match lines beginning with words, paying attention to subjects with
# special characters like spaces, quotes, dashes, parentheses, etc:
# SUBJECT
# ANOTHER SUBJECT
# XANTHOMONAS CAMPESTRIS PV. MANIHOTIS
# WOMEN'S PARTICIPATION
# COMMUNITY-BASED FOREST MANAGEMENT
# INTERACCIÓN GENOTIPO AMBIENTE
# COCOA (PLANT)
pattern = re.compile(r'^[\w\-\.\'\(\)]+?[\w\s\-\.\'\(\)]+$')
if pattern.match(value):
request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}'
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after)
request = requests.get(request_url)
# prune old cache entries
requests_cache.core.remove_expired_responses()
if request.status_code == requests.codes.ok:
data = request.json()
# check if there is 1 result, ie an exact subject term match
if len(data['results']) != 1:
print(f'Invalid AGROVOC subject: {value}')
return field

View File

@ -169,3 +169,24 @@ def test_check_invalid_language(capsys):
captured = capsys.readouterr()
assert captured.out == f'Invalid language: {value}\n'
def test_check_invalid_agrovoc(capsys):
'''Test invalid AGROVOC subject.'''
value = 'FOREST'
check.agrovoc(value)
captured = capsys.readouterr()
assert captured.out == f'Invalid AGROVOC subject: {value}\n'
def test_check_valid_agrovoc():
'''Test valid AGROVOC subject.'''
value = 'FORESTS'
result = check.agrovoc(value)
assert result == value