mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-17 11:37:03 +01:00
Add support for validating subjects against AGROVOC
Checks values in the dc.subject or dcterms.subject field against the AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py. See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/ See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
This commit is contained in:
parent
bb882315f1
commit
1f65a28307
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
__pycache__
|
||||
*.sqlite
|
||||
|
2
Pipfile
2
Pipfile
@ -13,6 +13,8 @@ pandas = "*"
|
||||
python-stdnum = "*"
|
||||
xlrd = "*"
|
||||
iso-639 = "*"
|
||||
requests = "*"
|
||||
requests-cache = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
|
46
Pipfile.lock
generated
46
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "d5c625fc9bd915e3199fc4f617c157ee002795fa44d9fa6f0a298749acc134e6"
|
||||
"sha256": "c5c86b4dae011bcbf6705514d97aa55e0a59dd8b7927c38e34103d77eca13cc7"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -16,6 +16,27 @@
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939",
|
||||
"sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695"
|
||||
],
|
||||
"version": "==2019.6.16"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
|
||||
"sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
|
||||
],
|
||||
"version": "==2.8"
|
||||
},
|
||||
"iso-639": {
|
||||
"hashes": [
|
||||
"sha256:dc9cd4b880b898d774c47fe9775167404af8a85dd889d58f9008035109acce49"
|
||||
@ -86,6 +107,22 @@
|
||||
],
|
||||
"version": "==2019.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
|
||||
"sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.22.0"
|
||||
},
|
||||
"requests-cache": {
|
||||
"hashes": [
|
||||
"sha256:6822f788c5ee248995c4bfbd725de2002ad710182ba26a666e85b64981866060",
|
||||
"sha256:73a7211870f7d67af5fd81cad2f67cfe1cd3eb4ee6a85155e07613968cc72dfc"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.5.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||
@ -93,6 +130,13 @@
|
||||
],
|
||||
"version": "==1.12.0"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
|
||||
"sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
|
||||
],
|
||||
"version": "==1.25.3"
|
||||
},
|
||||
"xlrd": {
|
||||
"hashes": [
|
||||
"sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2",
|
||||
|
@ -9,6 +9,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
|
||||
- Read Excel files
|
||||
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
|
||||
- Validate languages against ISO 639-2 and ISO 639-3
|
||||
- Validate subjects against AGROVOC REST API
|
||||
- Fix leading, trailing, and excessive whitespace
|
||||
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
|
||||
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc
|
||||
|
@ -43,6 +43,11 @@ def main(argv):
|
||||
# Fix: duplicate metadata values
|
||||
df[column] = df[column].apply(fix.duplicates)
|
||||
|
||||
# Check: invalid AGROVOC subject
|
||||
match = re.match(r'.*?dc\.subject.*$', column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.agrovoc)
|
||||
|
||||
# Check: invalid language
|
||||
match = re.match(r'^.*?language.*$', column)
|
||||
if match is not None:
|
||||
|
@ -189,3 +189,57 @@ def language(field):
|
||||
print(f'Invalid language: {value}')
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def agrovoc(field):
|
||||
"""Check subject terms against AGROVOC REST API.
|
||||
|
||||
Logic copied from agrovoc-lookup.py.
|
||||
|
||||
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
|
||||
|
||||
Prints a warning if the value is invalid.
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
import re
|
||||
import requests
|
||||
import requests_cache
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
# match lines beginning with words, paying attention to subjects with
|
||||
# special characters like spaces, quotes, dashes, parentheses, etc:
|
||||
# SUBJECT
|
||||
# ANOTHER SUBJECT
|
||||
# XANTHOMONAS CAMPESTRIS PV. MANIHOTIS
|
||||
# WOMEN'S PARTICIPATION
|
||||
# COMMUNITY-BASED FOREST MANAGEMENT
|
||||
# INTERACCIÓN GENOTIPO AMBIENTE
|
||||
# COCOA (PLANT)
|
||||
pattern = re.compile(r'^[\w\-\.\'\(\)]+?[\w\s\-\.\'\(\)]+$')
|
||||
|
||||
if pattern.match(value):
|
||||
request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}'
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after)
|
||||
|
||||
request = requests.get(request_url)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.core.remove_expired_responses()
|
||||
|
||||
if request.status_code == requests.codes.ok:
|
||||
data = request.json()
|
||||
|
||||
# check if there is 1 result, ie an exact subject term match
|
||||
if len(data['results']) != 1:
|
||||
print(f'Invalid AGROVOC subject: {value}')
|
||||
|
||||
return field
|
||||
|
@ -169,3 +169,24 @@ def test_check_invalid_language(capsys):
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid language: {value}\n'
|
||||
|
||||
|
||||
def test_check_invalid_agrovoc(capsys):
|
||||
'''Test invalid AGROVOC subject.'''
|
||||
|
||||
value = 'FOREST'
|
||||
|
||||
check.agrovoc(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid AGROVOC subject: {value}\n'
|
||||
|
||||
|
||||
def test_check_valid_agrovoc():
|
||||
'''Test valid AGROVOC subject.'''
|
||||
|
||||
value = 'FORESTS'
|
||||
|
||||
result = check.agrovoc(value)
|
||||
|
||||
assert result == value
|
||||
|
Loading…
Reference in New Issue
Block a user