1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 05:45:02 +01:00

csv_metadata_quality/check.py: Simplify AGROVOC check

I recycled this code from a separate agrovoc-lookup.py script that
checks lines in a text file to see if they are valid AGROVOC terms
or not. There I was concerned about skipping comments or something
I think, but we don't need to check that here. We simply check the
term that is in the field and inform the user if it's valid or not.
This commit is contained in:
Alan Orth 2019-08-21 16:35:29 +03:00
parent 6d02f5026a
commit 884e8f970d
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -212,7 +212,6 @@ def agrovoc(field, field_name):
""" """
from datetime import timedelta from datetime import timedelta
import re
import requests import requests
import requests_cache import requests_cache
@ -222,35 +221,23 @@ def agrovoc(field, field_name):
# Try to split multi-value field on "||" separator # Try to split multi-value field on "||" separator
for value in field.split('||'): for value in field.split('||'):
# match lines beginning with words, paying attention to subjects with request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}'
# special characters like spaces, quotes, dashes, parentheses, etc:
# SUBJECT
# ANOTHER SUBJECT
# XANTHOMONAS CAMPESTRIS PV. MANIHOTIS
# WOMEN'S PARTICIPATION
# COMMUNITY-BASED FOREST MANAGEMENT
# INTERACCIÓN GENOTIPO AMBIENTE
# COCOA (PLANT)
pattern = re.compile(r'^[\w\-\.\'\(\)]+?[\w\s\-\.\'\(\)]+$')
if pattern.match(value): # enable transparent request cache with thirty days expiry
request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}' expire_after = timedelta(days=30)
requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after)
# enable transparent request cache with thirty days expiry request = requests.get(request_url)
expire_after = timedelta(days=30)
requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after)
request = requests.get(request_url) # prune old cache entries
requests_cache.core.remove_expired_responses()
# prune old cache entries if request.status_code == requests.codes.ok:
requests_cache.core.remove_expired_responses() data = request.json()
if request.status_code == requests.codes.ok: # check if there are any results
data = request.json() if len(data['results']) == 0:
print(f'Invalid AGROVOC ({field_name}): {value}')
# check if there are any results
if len(data['results']) == 0:
print(f'Invalid AGROVOC ({field_name}): {value}')
return field return field