1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2024-11-25 15:48:20 +01:00

Refactor to use vanilla requests library

The SolrClient library is unmaintained, which is starting to cause
problems due to the moving Python ecosystem. Switching to requests
does not change my code in any meaningful way and makes maintenance
easier.
This commit is contained in:
Alan Orth 2019-04-15 10:19:50 +03:00
parent 18e1e1a227
commit 8f46ceb8d8
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 67 additions and 46 deletions

View File

@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Run pipenv update, bringing pytest version 4.3.1, psycopg-binary 2.7.7, etc - Run pipenv update, bringing pytest version 4.3.1, psycopg-binary 2.7.7, etc
- sr.ht and TravisCI configuration to disable emojis and animation to keep logs clean - sr.ht and TravisCI configuration to disable emojis and animation to keep logs clean
### Changed
- Use vanilla requests library instead of SolrClient
## [0.9.0] - 2019-01-22 ## [0.9.0] - 2019-01-22
### Updated ### Updated
- pytest version 4.0.0 - pytest version 4.0.0

View File

@ -7,7 +7,7 @@ name = "pypi"
gunicorn = "*" gunicorn = "*"
falcon = "*" falcon = "*"
"psycopg2-binary" = "*" "psycopg2-binary" = "*"
solrclient = {ref = "kazoo-2.5.0", git = "https://github.com/alanorth/SolrClient.git"} requests = "*"
[dev-packages] [dev-packages]
ipython = "*" ipython = "*"

View File

@ -29,12 +29,12 @@
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html # See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
# See: https://wiki.duraspace.org/display/DSPACE/Solr # See: https://wiki.duraspace.org/display/DSPACE/Solr
from .config import SOLR_SERVER
from .database import DatabaseManager from .database import DatabaseManager
import json import json
import psycopg2.extras import psycopg2.extras
import re import re
import requests import requests
from .solr import solr_connection
# Enumerate the cores in Solr to determine if statistics have been sharded into # Enumerate the cores in Solr to determine if statistics have been sharded into
@ -44,8 +44,12 @@ def get_statistics_shards():
statistics_core_years = [] statistics_core_years = []
# URL for Solr status to check active cores # URL for Solr status to check active cores
solr_url = solr.host + '/admin/cores?action=STATUS&wt=json' solr_query_params = {
res = requests.get(solr_url) 'action': 'STATUS',
'wt': 'json'
}
solr_url = SOLR_SERVER + '/admin/cores'
res = requests.get(solr_url, params=solr_query_params)
if res.status_code == requests.codes.ok: if res.status_code == requests.codes.ok:
data = res.json() data = res.json()
@ -68,13 +72,13 @@ def get_statistics_shards():
if len(statistics_core_years) > 0: if len(statistics_core_years) > 0:
# Begin building a string of shards starting with the default one # Begin building a string of shards starting with the default one
shards = '{}/statistics'.format(solr.host) shards = '{}/statistics'.format(SOLR_SERVER)
for core in statistics_core_years: for core in statistics_core_years:
# Create a comma-separated list of shards to pass to our Solr query # Create a comma-separated list of shards to pass to our Solr query
# #
# See: https://wiki.apache.org/solr/DistributedSearch # See: https://wiki.apache.org/solr/DistributedSearch
shards += ',{}/{}'.format(solr.host, core) shards += ',{}/{}'.format(SOLR_SERVER, core)
# Return the string of shards, which may actually be empty. Solr doesn't # Return the string of shards, which may actually be empty. Solr doesn't
# seem to mind if the shards query parameter is empty and I haven't seen # seem to mind if the shards query parameter is empty and I haven't seen
@ -89,23 +93,29 @@ def index_views():
# so we can get the countDistinct summary. # so we can get the countDistinct summary.
# #
# see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html # see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html
res = solr.query('statistics', { solr_query_params = {
'q': 'type:2', 'q': 'type:2',
'fq': 'isBot:false AND statistics_type:view', 'fq': 'isBot:false AND statistics_type:view',
'facet': True, 'facet': 'true',
'facet.field': 'id', 'facet.field': 'id',
'facet.mincount': 1, 'facet.mincount': 1,
'facet.limit': 1, 'facet.limit': 1,
'facet.offset': 0, 'facet.offset': 0,
'stats': True, 'stats': 'true',
'stats.field': 'id', 'stats.field': 'id',
'stats.calcdistinct': True, 'stats.calcdistinct': 'true',
'shards': shards 'shards': shards,
}, rows=0) 'rows': 0,
'wt': 'json'
}
solr_url = SOLR_SERVER + '/statistics/select'
res = requests.get(solr_url, params=solr_query_params)
try: try:
# get total number of distinct facets (countDistinct) # get total number of distinct facets (countDistinct)
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct'] results_totalNumFacets = res.json()['stats']['stats_fields']['id']['countDistinct']
except TypeError: except TypeError:
print('No item views to index, exiting.') print('No item views to index, exiting.')
@ -124,20 +134,27 @@ def index_views():
while results_current_page <= results_num_pages: while results_current_page <= results_num_pages:
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages)) print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
res = solr.query('statistics', { solr_query_params = {
'q': 'type:2', 'q': 'type:2',
'fq': 'isBot:false AND statistics_type:view', 'fq': 'isBot:false AND statistics_type:view',
'facet': True, 'facet': 'true',
'facet.field': 'id', 'facet.field': 'id',
'facet.mincount': 1, 'facet.mincount': 1,
'facet.limit': results_per_page, 'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page, 'facet.offset': results_current_page * results_per_page,
'shards': shards 'shards': shards,
}, rows=0) 'rows': 0,
'wt': 'json',
'json.nl': 'map' # return facets as a dict instead of a flat list
}
# SolrClient's get_facets() returns a dict of dicts solr_url = SOLR_SERVER + '/statistics/select'
views = res.get_facets()
# in this case iterate over the 'id' dict and get the item ids and views res = requests.get(solr_url, params=solr_query_params)
# Solr returns facets as a dict of dicts (see json.nl parameter)
views = res.json()['facet_counts']['facet_fields']
# iterate over the 'id' dict and get the item ids and views
for item_id, item_views in views['id'].items(): for item_id, item_views in views['id'].items():
data.append((item_id, item_views)) data.append((item_id, item_views))
@ -154,23 +171,29 @@ def index_views():
def index_downloads(): def index_downloads():
# get the total number of distinct facets for items with at least 1 download # get the total number of distinct facets for items with at least 1 download
res = solr.query('statistics', { solr_query_params= {
'q': 'type:0', 'q': 'type:0',
'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL', 'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
'facet': True, 'facet': 'true',
'facet.field': 'owningItem', 'facet.field': 'owningItem',
'facet.mincount': 1, 'facet.mincount': 1,
'facet.limit': 1, 'facet.limit': 1,
'facet.offset': 0, 'facet.offset': 0,
'stats': True, 'stats': 'true',
'stats.field': 'owningItem', 'stats.field': 'owningItem',
'stats.calcdistinct': True, 'stats.calcdistinct': 'true',
'shards': shards 'shards': shards,
}, rows=0) 'rows': 0,
'wt': 'json'
}
solr_url = SOLR_SERVER + '/statistics/select'
res = requests.get(solr_url, params=solr_query_params)
try: try:
# get total number of distinct facets (countDistinct) # get total number of distinct facets (countDistinct)
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct'] results_totalNumFacets = res.json()['stats']['stats_fields']['owningItem']['countDistinct']
except TypeError: except TypeError:
print('No item downloads to index, exiting.') print('No item downloads to index, exiting.')
@ -189,20 +212,27 @@ def index_downloads():
while results_current_page <= results_num_pages: while results_current_page <= results_num_pages:
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages)) print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
res = solr.query('statistics', { solr_query_params = {
'q': 'type:0', 'q': 'type:0',
'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL', 'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
'facet': True, 'facet': 'true',
'facet.field': 'owningItem', 'facet.field': 'owningItem',
'facet.mincount': 1, 'facet.mincount': 1,
'facet.limit': results_per_page, 'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page, 'facet.offset': results_current_page * results_per_page,
'shards': shards 'shards': shards,
}, rows=0) 'rows': 0,
'wt': 'json',
'json.nl': 'map' # return facets as a dict instead of a flat list
}
# SolrClient's get_facets() returns a dict of dicts solr_url = SOLR_SERVER + '/statistics/select'
downloads = res.get_facets()
# in this case iterate over the 'owningItem' dict and get the item ids and downloads res = requests.get(solr_url, params=solr_query_params)
# Solr returns facets as a dict of dicts (see json.nl parameter)
downloads = res.json()['facet_counts']['facet_fields']
# iterate over the 'owningItem' dict and get the item ids and downloads
for item_id, item_downloads in downloads['owningItem'].items(): for item_id, item_downloads in downloads['owningItem'].items():
data.append((item_id, item_downloads)) data.append((item_id, item_downloads))
@ -217,8 +247,6 @@ def index_downloads():
results_current_page += 1 results_current_page += 1
solr = solr_connection()
with DatabaseManager() as db: with DatabaseManager() as db:
with db.cursor() as cursor: with db.cursor() as cursor:
# create table to store item views and downloads # create table to store item views and downloads

View File

@ -1,10 +0,0 @@
from .config import SOLR_SERVER
from SolrClient import SolrClient
def solr_connection():
connection = SolrClient(SOLR_SERVER)
return connection
# vim: set sw=4 ts=4 expandtab: