mirror of
https://github.com/ilri/dspace-statistics-api.git
synced 2024-12-25 22:14:29 +01:00
Refactor to use vanilla requests library
The SolrClient library is unmaintained, which is starting to cause problems due to the moving Python ecosystem. Switching to requests does not change my code in any meaningful way and makes maintenance easier.
This commit is contained in:
parent
18e1e1a227
commit
8f46ceb8d8
@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Run pipenv update, bringing pytest version 4.3.1, psycopg-binary 2.7.7, etc
|
||||
- sr.ht and TravisCI configuration to disable emojis and animation to keep logs clean
|
||||
|
||||
### Changed
|
||||
- Use vanilla requests library instead of SolrClient
|
||||
|
||||
## [0.9.0] - 2019-01-22
|
||||
### Updated
|
||||
- pytest version 4.0.0
|
||||
|
2
Pipfile
2
Pipfile
@ -7,7 +7,7 @@ name = "pypi"
|
||||
gunicorn = "*"
|
||||
falcon = "*"
|
||||
"psycopg2-binary" = "*"
|
||||
solrclient = {ref = "kazoo-2.5.0", git = "https://github.com/alanorth/SolrClient.git"}
|
||||
requests = "*"
|
||||
|
||||
[dev-packages]
|
||||
ipython = "*"
|
||||
|
@ -29,12 +29,12 @@
|
||||
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
|
||||
# See: https://wiki.duraspace.org/display/DSPACE/Solr
|
||||
|
||||
from .config import SOLR_SERVER
|
||||
from .database import DatabaseManager
|
||||
import json
|
||||
import psycopg2.extras
|
||||
import re
|
||||
import requests
|
||||
from .solr import solr_connection
|
||||
|
||||
|
||||
# Enumerate the cores in Solr to determine if statistics have been sharded into
|
||||
@ -44,8 +44,12 @@ def get_statistics_shards():
|
||||
statistics_core_years = []
|
||||
|
||||
# URL for Solr status to check active cores
|
||||
solr_url = solr.host + '/admin/cores?action=STATUS&wt=json'
|
||||
res = requests.get(solr_url)
|
||||
solr_query_params = {
|
||||
'action': 'STATUS',
|
||||
'wt': 'json'
|
||||
}
|
||||
solr_url = SOLR_SERVER + '/admin/cores'
|
||||
res = requests.get(solr_url, params=solr_query_params)
|
||||
|
||||
if res.status_code == requests.codes.ok:
|
||||
data = res.json()
|
||||
@ -68,13 +72,13 @@ def get_statistics_shards():
|
||||
|
||||
if len(statistics_core_years) > 0:
|
||||
# Begin building a string of shards starting with the default one
|
||||
shards = '{}/statistics'.format(solr.host)
|
||||
shards = '{}/statistics'.format(SOLR_SERVER)
|
||||
|
||||
for core in statistics_core_years:
|
||||
# Create a comma-separated list of shards to pass to our Solr query
|
||||
#
|
||||
# See: https://wiki.apache.org/solr/DistributedSearch
|
||||
shards += ',{}/{}'.format(solr.host, core)
|
||||
shards += ',{}/{}'.format(SOLR_SERVER, core)
|
||||
|
||||
# Return the string of shards, which may actually be empty. Solr doesn't
|
||||
# seem to mind if the shards query parameter is empty and I haven't seen
|
||||
@ -89,23 +93,29 @@ def index_views():
|
||||
# so we can get the countDistinct summary.
|
||||
#
|
||||
# see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html
|
||||
res = solr.query('statistics', {
|
||||
solr_query_params = {
|
||||
'q': 'type:2',
|
||||
'fq': 'isBot:false AND statistics_type:view',
|
||||
'facet': True,
|
||||
'facet': 'true',
|
||||
'facet.field': 'id',
|
||||
'facet.mincount': 1,
|
||||
'facet.limit': 1,
|
||||
'facet.offset': 0,
|
||||
'stats': True,
|
||||
'stats': 'true',
|
||||
'stats.field': 'id',
|
||||
'stats.calcdistinct': True,
|
||||
'shards': shards
|
||||
}, rows=0)
|
||||
'stats.calcdistinct': 'true',
|
||||
'shards': shards,
|
||||
'rows': 0,
|
||||
'wt': 'json'
|
||||
}
|
||||
|
||||
solr_url = SOLR_SERVER + '/statistics/select'
|
||||
|
||||
res = requests.get(solr_url, params=solr_query_params)
|
||||
|
||||
try:
|
||||
# get total number of distinct facets (countDistinct)
|
||||
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct']
|
||||
results_totalNumFacets = res.json()['stats']['stats_fields']['id']['countDistinct']
|
||||
except TypeError:
|
||||
print('No item views to index, exiting.')
|
||||
|
||||
@ -124,20 +134,27 @@ def index_views():
|
||||
while results_current_page <= results_num_pages:
|
||||
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
|
||||
|
||||
res = solr.query('statistics', {
|
||||
solr_query_params = {
|
||||
'q': 'type:2',
|
||||
'fq': 'isBot:false AND statistics_type:view',
|
||||
'facet': True,
|
||||
'facet': 'true',
|
||||
'facet.field': 'id',
|
||||
'facet.mincount': 1,
|
||||
'facet.limit': results_per_page,
|
||||
'facet.offset': results_current_page * results_per_page,
|
||||
'shards': shards
|
||||
}, rows=0)
|
||||
'shards': shards,
|
||||
'rows': 0,
|
||||
'wt': 'json',
|
||||
'json.nl': 'map' # return facets as a dict instead of a flat list
|
||||
}
|
||||
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
views = res.get_facets()
|
||||
# in this case iterate over the 'id' dict and get the item ids and views
|
||||
solr_url = SOLR_SERVER + '/statistics/select'
|
||||
|
||||
res = requests.get(solr_url, params=solr_query_params)
|
||||
|
||||
# Solr returns facets as a dict of dicts (see json.nl parameter)
|
||||
views = res.json()['facet_counts']['facet_fields']
|
||||
# iterate over the 'id' dict and get the item ids and views
|
||||
for item_id, item_views in views['id'].items():
|
||||
data.append((item_id, item_views))
|
||||
|
||||
@ -154,23 +171,29 @@ def index_views():
|
||||
|
||||
def index_downloads():
|
||||
# get the total number of distinct facets for items with at least 1 download
|
||||
res = solr.query('statistics', {
|
||||
solr_query_params= {
|
||||
'q': 'type:0',
|
||||
'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||
'facet': True,
|
||||
'facet': 'true',
|
||||
'facet.field': 'owningItem',
|
||||
'facet.mincount': 1,
|
||||
'facet.limit': 1,
|
||||
'facet.offset': 0,
|
||||
'stats': True,
|
||||
'stats': 'true',
|
||||
'stats.field': 'owningItem',
|
||||
'stats.calcdistinct': True,
|
||||
'shards': shards
|
||||
}, rows=0)
|
||||
'stats.calcdistinct': 'true',
|
||||
'shards': shards,
|
||||
'rows': 0,
|
||||
'wt': 'json'
|
||||
}
|
||||
|
||||
solr_url = SOLR_SERVER + '/statistics/select'
|
||||
|
||||
res = requests.get(solr_url, params=solr_query_params)
|
||||
|
||||
try:
|
||||
# get total number of distinct facets (countDistinct)
|
||||
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct']
|
||||
results_totalNumFacets = res.json()['stats']['stats_fields']['owningItem']['countDistinct']
|
||||
except TypeError:
|
||||
print('No item downloads to index, exiting.')
|
||||
|
||||
@ -189,20 +212,27 @@ def index_downloads():
|
||||
while results_current_page <= results_num_pages:
|
||||
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
|
||||
|
||||
res = solr.query('statistics', {
|
||||
solr_query_params = {
|
||||
'q': 'type:0',
|
||||
'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||
'facet': True,
|
||||
'facet': 'true',
|
||||
'facet.field': 'owningItem',
|
||||
'facet.mincount': 1,
|
||||
'facet.limit': results_per_page,
|
||||
'facet.offset': results_current_page * results_per_page,
|
||||
'shards': shards
|
||||
}, rows=0)
|
||||
'shards': shards,
|
||||
'rows': 0,
|
||||
'wt': 'json',
|
||||
'json.nl': 'map' # return facets as a dict instead of a flat list
|
||||
}
|
||||
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
downloads = res.get_facets()
|
||||
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
||||
solr_url = SOLR_SERVER + '/statistics/select'
|
||||
|
||||
res = requests.get(solr_url, params=solr_query_params)
|
||||
|
||||
# Solr returns facets as a dict of dicts (see json.nl parameter)
|
||||
downloads = res.json()['facet_counts']['facet_fields']
|
||||
# iterate over the 'owningItem' dict and get the item ids and downloads
|
||||
for item_id, item_downloads in downloads['owningItem'].items():
|
||||
data.append((item_id, item_downloads))
|
||||
|
||||
@ -217,8 +247,6 @@ def index_downloads():
|
||||
results_current_page += 1
|
||||
|
||||
|
||||
solr = solr_connection()
|
||||
|
||||
with DatabaseManager() as db:
|
||||
with db.cursor() as cursor:
|
||||
# create table to store item views and downloads
|
||||
|
@ -1,10 +0,0 @@
|
||||
from .config import SOLR_SERVER
|
||||
from SolrClient import SolrClient
|
||||
|
||||
|
||||
def solr_connection():
|
||||
connection = SolrClient(SOLR_SERVER)
|
||||
|
||||
return connection
|
||||
|
||||
# vim: set sw=4 ts=4 expandtab:
|
Loading…
Reference in New Issue
Block a user