1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2024-11-22 14:25:01 +01:00

dspace_statistics_api/indexer.py: Query multiple shards

DSpace's stats-util script splits the Solr statistics core into yearly
shards. We need to use Solr's `shards` query parameter in order to get
the statistics for previous years. This commit adds a helper function
to enumerate the active Solr cores to find yearly shards matching the
statistics-YYYY pattern and add them to the query.
This commit is contained in:
Alan Orth 2019-01-22 08:39:36 +02:00
parent 934fa9db9b
commit 40e284dac0
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -32,9 +32,56 @@
from .database import DatabaseManager
import json
import psycopg2.extras
import re
import requests
from .solr import solr_connection
# Enumerate the cores in Solr to determine if statistics have been sharded into
# yearly shards by DSpace's stats-util or not (for example: statistics-2018).
def get_statistics_shards():
# Initialize an empty list for statistics core years
statistics_core_years = []
# URL for Solr status to check active cores
solr_url = solr.host + '/admin/cores?action=STATUS&wt=json'
res = requests.get(solr_url)
if res.status_code == requests.codes.ok:
data = res.json()
# Iterate over active cores from Solr's STATUS response (cores are in
# the status array of this response).
for core in data['status']:
# Pattern to match, for example: statistics-2018
pattern = re.compile('^statistics-[0-9]{4}$')
if not pattern.match(core):
continue
# Append current core to list
statistics_core_years.append(core)
# Initialize a string to hold our shards (may end up being empty if the Solr
# core has not been processed by stats-util).
shards = str()
if len(statistics_core_years) > 0:
# Begin building a string of shards starting with the default one
shards = '{}/statistics'.format(solr.host)
for core in statistics_core_years:
# Create a comma-separated list of shards to pass to our Solr query
#
# See: https://wiki.apache.org/solr/DistributedSearch
shards += ',{}/{}'.format(solr.host, core)
# Return the string of shards, which may actually be empty. Solr doesn't
# seem to mind if the shards query parameter is empty and I haven't seen
# any negative performance impact so this should be fine.
return shards
def index_views():
# get total number of distinct facets for items with a minimum of 1 view,
# otherwise Solr returns all kinds of weird ids that are actually not in
@ -52,7 +99,8 @@ def index_views():
'facet.offset': 0,
'stats': True,
'stats.field': 'id',
'stats.calcdistinct': True
'stats.calcdistinct': True,
'shards': shards
}, rows=0)
# get total number of distinct facets (countDistinct)
@ -78,7 +126,8 @@ def index_views():
'facet.field': 'id',
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
'facet.offset': results_current_page * results_per_page,
'shards': shards
}, rows=0)
# SolrClient's get_facets() returns a dict of dicts
@ -110,7 +159,8 @@ def index_downloads():
'facet.offset': 0,
'stats': True,
'stats.field': 'owningItem',
'stats.calcdistinct': True
'stats.calcdistinct': True,
'shards': shards
}, rows=0)
# get total number of distinct facets (countDistinct)
@ -136,7 +186,8 @@ def index_downloads():
'facet.field': 'owningItem',
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
'facet.offset': results_current_page * results_per_page,
'shards': shards
}, rows=0)
# SolrClient's get_facets() returns a dict of dicts
@ -167,6 +218,8 @@ with DatabaseManager() as db:
# commit the table creation before closing the database connection
db.commit()
shards = get_statistics_shards()
index_views()
index_downloads()