From 40e284dac05bb3832b550bec9ef4d636c032b7dd Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Tue, 22 Jan 2019 08:39:36 +0200
Subject: [PATCH] dspace_statistics_api/indexer.py: Query multiple shards

DSpace's stats-util script splits the Solr statistics core into yearly
shards. We need to use Solr's `shards` query parameter in order to get
the statistics for previous years. This commit adds a helper function
to enumerate the active Solr cores to find yearly shards matching the
statistics-YYYY pattern and add them to the query.
---
 dspace_statistics_api/indexer.py | 61 +++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/dspace_statistics_api/indexer.py b/dspace_statistics_api/indexer.py
index e964de7..23abf30 100644
--- a/dspace_statistics_api/indexer.py
+++ b/dspace_statistics_api/indexer.py
@@ -32,9 +32,56 @@
 from .database import DatabaseManager
 import json
 import psycopg2.extras
+import re
+import requests
 from .solr import solr_connection
 
 
+# Enumerate the cores in Solr to determine if statistics have been sharded into
+# yearly shards by DSpace's stats-util or not (for example: statistics-2018).
+def get_statistics_shards():
+    # Initialize an empty list for statistics core years
+    statistics_core_years = []
+
+    # URL for Solr status to check active cores
+    solr_url = solr.host + '/admin/cores?action=STATUS&wt=json'
+    res = requests.get(solr_url)
+
+    if res.status_code == requests.codes.ok:
+        data = res.json()
+
+        # Iterate over active cores from Solr's STATUS response (cores are in
+        # the status array of this response).
+        for core in data['status']:
+            # Pattern to match, for example: statistics-2018
+            pattern = re.compile('^statistics-[0-9]{4}$')
+
+            if not pattern.match(core):
+                continue
+
+            # Append current core to list
+            statistics_core_years.append(core)
+
+    # Initialize a string to hold our shards (may end up being empty if the Solr
+    # core has not been processed by stats-util).
+    shards = str()
+
+    if len(statistics_core_years) > 0:
+        # Begin building a string of shards starting with the default one
+        shards = '{}/statistics'.format(solr.host)
+
+        for core in statistics_core_years:
+            # Create a comma-separated list of shards to pass to our Solr query
+            #
+            # See: https://wiki.apache.org/solr/DistributedSearch
+            shards += ',{}/{}'.format(solr.host, core)
+
+    # Return the string of shards, which may actually be empty. Solr doesn't
+    # seem to mind if the shards query parameter is empty and I haven't seen
+    # any negative performance impact so this should be fine.
+    return shards
+
+
 def index_views():
     # get total number of distinct facets for items with a minimum of 1 view,
     # otherwise Solr returns all kinds of weird ids that are actually not in
@@ -52,7 +99,8 @@ def index_views():
         'facet.offset': 0,
         'stats': True,
         'stats.field': 'id',
-        'stats.calcdistinct': True
+        'stats.calcdistinct': True,
+        'shards': shards
     }, rows=0)
 
     # get total number of distinct facets (countDistinct)
@@ -78,7 +126,8 @@ def index_views():
                     'facet.field': 'id',
                     'facet.mincount': 1,
                     'facet.limit': results_per_page,
-                    'facet.offset': results_current_page * results_per_page
+                    'facet.offset': results_current_page * results_per_page,
+                    'shards': shards
                 }, rows=0)
 
                 # SolrClient's get_facets() returns a dict of dicts
@@ -110,7 +159,8 @@ def index_downloads():
         'facet.offset': 0,
         'stats': True,
         'stats.field': 'owningItem',
-        'stats.calcdistinct': True
+        'stats.calcdistinct': True,
+        'shards': shards
     }, rows=0)
 
     # get total number of distinct facets (countDistinct)
@@ -136,7 +186,8 @@ def index_downloads():
                     'facet.field': 'owningItem',
                     'facet.mincount': 1,
                     'facet.limit': results_per_page,
-                    'facet.offset': results_current_page * results_per_page
+                    'facet.offset': results_current_page * results_per_page,
+                    'shards': shards
                 }, rows=0)
 
                 # SolrClient's get_facets() returns a dict of dicts
@@ -167,6 +218,8 @@ with DatabaseManager() as db:
     # commit the table creation before closing the database connection
     db.commit()
 
+shards = get_statistics_shards()
+
 index_views()
 index_downloads()