Refactor indexer

Move the get_statistics_shards() method to a utility module so it can be used by other things.
2025-08-07 05:25:38 +02:00 · 2020-09-24 12:03:12 +03:00
parent 8e87f80e9a
commit 495386856b
2 changed files with 50 additions and 48 deletions
--- a/dspace_statistics_api/indexer.py
+++ b/dspace_statistics_api/indexer.py
@ -28,59 +28,12 @@
 #
 # See: https://wiki.duraspace.org/display/DSPACE/Solr
 import re
 import psycopg2.extras
 import requests
 from .config import SOLR_SERVER
 from .database import DatabaseManager
-
+from .util import get_statistics_shards
 # Enumerate the cores in Solr to determine if statistics have been sharded into
 # yearly shards by DSpace's stats-util or not (for example: statistics-2018).
 def get_statistics_shards():
    # Initialize an empty list for statistics core years
    statistics_core_years = []
    # URL for Solr status to check active cores
    solr_query_params = {"action": "STATUS", "wt": "json"}
    solr_url = SOLR_SERVER + "/admin/cores"
    res = requests.get(solr_url, params=solr_query_params)
    if res.status_code == requests.codes.ok:
        data = res.json()
        # Iterate over active cores from Solr's STATUS response (cores are in
        # the status array of this response).
        for core in data["status"]:
            # Pattern to match, for example: statistics-2018
            pattern = re.compile("^statistics-[0-9]{4}$")
            if not pattern.match(core):
                continue
            # Append current core to list
            statistics_core_years.append(core)
    # Initialize a string to hold our shards (may end up being empty if the Solr
    # core has not been processed by stats-util).
    shards = str()
    if len(statistics_core_years) > 0:
        # Begin building a string of shards starting with the default one
        shards = f"{SOLR_SERVER}/statistics"
        for core in statistics_core_years:
            # Create a comma-separated list of shards to pass to our Solr query
            #
            # See: https://wiki.apache.org/solr/DistributedSearch
            shards += f",{SOLR_SERVER}/{core}"
    # Return the string of shards, which may actually be empty. Solr doesn't
    # seem to mind if the shards query parameter is empty and I haven't seen
    # any negative performance impact so this should be fine.
    return shards
 def index_views():
--- a/dspace_statistics_api/util.py
+++ b/dspace_statistics_api/util.py
@ -0,0 +1,49 @@
 # Enumerate the cores in Solr to determine if statistics have been sharded into
 # yearly shards by DSpace's stats-util or not (for example: statistics-2018).
 def get_statistics_shards():
    from .config import SOLR_SERVER
    import re
    import requests
    # Initialize an empty list for statistics core years
    statistics_core_years = []
    # URL for Solr status to check active cores
    solr_query_params = {"action": "STATUS", "wt": "json"}
    solr_url = SOLR_SERVER + "/admin/cores"
    res = requests.get(solr_url, params=solr_query_params)
    if res.status_code == requests.codes.ok:
        data = res.json()
        # Iterate over active cores from Solr's STATUS response (cores are in
        # the status array of this response).
        for core in data["status"]:
            # Pattern to match, for example: statistics-2018
            pattern = re.compile("^statistics-[0-9]{4}$")
            if not pattern.match(core):
                continue
            # Append current core to list
            statistics_core_years.append(core)
    # Initialize a string to hold our shards (may end up being empty if the Solr
    # core has not been processed by stats-util).
    shards = str()
    if len(statistics_core_years) > 0:
        # Begin building a string of shards starting with the default one
        shards = f"{SOLR_SERVER}/statistics"
        for core in statistics_core_years:
            # Create a comma-separated list of shards to pass to our Solr query
            #
            # See: https://wiki.apache.org/solr/DistributedSearch
            shards += f",{SOLR_SERVER}/{core}"
    # Return the string of shards, which may actually be empty. Solr doesn't
    # seem to mind if the shards query parameter is empty and I haven't seen
    # any negative performance impact so this should be fine.
    return shards