diff --git a/CHANGELOG.md b/CHANGELOG.md index 64299a3..23b6f70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Run pipenv update, bringing pytest version 4.3.1, psycopg-binary 2.7.7, etc - sr.ht and TravisCI configuration to disable emojis and animation to keep logs clean +### Changed +- Use vanilla requests library instead of SolrClient + ## [0.9.0] - 2019-01-22 ### Updated - pytest version 4.0.0 diff --git a/Pipfile b/Pipfile index b66a5ad..f64301c 100644 --- a/Pipfile +++ b/Pipfile @@ -7,7 +7,7 @@ name = "pypi" gunicorn = "*" falcon = "*" "psycopg2-binary" = "*" -solrclient = {ref = "kazoo-2.5.0", git = "https://github.com/alanorth/SolrClient.git"} +requests = "*" [dev-packages] ipython = "*" diff --git a/dspace_statistics_api/indexer.py b/dspace_statistics_api/indexer.py index da4e690..b65a909 100644 --- a/dspace_statistics_api/indexer.py +++ b/dspace_statistics_api/indexer.py @@ -29,12 +29,12 @@ # See: https://solrclient.readthedocs.io/en/latest/SolrClient.html # See: https://wiki.duraspace.org/display/DSPACE/Solr +from .config import SOLR_SERVER from .database import DatabaseManager import json import psycopg2.extras import re import requests -from .solr import solr_connection # Enumerate the cores in Solr to determine if statistics have been sharded into @@ -44,8 +44,12 @@ def get_statistics_shards(): statistics_core_years = [] # URL for Solr status to check active cores - solr_url = solr.host + '/admin/cores?action=STATUS&wt=json' - res = requests.get(solr_url) + solr_query_params = { + 'action': 'STATUS', + 'wt': 'json' + } + solr_url = SOLR_SERVER + '/admin/cores' + res = requests.get(solr_url, params=solr_query_params) if res.status_code == requests.codes.ok: data = res.json() @@ -68,13 +72,13 @@ def get_statistics_shards(): if len(statistics_core_years) > 0: # Begin building a string of shards starting with the default one - shards = '{}/statistics'.format(solr.host) + shards = '{}/statistics'.format(SOLR_SERVER) for core in statistics_core_years: # Create a comma-separated list of shards to pass to our Solr query # # See: https://wiki.apache.org/solr/DistributedSearch - shards += ',{}/{}'.format(solr.host, core) + shards += ',{}/{}'.format(SOLR_SERVER, core) # Return the string of shards, which may actually be empty. Solr doesn't # seem to mind if the shards query parameter is empty and I haven't seen @@ -89,23 +93,29 @@ def index_views(): # so we can get the countDistinct summary. # # see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html - res = solr.query('statistics', { + solr_query_params = { 'q': 'type:2', 'fq': 'isBot:false AND statistics_type:view', - 'facet': True, + 'facet': 'true', 'facet.field': 'id', 'facet.mincount': 1, 'facet.limit': 1, 'facet.offset': 0, - 'stats': True, + 'stats': 'true', 'stats.field': 'id', - 'stats.calcdistinct': True, - 'shards': shards - }, rows=0) + 'stats.calcdistinct': 'true', + 'shards': shards, + 'rows': 0, + 'wt': 'json' + } + + solr_url = SOLR_SERVER + '/statistics/select' + + res = requests.get(solr_url, params=solr_query_params) try: # get total number of distinct facets (countDistinct) - results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct'] + results_totalNumFacets = res.json()['stats']['stats_fields']['id']['countDistinct'] except TypeError: print('No item views to index, exiting.') @@ -124,20 +134,27 @@ def index_views(): while results_current_page <= results_num_pages: print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages)) - res = solr.query('statistics', { + solr_query_params = { 'q': 'type:2', 'fq': 'isBot:false AND statistics_type:view', - 'facet': True, + 'facet': 'true', 'facet.field': 'id', 'facet.mincount': 1, 'facet.limit': results_per_page, 'facet.offset': results_current_page * results_per_page, - 'shards': shards - }, rows=0) + 'shards': shards, + 'rows': 0, + 'wt': 'json', + 'json.nl': 'map' # return facets as a dict instead of a flat list + } - # SolrClient's get_facets() returns a dict of dicts - views = res.get_facets() - # in this case iterate over the 'id' dict and get the item ids and views + solr_url = SOLR_SERVER + '/statistics/select' + + res = requests.get(solr_url, params=solr_query_params) + + # Solr returns facets as a dict of dicts (see json.nl parameter) + views = res.json()['facet_counts']['facet_fields'] + # iterate over the 'id' dict and get the item ids and views for item_id, item_views in views['id'].items(): data.append((item_id, item_views)) @@ -154,23 +171,29 @@ def index_views(): def index_downloads(): # get the total number of distinct facets for items with at least 1 download - res = solr.query('statistics', { + solr_query_params= { 'q': 'type:0', 'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL', - 'facet': True, + 'facet': 'true', 'facet.field': 'owningItem', 'facet.mincount': 1, 'facet.limit': 1, 'facet.offset': 0, - 'stats': True, + 'stats': 'true', 'stats.field': 'owningItem', - 'stats.calcdistinct': True, - 'shards': shards - }, rows=0) + 'stats.calcdistinct': 'true', + 'shards': shards, + 'rows': 0, + 'wt': 'json' + } + + solr_url = SOLR_SERVER + '/statistics/select' + + res = requests.get(solr_url, params=solr_query_params) try: # get total number of distinct facets (countDistinct) - results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct'] + results_totalNumFacets = res.json()['stats']['stats_fields']['owningItem']['countDistinct'] except TypeError: print('No item downloads to index, exiting.') @@ -189,20 +212,27 @@ def index_downloads(): while results_current_page <= results_num_pages: print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages)) - res = solr.query('statistics', { + solr_query_params = { 'q': 'type:0', 'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL', - 'facet': True, + 'facet': 'true', 'facet.field': 'owningItem', 'facet.mincount': 1, 'facet.limit': results_per_page, 'facet.offset': results_current_page * results_per_page, - 'shards': shards - }, rows=0) + 'shards': shards, + 'rows': 0, + 'wt': 'json', + 'json.nl': 'map' # return facets as a dict instead of a flat list + } - # SolrClient's get_facets() returns a dict of dicts - downloads = res.get_facets() - # in this case iterate over the 'owningItem' dict and get the item ids and downloads + solr_url = SOLR_SERVER + '/statistics/select' + + res = requests.get(solr_url, params=solr_query_params) + + # Solr returns facets as a dict of dicts (see json.nl parameter) + downloads = res.json()['facet_counts']['facet_fields'] + # iterate over the 'owningItem' dict and get the item ids and downloads for item_id, item_downloads in downloads['owningItem'].items(): data.append((item_id, item_downloads)) @@ -217,8 +247,6 @@ def index_downloads(): results_current_page += 1 -solr = solr_connection() - with DatabaseManager() as db: with db.cursor() as cursor: # create table to store item views and downloads diff --git a/dspace_statistics_api/solr.py b/dspace_statistics_api/solr.py deleted file mode 100644 index 322e21c..0000000 --- a/dspace_statistics_api/solr.py +++ /dev/null @@ -1,10 +0,0 @@ -from .config import SOLR_SERVER -from SolrClient import SolrClient - - -def solr_connection(): - connection = SolrClient(SOLR_SERVER) - - return connection - -# vim: set sw=4 ts=4 expandtab: