1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2025-05-10 15:16:02 +02:00

Compare commits

...

14 Commits

Author SHA1 Message Date
914ec52fbb CHANGELOG.md: Move unrelease changes to 0.9.0 2019-01-22 09:02:29 +02:00
5524066656 CHANGELOG.md: Add note about catching errors 2019-01-22 09:01:54 +02:00
043d897cef dspace_statistics_api/indexer.py: Catch case of no views/downloads
Don't fail with an exception when there are no views or downloads,
for example on a new DSpace installation.
2019-01-22 09:00:22 +02:00
bd28353cda README.md: Remove TODO for fixing querying of shards 2019-01-22 08:41:39 +02:00
e23d66c2a2 CHANGELOG.md: Add note about fixing querying of sharded cores 2019-01-22 08:41:31 +02:00
40e284dac0 dspace_statistics_api/indexer.py: Query multiple shards
DSpace's stats-util script splits the Solr statistics core into yearly
shards. We need to use Solr's `shards` query parameter in order to get
the statistics for previous years. This commit adds a helper function
to enumerate the active Solr cores to find yearly shards matching the
statistics-YYYY pattern and add them to the query.
2019-01-22 08:39:36 +02:00
934fa9db9b README.md: Add TODO about sharded statistics cores 2019-01-21 12:55:43 +02:00
1fabb72b58 Update requirements
Generated from pipenv with:

  $ pipenv lock -r > requirements.txt
  $ pipenv lock -r -d > requirements-dev.txt
2019-01-16 12:34:50 +02:00
c7f95f0b60 README.md: Update TODO
I think it might be possible to compute community and collection
statistics from Solr and make them available at new endpoints:

  - /communities
  - /community/id
  - /collections
  - /collection/id
2019-01-16 09:59:29 +02:00
c95a98dd2d Pipfile.lock: update dependencies
Updated with `pipenv update`.
2019-01-15 10:22:46 +02:00
3f70f94a10 Pipfile.lock: Run pipenv update 2018-11-26 11:53:37 +02:00
9b8ad9defd Merge pull request #9 from ilri/pipenv-update
Pipenv update
2018-11-19 23:50:44 +02:00
d69ab20220 CHANGELOG.md: pytest version 4.0.0 2018-11-19 23:46:03 +02:00
378f56ddc2 Pipfile.lock: Run pipenv update 2018-11-19 23:34:34 +02:00
6 changed files with 111 additions and 41 deletions

View File

@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.9.0] - 2019-01-22
### Updated
- pytest version 4.0.0
- Fix indexing of sharded statistics cores ([#10))
- Handle case of missing views/downloads gracefully
## [0.8.1] - 2018-11-14
### Changed
- README.md to recommend using vanilla Python virtual environments and pip instead of pipenv

50
Pipfile.lock generated
View File

@ -77,10 +77,10 @@
},
"six": {
"hashes": [
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.11.0"
"version": "==1.12.0"
},
"solrclient": {
"git": "https://github.com/alanorth/SolrClient.git",
@ -126,11 +126,11 @@
},
"ipython": {
"hashes": [
"sha256:a5781d6934a3341a1f9acb4ea5acdc7ea0a0855e689dbe755d070ca51e995435",
"sha256:b10a7ddd03657c761fc503495bc36471c8158e3fc948573fb9fe82a7029d8efd"
"sha256:6a9496209b76463f1dec126ab928919aaf1f55b38beb9219af3fe202f6bbdd12",
"sha256:f69932b1e806b38a7818d9a1e918e5821b685715040b48e59c657b3c7961b742"
],
"index": "pypi",
"version": "==7.1.1"
"version": "==7.2.0"
},
"ipython-genutils": {
"hashes": [
@ -141,10 +141,10 @@
},
"jedi": {
"hashes": [
"sha256:0191c447165f798e6a730285f2eee783fff81b0d3df261945ecb80983b5c3ca7",
"sha256:b7493f73a2febe0dc33d51c99b474547f7f6c0b2c8fb2b21f453eef204c12148"
"sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd",
"sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191"
],
"version": "==0.13.1"
"version": "==0.13.2"
},
"mccabe": {
"hashes": [
@ -155,11 +155,11 @@
},
"more-itertools": {
"hashes": [
"sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092",
"sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e",
"sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d"
"sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
"sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
"sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
],
"version": "==4.3.0"
"version": "==5.0.0"
},
"parso": {
"hashes": [
@ -185,10 +185,10 @@
},
"pluggy": {
"hashes": [
"sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
"sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
"sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
"sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
],
"version": "==0.8.0"
"version": "==0.8.1"
},
"prompt-toolkit": {
"hashes": [
@ -228,25 +228,25 @@
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
"sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
"sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
"sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
],
"version": "==2.2.0"
"version": "==2.3.1"
},
"pytest": {
"hashes": [
"sha256:3f193df1cfe1d1609d4c583838bea3d532b18d6160fd3f55c9447fdca30848ec",
"sha256:e246cf173c01169b9617fc07264b7b1316e78d7a650055235d6d897bc80d9660"
"sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2",
"sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be"
],
"index": "pypi",
"version": "==3.10.1"
"version": "==4.1.1"
},
"six": {
"hashes": [
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.11.0"
"version": "==1.12.0"
},
"traitlets": {
"hashes": [

View File

@ -85,6 +85,7 @@ The item id is the *internal* id for an item. You can get these from the standar
- Better logging
- Version API
- Use JSON in PostgreSQL
- Make community and collection stats available
- Switch to [Python 3.6+ f-string syntax](https://realpython.com/python-f-strings/)
## License

View File

@ -32,9 +32,56 @@
from .database import DatabaseManager
import json
import psycopg2.extras
import re
import requests
from .solr import solr_connection
# Enumerate the cores in Solr to determine if statistics have been sharded into
# yearly shards by DSpace's stats-util or not (for example: statistics-2018).
def get_statistics_shards():
# Initialize an empty list for statistics core years
statistics_core_years = []
# URL for Solr status to check active cores
solr_url = solr.host + '/admin/cores?action=STATUS&wt=json'
res = requests.get(solr_url)
if res.status_code == requests.codes.ok:
data = res.json()
# Iterate over active cores from Solr's STATUS response (cores are in
# the status array of this response).
for core in data['status']:
# Pattern to match, for example: statistics-2018
pattern = re.compile('^statistics-[0-9]{4}$')
if not pattern.match(core):
continue
# Append current core to list
statistics_core_years.append(core)
# Initialize a string to hold our shards (may end up being empty if the Solr
# core has not been processed by stats-util).
shards = str()
if len(statistics_core_years) > 0:
# Begin building a string of shards starting with the default one
shards = '{}/statistics'.format(solr.host)
for core in statistics_core_years:
# Create a comma-separated list of shards to pass to our Solr query
#
# See: https://wiki.apache.org/solr/DistributedSearch
shards += ',{}/{}'.format(solr.host, core)
# Return the string of shards, which may actually be empty. Solr doesn't
# seem to mind if the shards query parameter is empty and I haven't seen
# any negative performance impact so this should be fine.
return shards
def index_views():
# get total number of distinct facets for items with a minimum of 1 view,
# otherwise Solr returns all kinds of weird ids that are actually not in
@ -52,11 +99,17 @@ def index_views():
'facet.offset': 0,
'stats': True,
'stats.field': 'id',
'stats.calcdistinct': True
'stats.calcdistinct': True,
'shards': shards
}, rows=0)
# get total number of distinct facets (countDistinct)
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct']
try:
# get total number of distinct facets (countDistinct)
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct']
except TypeError:
print('No item views to index, exiting.')
exit(0)
# divide results into "pages" (cast to int to effectively round down)
results_per_page = 100
@ -78,7 +131,8 @@ def index_views():
'facet.field': 'id',
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
'facet.offset': results_current_page * results_per_page,
'shards': shards
}, rows=0)
# SolrClient's get_facets() returns a dict of dicts
@ -110,11 +164,17 @@ def index_downloads():
'facet.offset': 0,
'stats': True,
'stats.field': 'owningItem',
'stats.calcdistinct': True
'stats.calcdistinct': True,
'shards': shards
}, rows=0)
# get total number of distinct facets (countDistinct)
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct']
try:
# get total number of distinct facets (countDistinct)
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct']
except TypeError:
print('No item downloads to index, exiting.')
exit(0)
# divide results into "pages" (cast to int to effectively round down)
results_per_page = 100
@ -136,7 +196,8 @@ def index_downloads():
'facet.field': 'owningItem',
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
'facet.offset': results_current_page * results_per_page,
'shards': shards
}, rows=0)
# SolrClient's get_facets() returns a dict of dicts
@ -167,6 +228,8 @@ with DatabaseManager() as db:
# commit the table creation before closing the database connection
db.commit()
shards = get_statistics_shards()
index_views()
index_downloads()

View File

@ -5,21 +5,21 @@ backcall==0.1.0
decorator==4.3.0
flake8==3.6.0
ipython-genutils==0.2.0
ipython==7.1.1
jedi==0.13.1
ipython==7.2.0
jedi==0.13.2
mccabe==0.6.1
more-itertools==4.3.0
more-itertools==5.0.0
parso==0.3.1
pexpect==4.6.0 ; sys_platform != 'win32'
pickleshare==0.7.5
pluggy==0.8.0
pluggy==0.8.1
prompt-toolkit==2.0.7
ptyprocess==0.6.0
py==1.7.0
pycodestyle==2.4.0
pyflakes==2.0.0
pygments==2.2.0
pytest==3.10.1
six==1.11.0
pygments==2.3.1
pytest==4.1.1
six==1.12.0
traitlets==4.3.2
wcwidth==0.1.7

View File

@ -4,4 +4,4 @@ git+https://github.com/alanorth/SolrClient.git@c629e3475be37c82770b2be61748be7e2
gunicorn==19.9.0
psycopg2-binary==2.7.6.1
python-mimeparse==1.6.0
six==1.11.0
six==1.12.0