mirror of
https://github.com/ilri/dspace-statistics-api.git
synced 2025-05-11 07:36:02 +02:00
Compare commits
31 Commits
Author | SHA1 | Date | |
---|---|---|---|
7499b89d99
|
|||
2c1e4952b1
|
|||
379f202c3f
|
|||
560fa6056d
|
|||
385a34e5d0
|
|||
d0ea62d2bd
|
|||
366ae25b8e
|
|||
0f3054ae03
|
|||
6bf34235d4
|
|||
e604d8ca81
|
|||
fc35b816f3
|
|||
9e6a2f7559
|
|||
46cfc3ffbc
|
|||
2850035a4c
|
|||
c0b550109a
|
|||
bfceffd84d
|
|||
d0552f5047
|
|||
c3a0bf7f44
|
|||
6e47e9c9ee
|
|||
cd90d618d6
|
|||
280d211d56
|
|||
806d63137f
|
|||
f7c7390e4f
|
|||
702724e8a4
|
|||
36818d03ef
|
|||
4cf8656b35
|
|||
f30a464cd1
|
|||
93ae12e313
|
|||
dc978e9333
|
|||
295436fea0
|
|||
46a1476ab0
|
9
.travis.yml
Normal file
9
.travis.yml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
language: python
|
||||||
|
python:
|
||||||
|
- "3.5"
|
||||||
|
- "3.6"
|
||||||
|
- "3.7"
|
||||||
|
install:
|
||||||
|
- pip install -r requirements.txt
|
||||||
|
|
||||||
|
# vim: ts=2 sw=2 et
|
27
CHANGELOG.md
27
CHANGELOG.md
@ -4,6 +4,33 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [0.4.1] - 2018-09-26
|
||||||
|
### Changed
|
||||||
|
- Use execute_values() to batch insert records to PostgreSQL
|
||||||
|
|
||||||
|
## [0.4.0] - 2018-09-25
|
||||||
|
### Fixed
|
||||||
|
- Invalid OnCalendar syntax in dspace-statistics-indexer.timer
|
||||||
|
- Major logic error in indexer.py
|
||||||
|
|
||||||
|
## [0.3.2] - 2018-09-25
|
||||||
|
## Changed
|
||||||
|
- /item/id route now returns HTTP 404 if an item is not found
|
||||||
|
|
||||||
|
## [0.3.1] - 2018-09-25
|
||||||
|
### Changed
|
||||||
|
- Force SolrClient's kazoo dependency to version 2.5.0 to work with Python 3.7
|
||||||
|
- Add Python 3.7 to Travis CI configuration
|
||||||
|
|
||||||
|
## [0.3.0] - 2018-09-25
|
||||||
|
### Added
|
||||||
|
- requirements.txt for pip
|
||||||
|
- Travis CI build configuration for Python 3.5 and 3.6
|
||||||
|
- Documentation on using the API
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- The "all items" route from / to /items
|
||||||
|
|
||||||
## [0.2.1] - 2018-09-24
|
## [0.2.1] - 2018-09-24
|
||||||
### Changed
|
### Changed
|
||||||
- Environment settings in example systemd unit files
|
- Environment settings in example systemd unit files
|
||||||
|
16
README.md
16
README.md
@ -1,22 +1,30 @@
|
|||||||
# DSpace Statistics API
|
# DSpace Statistics API
|
||||||
A quick and dirty REST API to expose Solr view and download statistics for items in a DSpace repository.
|
A quick and dirty REST API to expose Solr view and download statistics for items in a DSpace repository.
|
||||||
|
|
||||||
Written and tested in Python 3.6. SolrClient (0.2.1) does not currently run in Python 3.7.0. Requires PostgreSQL version 9.5 or greater for [`UPSERT` support](https://wiki.postgresql.org/wiki/UPSERT).
|
Written and tested in Python 3.5, 3.6, and 3.7. Requires PostgreSQL version 9.5 or greater for [`UPSERT` support](https://wiki.postgresql.org/wiki/UPSERT).
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
Create a virtual environment and run it:
|
Create a virtual environment and run it:
|
||||||
|
|
||||||
$ virtualenv -p /usr/bin/python3.6 venv
|
$ python -m venv venv
|
||||||
$ . venv/bin/activate
|
$ . venv/bin/activate
|
||||||
$ pip install falcon gunicorn SolrClient psycopg2-binary
|
$ pip install -r requirements.txt
|
||||||
$ gunicorn app:api
|
$ gunicorn app:api
|
||||||
|
|
||||||
|
## Using the API
|
||||||
|
The API exposes the following endpoints:
|
||||||
|
|
||||||
|
- GET `/items` — return views and downloads for all items that Solr knows about¹. Accepts `limit` and `page` query parameters for pagination of results.
|
||||||
|
- GET `/item/id` — return views and downloads for a single item (*id* must be a positive integer). Returns HTTP 404 if an item id is not found.
|
||||||
|
|
||||||
|
¹ We are querying the Solr statistics core, which technically only knows about items that have either views or downloads.
|
||||||
|
|
||||||
## Todo
|
## Todo
|
||||||
|
|
||||||
- Add API documentation
|
- Add API documentation
|
||||||
- Close up DB connection when gunicorn shuts down gracefully
|
- Close up DB connection when gunicorn shuts down gracefully
|
||||||
- Better logging
|
- Better logging
|
||||||
- Return HTTP 404 when item_id is nonexistent
|
- Tests
|
||||||
|
|
||||||
## License
|
## License
|
||||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||||
|
11
app.py
11
app.py
@ -47,8 +47,13 @@ class ItemResource:
|
|||||||
|
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
cursor.execute('SELECT views, downloads FROM items WHERE id={}'.format(item_id))
|
cursor.execute('SELECT views, downloads FROM items WHERE id={}'.format(item_id))
|
||||||
|
if cursor.rowcount == 0:
|
||||||
|
raise falcon.HTTPNotFound(
|
||||||
|
title='Item not found',
|
||||||
|
description='The item with id "{}" was not found.'.format(item_id)
|
||||||
|
)
|
||||||
|
else:
|
||||||
results = cursor.fetchone()
|
results = cursor.fetchone()
|
||||||
cursor.close()
|
|
||||||
|
|
||||||
statistics = {
|
statistics = {
|
||||||
'id': item_id,
|
'id': item_id,
|
||||||
@ -58,8 +63,10 @@ class ItemResource:
|
|||||||
|
|
||||||
resp.media = statistics
|
resp.media = statistics
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
api = falcon.API()
|
api = falcon.API()
|
||||||
api.add_route('/', AllItemsResource())
|
api.add_route('/items', AllItemsResource())
|
||||||
api.add_route('/item/{item_id:int}', ItemResource())
|
api.add_route('/item/{item_id:int}', ItemResource())
|
||||||
|
|
||||||
# vim: set sw=4 ts=4 expandtab:
|
# vim: set sw=4 ts=4 expandtab:
|
||||||
|
@ -3,7 +3,7 @@ Description=DSpace Statistics Indexer
|
|||||||
|
|
||||||
[Timer]
|
[Timer]
|
||||||
# twice a day, at 6AM and 6PM
|
# twice a day, at 6AM and 6PM
|
||||||
OnCalendar=*-*-* 06:00:00,18:00:00
|
OnCalendar=*-*-* 06,18:00:00
|
||||||
# Add a random delay of 0–3600 seconds
|
# Add a random delay of 0–3600 seconds
|
||||||
RandomizedDelaySec=3600
|
RandomizedDelaySec=3600
|
||||||
Persistent=true
|
Persistent=true
|
||||||
|
@ -2,8 +2,7 @@ from config import DATABASE_NAME
|
|||||||
from config import DATABASE_USER
|
from config import DATABASE_USER
|
||||||
from config import DATABASE_PASS
|
from config import DATABASE_PASS
|
||||||
from config import DATABASE_HOST
|
from config import DATABASE_HOST
|
||||||
import psycopg2
|
import psycopg2, psycopg2.extras
|
||||||
import psycopg2.extras
|
|
||||||
|
|
||||||
def database_connection():
|
def database_connection():
|
||||||
connection = psycopg2.connect("dbname={} user={} password={} host='{}'".format(DATABASE_NAME, DATABASE_USER, DATABASE_PASS, DATABASE_HOST), cursor_factory=psycopg2.extras.DictCursor)
|
connection = psycopg2.connect("dbname={} user={} password={} host='{}'".format(DATABASE_NAME, DATABASE_USER, DATABASE_PASS, DATABASE_HOST), cursor_factory=psycopg2.extras.DictCursor)
|
||||||
|
93
indexer.py
93
indexer.py
@ -20,111 +20,140 @@
|
|||||||
# ---
|
# ---
|
||||||
#
|
#
|
||||||
# Connects to a DSpace Solr statistics core and ingests item views and downloads
|
# Connects to a DSpace Solr statistics core and ingests item views and downloads
|
||||||
# into a Postgres database for use with other applications (an API, for example).
|
# into a PostgreSQL database for use by other applications (like an API).
|
||||||
#
|
#
|
||||||
# This script is written for Python 3 and requires several modules that you can
|
# This script is written for Python 3.5+ and requires several modules that you
|
||||||
# install with pip (I recommend setting up a Python virtual environment first):
|
# can install with pip (I recommend using a Python virtual environment):
|
||||||
#
|
#
|
||||||
# $ pip install SolrClient
|
# $ pip install SolrClient psycopg2-binary
|
||||||
#
|
#
|
||||||
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
|
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
|
||||||
# See: https://wiki.duraspace.org/display/DSPACE/Solr
|
# See: https://wiki.duraspace.org/display/DSPACE/Solr
|
||||||
#
|
|
||||||
# Tested with Python 3.5 and 3.6.
|
|
||||||
|
|
||||||
from database import database_connection
|
from database import database_connection
|
||||||
|
import json
|
||||||
|
import psycopg2.extras
|
||||||
from solr import solr_connection
|
from solr import solr_connection
|
||||||
|
|
||||||
def index_views():
|
def index_views():
|
||||||
print("Populating database with item views.")
|
# get total number of distinct facets for items with a minimum of 1 view,
|
||||||
|
# otherwise Solr returns all kinds of weird ids that are actually not in
|
||||||
# determine the total number of items with views (aka Solr's numFound)
|
# the database. Also, stats are expensive, but we need stats.calcdistinct
|
||||||
|
# so we can get the countDistinct summary.
|
||||||
|
#
|
||||||
|
# see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html
|
||||||
res = solr.query('statistics', {
|
res = solr.query('statistics', {
|
||||||
'q':'type:2',
|
'q':'type:2',
|
||||||
'fq':'isBot:false AND statistics_type:view',
|
'fq':'isBot:false AND statistics_type:view',
|
||||||
'facet':True,
|
'facet':True,
|
||||||
'facet.field':'id',
|
'facet.field':'id',
|
||||||
|
'facet.mincount':1,
|
||||||
|
'facet.limit':1,
|
||||||
|
'facet.offset':0,
|
||||||
|
'stats':True,
|
||||||
|
'stats.field':'id',
|
||||||
|
'stats.calcdistinct':True
|
||||||
}, rows=0)
|
}, rows=0)
|
||||||
|
|
||||||
# divide results into "pages" (numFound / 100)
|
# get total number of distinct facets (countDistinct)
|
||||||
results_numFound = res.get_num_found()
|
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct']
|
||||||
|
|
||||||
|
# divide results into "pages" (cast to int to effectively round down)
|
||||||
results_per_page = 100
|
results_per_page = 100
|
||||||
results_num_pages = round(results_numFound / results_per_page)
|
results_num_pages = int(results_totalNumFacets / results_per_page)
|
||||||
results_current_page = 0
|
results_current_page = 0
|
||||||
|
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
# create an empty list to store values for batch insertion
|
||||||
|
data = []
|
||||||
|
|
||||||
while results_current_page <= results_num_pages:
|
while results_current_page <= results_num_pages:
|
||||||
print('Page {} of {}.'.format(results_current_page, results_num_pages))
|
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
|
||||||
|
|
||||||
res = solr.query('statistics', {
|
res = solr.query('statistics', {
|
||||||
'q':'type:2',
|
'q':'type:2',
|
||||||
'fq':'isBot:false AND statistics_type:view',
|
'fq':'isBot:false AND statistics_type:view',
|
||||||
'facet':True,
|
'facet':True,
|
||||||
'facet.field':'id',
|
'facet.field':'id',
|
||||||
|
'facet.mincount':1,
|
||||||
'facet.limit':results_per_page,
|
'facet.limit':results_per_page,
|
||||||
'facet.offset':results_current_page * results_per_page
|
'facet.offset':results_current_page * results_per_page
|
||||||
})
|
}, rows=0)
|
||||||
|
|
||||||
# make sure total number of results > 0
|
|
||||||
if res.get_num_found() > 0:
|
|
||||||
# SolrClient's get_facets() returns a dict of dicts
|
# SolrClient's get_facets() returns a dict of dicts
|
||||||
views = res.get_facets()
|
views = res.get_facets()
|
||||||
# in this case iterate over the 'id' dict and get the item ids and views
|
# in this case iterate over the 'id' dict and get the item ids and views
|
||||||
for item_id, item_views in views['id'].items():
|
for item_id, item_views in views['id'].items():
|
||||||
cursor.execute('''INSERT INTO items(id, views) VALUES(%s, %s)
|
data.append((item_id, item_views))
|
||||||
ON CONFLICT(id) DO UPDATE SET downloads=excluded.views''',
|
|
||||||
(item_id, item_views))
|
|
||||||
|
|
||||||
|
# do a batch insert of values from the current "page" of results
|
||||||
|
sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.views'
|
||||||
|
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
# clear all items from the list so we can populate it with the next batch
|
||||||
|
data.clear()
|
||||||
|
|
||||||
results_current_page += 1
|
results_current_page += 1
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def index_downloads():
|
def index_downloads():
|
||||||
print("Populating database with item downloads.")
|
# get the total number of distinct facets for items with at least 1 download
|
||||||
|
|
||||||
# determine the total number of items with downloads (aka Solr's numFound)
|
|
||||||
res = solr.query('statistics', {
|
res = solr.query('statistics', {
|
||||||
'q':'type:0',
|
'q':'type:0',
|
||||||
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||||
'facet':True,
|
'facet':True,
|
||||||
'facet.field':'owningItem',
|
'facet.field':'owningItem',
|
||||||
|
'facet.mincount':1,
|
||||||
|
'facet.limit':1,
|
||||||
|
'facet.offset':0,
|
||||||
|
'stats':True,
|
||||||
|
'stats.field':'owningItem',
|
||||||
|
'stats.calcdistinct':True
|
||||||
}, rows=0)
|
}, rows=0)
|
||||||
|
|
||||||
# divide results into "pages" (numFound / 100)
|
# get total number of distinct facets (countDistinct)
|
||||||
results_numFound = res.get_num_found()
|
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct']
|
||||||
|
|
||||||
|
# divide results into "pages" (cast to int to effectively round down)
|
||||||
results_per_page = 100
|
results_per_page = 100
|
||||||
results_num_pages = round(results_numFound / results_per_page)
|
results_num_pages = int(results_totalNumFacets / results_per_page)
|
||||||
results_current_page = 0
|
results_current_page = 0
|
||||||
|
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
# create an empty list to store values for batch insertion
|
||||||
|
data = []
|
||||||
|
|
||||||
while results_current_page <= results_num_pages:
|
while results_current_page <= results_num_pages:
|
||||||
print('Page {} of {}.'.format(results_current_page, results_num_pages))
|
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
|
||||||
|
|
||||||
res = solr.query('statistics', {
|
res = solr.query('statistics', {
|
||||||
'q':'type:0',
|
'q':'type:0',
|
||||||
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||||
'facet':True,
|
'facet':True,
|
||||||
'facet.field':'owningItem',
|
'facet.field':'owningItem',
|
||||||
|
'facet.mincount':1,
|
||||||
'facet.limit':results_per_page,
|
'facet.limit':results_per_page,
|
||||||
'facet.offset':results_current_page * results_per_page
|
'facet.offset':results_current_page * results_per_page
|
||||||
})
|
}, rows=0)
|
||||||
|
|
||||||
# make sure total number of results > 0
|
|
||||||
if res.get_num_found() > 0:
|
|
||||||
# SolrClient's get_facets() returns a dict of dicts
|
# SolrClient's get_facets() returns a dict of dicts
|
||||||
downloads = res.get_facets()
|
downloads = res.get_facets()
|
||||||
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
||||||
for item_id, item_downloads in downloads['owningItem'].items():
|
for item_id, item_downloads in downloads['owningItem'].items():
|
||||||
cursor.execute('''INSERT INTO items(id, downloads) VALUES(%s, %s)
|
data.append((item_id, item_downloads))
|
||||||
ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads''',
|
|
||||||
(item_id, item_downloads))
|
|
||||||
|
|
||||||
|
# do a batch insert of values from the current "page" of results
|
||||||
|
sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads'
|
||||||
|
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
# clear all items from the list so we can populate it with the next batch
|
||||||
|
data.clear()
|
||||||
|
|
||||||
results_current_page += 1
|
results_current_page += 1
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
12
requirements.txt
Normal file
12
requirements.txt
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
certifi==2018.8.24
|
||||||
|
chardet==3.0.4
|
||||||
|
falcon==1.4.1
|
||||||
|
gunicorn==19.9.0
|
||||||
|
idna==2.7
|
||||||
|
kazoo==2.5.0
|
||||||
|
psycopg2-binary==2.7.5
|
||||||
|
python-mimeparse==1.6.0
|
||||||
|
requests==2.19.1
|
||||||
|
six==1.11.0
|
||||||
|
SolrClient==0.2.1
|
||||||
|
urllib3==1.23
|
Reference in New Issue
Block a user