1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2025-05-10 23:26:02 +02:00

Compare commits

...

18 Commits

Author SHA1 Message Date
cbc98991b4 CHANGELOG.md: Move unreleased notes to version 0.1.0 2018-09-24 16:14:14 +03:00
6c28be0463 README.md: Add note about route for all items 2018-09-24 16:13:26 +03:00
42e8f17305 CHANGELOG.md: Add note about route for all items 2018-09-24 16:13:05 +03:00
19a45f3f6f app.py: Add route to page through all item statistics
This route exposes all item statistics and uses the limit and offset
parameters to control paging throug the result set. The logic here
is extremely easy thanks to the brilliant LIMIT and OFFSET features
of SQLite (of course the SQL query sorts the results by some unique
field to ensure the order is already the same).
2018-09-24 16:07:26 +03:00
505ef31101 CHANGELOG.md: Add note about UPSERT 2018-09-24 14:31:05 +03:00
1543cacc54 app.py: Update SQL logic to use single table
The indexer.py script was updated to use a single table because I
learned about UPSERT. This simplifies the database schema and the
Python logic, and makes it easier to page all views and downloads
at once without complicated JOIN queries.
2018-09-24 14:28:00 +03:00
2cab456f16 indexer.py: Use single items table with UPSERT
I was using two separate tables for item views and downloads without
realizing that SQLite didn't support FULL OUTER JOIN, which would be
needed to get views and downloads for a given item in a single query.

Instead I can use one table with a default value of 0 for both views
and downloads, and then use "UPSERT" to populate the statistics. This
is a newish SQL concept that allows you to attempt an INSERT and then
specify an action to perform in case of conflict. This works well in
SQLite and actually simplifies my Python logic greatly!

Note that the "excluded" table qualifier is a special keyword that
allows you to reference the value that would have been inserted.

See: https://www.sqlite.org/lang_UPSERT.html
2018-09-24 14:19:50 +03:00
53615dea2d indexer.py: Add license and documentation 2018-09-24 09:18:50 +03:00
2d8d1e6833 README.md: Add TODO for nonexistent items 2018-09-24 00:48:02 +03:00
e26e595ea1 README.md: Add more TODOs 2018-09-24 00:35:00 +03:00
a9151b5bbf CHANGELOG.md: Update unreleased notes 2018-09-24 00:30:58 +03:00
76833d6f5f contrib: Update some old CGSpace references to DSpace 2018-09-24 00:30:26 +03:00
a51422273c Remove SOLR_CORE configuration variable
This parameter is not customizable. All DSpace instances use this
name for the Solr statistics core.
2018-09-24 00:20:54 +03:00
89621af85d Split database access into RW and RO
The indexer need to be able to write to the database, but the API only
needs to read it.
2018-09-24 00:00:05 +03:00
c554404d7f CHANGELOG.md: Add systemd units for indexer 2018-09-23 23:15:27 +03:00
90d7a452bd contrib: Add systemd units for indexer
An example systemd service unit for the indexer and an accompanying
timer unit.
2018-09-23 23:13:43 +03:00
431a1c9d64 CHANGELOG.md: Add unreleased changes 2018-09-23 23:04:01 +03:00
e1b9d1284f Rename project to DSpace Statistics API
At first I called it "CGSpace" because I was making it specifically
for our CGSpace DSpace repository, but the potential here is bigger
than that!
2018-09-23 23:02:21 +03:00
9 changed files with 143 additions and 45 deletions

View File

@ -4,6 +4,20 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.1.0] - 2018-09-24
### Changed
- Rename project to "DSpace Statistics API"
- Use read-only database connection in API
- Update systemd units for CGSpace→DSpace rename
- Use UPSERT to simplify database schema and Python logic
### Added
- Example systemd service and timer unit for indexer service
- Add top-level route to expose all item statistics
### Removed
- Ability to customize SOLR_CORE variable
## [0.0.4] - 2018-09-23
### Added
- Added example systemd unit file for API

View File

@ -1,4 +1,4 @@
# CGSpace Statistics API
# DSpace Statistics API
A quick and dirty REST API to expose Solr view and download statistics for items in a DSpace repository.
Written and tested in Python 3.6. SolrClient (0.2.1) does not currently run in Python 3.7.0.
@ -13,8 +13,10 @@ Create a virtual environment and run it:
## Todo
- Ability to return a paginated list of items (on a different route?)
- Add API documentation
- Close up DB connection when gunicorn shuts down gracefully
- Better logging
- Return HTTP 404 when item_id is nonexistent
## License
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).

61
app.py
View File

@ -2,44 +2,67 @@
# See DSpace Solr docs for tips about parameters
# https://wiki.duraspace.org/display/DSPACE/Solr
from config import SOLR_CORE
from database import database_connection
from database import database_connection_ro
import falcon
from solr import solr_connection
db = database_connection()
db = database_connection_ro()
solr = solr_connection()
class AllItemsResource:
def on_get(self, req, resp):
"""Handles GET requests"""
# Return HTTPBadRequest if id parameter is not present and valid
limit = req.get_param_as_int("limit", min=0, max=100) or 100
page = req.get_param_as_int("page", min=0) or 0
offset = limit * page
cursor = db.cursor()
# get total number of items so we can estimate the pages
cursor.execute('SELECT COUNT(id) FROM items')
pages = round(cursor.fetchone()[0] / limit)
# get statistics, ordered by id, and use limit and offset to page through results
cursor.execute('SELECT id, views, downloads FROM items ORDER BY id ASC LIMIT {0} OFFSET {1}'.format(limit, offset))
results = cursor.fetchmany(limit)
cursor.close()
# create a list to hold dicts of item stats
statistics = list()
# iterate over results and build statistics object
for item in results:
statistics.append({ 'id': item['id'], 'views': item['views'], 'downloads': item['downloads'] })
message = {
'currentPage': page,
'totalPages': pages,
'limit': limit,
'statistics': statistics
}
resp.media = message
class ItemResource:
def on_get(self, req, resp, item_id):
"""Handles GET requests"""
cursor = db.cursor()
# get item views (and catch the TypeError if item doesn't have any views)
cursor.execute('SELECT views FROM itemviews WHERE id={0}'.format(item_id))
try:
views = cursor.fetchone()['views']
except:
views = 0
# get item downloads (and catch the TypeError if item doesn't have any downloads)
cursor.execute('SELECT downloads FROM itemdownloads WHERE id={0}'.format(item_id))
try:
downloads = cursor.fetchone()['downloads']
except:
downloads = 0
cursor.execute('SELECT views, downloads FROM items WHERE id={0}'.format(item_id))
results = cursor.fetchone()
cursor.close()
statistics = {
'id': item_id,
'views': views,
'downloads': downloads
'views': results['views'],
'downloads': results['downloads']
}
resp.media = statistics
api = falcon.API()
api.add_route('/', AllItemsResource())
api.add_route('/item/{item_id:int}', ItemResource())
# vim: set sw=4 ts=4 expandtab:

View File

@ -2,7 +2,6 @@ import os
# Check if Solr connection information was provided in the environment
SOLR_SERVER = os.environ.get('SOLR_SERVER', 'http://localhost:8080/solr')
SOLR_CORE = os.environ.get('SOLR_CORE', 'statistics')
SQLITE_DB = os.environ.get('SQLITE_DB', 'statistics.db')

View File

@ -1,15 +1,14 @@
[Unit]
Description=CGSpace Statistics API
Description=DSpace Statistics API
After=network.target
[Service]
Environment=SOLR_SERVER=http://localhost:8081/solr
Environment=SOLR_CORE=statistics
User=nobody
Group=nogroup
WorkingDirectory=/opt/ilri/cgspace-statistics-api
ExecStart=/opt/ilri/cgspace-statistics-api/venv/bin/gunicorn \
--bind 127.0.0.1:5000 \
WorkingDirectory=/opt/ilri/dspace-statistics-api
ExecStart=/opt/ilri/dspace-statistics-api/venv/bin/gunicorn \
--bind 127.0.0.1:5000 \
app:api
ExecReload=/bin/kill -s HUP $MAINPID
ExecStop=/bin/kill -s TERM $MAINPID

View File

@ -0,0 +1,13 @@
[Unit]
Description=DSpace Statistics Indexer
After=tomcat7.target
[Service]
Environment=SOLR_SERVER=http://localhost:8081/solr
User=nobody
Group=nogroup
WorkingDirectory=/opt/ilri/dspace-statistics-api
ExecStart=/opt/ilri/dspace-statistics-api/venv/bin/python indexer.py
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,12 @@
[Unit]
Description=DSpace Statistics Indexer
[Timer]
# twice a day, at 6AM and 6PM
OnCalendar=*-*-* 06:00:00,18:00:00
# Add a random delay of 03600 seconds
RandomizedDelaySec=3600
Persistent=true
[Install]
WantedBy=timers.target

View File

@ -1,11 +1,18 @@
from config import SQLITE_DB
import sqlite3
def database_connection():
def database_connection_rw():
connection = sqlite3.connect(SQLITE_DB)
# allow iterating over row results by column key
connection.row_factory = sqlite3.Row
return connection
def database_connection_ro():
connection = sqlite3.connect('file:{0}?mode=ro'.format(SQLITE_DB), uri=True)
# allow iterating over row results by column key
connection.row_factory = sqlite3.Row
return connection
# vim: set sw=4 ts=4 expandtab:

View File

@ -1,18 +1,45 @@
#!/usr/bin/env python
#
# Tested with Python 3.6
# See DSpace Solr docs for tips about parameters
# https://wiki.duraspace.org/display/DSPACE/Solr
# indexer.py
#
# Copyright 2018 Alan Orth.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# ---
#
# Connects to a DSpace Solr statistics core and ingests item views and downloads
# into a SQLite database for use with other applications (an API, for example).
#
# This script is written for Python 3 and requires several modules that you can
# install with pip (I recommend setting up a Python virtual environment first):
#
# $ pip install SolrClient
#
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
# See: https://wiki.duraspace.org/display/DSPACE/Solr
#
# Tested with Python 3.5 and 3.6.
from config import SOLR_CORE
from database import database_connection
from database import database_connection_rw
from solr import solr_connection
def index_views():
print("Populating database with item views.")
# determine the total number of items with views (aka Solr's numFound)
res = solr.query(SOLR_CORE, {
res = solr.query('statistics', {
'q':'type:2',
'fq':'isBot:false AND statistics_type:view',
'facet':True,
@ -28,7 +55,7 @@ def index_views():
while results_current_page <= results_num_pages:
print('Page {0} of {1}.'.format(results_current_page, results_num_pages))
res = solr.query(SOLR_CORE, {
res = solr.query('statistics', {
'q':'type:2',
'fq':'isBot:false AND statistics_type:view',
'facet':True,
@ -43,7 +70,9 @@ def index_views():
views = res.get_facets()
# in this case iterate over the 'id' dict and get the item ids and views
for item_id, item_views in views['id'].items():
db.execute('''REPLACE INTO itemviews VALUES (?, ?)''', (item_id, item_views))
db.execute('''INSERT INTO items(id, views) VALUES(?, ?)
ON CONFLICT(id) DO UPDATE SET downloads=excluded.views''',
(item_id, item_views))
db.commit()
@ -53,7 +82,7 @@ def index_downloads():
print("Populating database with item downloads.")
# determine the total number of items with downloads (aka Solr's numFound)
res = solr.query(SOLR_CORE, {
res = solr.query('statistics', {
'q':'type:0',
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
'facet':True,
@ -69,7 +98,7 @@ def index_downloads():
while results_current_page <= results_num_pages:
print('Page {0} of {1}.'.format(results_current_page, results_num_pages))
res = solr.query(SOLR_CORE, {
res = solr.query('statistics', {
'q':'type:0',
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
'facet':True,
@ -84,20 +113,20 @@ def index_downloads():
downloads = res.get_facets()
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
for item_id, item_downloads in downloads['owningItem'].items():
db.execute('''REPLACE INTO itemdownloads VALUES (?, ?)''', (item_id, item_downloads))
db.execute('''INSERT INTO items(id, downloads) VALUES(?, ?)
ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads''',
(item_id, item_downloads))
db.commit()
results_current_page += 1
db = database_connection()
db = database_connection_rw()
solr = solr_connection()
# use separate views and downloads tables so we can REPLACE INTO carelessly (ie, item may have views but no downloads)
db.execute('''CREATE TABLE IF NOT EXISTS itemviews
(id integer primary key, views integer)''')
db.execute('''CREATE TABLE IF NOT EXISTS itemdownloads
(id integer primary key, downloads integer)''')
# create table to store item views and downloads
db.execute('''CREATE TABLE IF NOT EXISTS items
(id INT PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)''')
index_views()
index_downloads()