mirror of
https://github.com/ilri/dspace-statistics-api.git
synced 2025-05-10 15:16:02 +02:00
Compare commits
134 Commits
Author | SHA1 | Date | |
---|---|---|---|
78900b5d85
|
|||
eb08832bf8
|
|||
c2ec780ad9
|
|||
df8ebc8bf1
|
|||
0d4be5f4c8
|
|||
30dc7f1939
|
|||
77194707fd
|
|||
10c1f8bdcc
|
|||
da74943da2
|
|||
fc8348ab29
|
|||
15c3299b99
|
|||
d36be5ee50 | |||
2f45d27554 | |||
b8356f7a87 | |||
2136dc79ce | |||
ed60120cef | |||
c027f01b48 | |||
754663f062
|
|||
507699e58a
|
|||
a016916995
|
|||
6fd2827a7c
|
|||
62142eb79e
|
|||
fda0321942
|
|||
963aa245c8
|
|||
568ff2eebb
|
|||
deecb8a10b
|
|||
12f45d7c08
|
|||
f65089f9ce
|
|||
1db5cf1c29
|
|||
e581c4b1aa
|
|||
e8d356c9ca
|
|||
34a9b8d629
|
|||
41e3d66a0e
|
|||
9b2a6137b4
|
|||
600b986f99
|
|||
49a7790794
|
|||
f2deba627c
|
|||
9323513794
|
|||
daf15610f2
|
|||
4ede966dbb
|
|||
3580473a6d
|
|||
071c24535f
|
|||
4291aecac4
|
|||
46bf537e88
|
|||
eaca5354d3
|
|||
4600288ee4
|
|||
8179563378
|
|||
b14c3eef4d
|
|||
71a789b13f
|
|||
c68ddacaa4
|
|||
9c9e79769e
|
|||
2ad5ade556
|
|||
7412a09670
|
|||
bb744a00b8
|
|||
7499b89d99
|
|||
2c1e4952b1
|
|||
379f202c3f
|
|||
560fa6056d
|
|||
385a34e5d0
|
|||
d0ea62d2bd
|
|||
366ae25b8e
|
|||
0f3054ae03
|
|||
6bf34235d4
|
|||
e604d8ca81
|
|||
fc35b816f3
|
|||
9e6a2f7559
|
|||
46cfc3ffbc
|
|||
2850035a4c
|
|||
c0b550109a
|
|||
bfceffd84d
|
|||
d0552f5047
|
|||
c3a0bf7f44
|
|||
6e47e9c9ee
|
|||
cd90d618d6
|
|||
280d211d56
|
|||
806d63137f
|
|||
f7c7390e4f
|
|||
702724e8a4
|
|||
36818d03ef
|
|||
4cf8656b35
|
|||
f30a464cd1
|
|||
93ae12e313
|
|||
dc978e9333
|
|||
295436fea0
|
|||
46a1476ab0
|
|||
87dbb6c4df
|
|||
3160c44566
|
|||
4b72f626d9
|
|||
2d3b7620e3
|
|||
6e4bc630f7
|
|||
44884140e5
|
|||
74ff86ee3b
|
|||
3327884f21
|
|||
8f7450f67a
|
|||
28d61fb041
|
|||
cbc98991b4
|
|||
6c28be0463
|
|||
42e8f17305
|
|||
19a45f3f6f
|
|||
505ef31101
|
|||
1543cacc54
|
|||
2cab456f16
|
|||
53615dea2d
|
|||
2d8d1e6833
|
|||
e26e595ea1
|
|||
a9151b5bbf
|
|||
76833d6f5f
|
|||
a51422273c
|
|||
89621af85d
|
|||
c554404d7f
|
|||
90d7a452bd
|
|||
431a1c9d64
|
|||
e1b9d1284f
|
|||
bac764a0a4
|
|||
1a650e57c0
|
|||
2db5e02be9
|
|||
9e942736b1
|
|||
ea85393b13
|
|||
cbeb7c89a7
|
|||
b0d81a543c
|
|||
84801a4ab5
|
|||
4e8621e3d9
|
|||
2c8430171d
|
|||
fb60133713
|
|||
9e01a80011
|
|||
a263996582
|
|||
ed9d25294e
|
|||
5e165d2e88
|
|||
8e29fd8a43
|
|||
24af83b03f
|
|||
a87aaba812
|
|||
57faec59c8
|
|||
06ab254017
|
|||
5b5cab8b34
|
11
.travis.yml
Normal file
11
.travis.yml
Normal file
@ -0,0 +1,11 @@
|
||||
language: python
|
||||
python:
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
- "3.7-dev"
|
||||
script: pip install -r requirements.txt
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
|
||||
# vim: ts=2 sw=2 et
|
106
CHANGELOG.md
106
CHANGELOG.md
@ -4,6 +4,112 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
### [0.6.1] - 2018-10-31
|
||||
## Added
|
||||
- API documentation at root path (/)
|
||||
|
||||
### [0.6.0] - 2018-10-31
|
||||
## Changed
|
||||
- Refactor project structure (note breaking changes to API and indexing invocation, see contrib and README.md)
|
||||
|
||||
### [0.5.2] - 2018-10-28
|
||||
## Changed
|
||||
- Update library versions in requirements.txt
|
||||
|
||||
### [0.5.1] - 2018-10-24
|
||||
## Changed
|
||||
- Use Python's native json instead of ujson
|
||||
|
||||
### [0.5.0] - 2018-10-24
|
||||
## Added
|
||||
- Example nginx configuration to README.md
|
||||
|
||||
## Changed
|
||||
- Don't initialize Solr connection in API
|
||||
|
||||
### [0.4.3] - 2018-10-17
|
||||
## Changed
|
||||
- Use pip install as script for Travis CI
|
||||
|
||||
## Improved
|
||||
- Documentation for deployment and testing
|
||||
|
||||
## [0.4.2] - 2018-10-04
|
||||
### Changed
|
||||
- README.md introduction and requirements
|
||||
- Use ujson instead of json
|
||||
- Iterate directly on SQL cursor in `/items` route
|
||||
|
||||
### Fixed
|
||||
- Logic error in SQL for item views
|
||||
|
||||
## [0.4.1] - 2018-09-26
|
||||
### Changed
|
||||
- Use execute_values() to batch insert records to PostgreSQL
|
||||
|
||||
## [0.4.0] - 2018-09-25
|
||||
### Fixed
|
||||
- Invalid OnCalendar syntax in dspace-statistics-indexer.timer
|
||||
- Major logic error in indexer.py
|
||||
|
||||
## [0.3.2] - 2018-09-25
|
||||
## Changed
|
||||
- /item/id route now returns HTTP 404 if an item is not found
|
||||
|
||||
## [0.3.1] - 2018-09-25
|
||||
### Changed
|
||||
- Force SolrClient's kazoo dependency to version 2.5.0 to work with Python 3.7
|
||||
- Add Python 3.7 to Travis CI configuration
|
||||
|
||||
## [0.3.0] - 2018-09-25
|
||||
### Added
|
||||
- requirements.txt for pip
|
||||
- Travis CI build configuration for Python 3.5 and 3.6
|
||||
- Documentation on using the API
|
||||
|
||||
### Changed
|
||||
- The "all items" route from / to /items
|
||||
|
||||
## [0.2.1] - 2018-09-24
|
||||
### Changed
|
||||
- Environment settings in example systemd unit files
|
||||
- Use psycopg2.extras.DictCursor for PostgreSQL connection
|
||||
|
||||
## [0.2.0] - 2018-09-24
|
||||
### Changed
|
||||
- Use PostgreSQL instead of SQLite because UPSERT support needs a very new libsqlite3 whereas it's already in PostgreSQL 9.5+
|
||||
|
||||
## [0.1.0] - 2018-09-24
|
||||
### Changed
|
||||
- Rename project to "DSpace Statistics API"
|
||||
- Use read-only database connection in API
|
||||
- Update systemd units for CGSpace→DSpace rename
|
||||
- Use UPSERT to simplify database schema and Python logic
|
||||
|
||||
### Added
|
||||
- Example systemd service and timer unit for indexer service
|
||||
- Add top-level route to expose all item statistics
|
||||
|
||||
### Removed
|
||||
- Ability to customize SOLR_CORE variable
|
||||
|
||||
## [0.0.4] - 2018-09-23
|
||||
### Added
|
||||
- Added example systemd unit file for API
|
||||
- Added indexer.py to ingest views and downloads from Solr to a SQLite database
|
||||
|
||||
### Changed
|
||||
- Refactor Solr configuration and connection
|
||||
- /item route now expects id as part of the URI instead of a query parameter: /item/id
|
||||
- View and download stats are now fetched from a SQLite database
|
||||
|
||||
## [0.0.3] - 2018-09-20
|
||||
### Changed
|
||||
- Refactor environment variables into config module
|
||||
- Simplify Solr query for "downloads"
|
||||
- Optimize Solr query by using rows=0
|
||||
- Fix Solr queries for item views
|
||||
|
||||
## [0.0.2] - 2018-09-18
|
||||
### Added
|
||||
- Ability to get Solr parameters from environment (`SOLR_SERVER` and `SOLR_CORE`)
|
||||
|
82
README.md
82
README.md
@ -1,19 +1,83 @@
|
||||
# CGSpace Statistics API
|
||||
A quick and dirty REST API to expose Solr view and download statistics for items in a DSpace repository.
|
||||
# DSpace Statistics API [](https://travis-ci.org/ilri/dspace-statistics-api)
|
||||
DSpace versions 4.0 and up include a [REST API](https://wiki.duraspace.org/display/DSDOC5x/REST+API) that allows the repository to be queried programmatically. The API exposes information about communities, collections, items, and bitstreams, but not item views or downloads. This project contains a lightweight indexer and a web application to make the view and download statistics available via a simple REST API that can be deployed simultaneously with DSpace's own.
|
||||
|
||||
Written and tested in Python 3.6. SolrClient (0.2.1) does not currently run in Python 3.7.0.
|
||||
You can read more about the Solr queries used to gather the item view and download statistics on the [DSpace wiki](https://wiki.duraspace.org/display/DSPACE/Solr).
|
||||
|
||||
## Installation
|
||||
Create a virtual environment and run it:
|
||||
## Requirements
|
||||
|
||||
$ virtualenv -p /usr/bin/python3.6 venv
|
||||
- Python 3.5+
|
||||
- PostgreSQL version 9.5+ (due to [`UPSERT` support](https://wiki.postgresql.org/wiki/UPSERT))
|
||||
- DSpace 4+ with [Solr usage statistics enabled](https://wiki.duraspace.org/display/DSDOC5x/SOLR+Statistics)
|
||||
|
||||
## Installation and Testing
|
||||
Create a Python virtual environment and install the dependencies:
|
||||
|
||||
$ python -m venv venv
|
||||
$ . venv/bin/activate
|
||||
$ pip install falcon gunicorn SolrClient
|
||||
$ gunicorn app:api
|
||||
$ pip install -r requirements.txt
|
||||
|
||||
Set up the environment variables for Solr and PostgreSQL:
|
||||
|
||||
$ export SOLR_SERVER=http://localhost:8080/solr
|
||||
$ export DATABASE_NAME=dspacestatistics
|
||||
$ export DATABASE_USER=dspacestatistics
|
||||
$ export DATABASE_PASS=dspacestatistics
|
||||
$ export DATABASE_HOST=localhost
|
||||
|
||||
Index the Solr statistics core to populate the PostgreSQL database:
|
||||
|
||||
$ python -m dspace_statistics_api.indexer
|
||||
|
||||
Run the REST API:
|
||||
|
||||
$ gunicorn dspace_statistics_api.app
|
||||
|
||||
Test to see if there are any statistics:
|
||||
|
||||
$ curl 'http://localhost:8000/items?limit=1'
|
||||
|
||||
## Deployment
|
||||
There are example systemd service and timer units in the `contrib` directory. The API service listens on localhost by default so you will need to expose it publicly using a web server like nginx.
|
||||
|
||||
An example nginx configuration is:
|
||||
|
||||
```
|
||||
server {
|
||||
#...
|
||||
|
||||
location ~ /rest/statistics/?(.*) {
|
||||
access_log /var/log/nginx/statistics.log;
|
||||
proxy_pass http://statistics_api/$1$is_args$args;
|
||||
}
|
||||
}
|
||||
|
||||
upstream statistics_api {
|
||||
server 127.0.0.1:5000;
|
||||
}
|
||||
```
|
||||
|
||||
This would expose the API at `/rest/statistics`.
|
||||
|
||||
## Using the API
|
||||
The API exposes the following endpoints:
|
||||
|
||||
- GET `/` — return a basic API documentation page.
|
||||
- GET `/items` — return views and downloads for all items that Solr knows about¹. Accepts `limit` and `page` query parameters for pagination of results (`limit` must be an integer between 1 and 100, and `page` must be an integer greater than or equal to 0).
|
||||
- GET `/item/id` — return views and downloads for a single item (`id` must be a positive integer). Returns HTTP 404 if an item id is not found.
|
||||
|
||||
The item id is the *internal* id for an item. You can get these from the standard DSpace REST API.
|
||||
|
||||
¹ We are querying the Solr statistics core, which technically only knows about items that have either views or downloads. If an item is not present here you can assume it has zero views and zero downloads, but not necessarily that it does not exist in the repository.
|
||||
|
||||
## Todo
|
||||
|
||||
- Take a list of items (POST in JSON?)
|
||||
- Close DB connection when gunicorn shuts down gracefully
|
||||
- Better logging
|
||||
- Tests
|
||||
- Check if database exists (try/except)
|
||||
- Version API
|
||||
- Use JSON in PostgreSQL
|
||||
- Switch to [Python 3.6+ f-string syntax](https://realpython.com/python-f-strings/)
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
46
app.py
46
app.py
@ -1,46 +0,0 @@
|
||||
# Tested with Python 3.6
|
||||
# See DSpace Solr docs for tips about parameters
|
||||
# https://wiki.duraspace.org/display/DSPACE/Solr
|
||||
|
||||
import falcon
|
||||
import os
|
||||
from SolrClient import SolrClient
|
||||
|
||||
# Check if Solr connection information was provided in the environment
|
||||
solr_server = os.environ.get('SOLR_SERVER', 'http://localhost:8080/solr')
|
||||
solr_core = os.environ.get('SOLR_CORE', 'statistics')
|
||||
|
||||
class ItemResource:
|
||||
def on_get(self, req, resp):
|
||||
"""Handles GET requests"""
|
||||
# Return HTTPBadRequest if id parameter is not present and valid
|
||||
item_id = req.get_param_as_int("id", required=True, min=0)
|
||||
|
||||
solr = SolrClient(solr_server)
|
||||
|
||||
# Get views
|
||||
res = solr.query(solr_core, {
|
||||
'q':'type:0',
|
||||
'fq':'owningItem:{0} AND isBot:false AND statistics_type:view AND -bundleName:ORIGINAL'.format(item_id)
|
||||
})
|
||||
|
||||
views = res.get_num_found()
|
||||
|
||||
# Get downloads
|
||||
res = solr.query(solr_core, {
|
||||
'q':'type:0',
|
||||
'fq':'owningItem:{0} AND isBot:false AND statistics_type:view AND -(bundleName:[* TO *] -bundleName:ORIGINAL)'.format(item_id)
|
||||
})
|
||||
|
||||
downloads = res.get_num_found()
|
||||
|
||||
statistics = {
|
||||
'id': item_id,
|
||||
'views': views,
|
||||
'downloads': downloads
|
||||
}
|
||||
|
||||
resp.media = statistics
|
||||
|
||||
api = falcon.API()
|
||||
api.add_route('/item', ItemResource())
|
20
contrib/dspace-statistics-api.service
Normal file
20
contrib/dspace-statistics-api.service
Normal file
@ -0,0 +1,20 @@
|
||||
[Unit]
|
||||
Description=DSpace Statistics API
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Environment=DATABASE_NAME=dspacestatistics
|
||||
Environment=DATABASE_USER=dspacestatistics
|
||||
Environment=DATABASE_PASS=dspacestatistics
|
||||
Environment=DATABASE_HOST=localhost
|
||||
User=nobody
|
||||
Group=nogroup
|
||||
WorkingDirectory=/var/lib/dspace-statistics-api
|
||||
ExecStart=/var/lib/dspace-statistics-api/venv/bin/gunicorn \
|
||||
--bind 127.0.0.1:5000 \
|
||||
dspace_statistics_api.app
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
ExecStop=/bin/kill -s TERM $MAINPID
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
17
contrib/dspace-statistics-indexer.service
Normal file
17
contrib/dspace-statistics-indexer.service
Normal file
@ -0,0 +1,17 @@
|
||||
[Unit]
|
||||
Description=DSpace Statistics Indexer
|
||||
After=tomcat7.target
|
||||
|
||||
[Service]
|
||||
Environment=SOLR_SERVER=http://localhost:8081/solr
|
||||
Environment=DATABASE_NAME=dspacestatistics
|
||||
Environment=DATABASE_USER=dspacestatistics
|
||||
Environment=DATABASE_PASS=dspacestatistics
|
||||
Environment=DATABASE_HOST=localhost
|
||||
User=nobody
|
||||
Group=nogroup
|
||||
WorkingDirectory=/var/lib/dspace-statistics-api
|
||||
ExecStart=/var/lib/dspace-statistics-api/venv/bin/python -m dspace_statistics_api.indexer
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
12
contrib/dspace-statistics-indexer.timer
Normal file
12
contrib/dspace-statistics-indexer.timer
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=DSpace Statistics Indexer
|
||||
|
||||
[Timer]
|
||||
# twice a day, at 6AM and 6PM
|
||||
OnCalendar=*-*-* 06,18:00:00
|
||||
# Add a random delay of 0–3600 seconds
|
||||
RandomizedDelaySec=3600
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
0
dspace_statistics_api/__init__.py
Normal file
0
dspace_statistics_api/__init__.py
Normal file
78
dspace_statistics_api/app.py
Normal file
78
dspace_statistics_api/app.py
Normal file
@ -0,0 +1,78 @@
|
||||
from .database import database_connection
|
||||
import falcon
|
||||
|
||||
db = database_connection()
|
||||
db.set_session(readonly=True)
|
||||
|
||||
class RootResource:
|
||||
def on_get(self, req, resp):
|
||||
resp.status = falcon.HTTP_200
|
||||
resp.content_type = 'text/html'
|
||||
with open('dspace_statistics_api/docs/index.html', 'r') as f:
|
||||
resp.body = f.read()
|
||||
|
||||
class AllItemsResource:
|
||||
def on_get(self, req, resp):
|
||||
"""Handles GET requests"""
|
||||
# Return HTTPBadRequest if id parameter is not present and valid
|
||||
limit = req.get_param_as_int("limit", min=0, max=100) or 100
|
||||
page = req.get_param_as_int("page", min=0) or 0
|
||||
offset = limit * page
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
# get total number of items so we can estimate the pages
|
||||
cursor.execute('SELECT COUNT(id) FROM items')
|
||||
pages = round(cursor.fetchone()[0] / limit)
|
||||
|
||||
# get statistics, ordered by id, and use limit and offset to page through results
|
||||
cursor.execute('SELECT id, views, downloads FROM items ORDER BY id ASC LIMIT {} OFFSET {}'.format(limit, offset))
|
||||
|
||||
# create a list to hold dicts of item stats
|
||||
statistics = list()
|
||||
|
||||
# iterate over results and build statistics object
|
||||
for item in cursor:
|
||||
statistics.append({ 'id': item['id'], 'views': item['views'], 'downloads': item['downloads'] })
|
||||
|
||||
cursor.close()
|
||||
|
||||
message = {
|
||||
'currentPage': page,
|
||||
'totalPages': pages,
|
||||
'limit': limit,
|
||||
'statistics': statistics
|
||||
}
|
||||
|
||||
resp.media = message
|
||||
|
||||
class ItemResource:
|
||||
def on_get(self, req, resp, item_id):
|
||||
"""Handles GET requests"""
|
||||
|
||||
cursor = db.cursor()
|
||||
cursor.execute('SELECT views, downloads FROM items WHERE id={}'.format(item_id))
|
||||
if cursor.rowcount == 0:
|
||||
raise falcon.HTTPNotFound(
|
||||
title='Item not found',
|
||||
description='The item with id "{}" was not found.'.format(item_id)
|
||||
)
|
||||
else:
|
||||
results = cursor.fetchone()
|
||||
|
||||
statistics = {
|
||||
'id': item_id,
|
||||
'views': results['views'],
|
||||
'downloads': results['downloads']
|
||||
}
|
||||
|
||||
resp.media = statistics
|
||||
|
||||
cursor.close()
|
||||
|
||||
api = application = falcon.API()
|
||||
api.add_route('/', RootResource())
|
||||
api.add_route('/items', AllItemsResource())
|
||||
api.add_route('/item/{item_id:int}', ItemResource())
|
||||
|
||||
# vim: set sw=4 ts=4 expandtab:
|
11
dspace_statistics_api/config.py
Normal file
11
dspace_statistics_api/config.py
Normal file
@ -0,0 +1,11 @@
|
||||
import os
|
||||
|
||||
# Check if Solr connection information was provided in the environment
|
||||
SOLR_SERVER = os.environ.get('SOLR_SERVER', 'http://localhost:8080/solr')
|
||||
|
||||
DATABASE_NAME = os.environ.get('DATABASE_NAME', 'dspacestatistics')
|
||||
DATABASE_USER = os.environ.get('DATABASE_USER', 'dspacestatistics')
|
||||
DATABASE_PASS = os.environ.get('DATABASE_PASS', 'dspacestatistics')
|
||||
DATABASE_HOST = os.environ.get('DATABASE_HOST', 'localhost')
|
||||
|
||||
# vim: set sw=4 ts=4 expandtab:
|
12
dspace_statistics_api/database.py
Normal file
12
dspace_statistics_api/database.py
Normal file
@ -0,0 +1,12 @@
|
||||
from .config import DATABASE_NAME
|
||||
from .config import DATABASE_USER
|
||||
from .config import DATABASE_PASS
|
||||
from .config import DATABASE_HOST
|
||||
import psycopg2, psycopg2.extras
|
||||
|
||||
def database_connection():
|
||||
connection = psycopg2.connect("dbname={} user={} password={} host='{}'".format(DATABASE_NAME, DATABASE_USER, DATABASE_PASS, DATABASE_HOST), cursor_factory=psycopg2.extras.DictCursor)
|
||||
|
||||
return connection
|
||||
|
||||
# vim: set sw=4 ts=4 expandtab:
|
20
dspace_statistics_api/docs/index.html
Normal file
20
dspace_statistics_api/docs/index.html
Normal file
@ -0,0 +1,20 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>DSpace Statistics API</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>DSpace Statistics API</h1>
|
||||
<p>This site is running the <a href="https://github.com/ilri/dspace-statistics-api" title="DSpace Statistics API project">DSpace Statistics API</a>. The following endpoints are available:</p>
|
||||
<ul>
|
||||
<li>GET <code>/</code> — return a basic API documentation page.</li>
|
||||
<li>GET <code>/items</code> — return views and downloads for all items that Solr knows about¹. Accepts <code>limit</code> and <code>page</code> query parameters for pagination of results (<code>limit</code> must be an integer between 1 and 100, and <code>page</code> must be an integer greater than or equal to 0).</li>
|
||||
<li>GET <code>/item/id</code> — return views and downloads for a single item (<code>id</code> must be a positive integer). Returns HTTP 404 if an item id is not found.</li>
|
||||
</ul>
|
||||
|
||||
<p>The item id is the <em>internal</em> id for an item. You can get these from the standard DSpace REST API.</p>
|
||||
|
||||
<p>¹ We are querying the Solr statistics core, which technically only knows about items that have either views or downloads. If an item is not present here you can assume it has zero views and zero downloads, but not necessarily that it does not exist in the repository.</code>
|
||||
</body>
|
||||
</html>
|
172
dspace_statistics_api/indexer.py
Normal file
172
dspace_statistics_api/indexer.py
Normal file
@ -0,0 +1,172 @@
|
||||
#
|
||||
# indexer.py
|
||||
#
|
||||
# Copyright 2018 Alan Orth.
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
# ---
|
||||
#
|
||||
# Connects to a DSpace Solr statistics core and ingests item views and downloads
|
||||
# into a PostgreSQL database for use by other applications (like an API).
|
||||
#
|
||||
# This script is written for Python 3.5+ and requires several modules that you
|
||||
# can install with pip (I recommend using a Python virtual environment):
|
||||
#
|
||||
# $ pip install SolrClient psycopg2-binary
|
||||
#
|
||||
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
|
||||
# See: https://wiki.duraspace.org/display/DSPACE/Solr
|
||||
|
||||
from .database import database_connection
|
||||
import json
|
||||
import psycopg2.extras
|
||||
from .solr import solr_connection
|
||||
|
||||
def index_views():
|
||||
# get total number of distinct facets for items with a minimum of 1 view,
|
||||
# otherwise Solr returns all kinds of weird ids that are actually not in
|
||||
# the database. Also, stats are expensive, but we need stats.calcdistinct
|
||||
# so we can get the countDistinct summary.
|
||||
#
|
||||
# see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:2',
|
||||
'fq':'isBot:false AND statistics_type:view',
|
||||
'facet':True,
|
||||
'facet.field':'id',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':1,
|
||||
'facet.offset':0,
|
||||
'stats':True,
|
||||
'stats.field':'id',
|
||||
'stats.calcdistinct':True
|
||||
}, rows=0)
|
||||
|
||||
# get total number of distinct facets (countDistinct)
|
||||
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct']
|
||||
|
||||
# divide results into "pages" (cast to int to effectively round down)
|
||||
results_per_page = 100
|
||||
results_num_pages = int(results_totalNumFacets / results_per_page)
|
||||
results_current_page = 0
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
# create an empty list to store values for batch insertion
|
||||
data = []
|
||||
|
||||
while results_current_page <= results_num_pages:
|
||||
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
|
||||
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:2',
|
||||
'fq':'isBot:false AND statistics_type:view',
|
||||
'facet':True,
|
||||
'facet.field':'id',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':results_per_page,
|
||||
'facet.offset':results_current_page * results_per_page
|
||||
}, rows=0)
|
||||
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
views = res.get_facets()
|
||||
# in this case iterate over the 'id' dict and get the item ids and views
|
||||
for item_id, item_views in views['id'].items():
|
||||
data.append((item_id, item_views))
|
||||
|
||||
# do a batch insert of values from the current "page" of results
|
||||
sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET views=excluded.views'
|
||||
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||
db.commit()
|
||||
|
||||
# clear all items from the list so we can populate it with the next batch
|
||||
data.clear()
|
||||
|
||||
results_current_page += 1
|
||||
|
||||
cursor.close()
|
||||
|
||||
def index_downloads():
|
||||
# get the total number of distinct facets for items with at least 1 download
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:0',
|
||||
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||
'facet':True,
|
||||
'facet.field':'owningItem',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':1,
|
||||
'facet.offset':0,
|
||||
'stats':True,
|
||||
'stats.field':'owningItem',
|
||||
'stats.calcdistinct':True
|
||||
}, rows=0)
|
||||
|
||||
# get total number of distinct facets (countDistinct)
|
||||
results_totalNumFacets = json.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct']
|
||||
|
||||
# divide results into "pages" (cast to int to effectively round down)
|
||||
results_per_page = 100
|
||||
results_num_pages = int(results_totalNumFacets / results_per_page)
|
||||
results_current_page = 0
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
# create an empty list to store values for batch insertion
|
||||
data = []
|
||||
|
||||
while results_current_page <= results_num_pages:
|
||||
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
|
||||
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:0',
|
||||
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||
'facet':True,
|
||||
'facet.field':'owningItem',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':results_per_page,
|
||||
'facet.offset':results_current_page * results_per_page
|
||||
}, rows=0)
|
||||
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
downloads = res.get_facets()
|
||||
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
||||
for item_id, item_downloads in downloads['owningItem'].items():
|
||||
data.append((item_id, item_downloads))
|
||||
|
||||
# do a batch insert of values from the current "page" of results
|
||||
sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads'
|
||||
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||
db.commit()
|
||||
|
||||
# clear all items from the list so we can populate it with the next batch
|
||||
data.clear()
|
||||
|
||||
results_current_page += 1
|
||||
|
||||
cursor.close()
|
||||
|
||||
db = database_connection()
|
||||
solr = solr_connection()
|
||||
|
||||
# create table to store item views and downloads
|
||||
cursor = db.cursor()
|
||||
cursor.execute('''CREATE TABLE IF NOT EXISTS items
|
||||
(id INT PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)''')
|
||||
index_views()
|
||||
index_downloads()
|
||||
|
||||
db.close()
|
||||
|
||||
# vim: set sw=4 ts=4 expandtab:
|
9
dspace_statistics_api/solr.py
Normal file
9
dspace_statistics_api/solr.py
Normal file
@ -0,0 +1,9 @@
|
||||
from .config import SOLR_SERVER
|
||||
from SolrClient import SolrClient
|
||||
|
||||
def solr_connection():
|
||||
connection = SolrClient(SOLR_SERVER)
|
||||
|
||||
return connection
|
||||
|
||||
# vim: set sw=4 ts=4 expandtab:
|
12
requirements.txt
Normal file
12
requirements.txt
Normal file
@ -0,0 +1,12 @@
|
||||
certifi==2018.10.15
|
||||
chardet==3.0.4
|
||||
falcon==1.4.1
|
||||
gunicorn==19.9.0
|
||||
idna==2.7
|
||||
kazoo==2.5.0
|
||||
psycopg2-binary==2.7.5
|
||||
python-mimeparse==1.6.0
|
||||
requests==2.20.0
|
||||
six==1.11.0
|
||||
-e git://github.com/alanorth/SolrClient.git@c629e3475be37c82770b2be61748be7e29882648#egg=SolrClient
|
||||
urllib3==1.24
|
Reference in New Issue
Block a user