mirror of
https://github.com/ilri/dspace-statistics-api.git
synced 2025-05-10 07:06:01 +02:00
Compare commits
28 Commits
Author | SHA1 | Date | |
---|---|---|---|
daf15610f2
|
|||
4ede966dbb
|
|||
3580473a6d
|
|||
071c24535f
|
|||
4291aecac4
|
|||
46bf537e88
|
|||
eaca5354d3
|
|||
4600288ee4
|
|||
8179563378
|
|||
b14c3eef4d
|
|||
71a789b13f
|
|||
c68ddacaa4
|
|||
9c9e79769e
|
|||
2ad5ade556
|
|||
7412a09670
|
|||
bb744a00b8
|
|||
7499b89d99
|
|||
2c1e4952b1
|
|||
379f202c3f
|
|||
560fa6056d
|
|||
385a34e5d0
|
|||
d0ea62d2bd
|
|||
366ae25b8e
|
|||
0f3054ae03
|
|||
6bf34235d4
|
|||
e604d8ca81
|
|||
fc35b816f3
|
|||
9e6a2f7559
|
18
CHANGELOG.md
18
CHANGELOG.md
@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.4.2] - 2018-10-04
|
||||
### Changed
|
||||
- README.md introduction and requirements
|
||||
- Use ujson instead of json
|
||||
- Iterate directly on SQL cursor in `/items` route
|
||||
|
||||
### Fixed
|
||||
- Logic error in SQL for item views
|
||||
|
||||
## [0.4.1] - 2018-09-26
|
||||
### Changed
|
||||
- Use execute_values() to batch insert records to PostgreSQL
|
||||
|
||||
## [0.4.0] - 2018-09-25
|
||||
### Fixed
|
||||
- Invalid OnCalendar syntax in dspace-statistics-indexer.timer
|
||||
- Major logic error in indexer.py
|
||||
|
||||
## [0.3.2] - 2018-09-25
|
||||
## Changed
|
||||
- /item/id route now returns HTTP 404 if an item is not found
|
||||
|
23
README.md
23
README.md
@ -1,16 +1,28 @@
|
||||
# DSpace Statistics API
|
||||
A quick and dirty REST API to expose Solr view and download statistics for items in a DSpace repository.
|
||||
A simple REST API to expose Solr view and download statistics for items in a DSpace repository. This project contains a standalone indexing component and a WSGI application.
|
||||
|
||||
Written and tested in Python 3.5, 3.6, and 3.7. Requires PostgreSQL version 9.5 or greater for [`UPSERT` support](https://wiki.postgresql.org/wiki/UPSERT).
|
||||
## Requirements
|
||||
|
||||
## Installation
|
||||
Create a virtual environment and run it:
|
||||
- Python 3.5+
|
||||
- PostgreSQL version 9.5+ (due to [`UPSERT` support](https://wiki.postgresql.org/wiki/UPSERT))
|
||||
- DSpace 4+ with [Solr usage statistics enabled](https://wiki.duraspace.org/display/DSDOC5x/SOLR+Statistics)
|
||||
|
||||
## Installation and Testing
|
||||
Create a Python virtual environment and install the dependencies:
|
||||
|
||||
$ python -m venv venv
|
||||
$ . venv/bin/activate
|
||||
$ pip install -r requirements.txt
|
||||
|
||||
Set up the environment variables Solr and PostgreSQL:
|
||||
|
||||
$ export SOLR_SERVER=http://localhost:8080/solr
|
||||
$
|
||||
$ gunicorn app:api
|
||||
|
||||
## Deployment
|
||||
There are example systemd service and timer units in the `contrib` directory.
|
||||
|
||||
## Using the API
|
||||
The API exposes the following endpoints:
|
||||
|
||||
@ -25,6 +37,9 @@ The API exposes the following endpoints:
|
||||
- Close up DB connection when gunicorn shuts down gracefully
|
||||
- Better logging
|
||||
- Tests
|
||||
- Check if database exists (try/except)
|
||||
- Version API
|
||||
- Use JSON in PostgreSQL
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
6
app.py
6
app.py
@ -22,16 +22,16 @@ class AllItemsResource:
|
||||
|
||||
# get statistics, ordered by id, and use limit and offset to page through results
|
||||
cursor.execute('SELECT id, views, downloads FROM items ORDER BY id ASC LIMIT {} OFFSET {}'.format(limit, offset))
|
||||
results = cursor.fetchmany(limit)
|
||||
cursor.close()
|
||||
|
||||
# create a list to hold dicts of item stats
|
||||
statistics = list()
|
||||
|
||||
# iterate over results and build statistics object
|
||||
for item in results:
|
||||
for item in cursor:
|
||||
statistics.append({ 'id': item['id'], 'views': item['views'], 'downloads': item['downloads'] })
|
||||
|
||||
cursor.close()
|
||||
|
||||
message = {
|
||||
'currentPage': page,
|
||||
'totalPages': pages,
|
||||
|
@ -3,7 +3,7 @@ Description=DSpace Statistics Indexer
|
||||
|
||||
[Timer]
|
||||
# twice a day, at 6AM and 6PM
|
||||
OnCalendar=*-*-* 06:00:00,18:00:00
|
||||
OnCalendar=*-*-* 06,18:00:00
|
||||
# Add a random delay of 0–3600 seconds
|
||||
RandomizedDelaySec=3600
|
||||
Persistent=true
|
||||
|
@ -2,8 +2,7 @@ from config import DATABASE_NAME
|
||||
from config import DATABASE_USER
|
||||
from config import DATABASE_PASS
|
||||
from config import DATABASE_HOST
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import psycopg2, psycopg2.extras
|
||||
|
||||
def database_connection():
|
||||
connection = psycopg2.connect("dbname={} user={} password={} host='{}'".format(DATABASE_NAME, DATABASE_USER, DATABASE_PASS, DATABASE_HOST), cursor_factory=psycopg2.extras.DictCursor)
|
||||
|
99
indexer.py
99
indexer.py
@ -31,98 +31,129 @@
|
||||
# See: https://wiki.duraspace.org/display/DSPACE/Solr
|
||||
|
||||
from database import database_connection
|
||||
import ujson
|
||||
import psycopg2.extras
|
||||
from solr import solr_connection
|
||||
|
||||
def index_views():
|
||||
print("Populating database with item views.")
|
||||
|
||||
# determine the total number of items with views (aka Solr's numFound)
|
||||
# get total number of distinct facets for items with a minimum of 1 view,
|
||||
# otherwise Solr returns all kinds of weird ids that are actually not in
|
||||
# the database. Also, stats are expensive, but we need stats.calcdistinct
|
||||
# so we can get the countDistinct summary.
|
||||
#
|
||||
# see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:2',
|
||||
'fq':'isBot:false AND statistics_type:view',
|
||||
'facet':True,
|
||||
'facet.field':'id',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':1,
|
||||
'facet.offset':0,
|
||||
'stats':True,
|
||||
'stats.field':'id',
|
||||
'stats.calcdistinct':True
|
||||
}, rows=0)
|
||||
|
||||
# divide results into "pages" (numFound / 100)
|
||||
results_numFound = res.get_num_found()
|
||||
# get total number of distinct facets (countDistinct)
|
||||
results_totalNumFacets = ujson.loads(res.get_json())['stats']['stats_fields']['id']['countDistinct']
|
||||
|
||||
# divide results into "pages" (cast to int to effectively round down)
|
||||
results_per_page = 100
|
||||
results_num_pages = round(results_numFound / results_per_page)
|
||||
results_num_pages = int(results_totalNumFacets / results_per_page)
|
||||
results_current_page = 0
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
# create an empty list to store values for batch insertion
|
||||
data = []
|
||||
|
||||
while results_current_page <= results_num_pages:
|
||||
print('Page {} of {}.'.format(results_current_page, results_num_pages))
|
||||
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
|
||||
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:2',
|
||||
'fq':'isBot:false AND statistics_type:view',
|
||||
'facet':True,
|
||||
'facet.field':'id',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':results_per_page,
|
||||
'facet.offset':results_current_page * results_per_page
|
||||
})
|
||||
}, rows=0)
|
||||
|
||||
# make sure total number of results > 0
|
||||
if res.get_num_found() > 0:
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
views = res.get_facets()
|
||||
# in this case iterate over the 'id' dict and get the item ids and views
|
||||
for item_id, item_views in views['id'].items():
|
||||
cursor.execute('''INSERT INTO items(id, views) VALUES(%s, %s)
|
||||
ON CONFLICT(id) DO UPDATE SET downloads=excluded.views''',
|
||||
(item_id, item_views))
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
views = res.get_facets()
|
||||
# in this case iterate over the 'id' dict and get the item ids and views
|
||||
for item_id, item_views in views['id'].items():
|
||||
data.append((item_id, item_views))
|
||||
|
||||
# do a batch insert of values from the current "page" of results
|
||||
sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET views=excluded.views'
|
||||
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||
db.commit()
|
||||
|
||||
# clear all items from the list so we can populate it with the next batch
|
||||
data.clear()
|
||||
|
||||
results_current_page += 1
|
||||
|
||||
cursor.close()
|
||||
|
||||
def index_downloads():
|
||||
print("Populating database with item downloads.")
|
||||
|
||||
# determine the total number of items with downloads (aka Solr's numFound)
|
||||
# get the total number of distinct facets for items with at least 1 download
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:0',
|
||||
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||
'facet':True,
|
||||
'facet.field':'owningItem',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':1,
|
||||
'facet.offset':0,
|
||||
'stats':True,
|
||||
'stats.field':'owningItem',
|
||||
'stats.calcdistinct':True
|
||||
}, rows=0)
|
||||
|
||||
# divide results into "pages" (numFound / 100)
|
||||
results_numFound = res.get_num_found()
|
||||
# get total number of distinct facets (countDistinct)
|
||||
results_totalNumFacets = ujson.loads(res.get_json())['stats']['stats_fields']['owningItem']['countDistinct']
|
||||
|
||||
# divide results into "pages" (cast to int to effectively round down)
|
||||
results_per_page = 100
|
||||
results_num_pages = round(results_numFound / results_per_page)
|
||||
results_num_pages = int(results_totalNumFacets / results_per_page)
|
||||
results_current_page = 0
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
# create an empty list to store values for batch insertion
|
||||
data = []
|
||||
|
||||
while results_current_page <= results_num_pages:
|
||||
print('Page {} of {}.'.format(results_current_page, results_num_pages))
|
||||
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
|
||||
|
||||
res = solr.query('statistics', {
|
||||
'q':'type:0',
|
||||
'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
|
||||
'facet':True,
|
||||
'facet.field':'owningItem',
|
||||
'facet.mincount':1,
|
||||
'facet.limit':results_per_page,
|
||||
'facet.offset':results_current_page * results_per_page
|
||||
})
|
||||
}, rows=0)
|
||||
|
||||
# make sure total number of results > 0
|
||||
if res.get_num_found() > 0:
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
downloads = res.get_facets()
|
||||
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
||||
for item_id, item_downloads in downloads['owningItem'].items():
|
||||
cursor.execute('''INSERT INTO items(id, downloads) VALUES(%s, %s)
|
||||
ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads''',
|
||||
(item_id, item_downloads))
|
||||
# SolrClient's get_facets() returns a dict of dicts
|
||||
downloads = res.get_facets()
|
||||
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
||||
for item_id, item_downloads in downloads['owningItem'].items():
|
||||
data.append((item_id, item_downloads))
|
||||
|
||||
# do a batch insert of values from the current "page" of results
|
||||
sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads'
|
||||
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||
db.commit()
|
||||
|
||||
# clear all items from the list so we can populate it with the next batch
|
||||
data.clear()
|
||||
|
||||
results_current_page += 1
|
||||
|
||||
cursor.close()
|
||||
|
@ -9,4 +9,5 @@ python-mimeparse==1.6.0
|
||||
requests==2.19.1
|
||||
six==1.11.0
|
||||
SolrClient==0.2.1
|
||||
ujson==1.35
|
||||
urllib3==1.23
|
||||
|
Reference in New Issue
Block a user