1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2024-06-29 09:33:47 +02:00

Merge pull request #4 from ilri/database-refactor

Database refactor
This commit is contained in:
Alan Orth 2018-11-07 17:54:04 +02:00 committed by GitHub
commit a6ce44e852
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 109 additions and 102 deletions

View File

@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Hound CI configuration to validate pull requests against PEP 8 code style with Flake8 - Hound CI configuration to validate pull requests against PEP 8 code style with Flake8
- Configuration for [pipenv](https://pipenv.readthedocs.io/en/latest/) - Configuration for [pipenv](https://pipenv.readthedocs.io/en/latest/)
## Changed
- Use a database management class with Python context management to automatically open/close connections and cursors
## Changed ## Changed
- Validate code against PEP 8 style guide with Flake8 - Validate code against PEP 8 style guide with Flake8

View File

@ -70,7 +70,6 @@ The item id is the *internal* id for an item. You can get these from the standar
## Todo ## Todo
- Close DB connection when gunicorn shuts down gracefully
- Better logging - Better logging
- Tests - Tests
- Check if database exists (try/except) - Check if database exists (try/except)

View File

@ -1,9 +1,6 @@
from .database import database_connection from .database import DatabaseManager
import falcon import falcon
db = database_connection()
db.set_session(readonly=True)
class RootResource: class RootResource:
def on_get(self, req, resp): def on_get(self, req, resp):
@ -21,23 +18,23 @@ class AllItemsResource:
page = req.get_param_as_int("page", min=0) or 0 page = req.get_param_as_int("page", min=0) or 0
offset = limit * page offset = limit * page
cursor = db.cursor() with DatabaseManager() as db:
db.set_session(readonly=True)
# get total number of items so we can estimate the pages with db.cursor() as cursor:
cursor.execute('SELECT COUNT(id) FROM items') # get total number of items so we can estimate the pages
pages = round(cursor.fetchone()[0] / limit) cursor.execute('SELECT COUNT(id) FROM items')
pages = round(cursor.fetchone()[0] / limit)
# get statistics, ordered by id, and use limit and offset to page through results # get statistics, ordered by id, and use limit and offset to page through results
cursor.execute('SELECT id, views, downloads FROM items ORDER BY id ASC LIMIT {} OFFSET {}'.format(limit, offset)) cursor.execute('SELECT id, views, downloads FROM items ORDER BY id ASC LIMIT {} OFFSET {}'.format(limit, offset))
# create a list to hold dicts of item stats # create a list to hold dicts of item stats
statistics = list() statistics = list()
# iterate over results and build statistics object # iterate over results and build statistics object
for item in cursor: for item in cursor:
statistics.append({'id': item['id'], 'views': item['views'], 'downloads': item['downloads']}) statistics.append({'id': item['id'], 'views': item['views'], 'downloads': item['downloads']})
cursor.close()
message = { message = {
'currentPage': page, 'currentPage': page,
@ -53,25 +50,27 @@ class ItemResource:
def on_get(self, req, resp, item_id): def on_get(self, req, resp, item_id):
"""Handles GET requests""" """Handles GET requests"""
cursor = db.cursor() with DatabaseManager() as db:
cursor.execute('SELECT views, downloads FROM items WHERE id={}'.format(item_id)) db.set_session(readonly=True)
if cursor.rowcount == 0:
raise falcon.HTTPNotFound(
title='Item not found',
description='The item with id "{}" was not found.'.format(item_id)
)
else:
results = cursor.fetchone()
statistics = { with db.cursor() as cursor:
'id': item_id, cursor = db.cursor()
'views': results['views'], cursor.execute('SELECT views, downloads FROM items WHERE id={}'.format(item_id))
'downloads': results['downloads'] if cursor.rowcount == 0:
} raise falcon.HTTPNotFound(
title='Item not found',
description='The item with id "{}" was not found.'.format(item_id)
)
else:
results = cursor.fetchone()
resp.media = statistics statistics = {
'id': item_id,
'views': results['views'],
'downloads': results['downloads']
}
cursor.close() resp.media = statistics
api = application = falcon.API() api = application = falcon.API()

View File

@ -7,9 +7,17 @@ import psycopg2
import psycopg2.extras import psycopg2.extras
def database_connection(): class DatabaseManager():
connection = psycopg2.connect("dbname={} user={} password={} host={} port={}".format(DATABASE_NAME, DATABASE_USER, DATABASE_PASS, DATABASE_HOST, DATABASE_PORT), cursor_factory=psycopg2.extras.DictCursor) '''Manage database connection.'''
return connection def __init__(self):
self._connection_uri = 'dbname={} user={} password={} host={} port={}'.format(DATABASE_NAME, DATABASE_USER, DATABASE_PASS, DATABASE_HOST, DATABASE_PORT)
def __enter__(self):
self._connection = psycopg2.connect(self._connection_uri, cursor_factory=psycopg2.extras.DictCursor)
return self._connection
def __exit__(self, exc_type, exc_value, exc_traceback):
self._connection.close()
# vim: set sw=4 ts=4 expandtab: # vim: set sw=4 ts=4 expandtab:

View File

@ -29,7 +29,7 @@
# See: https://solrclient.readthedocs.io/en/latest/SolrClient.html # See: https://solrclient.readthedocs.io/en/latest/SolrClient.html
# See: https://wiki.duraspace.org/display/DSPACE/Solr # See: https://wiki.duraspace.org/display/DSPACE/Solr
from .database import database_connection from .database import DatabaseManager
import json import json
import psycopg2.extras import psycopg2.extras
from .solr import solr_connection from .solr import solr_connection
@ -63,41 +63,39 @@ def index_views():
results_num_pages = int(results_totalNumFacets / results_per_page) results_num_pages = int(results_totalNumFacets / results_per_page)
results_current_page = 0 results_current_page = 0
cursor = db.cursor() with DatabaseManager() as db:
with db.cursor() as cursor:
# create an empty list to store values for batch insertion
data = []
# create an empty list to store values for batch insertion while results_current_page <= results_num_pages:
data = [] print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
while results_current_page <= results_num_pages: res = solr.query('statistics', {
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages)) 'q': 'type:2',
'fq': 'isBot:false AND statistics_type:view',
'facet': True,
'facet.field': 'id',
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
}, rows=0)
res = solr.query('statistics', { # SolrClient's get_facets() returns a dict of dicts
'q': 'type:2', views = res.get_facets()
'fq': 'isBot:false AND statistics_type:view', # in this case iterate over the 'id' dict and get the item ids and views
'facet': True, for item_id, item_views in views['id'].items():
'facet.field': 'id', data.append((item_id, item_views))
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
}, rows=0)
# SolrClient's get_facets() returns a dict of dicts # do a batch insert of values from the current "page" of results
views = res.get_facets() sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET views=excluded.views'
# in this case iterate over the 'id' dict and get the item ids and views psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
for item_id, item_views in views['id'].items(): db.commit()
data.append((item_id, item_views))
# do a batch insert of values from the current "page" of results # clear all items from the list so we can populate it with the next batch
sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET views=excluded.views' data.clear()
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
db.commit()
# clear all items from the list so we can populate it with the next batch results_current_page += 1
data.clear()
results_current_page += 1
cursor.close()
def index_downloads(): def index_downloads():
@ -123,53 +121,53 @@ def index_downloads():
results_num_pages = int(results_totalNumFacets / results_per_page) results_num_pages = int(results_totalNumFacets / results_per_page)
results_current_page = 0 results_current_page = 0
cursor = db.cursor() with DatabaseManager() as db:
with db.cursor() as cursor:
# create an empty list to store values for batch insertion
data = []
# create an empty list to store values for batch insertion while results_current_page <= results_num_pages:
data = [] print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
while results_current_page <= results_num_pages: res = solr.query('statistics', {
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages)) 'q': 'type:0',
'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
'facet': True,
'facet.field': 'owningItem',
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
}, rows=0)
res = solr.query('statistics', { # SolrClient's get_facets() returns a dict of dicts
'q': 'type:0', downloads = res.get_facets()
'fq': 'isBot:false AND statistics_type:view AND bundleName:ORIGINAL', # in this case iterate over the 'owningItem' dict and get the item ids and downloads
'facet': True, for item_id, item_downloads in downloads['owningItem'].items():
'facet.field': 'owningItem', data.append((item_id, item_downloads))
'facet.mincount': 1,
'facet.limit': results_per_page,
'facet.offset': results_current_page * results_per_page
}, rows=0)
# SolrClient's get_facets() returns a dict of dicts # do a batch insert of values from the current "page" of results
downloads = res.get_facets() sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads'
# in this case iterate over the 'owningItem' dict and get the item ids and downloads psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
for item_id, item_downloads in downloads['owningItem'].items(): db.commit()
data.append((item_id, item_downloads))
# do a batch insert of values from the current "page" of results # clear all items from the list so we can populate it with the next batch
sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads' data.clear()
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
db.commit()
# clear all items from the list so we can populate it with the next batch results_current_page += 1
data.clear()
results_current_page += 1
cursor.close()
db = database_connection()
solr = solr_connection() solr = solr_connection()
# create table to store item views and downloads with DatabaseManager() as db:
cursor = db.cursor() with db.cursor() as cursor:
cursor.execute('''CREATE TABLE IF NOT EXISTS items # create table to store item views and downloads
cursor.execute('''CREATE TABLE IF NOT EXISTS items
(id INT PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)''') (id INT PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)''')
# commit the table creation before closing the database connection
db.commit()
index_views() index_views()
index_downloads() index_downloads()
db.close()
# vim: set sw=4 ts=4 expandtab: # vim: set sw=4 ts=4 expandtab: