1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2024-11-22 14:25:01 +01:00

indexer.py: Use single items table with UPSERT

I was using two separate tables for item views and downloads without
realizing that SQLite didn't support FULL OUTER JOIN, which would be
needed to get views and downloads for a given item in a single query.

Instead I can use one table with a default value of 0 for both views
and downloads, and then use "UPSERT" to populate the statistics. This
is a newish SQL concept that allows you to attempt an INSERT and then
specify an action to perform in case of conflict. This works well in
SQLite and actually simplifies my Python logic greatly!

Note that the "excluded" table qualifier is a special keyword that
allows you to reference the value that would have been inserted.

See: https://www.sqlite.org/lang_UPSERT.html
This commit is contained in:
Alan Orth 2018-09-24 14:19:50 +03:00
parent 53615dea2d
commit 2cab456f16
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -70,7 +70,9 @@ def index_views():
views = res.get_facets() views = res.get_facets()
# in this case iterate over the 'id' dict and get the item ids and views # in this case iterate over the 'id' dict and get the item ids and views
for item_id, item_views in views['id'].items(): for item_id, item_views in views['id'].items():
db.execute('''REPLACE INTO itemviews VALUES (?, ?)''', (item_id, item_views)) db.execute('''INSERT INTO items(id, views) VALUES(?, ?)
ON CONFLICT(id) DO UPDATE SET downloads=excluded.views''',
(item_id, item_views))
db.commit() db.commit()
@ -111,7 +113,9 @@ def index_downloads():
downloads = res.get_facets() downloads = res.get_facets()
# in this case iterate over the 'owningItem' dict and get the item ids and downloads # in this case iterate over the 'owningItem' dict and get the item ids and downloads
for item_id, item_downloads in downloads['owningItem'].items(): for item_id, item_downloads in downloads['owningItem'].items():
db.execute('''REPLACE INTO itemdownloads VALUES (?, ?)''', (item_id, item_downloads)) db.execute('''INSERT INTO items(id, downloads) VALUES(?, ?)
ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads''',
(item_id, item_downloads))
db.commit() db.commit()
@ -120,11 +124,9 @@ def index_downloads():
db = database_connection_rw() db = database_connection_rw()
solr = solr_connection() solr = solr_connection()
# use separate views and downloads tables so we can REPLACE INTO carelessly (ie, item may have views but no downloads) # create table to store item views and downloads
db.execute('''CREATE TABLE IF NOT EXISTS itemviews db.execute('''CREATE TABLE IF NOT EXISTS items
(id integer primary key, views integer)''') (id INT PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)''')
db.execute('''CREATE TABLE IF NOT EXISTS itemdownloads
(id integer primary key, downloads integer)''')
index_views() index_views()
index_downloads() index_downloads()