1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2024-11-22 22:35:06 +01:00

indexer.py: Use psycopg2's execute_values to batch inserts

Batch inserts are much faster than a series of individual inserts
because they drastically reduce the overhead caused by round-trip
communication with the server. My tests in development confirm:

  - cursor.execute(): 19 seconds
  - execute_values(): 14 seconds

I'm currently only working with 4,500 rows, but I will experiment
with larger data sets, as well as larger batches. For example, on
the PostgreSQL mailing list a user reports doing 10,000 rows with
a page size of 100.

See: http://initd.org/psycopg/docs/extras.html#psycopg2.extras.execute_values
See: https://github.com/psycopg/psycopg2/issues/491#issuecomment-276551038
This commit is contained in:
Alan Orth 2018-09-26 23:10:29 +03:00
parent d0ea62d2bd
commit 385a34e5d0
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -32,6 +32,7 @@
from database import database_connection from database import database_connection
import json import json
import psycopg2.extras
from solr import solr_connection from solr import solr_connection
def index_views(): def index_views():
@ -64,6 +65,9 @@ def index_views():
cursor = db.cursor() cursor = db.cursor()
# create an empty list to store values for batch insertion
data = []
while results_current_page <= results_num_pages: while results_current_page <= results_num_pages:
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages)) print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
@ -84,12 +88,16 @@ def index_views():
views = res.get_facets() views = res.get_facets()
# in this case iterate over the 'id' dict and get the item ids and views # in this case iterate over the 'id' dict and get the item ids and views
for item_id, item_views in views['id'].items(): for item_id, item_views in views['id'].items():
cursor.execute('''INSERT INTO items(id, views) VALUES(%s, %s) data.append((item_id, item_views))
ON CONFLICT(id) DO UPDATE SET downloads=excluded.views''',
(item_id, item_views))
# do a batch insert of values from the current "page" of results
sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.views'
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
db.commit() db.commit()
# clear all items from the list so we can populate it with the next batch
data.clear()
results_current_page += 1 results_current_page += 1
cursor.close() cursor.close()
@ -119,6 +127,9 @@ def index_downloads():
cursor = db.cursor() cursor = db.cursor()
# create an empty list to store values for batch insertion
data = []
while results_current_page <= results_num_pages: while results_current_page <= results_num_pages:
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages)) print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
@ -136,12 +147,16 @@ def index_downloads():
downloads = res.get_facets() downloads = res.get_facets()
# in this case iterate over the 'owningItem' dict and get the item ids and downloads # in this case iterate over the 'owningItem' dict and get the item ids and downloads
for item_id, item_downloads in downloads['owningItem'].items(): for item_id, item_downloads in downloads['owningItem'].items():
cursor.execute('''INSERT INTO items(id, downloads) VALUES(%s, %s) data.append((item_id, item_downloads))
ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads''',
(item_id, item_downloads))
# do a batch insert of values from the current "page" of results
sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads'
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
db.commit() db.commit()
# clear all items from the list so we can populate it with the next batch
data.clear()
results_current_page += 1 results_current_page += 1
cursor.close() cursor.close()