mirror of
https://github.com/ilri/dspace-statistics-api.git
synced 2024-11-26 08:08:19 +01:00
indexer.py: Use psycopg2's execute_values to batch inserts
Batch inserts are much faster than a series of individual inserts because they drastically reduce the overhead caused by round-trip communication with the server. My tests in development confirm: - cursor.execute(): 19 seconds - execute_values(): 14 seconds I'm currently only working with 4,500 rows, but I will experiment with larger data sets, as well as larger batches. For example, on the PostgreSQL mailing list a user reports doing 10,000 rows with a page size of 100. See: http://initd.org/psycopg/docs/extras.html#psycopg2.extras.execute_values See: https://github.com/psycopg/psycopg2/issues/491#issuecomment-276551038
This commit is contained in:
parent
d0ea62d2bd
commit
385a34e5d0
27
indexer.py
27
indexer.py
@ -32,6 +32,7 @@
|
|||||||
|
|
||||||
from database import database_connection
|
from database import database_connection
|
||||||
import json
|
import json
|
||||||
|
import psycopg2.extras
|
||||||
from solr import solr_connection
|
from solr import solr_connection
|
||||||
|
|
||||||
def index_views():
|
def index_views():
|
||||||
@ -64,6 +65,9 @@ def index_views():
|
|||||||
|
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
# create an empty list to store values for batch insertion
|
||||||
|
data = []
|
||||||
|
|
||||||
while results_current_page <= results_num_pages:
|
while results_current_page <= results_num_pages:
|
||||||
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
|
print('Indexing item views (page {} of {})'.format(results_current_page, results_num_pages))
|
||||||
|
|
||||||
@ -84,12 +88,16 @@ def index_views():
|
|||||||
views = res.get_facets()
|
views = res.get_facets()
|
||||||
# in this case iterate over the 'id' dict and get the item ids and views
|
# in this case iterate over the 'id' dict and get the item ids and views
|
||||||
for item_id, item_views in views['id'].items():
|
for item_id, item_views in views['id'].items():
|
||||||
cursor.execute('''INSERT INTO items(id, views) VALUES(%s, %s)
|
data.append((item_id, item_views))
|
||||||
ON CONFLICT(id) DO UPDATE SET downloads=excluded.views''',
|
|
||||||
(item_id, item_views))
|
|
||||||
|
|
||||||
|
# do a batch insert of values from the current "page" of results
|
||||||
|
sql = 'INSERT INTO items(id, views) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.views'
|
||||||
|
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
# clear all items from the list so we can populate it with the next batch
|
||||||
|
data.clear()
|
||||||
|
|
||||||
results_current_page += 1
|
results_current_page += 1
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
@ -119,6 +127,9 @@ def index_downloads():
|
|||||||
|
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
# create an empty list to store values for batch insertion
|
||||||
|
data = []
|
||||||
|
|
||||||
while results_current_page <= results_num_pages:
|
while results_current_page <= results_num_pages:
|
||||||
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
|
print('Indexing item downloads (page {} of {})'.format(results_current_page, results_num_pages))
|
||||||
|
|
||||||
@ -136,12 +147,16 @@ def index_downloads():
|
|||||||
downloads = res.get_facets()
|
downloads = res.get_facets()
|
||||||
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
# in this case iterate over the 'owningItem' dict and get the item ids and downloads
|
||||||
for item_id, item_downloads in downloads['owningItem'].items():
|
for item_id, item_downloads in downloads['owningItem'].items():
|
||||||
cursor.execute('''INSERT INTO items(id, downloads) VALUES(%s, %s)
|
data.append((item_id, item_downloads))
|
||||||
ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads''',
|
|
||||||
(item_id, item_downloads))
|
|
||||||
|
|
||||||
|
# do a batch insert of values from the current "page" of results
|
||||||
|
sql = 'INSERT INTO items(id, downloads) VALUES %s ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads'
|
||||||
|
psycopg2.extras.execute_values(cursor, sql, data, template='(%s, %s)')
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
# clear all items from the list so we can populate it with the next batch
|
||||||
|
data.clear()
|
||||||
|
|
||||||
results_current_page += 1
|
results_current_page += 1
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user