From 2db5e02be90cd0d3e3d7912642eee252083d9eb2 Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Sun, 23 Sep 2018 16:47:48 +0300
Subject: [PATCH] Add indexer.py

Standalone script to ingest item views and downloads from Solr into
SQLite.
---
 .gitignore |   1 +
 indexer.py | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100755 indexer.py

diff --git a/.gitignore b/.gitignore
index 82adb58..0cb8f26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__
 venv
+*.db
diff --git a/indexer.py b/indexer.py
new file mode 100755
index 0000000..2e50e11
--- /dev/null
+++ b/indexer.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+#
+# Tested with Python 3.6
+# See DSpace Solr docs for tips about parameters
+# https://wiki.duraspace.org/display/DSPACE/Solr
+
+from config import SOLR_CORE
+from database import database_connection
+from solr import solr_connection
+
+def index_views():
+    print("Populating database with item views.")
+
+    # determine the total number of items with views (aka Solr's numFound)
+    res = solr.query(SOLR_CORE, {
+        'q':'type:2',
+        'fq':'isBot:false AND statistics_type:view',
+        'facet':True,
+        'facet.field':'id',
+    }, rows=0)
+
+    # divide results into "pages" (numFound / 100)
+    results_numFound = res.get_num_found()
+    results_per_page = 100
+    results_num_pages = round(results_numFound / results_per_page)
+    results_current_page = 0
+
+    while results_current_page <= results_num_pages:
+        print('Page {0} of {1}.'.format(results_current_page, results_num_pages))
+
+        res = solr.query(SOLR_CORE, {
+            'q':'type:2',
+            'fq':'isBot:false AND statistics_type:view',
+            'facet':True,
+            'facet.field':'id',
+            'facet.limit':results_per_page,
+            'facet.offset':results_current_page * results_per_page
+        })
+
+        # make sure total number of results > 0
+        if res.get_num_found() > 0:
+            # SolrClient's get_facets() returns a dict of dicts
+            views = res.get_facets()
+            # in this case iterate over the 'id' dict and get the item ids and views
+            for item_id, item_views in views['id'].items():
+                db.execute('''REPLACE INTO itemviews VALUES (?, ?)''', (item_id, item_views))
+
+        db.commit()
+
+        results_current_page += 1
+
+def index_downloads():
+    print("Populating database with item downloads.")
+
+    # determine the total number of items with downloads (aka Solr's numFound)
+    res = solr.query(SOLR_CORE, {
+        'q':'type:0',
+        'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
+        'facet':True,
+        'facet.field':'owningItem',
+    }, rows=0)
+
+    # divide results into "pages" (numFound / 100)
+    results_numFound = res.get_num_found()
+    results_per_page = 100
+    results_num_pages = round(results_numFound / results_per_page)
+    results_current_page = 0
+
+    while results_current_page <= results_num_pages:
+        print('Page {0} of {1}.'.format(results_current_page, results_num_pages))
+
+        res = solr.query(SOLR_CORE, {
+            'q':'type:0',
+            'fq':'isBot:false AND statistics_type:view AND bundleName:ORIGINAL',
+            'facet':True,
+            'facet.field':'owningItem',
+            'facet.limit':results_per_page,
+            'facet.offset':results_current_page * results_per_page
+        })
+
+        # make sure total number of results > 0
+        if res.get_num_found() > 0:
+            # SolrClient's get_facets() returns a dict of dicts
+            downloads = res.get_facets()
+            # in this case iterate over the 'owningItem' dict and get the item ids and downloads
+            for item_id, item_downloads in downloads['owningItem'].items():
+                db.execute('''REPLACE INTO itemdownloads VALUES (?, ?)''', (item_id, item_downloads))
+
+        db.commit()
+
+        results_current_page += 1
+
+db = database_connection()
+solr = solr_connection()
+
+# use separate views and downloads tables so we can REPLACE INTO carelessly (ie, item may have views but no downloads)
+db.execute('''CREATE TABLE IF NOT EXISTS itemviews
+                  (id integer primary key, views integer)''')
+db.execute('''CREATE TABLE IF NOT EXISTS itemdownloads
+                  (id integer primary key, downloads integer)''')
+index_views()
+index_downloads()
+
+db.close()
+
+# vim: set sw=4 ts=4 expandtab: