From f4c053ef76b1803df41a6ebe505df0762c8822ff Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 25 Sep 2018 19:05:02 +0300 Subject: [PATCH] Update notes for 2018-09-25 --- content/posts/2018-09.md | 19 +++++++++++++++++++ docs/2018-09/index.html | 29 ++++++++++++++++++++++++++--- docs/sitemap.xml | 10 +++++----- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/content/posts/2018-09.md b/content/posts/2018-09.md index 2a0bfe10f..b02107501 100644 --- a/content/posts/2018-09.md +++ b/content/posts/2018-09.md @@ -468,5 +468,24 @@ $ psql -h localhost -U postgres dspacestatistics dspacestatistics=> CREATE TABLE IF NOT EXISTS items dspacestatistics-> (id INT PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0) ``` +## 2018-09-25 + +- I deployed the DSpace statistics API on CGSpace, but when I ran the indexer it wanted to index 180,000 pages of item views +- I'm not even sure how that's possible, as we only have 74,000 items! +- I need to inspect the `id` values that are returned for views and cross check them with the `owningItem` values for bitstream downloads... +- Also, I could try to check all IDs against the items table to see if they are actually items (perhaps the Solr `id` field doesn't correspond with *actual* DSpace items?) +- I want to purge the bot hits from the Solr statistics core, as I am now realizing that I don't give a shit about tens of millions of hits by Google and Bing indexing my shit every day (at least not in Solr!) +- CGSpace's Solr core has 150,000,000 documents in it... and it's still pretty fast to query, but it's really a maintenance and backup burden +- DSpace Test currently has about 2,000,000 documents with `isBot:true` in its Solr statistics core, and the size on disk is 2GB (it's not much, but I have to test this somewhere!) +- According to the [DSpace 5.x Solr documentation](https://wiki.duraspace.org/display/DSDOC5x/SOLR+Statistics+Maintenance) I can use `dspace stats-util -f`, so let's try it: + +``` +$ dspace stats-util -f +``` + +- The command comes back after a few seconds and I still see 2,000,000 documents in the statistics core with `isBot:true` +- I was just writing a message to the dspace-tech mailing list and then I decided to check the number of bot view events on DSpace Test again, and now it's 201 instead of 2,000,000, and statistics core is only 30MB now! +- I will set the `logBots = false` property in `dspace/config/modules/usage-statistics.cfg` on DSpace Test and check if the number of `isBot:true` events goes up any more... +- I restarted the server with `logBots = false` and after it came back up I see 266 events with `isBots:true` (maybe they were buffered)... I will check again tomorrow diff --git a/docs/2018-09/index.html b/docs/2018-09/index.html index 47b8e7c5a..b15a70b1a 100644 --- a/docs/2018-09/index.html +++ b/docs/2018-09/index.html @@ -18,7 +18,7 @@ I’m testing the new DSpace 5.8 branch in my Ubuntu 18.04 environment and I " /> - + +

2018-09-25

+ + + +
$ dspace stats-util -f
+
+ + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 6b1e66dbd..34d6cd195 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-09/ - 2018-09-25T02:24:43+03:00 + 2018-09-25T11:33:05+03:00 @@ -184,7 +184,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-09-25T02:24:43+03:00 + 2018-09-25T11:33:05+03:00 0 @@ -195,7 +195,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-09-25T02:24:43+03:00 + 2018-09-25T11:33:05+03:00 0 @@ -207,13 +207,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-09-25T02:24:43+03:00 + 2018-09-25T11:33:05+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-09-25T02:24:43+03:00 + 2018-09-25T11:33:05+03:00 0