From 65d9c4010349dc187431b68b87f0a0b0c6659855 Mon Sep 17 00:00:00 2001
From: Alan Orth
Date: Thu, 7 Nov 2019 18:22:19 +0200
Subject: [PATCH] Update notes for 2019-11-07
---
content/posts/2019-11.md | 34 ++++++++++++++++++++++
docs/2019-11/index.html | 61 +++++++++++++++++++++++++++++++---------
docs/sitemap.xml | 10 +++----
3 files changed, 87 insertions(+), 18 deletions(-)
diff --git a/content/posts/2019-11.md b/content/posts/2019-11.md
index d0c4e7d83..b0c7119e2 100644
--- a/content/posts/2019-11.md
+++ b/content/posts/2019-11.md
@@ -177,5 +177,39 @@ $ http --print b 'http://localhost:8081/solr/statistics/select?q=userAgent:www.g
- CCAFS finally confirmed that they do indeed need the confusing new project tag that looks like a duplicate
- They had proposed a batch of new tags in 2019-09 and we never merged them due to this uncertainty
- I have now merged the changes in to the `5_x-prod` branch ([#432](https://github.com/ilri/DSpace/pull/432))
+- I am reconsidering the move of `cg.identifier.dataurl` to `cg.hasMetadata` in CG Core v2
+ - The values of this field are mostly links to data sets on Dataverse and partner sites
+ - I opened an [issue on GitHub](https://github.com/AgriculturalSemantics/cg-core/issues/10) to ask Marie-Angelique for clarification
+- Looking into CGSpace statistics again
+ - I searched for hits in Solr from the BUbiNG bot and found 63,000 in the `statistics-2018` core:
+
+```
+$ http --print b 'http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:BUbiNG*' | xmllint --format - | grep numFound
+
+```
+
+ - Similar for com.plumanalytics, Grammarly, and ltx71!
+
+```
+$ http --print b 'http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:
+*com.plumanalytics*' | xmllint --format - | grep numFound
+
+$ http --print b 'http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:*Grammarly*' | xmllint --format - | grep numFound
+
+$ http --print b 'http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:*ltx71*' | xmllint --format - | grep numFound
+
+```
+
+- Deleting these seems to work, for example the 105,000 ltx71 records from 2018:
+
+```
+$ http --print b 'http://localhost:8081/solr/statistics-2018/update?stream.body=userAgent:*ltx71*type:0&commit=true'
+$ http --print b 'http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:*ltx71*' | xmllint --format - | grep numFound
+
+```
+
+- I wrote a quick bash script to check all these user agents against the CGSpace Solr statistics cores
+ - For years 2010 until 2019 there are 1.6 million hits from these spider user agents
+ - For 2019 alone there are 740,000, over half of which come from Unpaywall!
diff --git a/docs/2019-11/index.html b/docs/2019-11/index.html
index 5af7f6f8e..3875c4f60 100644
--- a/docs/2019-11/index.html
+++ b/docs/2019-11/index.html
@@ -34,7 +34,7 @@ Let’s see how many of the REST API requests were for bitstreams (because t
-
+
@@ -73,9 +73,9 @@ Let’s see how many of the REST API requests were for bitstreams (because t
"@type": "BlogPosting",
"headline": "November, 2019",
"url": "https:\/\/alanorth.github.io\/cgspace-notes\/2019-11\/",
- "wordCount": "1088",
+ "wordCount": "1293",
"datePublished": "2019-11-04T12:20:30+02:00",
- "dateModified": "2019-11-06T09:35:51+02:00",
+ "dateModified": "2019-11-07T12:40:25+02:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
@@ -323,21 +323,56 @@ $ http –print Hh ‘http://localhost:8081/solr/statistics/select?q=userAgent:www.gnip.com&fq=dateYearMonth%3A2019-11' | xmllint –format - | grep numFound
$ http –print b ‘http://localhost:8081/solr/statistics/select?q=userAgent:www.gnyp.com&fq=dateYearMonth%3A2019-11' | xmllint –format - | grep numFound
-
+
+
+
+- So the blocking seems to be working because "www\.gnip\.com" is one of the new patterns added to the spiders file...
+
+## 2019-11-07
+
+- CCAFS finally confirmed that they do indeed need the confusing new project tag that looks like a duplicate
+ - They had proposed a batch of new tags in 2019-09 and we never merged them due to this uncertainty
+ - I have now merged the changes in to the `5_x-prod` branch ([#432](https://github.com/ilri/DSpace/pull/432))
+- I am reconsidering the move of `cg.identifier.dataurl` to `cg.hasMetadata` in CG Core v2
+ - The values of this field are mostly links to data sets on Dataverse and partner sites
+ - I opened an [issue on GitHub](https://github.com/AgriculturalSemantics/cg-core/issues/10) to ask Marie-Angelique for clarification
+- Looking into CGSpace statistics again
+ - I searched for hits in Solr from the BUbiNG bot and found 63,000 in the `statistics-2018` core:
+
+
+
+$ http –print b ‘http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:BUbiNG*' | xmllint –format - | grep numFound
+
+
+
+ - Similar for com.plumanalytics, Grammarly, and ltx71!
+
+
+
+$ http –print b ‘http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:
+com.plumanalytics’ | xmllint –format - | grep numFound
+
+$ http –print b ‘http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:*Grammarly*' | xmllint –format - | grep numFound
+
+$ http –print b ‘http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:*ltx71*' | xmllint –format - | grep numFound
+
+
+
+- Deleting these seems to work, for example the 105,000 ltx71 records from 2018:
+
+
+
+$ http –print b ‘http://localhost:8081/solr/statistics-2018/update?stream.body=userAgent:ltx71type:0&commit=true’
+$ http –print b ‘http://localhost:8081/solr/statistics-2018/select?facet=true&facet.field=ip&facet.mincount=1&type:0&q=userAgent:*ltx71*' | xmllint –format - | grep numFound
+
```
-- So the blocking seems to be working because “www.gnip.com” is one of the new patterns added to the spiders file…
-
-
-2019-11-07
+I wrote a quick bash script to check all these user agents against the CGSpace Solr statistics cores
-- CCAFS finally confirmed that they do indeed need the confusing new project tag that looks like a duplicate
-
-
-- They had proposed a batch of new tags in 2019-09 and we never merged them due to this uncertainty
-- I have now merged the changes in to the
5_x-prod
branch (#432)
+- For years 2010 until 2019 there are 1.6 million hits from these spider user agents
+- For 2019 alone there are 740,000, over half of which come from Unpaywall!
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index c52533bd9..544aeff63 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -4,27 +4,27 @@
https://alanorth.github.io/cgspace-notes/categories/
- 2019-11-06T09:35:51+02:00
+ 2019-11-07T12:40:25+02:00
https://alanorth.github.io/cgspace-notes/
- 2019-11-06T09:35:51+02:00
+ 2019-11-07T12:40:25+02:00
https://alanorth.github.io/cgspace-notes/categories/notes/
- 2019-11-06T09:35:51+02:00
+ 2019-11-07T12:40:25+02:00
https://alanorth.github.io/cgspace-notes/2019-11/
- 2019-11-06T09:35:51+02:00
+ 2019-11-07T12:40:25+02:00
https://alanorth.github.io/cgspace-notes/posts/
- 2019-11-06T09:35:51+02:00
+ 2019-11-07T12:40:25+02:00