diff --git a/content/posts/2021-02.md b/content/posts/2021-02.md index abeab64d8..eb51ab8d1 100644 --- a/content/posts/2021-02.md +++ b/content/posts/2021-02.md @@ -234,4 +234,89 @@ $ curl -XDELETE 'http://localhost:9200/openrxv-items-temp' # start indexing in AReS ``` +## 2021-02-08 + +- Finish rotating the AReS indexes after the harvesting last night: + +```console +$ curl -s 'http://localhost:9200/openrxv-items-temp/_count?q=*&pretty' +{ + "count" : 100983, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + } +} +$ curl -X PUT "localhost:9200/openrxv-items/_settings" -H 'Content-Type: application/json' -d'{"settings": {"index.blocks.write":true}}' +$ curl -s -X POST http://localhost:9200/openrxv-items/_clone/openrxv-items-2021-02-08 +$ curl -XDELETE 'http://localhost:9200/openrxv-items' +$ curl -X PUT "localhost:9200/openrxv-items-temp/_settings" -H 'Content-Type: application/json' -d'{"settings": {"index.blocks.write": true}}' +$ curl -s -X POST http://localhost:9200/openrxv-items-temp/_clone/openrxv-items +$ curl -XDELETE 'http://localhost:9200/openrxv-items-temp' +$ curl -XDELETE 'http://localhost:9200/openrxv-items-2021-02-08' +``` + +## 2021-02-10 + +- Talk to Abdullah from CodeObia about a few of the issues we filed on OpenRXV + - Verify a fix he made for the issue with spaces in template file names + - He says that the [Angular expressions support should be enabled](https://github.com/ilri/OpenRXV/issues/49), but I tried it and couldn't get a few simple examples working +- Atmire responded to a few issues today: + - First, the one about a crash while exporting a community CSV, which appears to be a [vanilla DSpace issue with a patch in DSpace 6.4](https://jira.lyrasis.org/browse/DS-4211) + - Second, the MQM batch consumer issue, which appears to be harmless log spam in *most* cases and they have sent a patch that adjusts the logging as such + - Third, a version bump for CUA to fix the `java.lang.UnsupportedOperationException: Multiple update components target the same field:solr_update_time_stamp` error +- I cherry-picked the patches for DS-4111 and was able to export the ILRI community finally, but the results are almost twice as many items as in the community! + - Investigating with csvcut I see there are some ids that appear up to five, six, or seven times! + +```console +$ csvcut -c id /tmp/2021-02-10-ILRI.csv | sed '1d' | wc -l +30354 +$ csvcut -c id /tmp/2021-02-10-ILRI.csv | sed '1d' | sort -u | wc -l +18555 +$ csvcut -c id /tmp/2021-02-10-ILRI.csv | sed '1d' | sort | uniq -c | sort -h | tail + 5 c21a79e5-e24e-4861-aa07-e06703d1deb7 + 5 c2460aa1-ae28-4003-9a99-2d7c5cd7fd38 + 5 d73fb3ae-9fac-4f7e-990f-e394f344246c + 5 dc0e24fa-b7f5-437e-ac09-e15c0704be00 + 5 dc50bcca-0abf-473f-8770-69d5ab95cc33 + 5 e714bdf9-cc0f-4d9a-a808-d572e25c9238 + 6 7dfd1c61-9e8c-4677-8d41-e1c4b11d867d + 6 fb76888c-03ae-4d53-b27d-87d7ca91371a + 6 ff42d1e6-c489-492c-a40a-803cabd901ed + 7 094e9e1d-09ff-40ca-a6b9-eca580936147 +``` + +- I added a comment to that bug to ask if this is a side effect of the patch +- I started working on tagging pre-2010 ILRI items with license information, like we talked about with Peter and Abenet last week + - Due to the export bug I had to sort and remove duplicates first, then use csvgrep to filter out books and journal articles: + +```console +$ csvcut -c 'id,dc.date.issued,dc.date.issued[],dc.date.issued[en_US],dc.rights,dc.rights[],dc.rights[en],dc.rights[en_US],dc.publisher,dc.publisher[],dc.publisher[en_US],dc.type[en_US]' /tmp/2021-02-10-ILRI.csv | csvgrep -c 'dc.type[en_US]' -r '^.+[^(Journal Item|Journal Article|Book|Book Chapter)]' +``` + +- I imported the CSV into OpenRefine and converted the date text values to date types so I could facet by dates before 2010: + +```console +if(diff(value,"01/01/2010".toDate(),"days")<0, true, false) +``` + +- Then I filtered by publisher to make sure they were only ours: + +```console +or( + value.contains("International Livestock Research Institute"), + value.contains("ILRI"), + value.contains("International Livestock Centre for Africa"), + value.contains("ILCA"), + value.contains("ILRAD"), + value.contains("International Laboratory for Research on Animal Diseases") +) +``` + +- I tagged these pre-2010 items with "Other" if they didn't already have a license +- I checked 2010 to 2015, and 2016 to date, but they were all tagged already! +- In the end I added the "Other" license to 1,523 items from before 2010 + diff --git a/docs/2021-02/index.html b/docs/2021-02/index.html index 296b1bf7b..2be749242 100644 --- a/docs/2021-02/index.html +++ b/docs/2021-02/index.html @@ -32,7 +32,7 @@ $ curl -s 'http://localhost:9200/openrxv-items-temp/_count?q=*&pretty - + @@ -70,9 +70,9 @@ $ curl -s 'http://localhost:9200/openrxv-items-temp/_count?q=*&pretty "@type": "BlogPosting", "headline": "February, 2021", "url": "https://alanorth.github.io/cgspace-notes/2021-02/", - "wordCount": "1517", + "wordCount": "2017", "datePublished": "2021-02-01T10:13:54+02:00", - "dateModified": "2021-02-06T14:00:36+02:00", + "dateModified": "2021-02-07T16:27:36+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -389,7 +389,93 @@ sys 2m26.050s
$ curl -XDELETE 'http://localhost:9200/openrxv-items-temp'
# start indexing in AReS
-
+$ curl -s 'http://localhost:9200/openrxv-items-temp/_count?q=*&pretty'
+{
+ "count" : 100983,
+ "_shards" : {
+ "total" : 1,
+ "successful" : 1,
+ "skipped" : 0,
+ "failed" : 0
+ }
+}
+$ curl -X PUT "localhost:9200/openrxv-items/_settings" -H 'Content-Type: application/json' -d'{"settings": {"index.blocks.write":true}}'
+$ curl -s -X POST http://localhost:9200/openrxv-items/_clone/openrxv-items-2021-02-08
+$ curl -XDELETE 'http://localhost:9200/openrxv-items'
+$ curl -X PUT "localhost:9200/openrxv-items-temp/_settings" -H 'Content-Type: application/json' -d'{"settings": {"index.blocks.write": true}}'
+$ curl -s -X POST http://localhost:9200/openrxv-items-temp/_clone/openrxv-items
+$ curl -XDELETE 'http://localhost:9200/openrxv-items-temp'
+$ curl -XDELETE 'http://localhost:9200/openrxv-items-2021-02-08'
+
java.lang.UnsupportedOperationException: Multiple update components target the same field:solr_update_time_stamp
error$ csvcut -c id /tmp/2021-02-10-ILRI.csv | sed '1d' | wc -l
+30354
+$ csvcut -c id /tmp/2021-02-10-ILRI.csv | sed '1d' | sort -u | wc -l
+18555
+$ csvcut -c id /tmp/2021-02-10-ILRI.csv | sed '1d' | sort | uniq -c | sort -h | tail
+ 5 c21a79e5-e24e-4861-aa07-e06703d1deb7
+ 5 c2460aa1-ae28-4003-9a99-2d7c5cd7fd38
+ 5 d73fb3ae-9fac-4f7e-990f-e394f344246c
+ 5 dc0e24fa-b7f5-437e-ac09-e15c0704be00
+ 5 dc50bcca-0abf-473f-8770-69d5ab95cc33
+ 5 e714bdf9-cc0f-4d9a-a808-d572e25c9238
+ 6 7dfd1c61-9e8c-4677-8d41-e1c4b11d867d
+ 6 fb76888c-03ae-4d53-b27d-87d7ca91371a
+ 6 ff42d1e6-c489-492c-a40a-803cabd901ed
+ 7 094e9e1d-09ff-40ca-a6b9-eca580936147
+
$ csvcut -c 'id,dc.date.issued,dc.date.issued[],dc.date.issued[en_US],dc.rights,dc.rights[],dc.rights[en],dc.rights[en_US],dc.publisher,dc.publisher[],dc.publisher[en_US],dc.type[en_US]' /tmp/2021-02-10-ILRI.csv | csvgrep -c 'dc.type[en_US]' -r '^.+[^(Journal Item|Journal Article|Book|Book Chapter)]'
+
if(diff(value,"01/01/2010".toDate(),"days")<0, true, false)
+
or(
+ value.contains("International Livestock Research Institute"),
+ value.contains("ILRI"),
+ value.contains("International Livestock Centre for Africa"),
+ value.contains("ILCA"),
+ value.contains("ILRAD"),
+ value.contains("International Laboratory for Research on Animal Diseases")
+)
+