From d2c037d0de642ff8258b1077d9aa0dab0e6de4c9 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 19 Aug 2020 22:08:33 +0300 Subject: [PATCH] Add notes for 2020-08-19 --- content/posts/2020-08.md | 57 ++++++++++++-- docs/2020-08/index.html | 100 ++++++++++++++++++++++-- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/sitemap.xml | 10 +-- 20 files changed, 166 insertions(+), 35 deletions(-) diff --git a/content/posts/2020-08.md b/content/posts/2020-08.md index d9a376330..4842f9e7f 100644 --- a/content/posts/2020-08.md +++ b/content/posts/2020-08.md @@ -432,12 +432,6 @@ $ export JAVA_OPTS='-Dfile.encoding=UTF-8 -Xmx2048m' $ chrt -b 0 dspace dsrun com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdateCLI -t 12 -c statistics-2016 ``` -- Then I see there are 849,000 docs with `id: -1` and `type: 5` so I should purge those too probably: - -``` -$ curl -s "http://localhost:8081/solr/statistics-2017/update?softCommit=true" -H "Content-Type: text/xml" --data-binary 'id:\-1' -``` - - Altmetric asked for a dump of CGSpace's OAI "sets" so they can update their affiliation mappings - I did it in a kinda ghetto way: @@ -450,4 +444,55 @@ $ for num in {0..1300..100}; do cat /tmp/$num.xml >> /tmp/cgspace-oai-sets.xml; - This produces one file that has all the sets, albeit with 14 pages of responses concatenated into one document, but that's how theirs was in the first place... - Help Bizu with a restricted item for CIAT +## 2020-08-16 + +- The com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdateCLI script that was processing 2015 records last night started spitting shit tons of errors and created 120GB of logs... +- I looked at a few of the UIDs that it was having problems with and they were unmigrated ones... so I purged them in 2015 and all the rest of the statistics cores + +``` +$ curl -s "http://localhost:8081/solr/statistics-2015/update?softCommit=true" -H "Content-Type: text/xml" --data-binary 'id:/.*unmigrated.*/' +... +$ curl -s "http://localhost:8081/solr/statistics-2010/update?softCommit=true" -H "Content-Type: text/xml" --data-binary 'id:/.*unmigrated.*/' +``` + +## 2020-08-19 + +- I tested the DSpace 5 and DSpace 6 versions of the [country code tagger curation task](https://github.com/ilri/cgspace-java-helpers) and noticed a few things + - The DSpace 5.8 version finishes in 2 hours and 1 minute + - The DSpace 6.3 version ran for over 12 hours and didn't even finish (I killed it) + - Furthermore, it seems that each item is curated once for each collection it appears in, causing about 115,000 items to be processed, even though we only have about 87,000 +- I had been running the tasks on the entire repository with `-i 10568/0`, but I think I might need to try again with the special `all` option before writing to the dspace-tech mailing list for help + - Actually I just tested the `all` option on DSpace 5.8 and it still does many of the items multiple times, once for each of their mappings +- I finished the Atmire stats processing on all cores on DSpace Test: + - statistics: + - 2,040,385 docs: 2h 28m 49s + - statistics-2019: + - 8,960,000 docs: 12h 7s + - 1,780,575 docs: 2h 7m 29s + - statistics-2018: + - 2,200,000 docs: 12h 1m 11s + - 2,100,000 docs: 12h 4m 19s + - ? + - statistics-2017: + - 1,970,000 docs: 12h 5m 45s + - 2,000,000 docs: 12h 5m 38s + - 1,312,674 docs: 4h 14m 23s + - statistics-2016: + - 1,669,020 docs: 12h 4m 3s + - 1,650,000 docs: 12h 7m 40s + - 850,611 docs: 44m 52s + - statistics-2014: + - 4,832,334 docs: 3h 53m 41s + - statistics-2013: + - 4,509,891 docs: 3h 18m 44s + - statistics-2012: + - 3,716,857 docs: 2h 36m 21s + - statistics-2011: + - 1,645,426 docs: 1h 11m 41s +- As far as I can tell, the processing became much faster once I purged all the unmigrated records + - It took about six days for the processing according to the times above, though 2015 is missing... hmm +- Now I am testing the Atmire Listings and Reports + - On both my local test and DSpace Test I get no results when searching for "Orth, A." and "Orth, Alan" or even Delia Grace, but the Discovery index is up to date and I have eighteen items... + - I sent a message to Atmire... + diff --git a/docs/2020-08/index.html b/docs/2020-08/index.html index 0aff276c3..12081ce43 100644 --- a/docs/2020-08/index.html +++ b/docs/2020-08/index.html @@ -19,7 +19,7 @@ It is class based so I can easily add support for other vocabularies, and the te - + @@ -43,9 +43,9 @@ It is class based so I can easily add support for other vocabularies, and the te "@type": "BlogPosting", "headline": "August, 2020", "url": "https://alanorth.github.io/cgspace-notes/2020-08/", - "wordCount": "2800", + "wordCount": "3168", "datePublished": "2020-08-02T15:35:54+03:00", - "dateModified": "2020-08-13T17:56:39+03:00", + "dateModified": "2020-08-14T11:22:16+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -599,10 +599,6 @@ Caused by: java.lang.NullPointerException $ export JAVA_OPTS='-Dfile.encoding=UTF-8 -Xmx2048m' $ chrt -b 0 dspace dsrun com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdateCLI -t 12 -c statistics-2016 -
$ curl -s "http://localhost:8081/solr/statistics-2017/update?softCommit=true" -H "Content-Type: text/xml" --data-binary '<delete><query>id:\-1</query></delete>'
-