From 461428c92660460a75ea978b31a7c2a3c5d51f7a Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 20 Dec 2020 16:47:45 +0200 Subject: [PATCH] Add notes for 2020-12-20 --- content/posts/2020-12.md | 59 ++++++++++++++++++++ docs/2019-01/index.html | 4 +- docs/2020-12/index.html | 73 +++++++++++++++++++++++-- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/sitemap.xml | 10 ++-- 24 files changed, 154 insertions(+), 32 deletions(-) diff --git a/content/posts/2020-12.md b/content/posts/2020-12.md index be59bd059..7c0f36814 100644 --- a/content/posts/2020-12.md +++ b/content/posts/2020-12.md @@ -535,4 +535,63 @@ $ ./fix-metadata-values.py -i 2020-12-17-update-ILRI-author.csv -db dspace -u ds $ csvcut -c 'dc.identifier.citation[en_US],dc.identifier.uri,dc.identifier.uri[],dc.identifier.uri[en_US],dc.date.issued,dc.date.issued[],dc.date.issued[en_US],cg.identifier.status[en_US]' ~/Downloads/10568-80099.csv | csvgrep -c 'cg.identifier.status[en_US]' -m 'Limited Access' | csvgrep -c 'dc.date.issued' -m 2020 -c 'dc.date.issued[]' -m 2020 -c 'dc.date.issued[en_US]' -m 2020 > /tmp/limited-2020.csv ``` +## 2020-12-18 + +- I added support for indexing community views and downloads to [dspace-statistics-api](https://github.com/ilri/dspace-statistics-api) + - I still have to add the API endpoints to make the stats available + - Also, I played a little bit with Swagger via [falcon-swagger-ui](https://github.com/rdidyk/falcon-swagger-ui) and I think I can get that working for better API documentation / testing +- Atmire sent some feedback on the DeduplicateValuesProcessor + - They confirm that it should process _all_ duplicates, not just those in `owningComm` and `owningColl` + - They asked me to try it again on DSpace Test now that I've resync'd the Solr statistics cores from production + - I started processing the statistics core on DSpace Test + +## 2020-12-20 + +- The DeduplicateValuesProcessor has been running on DSpace Test since two days ago and it almost completed its second twelve-hour run, but crashed near the end: + +```console +... +Run 1 — 100% — 8,230,000/8,239,228 docs — 39s — 9h 8m 31s +Exception: Java heap space +java.lang.OutOfMemoryError: Java heap space + at java.util.Arrays.copyOfRange(Arrays.java:3664) + at java.lang.String.(String.java:207) + at org.noggit.CharArr.toString(CharArr.java:164) + at org.apache.solr.common.util.JavaBinCodec.readStr(JavaBinCodec.java:599) + at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:180) + at org.apache.solr.common.util.JavaBinCodec.readArray(JavaBinCodec.java:492) + at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:186) + at org.apache.solr.common.util.JavaBinCodec.readSolrDocument(JavaBinCodec.java:360) + at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:219) + at org.apache.solr.common.util.JavaBinCodec.readArray(JavaBinCodec.java:492) + at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:186) + at org.apache.solr.common.util.JavaBinCodec.readSolrDocumentList(JavaBinCodec.java:374) + at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:221) + at org.apache.solr.common.util.JavaBinCodec.readOrderedMap(JavaBinCodec.java:125) + at org.apache.solr.common.util.JavaBinCodec.readVal(JavaBinCodec.java:188) + at org.apache.solr.common.util.JavaBinCodec.unmarshal(JavaBinCodec.java:116) + at org.apache.solr.client.solrj.impl.BinaryResponseParser.processResponse(BinaryResponseParser.java:43) + at org.apache.solr.client.solrj.impl.HttpSolrServer.executeMethod(HttpSolrServer.java:528) + at org.apache.solr.client.solrj.impl.HttpSolrServer.request(HttpSolrServer.java:210) + at org.apache.solr.client.solrj.impl.HttpSolrServer.request(HttpSolrServer.java:206) + at org.apache.solr.client.solrj.request.QueryRequest.process(QueryRequest.java:91) + at org.apache.solr.client.solrj.SolrServer.query(SolrServer.java:301) + at com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdater.getNextSetOfSolrDocuments(SourceFile:392) + at com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdater.performRun(SourceFile:157) + at com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdater.update(SourceFile:128) + at com.atmire.statistics.util.update.atomic.AtomicStatisticsUpdateCLI.main(SourceFile:78) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at org.dspace.app.launcher.ScriptLauncher.runOneCommand(ScriptLauncher.java:229) + at org.dspace.app.launcher.ScriptLauncher.main(ScriptLauncher.java:81) +``` + +- That was with a JVM heap of 512m +- I looked in Solr and found dozens of duplicates of each field again... + - I sent [feedback to Atmire](https://tracker.atmire.com/tickets-cgiar-ilri/view-ticket?id=839) +- I finished the technical work on adding community and collection support to the DSpace Statistics API + - I still need to update the tests as well as the documentation + diff --git a/docs/2019-01/index.html b/docs/2019-01/index.html index 42108e3bd..aa32b7cf7 100644 --- a/docs/2019-01/index.html +++ b/docs/2019-01/index.html @@ -60,7 +60,7 @@ I don’t see anything interesting in the web server logs around that time t "@type": "BlogPosting", "headline": "January, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-01/", - "wordCount": "5532", + "wordCount": "5531", "datePublished": "2019-01-02T09:48:30+02:00", "dateModified": "2020-10-19T15:23:30+03:00", "author": { @@ -949,7 +949,7 @@ $ http 'http://localhost:8081/solr/statistics/select?indent=on&rows=0&q= - +