From 5f76797488f7ed4a9924f359b8cf134203a48007 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 26 Oct 2020 16:34:45 +0300 Subject: [PATCH] Add notes for 2020-10-26 --- content/posts/2020-10.md | 107 +++++++++++++++++++++ docs/2020-10/index.html | 122 +++++++++++++++++++++++- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/sitemap.xml | 10 +- 22 files changed, 249 insertions(+), 28 deletions(-) diff --git a/content/posts/2020-10.md b/content/posts/2020-10.md index 60f5909a1..f9983f797 100644 --- a/content/posts/2020-10.md +++ b/content/posts/2020-10.md @@ -656,4 +656,111 @@ $ http 'http://localhost:9200/openrxv-items-final/_search?_source_includes=affil - replace: International Livestock Research Institute - I re-uploaded the mappings to Elasticsearch like I did yesterday and restarted the harvesting +## 2020-10-24 + +- Atmire sent a small version bump to CUA (6.x-4.1.10-ilri-RC5) to fix the logging of bot requests when `usage-statistics.logBots` is false + - I tested it by making several requests to DSpace Test with the `RTB website BOT` and `Delphi 2009` user agents and can verify that they are no longer logged +- I spent a few hours working on mappings on AReS + - I decided to do a full re-harvest on AReS with *no mappings* so I could extract the CRPs and affiliations to see how much work they needed + - I worked on my Python script to process some cleanups of the values to create find/replace mappings for common scenarios: + - Removing acronyms from the end of strings + - Removing "CRP on " from strings + - The problem is that the mappings are applied to all fields, and we want to keep "CGIAR Research Program on ..." in the authors, but not in the CRPs field + - Really the best solution is to have each repository use the same controlled vocabularies + +## 2020-10-25 + +- I re-installed DSpace Test with a fresh snapshot of CGSpace's to test the DSpace 6 upgrade (the last time was in 2020-05, and we've fixed a lot of issues since then): + +``` +$ cp dspace/etc/postgres/update-sequences.sql /tmp/dspace5-update-sequences.sql +$ git checkout origin/6_x-dev-atmire-modules +$ chrt -b 0 mvn -U -Dmirage2.on=true -Dmirage2.deps.included=false clean package +$ sudo su - postgres +$ psql dspacetest -c 'CREATE EXTENSION pgcrypto;' +$ psql dspacetest -c "DELETE FROM schema_version WHERE version IN ('5.8.2015.12.03.3');" +$ exit +$ sudo systemctl stop tomcat7 +$ cd dspace/target/dspace-installer +$ rm -rf /blah/dspacetest/config/spring +$ ant update +$ dspace database migrate +(10 minutes) +$ sudo systemctl start tomcat7 +(discovery indexing starts) +``` + +- Then I started processing the Solr stats one core and 1 million records at a time: + +``` +$ export JAVA_OPTS='-Dfile.encoding=UTF-8 -Xmx2048m' +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics +``` + +- After the fifth or so run I got this error: + +``` +Exception: Error while creating field 'p_group_id{type=uuid,properties=indexed,stored,multiValued}' from value '10' +org.apache.solr.client.solrj.impl.HttpSolrServer$RemoteSolrException: Error while creating field 'p_group_id{type=uuid,properties=indexed,stored,multiValued}' from value '10' + at org.apache.solr.client.solrj.impl.HttpSolrServer.executeMethod(HttpSolrServer.java:552) + at org.apache.solr.client.solrj.impl.HttpSolrServer.request(HttpSolrServer.java:210) + at org.apache.solr.client.solrj.impl.HttpSolrServer.request(HttpSolrServer.java:206) + at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:124) + at org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:68) + at org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:54) + at org.dspace.util.SolrUpgradePre6xStatistics.batchUpdateStats(SolrUpgradePre6xStatistics.java:161) + at org.dspace.util.SolrUpgradePre6xStatistics.run(SolrUpgradePre6xStatistics.java:456) + at org.dspace.util.SolrUpgradePre6xStatistics.main(SolrUpgradePre6xStatistics.java:365) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at org.dspace.app.launcher.ScriptLauncher.runOneCommand(ScriptLauncher.java:229) + at org.dspace.app.launcher.ScriptLauncher.main(ScriptLauncher.java:81) +``` + +- So basically, as I saw at this same step in 2020-05, there are some documents that have IDs that have *not* been converted to UUID, and have *not* been labeled as "unmigrated" either... + - I see there are about 217,000 of them, 99% of which are of `type: 5` which is "search" + - I purged them: + +``` +$ curl -s "http://localhost:8083/solr/statistics/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "(*:* NOT id:/.{36}/) AND (*:* NOT id:/.+-unmigrated/)" +``` + +- Then I restarted the `solr-upgrade-statistics-6x` process, which apparently had no records left to process +- I started processing the statistics-2019 core... + - I managed to process 7.5 million records in 7 hours without any errors! + +## 2020-10-26 + +- The statistics processing on the statistics-2018 core errored after 1.8 million records: + +``` +Exception: Java heap space +java.lang.OutOfMemoryError: Java heap space +``` + +- I had the same problem when I processed the statistics-2018 core in 2020-07 and 2020-08 + - I will try to purge some unmigrated records (around 460,000), most of which are of `type: 5` (search) so not relevant to our views and downloads anyways: + +```console +$ curl -s "http://localhost:8083/solr/statistics-2018/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "id:/.+-unmigrated/" +``` + +- I restarted the process and it crashed again a few minutes later + - I increased the memory to 4096m and tried again + - It eventually completed, after which time I purge all remaining 350,000 unmigrated records (99% of which were `type: 5`): + +``` +$ curl -s "http://localhost:8083/solr/statistics-2018/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "(*:* NOT id:/.{36}/) AND (*:* NOT id:/.+-unmigrated/)" +``` + +- Then I started processing the statistics-2017 core... +- I filed an issue with Atmire about the duplicate values in the `owningComm` and `containerCommunity` fields in Solr: https://tracker.atmire.com/tickets-cgiar-ilri/view-ticket?id=839 +- Add new ORCID identifier for [Perle LATRE DE LATE](https://orcid.org/0000-0003-3871-6277) to controlled vocabulary + diff --git a/docs/2020-10/index.html b/docs/2020-10/index.html index 4ac638314..042a8b338 100644 --- a/docs/2020-10/index.html +++ b/docs/2020-10/index.html @@ -23,7 +23,7 @@ During the FlywayDB migration I got an error: - + @@ -51,9 +51,9 @@ During the FlywayDB migration I got an error: "@type": "BlogPosting", "headline": "October, 2020", "url": "https://alanorth.github.io/cgspace-notes/2020-10/", - "wordCount": "4350", + "wordCount": "5014", "datePublished": "2020-10-06T16:55:54+03:00", - "dateModified": "2020-10-22T11:58:26+03:00", + "dateModified": "2020-10-24T22:23:06+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -432,7 +432,7 @@ $ http --print Hh https://dspacetest.cgiar.org/rest/bitstreams/dfa1d9c3-75d3-438
  • Reference: https://lucene.apache.org/core/4_0_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Regexp_Searches
  • -
  • I added [Ss]pider to the Tomcat Crawler Sessions Manager Valve regex because this can catch a few more generic bots and force them to use the same Tomcat JSESSIONID
  • +
  • I added [Ss]pider to the Tomcat Crawler Session Manager Valve regex because this can catch a few more generic bots and force them to use the same Tomcat JSESSIONID
  • I added a few of the patterns from above to our local agents list and ran the check-spider-hits.sh on CGSpace:
  • $ ./check-spider-hits.sh -f dspace/config/spiders/agents/ilri -s statistics -u http://localhost:8083/solr -p
    @@ -865,6 +865,120 @@ $ csvcut -c 'id,dc.subject[],dc.subject[en_US],cg.subject.ilri[],cg.subject.ilri
     
     
  • I re-uploaded the mappings to Elasticsearch like I did yesterday and restarted the harvesting
  • +

    2020-10-24

    +
      +
    • Atmire sent a small version bump to CUA (6.x-4.1.10-ilri-RC5) to fix the logging of bot requests when usage-statistics.logBots is false +
        +
      • I tested it by making several requests to DSpace Test with the RTB website BOT and Delphi 2009 user agents and can verify that they are no longer logged
      • +
      +
    • +
    • I spent a few hours working on mappings on AReS +
        +
      • I decided to do a full re-harvest on AReS with no mappings so I could extract the CRPs and affiliations to see how much work they needed
      • +
      • I worked on my Python script to process some cleanups of the values to create find/replace mappings for common scenarios: +
          +
        • Removing acronyms from the end of strings
        • +
        • Removing “CRP on " from strings
        • +
        +
      • +
      • The problem is that the mappings are applied to all fields, and we want to keep “CGIAR Research Program on …” in the authors, but not in the CRPs field
      • +
      • Really the best solution is to have each repository use the same controlled vocabularies
      • +
      +
    • +
    +

    2020-10-25

    +
      +
    • I re-installed DSpace Test with a fresh snapshot of CGSpace’s to test the DSpace 6 upgrade (the last time was in 2020-05, and we’ve fixed a lot of issues since then):
    • +
    +
    $ cp dspace/etc/postgres/update-sequences.sql /tmp/dspace5-update-sequences.sql
    +$ git checkout origin/6_x-dev-atmire-modules
    +$ chrt -b 0 mvn -U -Dmirage2.on=true -Dmirage2.deps.included=false clean package
    +$ sudo su - postgres
    +$ psql dspacetest -c 'CREATE EXTENSION pgcrypto;'
    +$ psql dspacetest -c "DELETE FROM schema_version WHERE version IN ('5.8.2015.12.03.3');"
    +$ exit
    +$ sudo systemctl stop tomcat7
    +$ cd dspace/target/dspace-installer
    +$ rm -rf /blah/dspacetest/config/spring
    +$ ant update
    +$ dspace database migrate
    +(10 minutes)
    +$ sudo systemctl start tomcat7
    +(discovery indexing starts)
    +
      +
    • Then I started processing the Solr stats one core and 1 million records at a time:
    • +
    +
    $ export JAVA_OPTS='-Dfile.encoding=UTF-8 -Xmx2048m'
    +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics
    +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics
    +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics
    +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics
    +$ chrt -b 0 dspace solr-upgrade-statistics-6x -n 1000000 -i statistics
    +
      +
    • After the fifth or so run I got this error:
    • +
    +
    Exception: Error while creating field 'p_group_id{type=uuid,properties=indexed,stored,multiValued}' from value '10'
    +org.apache.solr.client.solrj.impl.HttpSolrServer$RemoteSolrException: Error while creating field 'p_group_id{type=uuid,properties=indexed,stored,multiValued}' from value '10'
    +        at org.apache.solr.client.solrj.impl.HttpSolrServer.executeMethod(HttpSolrServer.java:552)
    +        at org.apache.solr.client.solrj.impl.HttpSolrServer.request(HttpSolrServer.java:210)
    +        at org.apache.solr.client.solrj.impl.HttpSolrServer.request(HttpSolrServer.java:206)
    +        at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:124)
    +        at org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:68)
    +        at org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:54)
    +        at org.dspace.util.SolrUpgradePre6xStatistics.batchUpdateStats(SolrUpgradePre6xStatistics.java:161)
    +        at org.dspace.util.SolrUpgradePre6xStatistics.run(SolrUpgradePre6xStatistics.java:456)
    +        at org.dspace.util.SolrUpgradePre6xStatistics.main(SolrUpgradePre6xStatistics.java:365)
    +        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    +        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    +        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    +        at java.lang.reflect.Method.invoke(Method.java:498)
    +        at org.dspace.app.launcher.ScriptLauncher.runOneCommand(ScriptLauncher.java:229)
    +        at org.dspace.app.launcher.ScriptLauncher.main(ScriptLauncher.java:81)
    +
      +
    • So basically, as I saw at this same step in 2020-05, there are some documents that have IDs that have not been converted to UUID, and have not been labeled as “unmigrated” either… +
        +
      • I see there are about 217,000 of them, 99% of which are of type: 5 which is “search”
      • +
      • I purged them:
      • +
      +
    • +
    +
    $ curl -s "http://localhost:8083/solr/statistics/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "<delete><query>(*:* NOT id:/.{36}/) AND (*:* NOT id:/.+-unmigrated/)</query></delete>"
    +
      +
    • Then I restarted the solr-upgrade-statistics-6x process, which apparently had no records left to process
    • +
    • I started processing the statistics-2019 core… +
        +
      • I managed to process 7.5 million records in 7 hours without any errors!
      • +
      +
    • +
    +

    2020-10-26

    +
      +
    • The statistics processing on the statistics-2018 core errored after 1.8 million records:
    • +
    +
    Exception: Java heap space
    +java.lang.OutOfMemoryError: Java heap space
    +
      +
    • I had the same problem when I processed the statistics-2018 core in 2020-07 and 2020-08 +
        +
      • I will try to purge some unmigrated records (around 460,000), most of which are of type: 5 (search) so not relevant to our views and downloads anyways:
      • +
      +
    • +
    +
    $ curl -s "http://localhost:8083/solr/statistics-2018/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "<delete><query>id:/.+-unmigrated/</query></delete>"
    +
      +
    • I restarted the process and it crashed again a few minutes later +
        +
      • I increased the memory to 4096m and tried again
      • +
      • It eventually completed, after which time I purge all remaining 350,000 unmigrated records (99% of which were type: 5):
      • +
      +
    • +
    +
    $ curl -s "http://localhost:8083/solr/statistics-2018/update?softCommit=true" -H "Content-Type: text/xml" --data-binary "<delete><query>(*:* NOT id:/.{36}/) AND (*:* NOT id:/.+-unmigrated/)</query></delete>"
    +
    diff --git a/docs/categories/index.html b/docs/categories/index.html index 458565277..0c633681b 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 7bad869ed..9a57f827b 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index cfe8a85b1..86dccf801 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 8e8371fe9..4e1014efd 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 7a2caecfa..aedb1773d 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 069f7a4fd..8f59be00e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 3f9b799f8..ade2cff9b 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index bd630378e..457c3e936 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index ea45cb01e..1a85fbe94 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 5d8229acb..05106cf39 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 44a91df4c..290a4a7a0 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 4b8e4b103..b3189a34d 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 0a480f383..85239c05d 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 4a00bd38c..7f489494f 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 3655fcb9a..78e945313 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 6a0054343..338a367cd 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 2383b0b34..b1d821f0b 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index b10d18af3..832d3284c 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 4b4c9ae38..0f8c4dfe8 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 6b72a91f8..f01d32fa8 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,27 +4,27 @@ https://alanorth.github.io/cgspace-notes/categories/ - 2020-10-22T11:58:26+03:00 + 2020-10-24T22:23:06+03:00 https://alanorth.github.io/cgspace-notes/ - 2020-10-22T11:58:26+03:00 + 2020-10-24T22:23:06+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2020-10-22T11:58:26+03:00 + 2020-10-24T22:23:06+03:00 https://alanorth.github.io/cgspace-notes/2020-10/ - 2020-10-22T11:58:26+03:00 + 2020-10-24T22:23:06+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2020-10-22T11:58:26+03:00 + 2020-10-24T22:23:06+03:00