diff --git a/content/posts/2018-07.md b/content/posts/2018-07.md index c59eeaa56..3d74e9532 100644 --- a/content/posts/2018-07.md +++ b/content/posts/2018-07.md @@ -53,4 +53,46 @@ $ dspace database migrate ignored - Discuss AgriKnowledge including our Handle identifier on their harvested items from CGSpace - They seem to be only interested in Gates-funded outputs, for example: https://www.agriknowledge.org/files/tm70mv21t +## 2018-07-03 + +- Finally finish with the CIFOR Archive records (a total of 2448): + - I mapped the 50 items that were duplicates from elsewhere in CGSpace into [CIFOR Archive](https://cgspace.cgiar.org/handle/10568/16702) + - I did one last check of the remaining 2398 items and found eight who have a `cg.identifier.doi` that links to some URL other than a DOI so I moved those to `cg.identifier.url` and `cg.identifier.googleurl` as appropriate + - Also, thirteen items had a DOI in their citation, but did not have a `cg.identifier.doi` field, so I added those + - Then I imported those 2398 items in two batches (to deal with memory issues): + +``` +$ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx1024m" +$ dspace metadata-import -e aorth@mjanja.ch -f /tmp/2018-06-27-New-CIFOR-Archive.csv +$ dspace metadata-import -e aorth@mjanja.ch -f /tmp/2018-06-27-New-CIFOR-Archive2.csv +``` + +- I noticed there are many items that use HTTP instead of HTTPS for their Google Books URL, and some missing HTTP entirely: + +``` +dspace=# select count(*) from metadatavalue where resource_type_id=2 and metadata_field_id=222 and text_value like 'http://books.google.%'; + count +------- + 785 +dspace=# select count(*) from metadatavalue where resource_type_id=2 and metadata_field_id=222 and text_value ~ '^books\.google\..*'; + count +------- + 4 +``` + +- I think I should fix that as well as some other garbage values like "test" and "dspace.ilri.org" etc: + +``` +dspace=# begin; +dspace=# update metadatavalue set text_value = regexp_replace(text_value, 'http://books.google', 'https://books.google') where resource_type_id=2 and metadata_field_id=222 and text_value like 'http://books.google.%'; +UPDATE 785 +dspace=# update metadatavalue set text_value = regexp_replace(text_value, 'books.google', 'https://books.google') where resource_type_id=2 and metadata_field_id=222 and text_value ~ '^books\.google\..*'; +UPDATE 4 +dspace=# update metadatavalue set text_value='https://books.google.com/books?id=meF1CLdPSF4C' where resource_type_id=2 and metadata_field_id=222 and text_value='meF1CLdPSF4C'; +UPDATE 1 +dspace=# delete from metadatavalue where resource_type_id=2 and metadata_field_id=222 and metadata_value_id in (2299312, 10684, 10700, 996403); +DELETE 4 +dspace=# commit; +``` + diff --git a/docs/2015-11/index.html b/docs/2015-11/index.html index 912589abe..83f94ba1e 100644 --- a/docs/2015-11/index.html +++ b/docs/2015-11/index.html @@ -53,7 +53,7 @@ $ psql -c 'SELECT * from pg_stat_activity;' | grep idle | grep -c cgspac "/> - + diff --git a/docs/2015-12/index.html b/docs/2015-12/index.html index b5f572d0b..bf327cc45 100644 --- a/docs/2015-12/index.html +++ b/docs/2015-12/index.html @@ -55,7 +55,7 @@ Replace lzop with xz in log compression cron jobs on DSpace Test—it uses less "/> - + diff --git a/docs/2016-01/index.html b/docs/2016-01/index.html index cb587f882..5f89c15e3 100644 --- a/docs/2016-01/index.html +++ b/docs/2016-01/index.html @@ -45,7 +45,7 @@ Update GitHub wiki for documentation of maintenance tasks. "/> - + diff --git a/docs/2016-02/index.html b/docs/2016-02/index.html index 1acaed481..fc3aab130 100644 --- a/docs/2016-02/index.html +++ b/docs/2016-02/index.html @@ -59,7 +59,7 @@ Also, lots of things like “COTE D`LVOIRE” and “COTE D IVOIRE&r "/> - + diff --git a/docs/2016-03/index.html b/docs/2016-03/index.html index 3f66fbff7..51bb2e0bb 100644 --- a/docs/2016-03/index.html +++ b/docs/2016-03/index.html @@ -45,7 +45,7 @@ Reinstall my local (Mac OS X) DSpace stack with Tomcat 7, PostgreSQL 9.3, and Ja "/> - + diff --git a/docs/2016-04/index.html b/docs/2016-04/index.html index 91f9ee997..5e815f956 100644 --- a/docs/2016-04/index.html +++ b/docs/2016-04/index.html @@ -49,7 +49,7 @@ Also, I noticed the checker log has some errors we should pay attention to: "/> - + diff --git a/docs/2016-05/index.html b/docs/2016-05/index.html index 064e4820f..14931f3c4 100644 --- a/docs/2016-05/index.html +++ b/docs/2016-05/index.html @@ -53,7 +53,7 @@ There are 3,000 IPs accessing the REST API in a 24-hour period! "/> - + diff --git a/docs/2016-06/index.html b/docs/2016-06/index.html index 5842ccaf1..3732e51d5 100644 --- a/docs/2016-06/index.html +++ b/docs/2016-06/index.html @@ -51,7 +51,7 @@ Working on second phase of metadata migration, looks like this will work for mov "/> - + diff --git a/docs/2016-07/index.html b/docs/2016-07/index.html index 50b2feb8a..d114cfd78 100644 --- a/docs/2016-07/index.html +++ b/docs/2016-07/index.html @@ -67,7 +67,7 @@ In this case the select query was showing 95 results before the update "/> - + diff --git a/docs/2016-08/index.html b/docs/2016-08/index.html index 4fefde962..52822e7c5 100644 --- a/docs/2016-08/index.html +++ b/docs/2016-08/index.html @@ -61,7 +61,7 @@ $ git rebase -i dspace-5.5 "/> - + diff --git a/docs/2016-09/index.html b/docs/2016-09/index.html index b2f223691..f16da0b25 100644 --- a/docs/2016-09/index.html +++ b/docs/2016-09/index.html @@ -53,7 +53,7 @@ $ ldapsearch -x -H ldaps://svcgroot2.cgiarad.org:3269/ -b "dc=cgiarad,dc=or "/> - + diff --git a/docs/2016-10/index.html b/docs/2016-10/index.html index cfe599970..2bdbf3865 100644 --- a/docs/2016-10/index.html +++ b/docs/2016-10/index.html @@ -61,7 +61,7 @@ I exported a random item’s metadata as CSV, deleted all columns except id "/> - + diff --git a/docs/2016-11/index.html b/docs/2016-11/index.html index 8bbe636bd..b40f8db86 100644 --- a/docs/2016-11/index.html +++ b/docs/2016-11/index.html @@ -45,7 +45,7 @@ Add dc.type to the output options for Atmire’s Listings and Reports module "/> - + diff --git a/docs/2016-12/index.html b/docs/2016-12/index.html index 42c40e0ee..db9b2a224 100644 --- a/docs/2016-12/index.html +++ b/docs/2016-12/index.html @@ -69,7 +69,7 @@ Another worrying error from dspace.log is: "/> - + diff --git a/docs/2017-01/index.html b/docs/2017-01/index.html index d92958b32..d4d744b73 100644 --- a/docs/2017-01/index.html +++ b/docs/2017-01/index.html @@ -45,7 +45,7 @@ I asked on the dspace-tech mailing list because it seems to be broken, and actua "/> - + diff --git a/docs/2017-02/index.html b/docs/2017-02/index.html index a2a068387..53c0e7ab2 100644 --- a/docs/2017-02/index.html +++ b/docs/2017-02/index.html @@ -73,7 +73,7 @@ Looks like we’ll be using cg.identifier.ccafsprojectpii as the field name "/> - + diff --git a/docs/2017-03/index.html b/docs/2017-03/index.html index f24cc3e04..ae7fa55f3 100644 --- a/docs/2017-03/index.html +++ b/docs/2017-03/index.html @@ -77,7 +77,7 @@ $ identify ~/Desktop/alc_contrastes_desafios.jpg "/> - + diff --git a/docs/2017-04/index.html b/docs/2017-04/index.html index 4cd3ea8a3..e809161ff 100644 --- a/docs/2017-04/index.html +++ b/docs/2017-04/index.html @@ -63,7 +63,7 @@ $ [dspace]/bin/dspace filter-media -f -i 10568/16498 -p "ImageMagick PDF Th "/> - + diff --git a/docs/2017-05/index.html b/docs/2017-05/index.html index 58ecb5b88..22eea021d 100644 --- a/docs/2017-05/index.html +++ b/docs/2017-05/index.html @@ -29,7 +29,7 @@ - + diff --git a/docs/2017-06/index.html b/docs/2017-06/index.html index f9657b350..ec1832f26 100644 --- a/docs/2017-06/index.html +++ b/docs/2017-06/index.html @@ -29,7 +29,7 @@ - + diff --git a/docs/2017-07/index.html b/docs/2017-07/index.html index 334f9b92e..4668d4bbb 100644 --- a/docs/2017-07/index.html +++ b/docs/2017-07/index.html @@ -57,7 +57,7 @@ We can use PostgreSQL’s extended output format (-x) plus sed to format the "/> - + diff --git a/docs/2017-08/index.html b/docs/2017-08/index.html index f0fdcbf76..4c9f74e38 100644 --- a/docs/2017-08/index.html +++ b/docs/2017-08/index.html @@ -77,7 +77,7 @@ Then I cleaned up the author authorities and HTML characters in OpenRefine and s "/> - + diff --git a/docs/2017-09/index.html b/docs/2017-09/index.html index 1c015adee..d549f897f 100644 --- a/docs/2017-09/index.html +++ b/docs/2017-09/index.html @@ -53,7 +53,7 @@ Ask Sisay to clean up the WLE approvers a bit, as Marianne’s user account "/> - + diff --git a/docs/2017-10/index.html b/docs/2017-10/index.html index ee17e74e3..cb6fc3014 100644 --- a/docs/2017-10/index.html +++ b/docs/2017-10/index.html @@ -57,7 +57,7 @@ Add Katherine Lutz to the groups for content submission and edit steps of the CG "/> - + diff --git a/docs/2017-11/index.html b/docs/2017-11/index.html index 04853138d..88f692e2e 100644 --- a/docs/2017-11/index.html +++ b/docs/2017-11/index.html @@ -77,7 +77,7 @@ COPY 54701 "/> - + diff --git a/docs/2017-12/index.html b/docs/2017-12/index.html index 2e09f244a..85b1c9e05 100644 --- a/docs/2017-12/index.html +++ b/docs/2017-12/index.html @@ -47,7 +47,7 @@ The list of connections to XMLUI and REST API for today: "/> - + diff --git a/docs/2018-01/index.html b/docs/2018-01/index.html index e3dd9c357..8b6bbe63f 100644 --- a/docs/2018-01/index.html +++ b/docs/2018-01/index.html @@ -185,7 +185,7 @@ Danny wrote to ask for help renewing the wildcard ilri.org certificate and I adv "/> - + diff --git a/docs/2018-02/index.html b/docs/2018-02/index.html index 29426cc50..b6e4074c4 100644 --- a/docs/2018-02/index.html +++ b/docs/2018-02/index.html @@ -47,7 +47,7 @@ I copied the logic in the jmx_tomcat_dbpools provided by Ubuntu’s munin-pl "/> - + diff --git a/docs/2018-03/index.html b/docs/2018-03/index.html index d67fccff8..98bf22ab6 100644 --- a/docs/2018-03/index.html +++ b/docs/2018-03/index.html @@ -41,7 +41,7 @@ Export a CSV of the IITA community metadata for Martin Mueller "/> - + diff --git a/docs/2018-04/index.html b/docs/2018-04/index.html index 30047b659..c93d12c93 100644 --- a/docs/2018-04/index.html +++ b/docs/2018-04/index.html @@ -43,7 +43,7 @@ Catalina logs at least show some memory errors yesterday: "/> - + diff --git a/docs/2018-05/index.html b/docs/2018-05/index.html index 586b797bc..b1149e654 100644 --- a/docs/2018-05/index.html +++ b/docs/2018-05/index.html @@ -55,7 +55,7 @@ Also, I switched it to use OpenJDK instead of Oracle Java, as well as re-worked "/> - + diff --git a/docs/2018-06/index.html b/docs/2018-06/index.html index c6f5b734b..145c499d1 100644 --- a/docs/2018-06/index.html +++ b/docs/2018-06/index.html @@ -83,7 +83,7 @@ sys 2m7.289s "/> - + diff --git a/docs/2018-07/index.html b/docs/2018-07/index.html index a9196a571..a64cd9aab 100644 --- a/docs/2018-07/index.html +++ b/docs/2018-07/index.html @@ -30,7 +30,7 @@ There is insufficient memory for the Java Runtime Environment to continue. - + @@ -61,7 +61,7 @@ There is insufficient memory for the Java Runtime Environment to continue. "/> - + @@ -71,9 +71,9 @@ There is insufficient memory for the Java Runtime Environment to continue. "@type": "BlogPosting", "headline": "July, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-07/", - "wordCount": "210", + "wordCount": "469", "datePublished": "2018-07-01T12:56:54+03:00", - "dateModified": "2018-07-01T18:05:01+03:00", + "dateModified": "2018-07-02T17:33:38+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -195,6 +195,54 @@ $ dspace database migrate ignored
  • They seem to be only interested in Gates-funded outputs, for example: https://www.agriknowledge.org/files/tm70mv21t
  • +

    2018-07-03

    + + + +
    $ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx1024m"
    +$ dspace metadata-import -e aorth@mjanja.ch -f /tmp/2018-06-27-New-CIFOR-Archive.csv
    +$ dspace metadata-import -e aorth@mjanja.ch -f /tmp/2018-06-27-New-CIFOR-Archive2.csv
    +
    + + + +
    dspace=# select count(*) from metadatavalue where resource_type_id=2 and metadata_field_id=222 and text_value like 'http://books.google.%';
    + count
    +-------
    +   785
    +dspace=# select count(*) from metadatavalue where resource_type_id=2 and metadata_field_id=222 and text_value ~ '^books\.google\..*';
    + count
    +-------
    +     4
    +
    + + + +
    dspace=# begin;
    +dspace=# update metadatavalue set text_value = regexp_replace(text_value, 'http://books.google', 'https://books.google') where resource_type_id=2 and metadata_field_id=222 and text_value like 'http://books.google.%';
    +UPDATE 785
    +dspace=# update metadatavalue set text_value = regexp_replace(text_value, 'books.google', 'https://books.google') where resource_type_id=2 and metadata_field_id=222 and text_value ~ '^books\.google\..*';
    +UPDATE 4
    +dspace=# update metadatavalue set text_value='https://books.google.com/books?id=meF1CLdPSF4C' where resource_type_id=2 and metadata_field_id=222 and text_value='meF1CLdPSF4C';
    +UPDATE 1
    +dspace=# delete from metadatavalue where resource_type_id=2 and metadata_field_id=222 and metadata_value_id in (2299312, 10684, 10700, 996403);
    +DELETE 4
    +dspace=# commit;
    +
    + diff --git a/docs/404.html b/docs/404.html index 096e2ad06..4668f29a1 100644 --- a/docs/404.html +++ b/docs/404.html @@ -26,7 +26,7 @@ - + diff --git a/docs/categories/index.html b/docs/categories/index.html index e2828235c..849aae9c0 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 92509a862..c8b620ce3 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/categories/page/2/index.html b/docs/categories/page/2/index.html index 10fc75354..f11f5d468 100644 --- a/docs/categories/page/2/index.html +++ b/docs/categories/page/2/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/categories/page/3/index.html b/docs/categories/page/3/index.html index 2b9699810..d2272b4f7 100644 --- a/docs/categories/page/3/index.html +++ b/docs/categories/page/3/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/categories/page/4/index.html b/docs/categories/page/4/index.html index 8a2cb2e5a..3f236f2b8 100644 --- a/docs/categories/page/4/index.html +++ b/docs/categories/page/4/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/cgiar-library-migration/index.html b/docs/cgiar-library-migration/index.html index e658d1d82..725330b7a 100644 --- a/docs/cgiar-library-migration/index.html +++ b/docs/cgiar-library-migration/index.html @@ -29,7 +29,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 06b65a39f..6adecfe60 100644 --- a/docs/index.html +++ b/docs/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 19fbbdcb7..3239b5fd3 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index b8b263b95..d3bde8d1d 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 05d60a4e3..f99881832 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index dbb49f9da..fd5597421 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 62b5598b4..d432c6fd3 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index a70092980..13821a89a 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index be99c60ed..4f309142f 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 318a7dea8..66d8fe337 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-07/ - 2018-07-01T18:05:01+03:00 + 2018-07-02T17:33:38+03:00 @@ -174,7 +174,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-07-01T18:05:01+03:00 + 2018-07-02T17:33:38+03:00 0 @@ -185,7 +185,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-07-01T18:05:01+03:00 + 2018-07-02T17:33:38+03:00 0 @@ -197,13 +197,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-07-01T18:05:01+03:00 + 2018-07-02T17:33:38+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-07-01T18:05:01+03:00 + 2018-07-02T17:33:38+03:00 0 diff --git a/docs/tags/index.html b/docs/tags/index.html index 0db5a810c..fbaa1062f 100644 --- a/docs/tags/index.html +++ b/docs/tags/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/notes/index.html b/docs/tags/notes/index.html index 46ed777ae..52ec56f99 100644 --- a/docs/tags/notes/index.html +++ b/docs/tags/notes/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/notes/page/2/index.html b/docs/tags/notes/page/2/index.html index 6aa99aaf6..a02a27bdd 100644 --- a/docs/tags/notes/page/2/index.html +++ b/docs/tags/notes/page/2/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/notes/page/3/index.html b/docs/tags/notes/page/3/index.html index d37ac9a78..de895c3c9 100644 --- a/docs/tags/notes/page/3/index.html +++ b/docs/tags/notes/page/3/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/notes/page/4/index.html b/docs/tags/notes/page/4/index.html index 20e1b1737..c2e563b32 100644 --- a/docs/tags/notes/page/4/index.html +++ b/docs/tags/notes/page/4/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/page/2/index.html b/docs/tags/page/2/index.html index d3f3ab556..413313067 100644 --- a/docs/tags/page/2/index.html +++ b/docs/tags/page/2/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/page/3/index.html b/docs/tags/page/3/index.html index 6613e0f62..476760a9f 100644 --- a/docs/tags/page/3/index.html +++ b/docs/tags/page/3/index.html @@ -26,7 +26,7 @@ - + diff --git a/docs/tags/page/4/index.html b/docs/tags/page/4/index.html index 6581c7ff2..d874856a2 100644 --- a/docs/tags/page/4/index.html +++ b/docs/tags/page/4/index.html @@ -26,7 +26,7 @@ - +