From 3a1e203aa6d9555f1f234fe0bd1ef11850164137 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 9 May 2017 00:43:02 +0300 Subject: [PATCH] Update notes for 2017-05-08 --- content/post/2017-05.md | 13 +++++++++++++ public/2017-05/index.html | 20 +++++++++++++++++--- public/sitemap.xml | 10 +++++----- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/content/post/2017-05.md b/content/post/2017-05.md index b7872f9d3..322101fab 100644 --- a/content/post/2017-05.md +++ b/content/post/2017-05.md @@ -56,6 +56,19 @@ $ ./fix-metadata-values.py -i ccafs-flagships-may7.csv -f cg.subject.ccafs -t co - Start working on CGIAR Library migration - We decided to use AIP export to preserve the hierarchies and handles of communities and collections +- When ingesting some collections I was getting `java.lang.OutOfMemoryError: GC overhead limit exceeded`, which can be solved by disabling the GC timeout with `-XX:-UseGCOverheadLimit` +- Other times I was getting an error about heap space, so I kept bumping the RAM allocation by 512MB each time (up to 4096m!) it crashed +- This leads to tens of thousands of abandoned files in the assetstore, which need to be cleaned up using `dspace cleanup -v`, or else you'll run out of disk space +- In the end I realized it's better to use submission mode (`-s`) to ingest the community object as a single AIP without its children, followed by each of the collections: + +``` +$ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx2048m -XX:-UseGCOverheadLimit" +$ [dspace]/bin/dspace packager -s -o ignoreHandle=false -t AIP -e some@user.com -p 10568/87775 /home/aorth/10947-1/10947-1.zip +$ for collection in /home/aorth/10947-1/COLLECTION@10947-*; do [dspace]/bin/dspace packager -s -o ignoreHandle=false -t AIP -e aorth@mjanja.ch -p 10947/1 $collection; done +$ for item in /home/aorth/10947-1/ITEM@10947-*; do [dspace]/bin/dspace packager -r -f -u -t AIP -e aorth@mjanja.ch $item; done +``` + +- Note that in submission mode DSpace ignores the handle specified in `mets.xml` in the zip file, so you need to turn that off with `-o ignoreHandle=false` - Give feedback to CIFOR about their data quality: - Suggestion: uppercase dc.subject, cg.coverage.region, and cg.coverage.subregion in your crosswalk so they match CGSpace and therefore can be faceted / reported on easier - Suggestion: use CGSpace's CRP names (cg.contributor.crp), see: dspace/config/input-forms.xml diff --git a/public/2017-05/index.html b/public/2017-05/index.html index 39c861375..da5458782 100644 --- a/public/2017-05/index.html +++ b/public/2017-05/index.html @@ -13,7 +13,7 @@ - + @@ -45,9 +45,9 @@ "@type": "BlogPosting", "headline": "May, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-05/", - "wordCount": "464", + "wordCount": "653", "datePublished": "2017-05-01T16:21:52+02:00", - "dateModified": "2017-05-08T17:51:55+03:00", + "dateModified": "2017-05-08T20:20:52+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -181,6 +181,20 @@ + +
$ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx2048m -XX:-UseGCOverheadLimit"
+$ [dspace]/bin/dspace packager -s -o ignoreHandle=false -t AIP -e some@user.com -p 10568/87775 /home/aorth/10947-1/10947-1.zip
+$ for collection in /home/aorth/10947-1/COLLECTION@10947-*; do [dspace]/bin/dspace packager -s -o ignoreHandle=false -t AIP -e aorth@mjanja.ch -p 10947/1 $collection; done
+$ for item in /home/aorth/10947-1/ITEM@10947-*; do [dspace]/bin/dspace packager -r -f -u -t AIP -e aorth@mjanja.ch $item; done
+
+ +