From c053b90504f75bc537b07bb5573cddef7ccb3167 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 5 Feb 2019 08:59:54 +0200 Subject: [PATCH] Update notes for 2019-02-05 --- content/posts/2019-02.md | 30 ++++++++++++++++++++++++++++++ docs/2019-02/index.html | 39 ++++++++++++++++++++++++++++++++++++--- docs/robots.txt | 2 +- docs/sitemap.xml | 20 ++++++++++---------- 4 files changed, 77 insertions(+), 14 deletions(-) diff --git a/content/posts/2019-02.md b/content/posts/2019-02.md index 2c3b21d1d..17bb554b2 100644 --- a/content/posts/2019-02.md +++ b/content/posts/2019-02.md @@ -160,4 +160,34 @@ COPY 321 - At this rate I think I just need to stop paying attention to these alerts—DSpace gets thrashed when people use the APIs properly and there's nothing we can do to improve REST API performance! - Perhaps I just need to keep increasing the Linode alert threshold (currently 300%) for this host? +## 2019-02-05 + +- Peter sent me corrections and deletions for the CTA subjects and as usual, there were encoding errors with some accentsÁ in his file +- In other news, it seems that the GREL syntax regarding booleans changed in OpenRefine recently, so I need to update some expressions like the one I use to detect encoding errors to use `toString()`: + +``` +or( + isNotNull(value.match(/.*\uFFFD.*/)), + isNotNull(value.match(/.*\u00A0.*/)), + isNotNull(value.match(/.*\u200A.*/)), + isNotNull(value.match(/.*\u2019.*/)), + isNotNull(value.match(/.*\u00b4.*/)), + isNotNull(value.match(/.*\u007e.*/)) +).toString() +``` + +- Testing the corrections for sixty-five items and sixteen deletions using my [fix-metadata-values.py](https://gist.github.com/alanorth/df92cbfb54d762ba21b28f7cd83b6897) and [delete-metadata-values.py](https://gist.github.com/alanorth/bd7d58c947f686401a2b1fadc78736be) scripts: + +``` +$ ./fix-metadata-values.py -i 2019-02-04-Correct-65-CTA-Subjects.csv -f cg.subject.cta -t CORRECT -m 124 -db dspace -u dspace -p 'fuu' -d +$ ./delete-metadata-values.py -i 2019-02-04-Delete-16-CTA-Subjects.csv -f cg.subject.cta -m 124 -db dspace -u dspace -p 'fuu' -d +``` + +- I applied them on DSpace Test and CGSpace and started a full Discovery re-index: + +``` +$ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx1024m" +$ time schedtool -D -e ionice -c2 -n7 nice -n19 dspace index-discovery -b +``` + diff --git a/docs/2019-02/index.html b/docs/2019-02/index.html index 511f1bdc1..ae3710cb6 100644 --- a/docs/2019-02/index.html +++ b/docs/2019-02/index.html @@ -42,7 +42,7 @@ sys 0m1.979s - + @@ -89,9 +89,9 @@ sys 0m1.979s "@type": "BlogPosting", "headline": "February, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-02/", - "wordCount": "846", + "wordCount": "990", "datePublished": "2019-02-01T21:37:30+02:00", - "dateModified": "2019-02-04T20:09:20+02:00", + "dateModified": "2019-02-04T23:05:12+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -338,6 +338,39 @@ COPY 321
  • Perhaps I just need to keep increasing the Linode alert threshold (currently 300%) for this host?
  • +

    2019-02-05

    + + + +
    or(
    +  isNotNull(value.match(/.*\uFFFD.*/)),
    +  isNotNull(value.match(/.*\u00A0.*/)),
    +  isNotNull(value.match(/.*\u200A.*/)),
    +  isNotNull(value.match(/.*\u2019.*/)),
    +  isNotNull(value.match(/.*\u00b4.*/)),
    +  isNotNull(value.match(/.*\u007e.*/))
    +).toString()
    +
    + + + +
    $ ./fix-metadata-values.py -i 2019-02-04-Correct-65-CTA-Subjects.csv -f cg.subject.cta -t CORRECT -m 124 -db dspace -u dspace -p 'fuu' -d
    +$ ./delete-metadata-values.py -i 2019-02-04-Delete-16-CTA-Subjects.csv -f cg.subject.cta -m 124 -db dspace -u dspace -p 'fuu' -d
    +
    + + + +
    $ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx1024m"
    +$ time schedtool -D -e ionice -c2 -n7 nice -n19 dspace index-discovery -b
    +
    + diff --git a/docs/robots.txt b/docs/robots.txt index 66dc42771..d85aff9f8 100644 --- a/docs/robots.txt +++ b/docs/robots.txt @@ -44,7 +44,7 @@ Disallow: /cgspace-notes/2015-12/ Disallow: /cgspace-notes/2015-11/ Disallow: /cgspace-notes/ Disallow: /cgspace-notes/categories/ -Disallow: /cgspace-notes/tags/notes/ Disallow: /cgspace-notes/categories/notes/ +Disallow: /cgspace-notes/tags/notes/ Disallow: /cgspace-notes/posts/ Disallow: /cgspace-notes/tags/ diff --git a/docs/sitemap.xml b/docs/sitemap.xml index e222fce9f..54f4c587e 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2019-02/ - 2019-02-04T20:09:20+02:00 + 2019-02-04T23:05:12+02:00 @@ -209,7 +209,7 @@ https://alanorth.github.io/cgspace-notes/ - 2019-02-04T20:09:20+02:00 + 2019-02-04T23:05:12+02:00 0 @@ -218,27 +218,27 @@ 0 - - https://alanorth.github.io/cgspace-notes/tags/notes/ - 2019-02-04T20:09:20+02:00 - 0 - - https://alanorth.github.io/cgspace-notes/categories/notes/ 2018-03-09T22:10:33+02:00 0 + + https://alanorth.github.io/cgspace-notes/tags/notes/ + 2019-02-04T23:05:12+02:00 + 0 + + https://alanorth.github.io/cgspace-notes/posts/ - 2019-02-04T20:09:20+02:00 + 2019-02-04T23:05:12+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2019-02-04T20:09:20+02:00 + 2019-02-04T23:05:12+02:00 0