From c070fda9b3583988916fb0e5ebe487c18a03655e Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 25 Mar 2018 22:46:48 +0300 Subject: [PATCH] Add notes for 2018-03-25 --- content/posts/2018-03.md | 45 ++++++++++++++++++++++++++++++ docs/2018-03/index.html | 60 ++++++++++++++++++++++++++++++++++++++-- docs/sitemap.xml | 10 +++---- 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/content/posts/2018-03.md b/content/posts/2018-03.md index 893716c4f..7cc2e8f6c 100644 --- a/content/posts/2018-03.md +++ b/content/posts/2018-03.md @@ -445,3 +445,48 @@ isNotNull(value.match(/.*\ufffd.*/)) - More work on the Ubuntu 18.04 readiness stuff for the [Ansible playbooks](https://github.com/ilri/rmg-ansible-public) - The playbook now uses the system's Ruby and Node.js so I don't have to manually install RVM and NVM after + +## 2018-03-25 + +- Looking at Peter's author corrections and trying to work out a way to find errors in OpenRefine easily +- I can find all names that have acceptable characters using a GREL expression like: + +``` +isNotNull(value.match(/.*[a-zA-ZáÁéèïíñØøöóúü].*/)) +``` + +- But it's probably better to just say which characters I know for sure are not valid (like parentheses, pipe, or weird Unicode characters): + +``` +or( + isNotNull(value.match(/.*[(|)].*/)), + isNotNull(value.match(/.*\uFFFD.*/)), + isNotNull(value.match(/.*\u00A0.*/)), + isNotNull(value.match(/.*\u200A.*/)) +) +``` + +- And here's one combined GREL expression to check for items marked as to delete or check so I can flag them and export them to a separate CSV (though perhaps it's time to add delete support to my `fix-metadata-values.py` script: + +``` +or( + isNotNull(value.match(/.*delete.*/i)), + isNotNull(value.match(/.*remove.*/i)), + isNotNull(value.match(/.*check.*/i)) +) +``` + +- So I guess the routine is in OpenRefine is: + - Transform: trim leading/trailing whitespace + - Transform: collapse consecutive whitespace + - Custom text facet for items to delete/check + - Custom text facet for illegal characters + +- Test the corrections and deletions locally, then run them on CGSpace: + +``` +$ ./fix-metadata-values.py -i /tmp/Correct-2928-Authors-2018-03-21.csv -db dspace -u dspace -p 'fuuu' -f dc.contributor.author -t correct -m 3 +$ ./delete-metadata-values.py -i /tmp/Delete-8-Authors-2018-03-21.csv -f dc.contributor.author -m 3 -db dspacetest -u dspace -p 'fuuu' +``` + +- Afterwards I started a full Discovery reindexing diff --git a/docs/2018-03/index.html b/docs/2018-03/index.html index 72c06dfe7..30b6a835f 100644 --- a/docs/2018-03/index.html +++ b/docs/2018-03/index.html @@ -20,7 +20,7 @@ Export a CSV of the IITA community metadata for Martin Mueller - + @@ -51,9 +51,9 @@ Export a CSV of the IITA community metadata for Martin Mueller "@type": "BlogPosting", "headline": "March, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-03/", - "wordCount": "2509", + "wordCount": "2695", "datePublished": "2018-03-02T16:07:54+02:00", - "dateModified": "2018-03-22T23:07:03+02:00", + "dateModified": "2018-03-24T22:03:00+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -626,6 +626,60 @@ sys 2m45.135s
  • The playbook now uses the system’s Ruby and Node.js so I don’t have to manually install RVM and NVM after
  • +

    2018-03-25

    + + + +
    isNotNull(value.match(/.*[a-zA-ZáÁéèïíñØøöóúü].*/))
    +
    + + + +
    or(
    +  isNotNull(value.match(/.*[(|)].*/)),
    +  isNotNull(value.match(/.*\uFFFD.*/)),
    +  isNotNull(value.match(/.*\u00A0.*/)),
    +  isNotNull(value.match(/.*\u200A.*/))
    +)
    +
    + + + +
    or(
    +  isNotNull(value.match(/.*delete.*/i)),
    +  isNotNull(value.match(/.*remove.*/i)),
    +  isNotNull(value.match(/.*check.*/i))
    +)
    +
    + + + +
    $ ./fix-metadata-values.py -i /tmp/Correct-2928-Authors-2018-03-21.csv -db dspace -u dspace -p 'fuuu' -f dc.contributor.author -t correct -m 3
    +$ ./delete-metadata-values.py -i /tmp/Delete-8-Authors-2018-03-21.csv -f dc.contributor.author -m 3 -db dspacetest -u dspace -p 'fuuu'
    +
    + + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index f222dafb5..bad98e612 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-03/ - 2018-03-22T23:07:03+02:00 + 2018-03-24T22:03:00+02:00 @@ -154,7 +154,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-03-22T23:07:03+02:00 + 2018-03-24T22:03:00+02:00 0 @@ -165,7 +165,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-03-22T23:07:03+02:00 + 2018-03-24T22:03:00+02:00 0 @@ -177,13 +177,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-03-22T23:07:03+02:00 + 2018-03-24T22:03:00+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-03-22T23:07:03+02:00 + 2018-03-24T22:03:00+02:00 0