From f37fb890929e8bdbd934e795bdcfd1209ac8515c Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Tue, 4 Sep 2018 17:08:34 +0300
Subject: [PATCH] Update notes for 2018-09-04

---
 content/posts/2018-09.md | 10 +++++++++-
 docs/2018-05/index.html  |  8 ++++----
 docs/2018-09/index.html  | 16 ++++++++++++----
 docs/sitemap.xml         | 12 ++++++------
 4 files changed, 31 insertions(+), 15 deletions(-)
diff --git a/content/posts/2018-09.md b/content/posts/2018-09.md
index 9e81e2581..aa7d1ec86 100644
--- a/content/posts/2018-09.md
+++ b/content/posts/2018-09.md
@@ -54,7 +54,15 @@ Caused by: java.lang.RuntimeException: Failed to startup the DSpace Service Mana
 - I'm looking over the latest round of IITA records from Sisay: [Mercy1806_August_29](https://dspacetest.cgiar.org/handle/10568/104230)
   - All fields are split with multiple columns like `cg.authorship.types` and `cg.authorship.types[]`
   - This makes it super annoying to do the checks and cleanup, so I will merge them (also time consuming)
-  - Five issue dates had values like `2013-5` so I corrected them to be `2013-05`
+  - Five items had `dc.date.issued` values like `2013-5` so I corrected them to be `2013-05`
   - Several metadata fields had values with newlines in them (even in some titles!), which I fixed by trimming the consecutive whitespaces in Open Refine
+  - Many (196!) items from before 2011 are indicated as having a CRP, but CRPs didn't exist then so this is impossible
+    - I got all items that were from 2011 and onwards using a custom facet with this GREL on the `dc.date.issued` column: `isNotNull(value.match(/201[1-8].*/))` and then blanking their CRPs
+  - Some affiliations with only one separator (|) for multiple values
+  - I replaced smart quotes like `’` with plain ones
+  - Some inconsitencies in `cg.subject.iita` like COWPEA and COWPEAS, and YAM and YAMS, etc, as well as some spelling mistakes like IMPACT ASSESSMENTN
+  - Some values in the `dc.identifier.isbn` are actually ISSNs so I moved them to the `dc.identifier.issn` column
+  - I found one invalid ISSN using a custom text facet with the regex from the [ISSN page on Wikipedia](https://en.wikipedia.org/wiki/International_Standard_Serial_Number#Code_format): `isNotBlank(value.match(/^\d{4}-\d{3}[\dxX]$/))`
+  - One invalid value for `dc.type`
 
 <!-- vim: set sw=2 ts=2: -->
diff --git a/docs/2018-05/index.html b/docs/2018-05/index.html
index 8438c60d4..71376ee77 100644
--- a/docs/2018-05/index.html
+++ b/docs/2018-05/index.html
@@ -22,7 +22,7 @@ Also, I switched it to use OpenJDK instead of Oracle Java, as well as re-worked
 " />
 <meta property="og:type" content="article" />
 <meta property="og:url" content="https://alanorth.github.io/cgspace-notes/2018-05/" /><meta property="article:published_time" content="2018-05-01T16:43:54&#43;03:00"/>
-<meta property="article:modified_time" content="2018-05-31T15:53:12-07:00"/>
+<meta property="article:modified_time" content="2018-09-04T16:15:26&#43;03:00"/>
 <meta name="twitter:card" content="summary"/>
 <meta name="twitter:title" content="May, 2018"/>
 <meta name="twitter:description" content="2018-05-01
@@ -49,9 +49,9 @@ Also, I switched it to use OpenJDK instead of Oracle Java, as well as re-worked
   "@type": "BlogPosting",
   "headline": "May, 2018",
   "url": "https://alanorth.github.io/cgspace-notes/2018-05/",
-  "wordCount": "3502",
+  "wordCount": "3503",
   "datePublished": "2018-05-01T16:43:54&#43;03:00",
-  "dateModified": "2018-05-31T15:53:12-07:00",
+  "dateModified": "2018-09-04T16:15:26&#43;03:00",
   "author": {
     "@type": "Person",
     "name": "Alan Orth"
@@ -294,7 +294,7 @@ Livestock and Fish
 <li>Just a note to myself, I figured out how to get reconcile-csv to run from source rather than running the old pre-compiled JAR file:</li>
 </ul>
 
-<pre><code>$ lein run /tmp/crps.csv id
+<pre><code>$ lein run /tmp/crps.csv name id
 </code></pre>
 
 <ul>
diff --git a/docs/2018-09/index.html b/docs/2018-09/index.html
index cf86cefaa..d52e532df 100644
--- a/docs/2018-09/index.html
+++ b/docs/2018-09/index.html
@@ -18,7 +18,7 @@ I&rsquo;m testing the new DSpace 5.8 branch in my Ubuntu 18.04 environment and I
 " />
 <meta property="og:type" content="article" />
 <meta property="og:url" content="https://alanorth.github.io/cgspace-notes/2018-09/" /><meta property="article:published_time" content="2018-09-02T09:55:54&#43;03:00"/>
-<meta property="article:modified_time" content="2018-09-03T16:47:24&#43;03:00"/>
+<meta property="article:modified_time" content="2018-09-04T13:25:13&#43;03:00"/>
 <meta name="twitter:card" content="summary"/>
 <meta name="twitter:title" content="September, 2018"/>
 <meta name="twitter:description" content="2018-09-02
@@ -41,9 +41,9 @@ I&rsquo;m testing the new DSpace 5.8 branch in my Ubuntu 18.04 environment and I
   "@type": "BlogPosting",
   "headline": "September, 2018",
   "url": "https://alanorth.github.io/cgspace-notes/2018-09/",
-  "wordCount": "538",
+  "wordCount": "668",
   "datePublished": "2018-09-02T09:55:54&#43;03:00",
-  "dateModified": "2018-09-03T16:47:24&#43;03:00",
+  "dateModified": "2018-09-04T13:25:13&#43;03:00",
   "author": {
     "@type": "Person",
     "name": "Alan Orth"
@@ -166,8 +166,16 @@ Caused by: java.lang.RuntimeException: Failed to startup the DSpace Service Mana
 <ul>
 <li>All fields are split with multiple columns like <code>cg.authorship.types</code> and <code>cg.authorship.types[]</code></li>
 <li>This makes it super annoying to do the checks and cleanup, so I will merge them (also time consuming)</li>
-<li>Five issue dates had values like <code>2013-5</code> so I corrected them to be <code>2013-05</code></li>
+<li>Five items had <code>dc.date.issued</code> values like <code>2013-5</code> so I corrected them to be <code>2013-05</code></li>
 <li>Several metadata fields had values with newlines in them (even in some titles!), which I fixed by trimming the consecutive whitespaces in Open Refine</li>
+<li>Many (196!) items from before 2011 are indicated as having a CRP, but CRPs didn&rsquo;t exist then so this is impossible</li>
+<li>I got all items that were from 2011 and onwards using a custom facet with this GREL on the <code>dc.date.issued</code> column: <code>isNotNull(value.match(/201[1-8].*/))</code> and then blanking their CRPs</li>
+<li>Some affiliations with only one separator (|) for multiple values</li>
+<li>I replaced smart quotes like <code>’</code> with plain ones</li>
+<li>Some inconsitencies in <code>cg.subject.iita</code> like COWPEA and COWPEAS, and YAM and YAMS, etc, as well as some spelling mistakes like IMPACT ASSESSMENTN</li>
+<li>Some values in the <code>dc.identifier.isbn</code> are actually ISSNs so I moved them to the <code>dc.identifier.issn</code> column</li>
+<li>I found one invalid ISSN using a custom text facet with the regex from the <a href="https://en.wikipedia.org/wiki/International_Standard_Serial_Number#Code_format">ISSN page on Wikipedia</a>: <code>isNotBlank(value.match(/^\d{4}-\d{3}[\dxX]$/))</code></li>
+<li>One invalid value for <code>dc.type</code></li>
 </ul></li>
 </ul>
 
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index b71ab02d5..496215727 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -4,7 +4,7 @@
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/2018-09/</loc>
-    <lastmod>2018-09-03T16:47:24+03:00</lastmod>
+    <lastmod>2018-09-04T13:25:13+03:00</lastmod>
   </url>
   
   <url>
@@ -24,7 +24,7 @@
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/2018-05/</loc>
-    <lastmod>2018-05-31T15:53:12-07:00</lastmod>
+    <lastmod>2018-09-04T16:15:26+03:00</lastmod>
   </url>
   
   <url>
@@ -184,7 +184,7 @@
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/</loc>
-    <lastmod>2018-09-03T16:47:24+03:00</lastmod>
+    <lastmod>2018-09-04T13:25:13+03:00</lastmod>
     <priority>0</priority>
   </url>
   
@@ -195,7 +195,7 @@
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/tags/notes/</loc>
-    <lastmod>2018-09-03T16:47:24+03:00</lastmod>
+    <lastmod>2018-09-04T13:25:13+03:00</lastmod>
     <priority>0</priority>
   </url>
   
@@ -207,13 +207,13 @@
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/posts/</loc>
-    <lastmod>2018-09-03T16:47:24+03:00</lastmod>
+    <lastmod>2018-09-04T13:25:13+03:00</lastmod>
     <priority>0</priority>
   </url>
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/tags/</loc>
-    <lastmod>2018-09-03T16:47:24+03:00</lastmod>
+    <lastmod>2018-09-04T13:25:13+03:00</lastmod>
     <priority>0</priority>
   </url>