From a386dda3f7a365d316901287abda3fc82c821ef8 Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Thu, 16 Jan 2020 12:49:21 +0200
Subject: [PATCH] Add notes for 2020-01-16

---
 content/posts/2020-01.md | 18 ++++++++++++++++++
 docs/2020-01/index.html  | 30 ++++++++++++++++++++++++++----
 docs/sitemap.xml         | 10 +++++-----
 3 files changed, 49 insertions(+), 9 deletions(-)
diff --git a/content/posts/2020-01.md b/content/posts/2020-01.md
index f29bffe82..35a57974f 100644
--- a/content/posts/2020-01.md
+++ b/content/posts/2020-01.md
@@ -112,4 +112,22 @@ COPY 1325
 $ ./fix-metadata-values.py -i 2020-01-15-fix-8-ilri-subjects.csv -db dspace -u dspace -p 'fuuu' -f cg.subject.ilri -m 203 -t correct -d
 ```
 
+## 2020-01-16
+
+- Extract a list of CIAT subjects from CGSpace for Elizabeth Arnaud from Bioversity:
+
+```
+dspace=# \COPY (SELECT DISTINCT text_value as "cg.subject.ciat", count(*) FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 122 GROUP BY text_value ORDER BY count DESC) to /tmp/2020-01-16-ciat-subjects.csv WITH CSV HEADER;
+COPY 35
+```
+
+- Start examining the 175 IITA records that Bosede originally sent in October, 2019 (201907.xls)
+  - We had delayed processing them because DSpace Test (linode19) was testing CG Core v2 implementation for the last few months
+  - Sisay uploaded the records to DSpace Test as [IITA_201907_Jan13](https://dspacetest.cgiar.org/handle/10568/106567)
+  - I started first with basic sanity checks using my csv-metadata-quality tool and found twenty-two items with extra whitespace, invalid multi-value separators, and duplicates, which means Sisay did not do any quality checking on the data
+  - I corrected one invalid AGROVOC subject
+  - Validate and normalize affiliations against our 2019-04 list using reconcile-csv and OpenRefine:
+    - `$ lein run ~/src/git/DSpace/2019-04-08-affiliations.csv name id`
+    - I always forget how to copy the reconciled values in OpenRefine, but you need to make a new colum and populate it using this GREL: `if(cell.recon.matched, cell.recon.match.name, value)`
+
 <!-- vim: set sw=2 ts=2: -->
diff --git a/docs/2020-01/index.html b/docs/2020-01/index.html
index 38abb0d2f..2d6e897dd 100644
--- a/docs/2020-01/index.html
+++ b/docs/2020-01/index.html
@@ -29,7 +29,7 @@ I tweeted the CGSpace repository link
 <meta property="og:type" content="article" />
 <meta property="og:url" content="https://alanorth.github.io/cgspace-notes/2020-01/" />
 <meta property="article:published_time" content="2020-01-06T10:48:30+02:00" />
-<meta property="article:modified_time" content="2020-01-14T20:40:41+02:00" />
+<meta property="article:modified_time" content="2020-01-15T13:51:35+02:00" />
 
 <meta name="twitter:card" content="summary"/>
 <meta name="twitter:title" content="January, 2020"/>
@@ -63,9 +63,9 @@ I tweeted the CGSpace repository link
   "@type": "BlogPosting",
   "headline": "January, 2020",
   "url": "https:\/\/alanorth.github.io\/cgspace-notes\/2020-01\/",
-  "wordCount": "765",
+  "wordCount": "941",
   "datePublished": "2020-01-06T10:48:30+02:00",
-  "dateModified": "2020-01-14T20:40:41+02:00",
+  "dateModified": "2020-01-15T13:51:35+02:00",
   "author": {
     "@type": "Person",
     "name": "Alan Orth"
@@ -236,7 +236,29 @@ COPY 1325
 <li>I noticed a few errors in the ILRI subjects so I fixed them locally and on CGSpace (linode18) using my <code>fix-metadata.py</code> script:</li>
 </ul>
 <pre><code>$ ./fix-metadata-values.py -i 2020-01-15-fix-8-ilri-subjects.csv -db dspace -u dspace -p 'fuuu' -f cg.subject.ilri -m 203 -t correct -d
-</code></pre><!-- raw HTML omitted -->
+</code></pre><h2 id="2020-01-16">2020-01-16</h2>
+<ul>
+<li>Extract a list of CIAT subjects from CGSpace for Elizabeth Arnaud from Bioversity:</li>
+</ul>
+<pre><code>dspace=# \COPY (SELECT DISTINCT text_value as &quot;cg.subject.ciat&quot;, count(*) FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 122 GROUP BY text_value ORDER BY count DESC) to /tmp/2020-01-16-ciat-subjects.csv WITH CSV HEADER;
+COPY 35
+</code></pre><ul>
+<li>Start examining the 175 IITA records that Bosede originally sent in October, 2019 (201907.xls)
+<ul>
+<li>We had delayed processing them because DSpace Test (linode19) was testing CG Core v2 implementation for the last few months</li>
+<li>Sisay uploaded the records to DSpace Test as <a href="https://dspacetest.cgiar.org/handle/10568/106567">IITA_201907_Jan13</a></li>
+<li>I started first with basic sanity checks using my csv-metadata-quality tool and found twenty-two items with extra whitespace, invalid multi-value separators, and duplicates, which means Sisay did not do any quality checking on the data</li>
+<li>I corrected one invalid AGROVOC subject</li>
+<li>Validate and normalize affiliations against our 2019-04 list using reconcile-csv and OpenRefine:
+<ul>
+<li><code>$ lein run ~/src/git/DSpace/2019-04-08-affiliations.csv name id</code></li>
+<li>I always forget how to copy the reconciled values in OpenRefine, but you need to make a new colum and populate it using this GREL: <code>if(cell.recon.matched, cell.recon.match.name, value)</code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+<!-- raw HTML omitted -->
 
   
 
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 66194b81b..af2dbe7f7 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -4,27 +4,27 @@
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/categories/</loc>
-    <lastmod>2020-01-14T20:40:41+02:00</lastmod>
+    <lastmod>2020-01-15T13:51:35+02:00</lastmod>
   </url>
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/</loc>
-    <lastmod>2020-01-14T20:40:41+02:00</lastmod>
+    <lastmod>2020-01-15T13:51:35+02:00</lastmod>
   </url>
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/2020-01/</loc>
-    <lastmod>2020-01-14T20:40:41+02:00</lastmod>
+    <lastmod>2020-01-15T13:51:35+02:00</lastmod>
   </url>
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/categories/notes/</loc>
-    <lastmod>2020-01-14T20:40:41+02:00</lastmod>
+    <lastmod>2020-01-15T13:51:35+02:00</lastmod>
   </url>
   
   <url>
     <loc>https://alanorth.github.io/cgspace-notes/posts/</loc>
-    <lastmod>2020-01-14T20:40:41+02:00</lastmod>
+    <lastmod>2020-01-15T13:51:35+02:00</lastmod>
   </url>
   
   <url>