From 856181c13b26dd58499d678cb9b02f273a8167f3 Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Tue, 17 Jan 2017 16:41:23 +0200
Subject: [PATCH] Add notes for 2017-01-17

---
 content/post/2017-01.md     | 33 +++++++++++++++++++++++++++++++
 public/2017-01/index.html   | 39 ++++++++++++++++++++++++++++++++++++-
 public/index.xml            | 37 +++++++++++++++++++++++++++++++++++
 public/post/index.xml       | 37 +++++++++++++++++++++++++++++++++++
 public/tags/notes/index.xml | 37 +++++++++++++++++++++++++++++++++++
 5 files changed, 182 insertions(+), 1 deletion(-)
diff --git a/content/post/2017-01.md b/content/post/2017-01.md
index 43a82e45e..f58a6e2ba 100644
--- a/content/post/2017-01.md
+++ b/content/post/2017-01.md
@@ -171,3 +171,36 @@ delete from collection2item where item_id = '80596' and id not in (90792, 90806,
 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */
 delete from collection2item where id = '91082';
 ```
+
+## 2017-01-17
+
+- Helping clean up some file names in the 232 CIAT records that Sisay worked on last week
+- There are about 30 files with `%20` (space) and Spanish accents in the file name
+- At first I thought we should fix these, but actually it is [prescribed by the W3 working group to convert these to UTF8 and URL encode them](https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1)!
+- And the file names don't really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore
+- Seems like the only ones I should replace are the `'` apostrophe characters, as `%27`:
+
+```
+value.replace("'",'%27')
+```
+
+- Add the item's Type to the filename column as a hint to SAF Builder so it can set a more useful description field:
+
+```
+value + "__description:" + cells["dc.type"].value
+```
+
+- Test importing of the new CIAT records (actually there are 232, not 234):
+
+```
+$ JAVA_OPTS="-Xmx512m -Dfile.encoding=UTF-8" /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568
+/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &> /tmp/ciat.log
+```
+
+- Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB
+- These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:
+
+```
+$ convert -compress Zip -density 150x150 input.pdf output.pdf
+$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf
+```
diff --git a/public/2017-01/index.html b/public/2017-01/index.html
index 033ed7ba5..ce9f8ece2 100644
--- a/public/2017-01/index.html
+++ b/public/2017-01/index.html
@@ -28,7 +28,7 @@
 
 
 <meta itemprop="dateModified" content="2017-01-02T10:43:00&#43;03:00" />
-<meta itemprop="wordCount" content="884">
+<meta itemprop="wordCount" content="1104">
 
 
 
@@ -301,6 +301,43 @@ delete from collection2item where item_id = '80596' and id not in (90792, 90806,
 delete from collection2item where id = '91082';
 </code></pre>
 
+<h2 id="2017-01-17">2017-01-17</h2>
+
+<ul>
+<li>Helping clean up some file names in the 232 CIAT records that Sisay worked on last week</li>
+<li>There are about 30 files with <code>%20</code> (space) and Spanish accents in the file name</li>
+<li>At first I thought we should fix these, but actually it is <a href="https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1">prescribed by the W3 working group to convert these to UTF8 and URL encode them</a>!</li>
+<li>And the file names don&rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore</li>
+<li>Seems like the only ones I should replace are the <code>'</code> apostrophe characters, as <code>%27</code>:</li>
+</ul>
+
+<pre><code>value.replace(&quot;'&quot;,'%27')
+</code></pre>
+
+<ul>
+<li>Add the item&rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:</li>
+</ul>
+
+<pre><code>value + &quot;__description:&quot; + cells[&quot;dc.type&quot;].value
+</code></pre>
+
+<ul>
+<li>Test importing of the new CIAT records (actually there are 232, not 234):</li>
+</ul>
+
+<pre><code>$ JAVA_OPTS=&quot;-Xmx512m -Dfile.encoding=UTF-8&quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568
+/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;&gt; /tmp/ciat.log
+</code></pre>
+
+<ul>
+<li>Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB</li>
+<li>These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:</li>
+</ul>
+
+<pre><code>$ convert -compress Zip -density 150x150 input.pdf output.pdf
+$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf
+</code></pre>
+
 
   
 
diff --git a/public/index.xml b/public/index.xml
index 877b025fd..1a899027e 100644
--- a/public/index.xml
+++ b/public/index.xml
@@ -207,6 +207,43 @@ delete from collection2item where item_id = &#39;80596&#39; and id not in (90792
 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */
 delete from collection2item where id = &#39;91082&#39;;
 &lt;/code&gt;&lt;/pre&gt;
+
+&lt;h2 id=&#34;2017-01-17&#34;&gt;2017-01-17&lt;/h2&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Helping clean up some file names in the 232 CIAT records that Sisay worked on last week&lt;/li&gt;
+&lt;li&gt;There are about 30 files with &lt;code&gt;%20&lt;/code&gt; (space) and Spanish accents in the file name&lt;/li&gt;
+&lt;li&gt;At first I thought we should fix these, but actually it is &lt;a href=&#34;https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1&#34;&gt;prescribed by the W3 working group to convert these to UTF8 and URL encode them&lt;/a&gt;!&lt;/li&gt;
+&lt;li&gt;And the file names don&amp;rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore&lt;/li&gt;
+&lt;li&gt;Seems like the only ones I should replace are the &lt;code&gt;&#39;&lt;/code&gt; apostrophe characters, as &lt;code&gt;%27&lt;/code&gt;:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;value.replace(&amp;quot;&#39;&amp;quot;,&#39;%27&#39;)
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Add the item&amp;rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;value + &amp;quot;__description:&amp;quot; + cells[&amp;quot;dc.type&amp;quot;].value
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Test importing of the new CIAT records (actually there are 232, not 234):&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;$ JAVA_OPTS=&amp;quot;-Xmx512m -Dfile.encoding=UTF-8&amp;quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568
+/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;amp;&amp;gt; /tmp/ciat.log
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB&lt;/li&gt;
+&lt;li&gt;These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;$ convert -compress Zip -density 150x150 input.pdf output.pdf
+$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf
+&lt;/code&gt;&lt;/pre&gt;
 </description>
     </item>
     
diff --git a/public/post/index.xml b/public/post/index.xml
index 75bd2d4a9..7165cfa86 100644
--- a/public/post/index.xml
+++ b/public/post/index.xml
@@ -207,6 +207,43 @@ delete from collection2item where item_id = &#39;80596&#39; and id not in (90792
 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */
 delete from collection2item where id = &#39;91082&#39;;
 &lt;/code&gt;&lt;/pre&gt;
+
+&lt;h2 id=&#34;2017-01-17&#34;&gt;2017-01-17&lt;/h2&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Helping clean up some file names in the 232 CIAT records that Sisay worked on last week&lt;/li&gt;
+&lt;li&gt;There are about 30 files with &lt;code&gt;%20&lt;/code&gt; (space) and Spanish accents in the file name&lt;/li&gt;
+&lt;li&gt;At first I thought we should fix these, but actually it is &lt;a href=&#34;https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1&#34;&gt;prescribed by the W3 working group to convert these to UTF8 and URL encode them&lt;/a&gt;!&lt;/li&gt;
+&lt;li&gt;And the file names don&amp;rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore&lt;/li&gt;
+&lt;li&gt;Seems like the only ones I should replace are the &lt;code&gt;&#39;&lt;/code&gt; apostrophe characters, as &lt;code&gt;%27&lt;/code&gt;:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;value.replace(&amp;quot;&#39;&amp;quot;,&#39;%27&#39;)
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Add the item&amp;rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;value + &amp;quot;__description:&amp;quot; + cells[&amp;quot;dc.type&amp;quot;].value
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Test importing of the new CIAT records (actually there are 232, not 234):&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;$ JAVA_OPTS=&amp;quot;-Xmx512m -Dfile.encoding=UTF-8&amp;quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568
+/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;amp;&amp;gt; /tmp/ciat.log
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB&lt;/li&gt;
+&lt;li&gt;These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;$ convert -compress Zip -density 150x150 input.pdf output.pdf
+$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf
+&lt;/code&gt;&lt;/pre&gt;
 </description>
     </item>
     
diff --git a/public/tags/notes/index.xml b/public/tags/notes/index.xml
index 9ab63a25e..739b7a720 100644
--- a/public/tags/notes/index.xml
+++ b/public/tags/notes/index.xml
@@ -206,6 +206,43 @@ delete from collection2item where item_id = &#39;80596&#39; and id not in (90792
 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */
 delete from collection2item where id = &#39;91082&#39;;
 &lt;/code&gt;&lt;/pre&gt;
+
+&lt;h2 id=&#34;2017-01-17&#34;&gt;2017-01-17&lt;/h2&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Helping clean up some file names in the 232 CIAT records that Sisay worked on last week&lt;/li&gt;
+&lt;li&gt;There are about 30 files with &lt;code&gt;%20&lt;/code&gt; (space) and Spanish accents in the file name&lt;/li&gt;
+&lt;li&gt;At first I thought we should fix these, but actually it is &lt;a href=&#34;https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1&#34;&gt;prescribed by the W3 working group to convert these to UTF8 and URL encode them&lt;/a&gt;!&lt;/li&gt;
+&lt;li&gt;And the file names don&amp;rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore&lt;/li&gt;
+&lt;li&gt;Seems like the only ones I should replace are the &lt;code&gt;&#39;&lt;/code&gt; apostrophe characters, as &lt;code&gt;%27&lt;/code&gt;:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;value.replace(&amp;quot;&#39;&amp;quot;,&#39;%27&#39;)
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Add the item&amp;rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;value + &amp;quot;__description:&amp;quot; + cells[&amp;quot;dc.type&amp;quot;].value
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Test importing of the new CIAT records (actually there are 232, not 234):&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;$ JAVA_OPTS=&amp;quot;-Xmx512m -Dfile.encoding=UTF-8&amp;quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568
+/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;amp;&amp;gt; /tmp/ciat.log
+&lt;/code&gt;&lt;/pre&gt;
+
+&lt;ul&gt;
+&lt;li&gt;Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB&lt;/li&gt;
+&lt;li&gt;These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;pre&gt;&lt;code&gt;$ convert -compress Zip -density 150x150 input.pdf output.pdf
+$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf
+&lt;/code&gt;&lt;/pre&gt;
 </description>
     </item>