From 856181c13b26dd58499d678cb9b02f273a8167f3 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 17 Jan 2017 16:41:23 +0200 Subject: [PATCH] Add notes for 2017-01-17 --- content/post/2017-01.md | 33 +++++++++++++++++++++++++++++++ public/2017-01/index.html | 39 ++++++++++++++++++++++++++++++++++++- public/index.xml | 37 +++++++++++++++++++++++++++++++++++ public/post/index.xml | 37 +++++++++++++++++++++++++++++++++++ public/tags/notes/index.xml | 37 +++++++++++++++++++++++++++++++++++ 5 files changed, 182 insertions(+), 1 deletion(-) diff --git a/content/post/2017-01.md b/content/post/2017-01.md index 43a82e45e..f58a6e2ba 100644 --- a/content/post/2017-01.md +++ b/content/post/2017-01.md @@ -171,3 +171,36 @@ delete from collection2item where item_id = '80596' and id not in (90792, 90806, /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */ delete from collection2item where id = '91082'; ``` + +## 2017-01-17 + +- Helping clean up some file names in the 232 CIAT records that Sisay worked on last week +- There are about 30 files with `%20` (space) and Spanish accents in the file name +- At first I thought we should fix these, but actually it is [prescribed by the W3 working group to convert these to UTF8 and URL encode them](https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1)! +- And the file names don't really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore +- Seems like the only ones I should replace are the `'` apostrophe characters, as `%27`: + +``` +value.replace("'",'%27') +``` + +- Add the item's Type to the filename column as a hint to SAF Builder so it can set a more useful description field: + +``` +value + "__description:" + cells["dc.type"].value +``` + +- Test importing of the new CIAT records (actually there are 232, not 234): + +``` +$ JAVA_OPTS="-Xmx512m -Dfile.encoding=UTF-8" /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568 +/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &> /tmp/ciat.log +``` + +- Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB +- These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much: + +``` +$ convert -compress Zip -density 150x150 input.pdf output.pdf +$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf +``` diff --git a/public/2017-01/index.html b/public/2017-01/index.html index 033ed7ba5..ce9f8ece2 100644 --- a/public/2017-01/index.html +++ b/public/2017-01/index.html @@ -28,7 +28,7 @@ - + @@ -301,6 +301,43 @@ delete from collection2item where item_id = '80596' and id not in (90792, 90806, delete from collection2item where id = '91082'; +

2017-01-17

+ + + +
value.replace("'",'%27')
+
+ + + +
value + "__description:" + cells["dc.type"].value
+
+ + + +
$ JAVA_OPTS="-Xmx512m -Dfile.encoding=UTF-8" /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568
+/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &> /tmp/ciat.log
+
+ + + +
$ convert -compress Zip -density 150x150 input.pdf output.pdf
+$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf
+
+ diff --git a/public/index.xml b/public/index.xml index 877b025fd..1a899027e 100644 --- a/public/index.xml +++ b/public/index.xml @@ -207,6 +207,43 @@ delete from collection2item where item_id = '80596' and id not in (90792 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */ delete from collection2item where id = '91082'; </code></pre> + +<h2 id="2017-01-17">2017-01-17</h2> + +<ul> +<li>Helping clean up some file names in the 232 CIAT records that Sisay worked on last week</li> +<li>There are about 30 files with <code>%20</code> (space) and Spanish accents in the file name</li> +<li>At first I thought we should fix these, but actually it is <a href="https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1">prescribed by the W3 working group to convert these to UTF8 and URL encode them</a>!</li> +<li>And the file names don&rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore</li> +<li>Seems like the only ones I should replace are the <code>'</code> apostrophe characters, as <code>%27</code>:</li> +</ul> + +<pre><code>value.replace(&quot;'&quot;,'%27') +</code></pre> + +<ul> +<li>Add the item&rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:</li> +</ul> + +<pre><code>value + &quot;__description:&quot; + cells[&quot;dc.type&quot;].value +</code></pre> + +<ul> +<li>Test importing of the new CIAT records (actually there are 232, not 234):</li> +</ul> + +<pre><code>$ JAVA_OPTS=&quot;-Xmx512m -Dfile.encoding=UTF-8&quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568 +/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;&gt; /tmp/ciat.log +</code></pre> + +<ul> +<li>Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB</li> +<li>These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:</li> +</ul> + +<pre><code>$ convert -compress Zip -density 150x150 input.pdf output.pdf +$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf +</code></pre> diff --git a/public/post/index.xml b/public/post/index.xml index 75bd2d4a9..7165cfa86 100644 --- a/public/post/index.xml +++ b/public/post/index.xml @@ -207,6 +207,43 @@ delete from collection2item where item_id = '80596' and id not in (90792 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */ delete from collection2item where id = '91082'; </code></pre> + +<h2 id="2017-01-17">2017-01-17</h2> + +<ul> +<li>Helping clean up some file names in the 232 CIAT records that Sisay worked on last week</li> +<li>There are about 30 files with <code>%20</code> (space) and Spanish accents in the file name</li> +<li>At first I thought we should fix these, but actually it is <a href="https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1">prescribed by the W3 working group to convert these to UTF8 and URL encode them</a>!</li> +<li>And the file names don&rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore</li> +<li>Seems like the only ones I should replace are the <code>'</code> apostrophe characters, as <code>%27</code>:</li> +</ul> + +<pre><code>value.replace(&quot;'&quot;,'%27') +</code></pre> + +<ul> +<li>Add the item&rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:</li> +</ul> + +<pre><code>value + &quot;__description:&quot; + cells[&quot;dc.type&quot;].value +</code></pre> + +<ul> +<li>Test importing of the new CIAT records (actually there are 232, not 234):</li> +</ul> + +<pre><code>$ JAVA_OPTS=&quot;-Xmx512m -Dfile.encoding=UTF-8&quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568 +/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;&gt; /tmp/ciat.log +</code></pre> + +<ul> +<li>Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB</li> +<li>These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:</li> +</ul> + +<pre><code>$ convert -compress Zip -density 150x150 input.pdf output.pdf +$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf +</code></pre> diff --git a/public/tags/notes/index.xml b/public/tags/notes/index.xml index 9ab63a25e..739b7a720 100644 --- a/public/tags/notes/index.xml +++ b/public/tags/notes/index.xml @@ -206,6 +206,43 @@ delete from collection2item where item_id = '80596' and id not in (90792 /* 1 incorrect mapping: https://cgspace.cgiar.org/handle/10568/78658 */ delete from collection2item where id = '91082'; </code></pre> + +<h2 id="2017-01-17">2017-01-17</h2> + +<ul> +<li>Helping clean up some file names in the 232 CIAT records that Sisay worked on last week</li> +<li>There are about 30 files with <code>%20</code> (space) and Spanish accents in the file name</li> +<li>At first I thought we should fix these, but actually it is <a href="https://www.w3.org/TR/html4/appendix/notes.html#h-B.2.1">prescribed by the W3 working group to convert these to UTF8 and URL encode them</a>!</li> +<li>And the file names don&rsquo;t really matter either, as long as the SAF Builder tool can read them—after that DSpace renames them with a hash in the assetstore</li> +<li>Seems like the only ones I should replace are the <code>'</code> apostrophe characters, as <code>%27</code>:</li> +</ul> + +<pre><code>value.replace(&quot;'&quot;,'%27') +</code></pre> + +<ul> +<li>Add the item&rsquo;s Type to the filename column as a hint to SAF Builder so it can set a more useful description field:</li> +</ul> + +<pre><code>value + &quot;__description:&quot; + cells[&quot;dc.type&quot;].value +</code></pre> + +<ul> +<li>Test importing of the new CIAT records (actually there are 232, not 234):</li> +</ul> + +<pre><code>$ JAVA_OPTS=&quot;-Xmx512m -Dfile.encoding=UTF-8&quot; /home/dspacetest.cgiar.org/bin/dspace import --add --eperson=aorth@mjanja.ch --collection=10568 +/79042 --source /home/aorth/CIAT_234/SimpleArchiveFormat/ --mapfile=/tmp/ciat.map &amp;&gt; /tmp/ciat.log +</code></pre> + +<ul> +<li>Many of the PDFs are 20, 30, 40, 50+ MB, which makes a total of 4GB</li> +<li>These are scanned from paper and likely have no compression, so we should try to test if these compression techniques help without comprimising the quality too much:</li> +</ul> + +<pre><code>$ convert -compress Zip -density 150x150 input.pdf output.pdf +$ gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output.pdf input.pdf +</code></pre>