From 46bdcb7a4573ef090be8bd2e8d0cb6ac33ea4f23 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 9 Aug 2021 17:10:45 +0300 Subject: [PATCH] Add notes for 2021-08-09 --- content/posts/2021-08.md | 32 +++++++++++++++++++++ docs/2021-08/index.html | 38 +++++++++++++++++++++++-- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/page/8/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/posts/page/8/index.html | 2 +- docs/sitemap.xml | 10 +++---- 25 files changed, 94 insertions(+), 30 deletions(-) diff --git a/content/posts/2021-08.md b/content/posts/2021-08.md index 71d7cd00d..66d182097 100644 --- a/content/posts/2021-08.md +++ b/content/posts/2021-08.md @@ -178,5 +178,37 @@ $ csvcut -c 'id,cg.issn,cg.issn[],cg.issn[en],cg.issn[en_US],cg.isbn,cg.isbn[],c - Then in OpenRefine I merged all null, blank, and en fields into the `en_US` one for each, removed all spaces, fixed invalid multi-value separators, removed everything other than ISSN/ISBNs themselves - In total it was a few thousand metadata entries or so so I had to split the CSV with `xsv split` in order to process it - I was reminded again how DSpace 6 is very fucking slow when it comes to any database-related operations, as it takes over an hour to process 200 metadata changes... + - In total it was 1,195 changes to ISSN and ISBN metadata fields + +## 2021-08-09 + +- Extract all unique ISSNs to look up on Sherpa Romeo and Crossref + +```console +$ csvcut -c 'cg.issn[en_US]' ~/Downloads/2021-08-08-CGSpace-ISBN-ISSN.csv | csvgrep -c 1 -r '^[0-9]{4}' | sed 1d | sort | uniq > /tmp/2021-08-09-issns.txt +$ ./ilri/sherpa-issn-lookup.py -a mehhhhhhhhhhhhh -i /tmp/2021-08-09-issns.txt -o /tmp/2021-08-09-journals-sherpa-romeo.csv +$ ./ilri/crossref-issn-lookup.py -e me@cgiar.org -i /tmp/2021-08-09-issns.txt -o /tmp/2021-08-09-journals-crossref.csv +``` + +- Then I updated the CSV headers for each and joined the CSVs on the issn column: + +```console +$ sed -i '1s/journal title/sherpa romeo journal title/' /tmp/2021-08-09-journals-sherpa-romeo.csv +$ sed -i '1s/journal title/crossref journal title/' /tmp/2021-08-09-journals-crossref.csv +$ csvjoin -c issn /tmp/2021-08-09-journals-sherpa-romeo.csv /tmp/2021-08-09-journals-crossref.csv > /tmp/2021-08-09-journals-all.csv +``` + +- In OpenRefine I faceted by blank in each column and copied the values from the other, then created a new column to indicate whether the values were the same with this GREL: + +```console +if(cells['sherpa romeo journal title'].value == cells['crossref journal title'].value,"same","different") +``` + +- Then I exported the list of journals that differ and sent it to Peter for comments and corrections + - I want to build an updated controlled vocabulary so I can update CGSpace and reconcile our existing metadata against it +- Convert my `generate-thumbnails.py` script to use libvips instead of Graphicsmagick + - It is faster and uses less memory than GraphicsMagick (and ImageMagick), and produces nice thumbnails from PDFs + - One drawback is that libvips uses Poppler instead of Graphicsmagick, which apparently means that it can't work in CMYK + - I tested one item (10568/51999) that uses CMYK and the thumbnail looked OK (closer to the original than GraphicsMagick), so I'm not sure... diff --git a/docs/2021-08/index.html b/docs/2021-08/index.html index ebc91e4f8..b5782c2f9 100644 --- a/docs/2021-08/index.html +++ b/docs/2021-08/index.html @@ -18,7 +18,7 @@ I decided to upgrade linode20 from Ubuntu 18.04 to 20.04 - + @@ -42,9 +42,9 @@ I decided to upgrade linode20 from Ubuntu 18.04 to 20.04 "@type": "BlogPosting", "headline": "August, 2021", "url": "https://alanorth.github.io/cgspace-notes/2021-08/", - "wordCount": "1288", + "wordCount": "1537", "datePublished": "2021-08-01T09:01:07+03:00", - "dateModified": "2021-08-06T09:08:15+03:00", + "dateModified": "2021-08-09T08:38:44+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -295,6 +295,38 @@ Total number of hits from bots: 4492 + + +

2021-08-09

+ +
$ csvcut -c 'cg.issn[en_US]' ~/Downloads/2021-08-08-CGSpace-ISBN-ISSN.csv | csvgrep -c 1 -r '^[0-9]{4}' | sed 1d | sort | uniq > /tmp/2021-08-09-issns.txt
+$ ./ilri/sherpa-issn-lookup.py -a mehhhhhhhhhhhhh -i /tmp/2021-08-09-issns.txt -o /tmp/2021-08-09-journals-sherpa-romeo.csv
+$ ./ilri/crossref-issn-lookup.py -e me@cgiar.org -i /tmp/2021-08-09-issns.txt -o /tmp/2021-08-09-journals-crossref.csv
+
+
$ sed -i '1s/journal title/sherpa romeo journal title/' /tmp/2021-08-09-journals-sherpa-romeo.csv
+$ sed -i '1s/journal title/crossref journal title/' /tmp/2021-08-09-journals-crossref.csv
+$ csvjoin -c issn /tmp/2021-08-09-journals-sherpa-romeo.csv /tmp/2021-08-09-journals-crossref.csv > /tmp/2021-08-09-journals-all.csv
+
+
if(cells['sherpa romeo journal title'].value == cells['crossref journal title'].value,"same","different")
+
diff --git a/docs/categories/index.html b/docs/categories/index.html index 4fa09f197..398c442a5 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 556b420fd..fae4a3ecc 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 5559c9939..6de22e301 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 0243175df..08ee12512 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 11aec734d..571c184c5 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index 8f4aecc4f..2e0fe515d 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 7ded0ed70..916d47477 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 97e792e4e..8adf735f8 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 4eadf6e62..38eca8d69 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 142ffe24c..ffad9f8db 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 792ed18ee..03e3c4237 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index b491cc9a6..62df12e4d 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 43c98a304..00f140bda 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index 02baea481..62d113060 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 86a073fd7..e5fa61442 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index f7d7a6563..7e9ae9075 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 9fcc59d92..709953cee 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 6ab2d655b..7cc40a595 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 2b4bb1302..8c4627b73 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 80b87818e..0f0b0c11a 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 44a9af68f..8f358876c 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index 0492ff413..7d0c04932 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 78e923056..d032d4d6f 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/2021-08/ - 2021-08-06T09:08:15+03:00 + 2021-08-09T08:38:44+03:00 https://alanorth.github.io/cgspace-notes/categories/ - 2021-08-08T17:07:54+03:00 + 2021-08-09T08:38:44+03:00 https://alanorth.github.io/cgspace-notes/ - 2021-08-08T17:07:54+03:00 + 2021-08-09T08:38:44+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2021-08-08T17:07:54+03:00 + 2021-08-09T08:38:44+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2021-08-08T17:07:54+03:00 + 2021-08-09T08:38:44+03:00 https://alanorth.github.io/cgspace-notes/2021-07/ 2021-08-01T16:19:05+03:00