diff --git a/content/posts/2021-06.md b/content/posts/2021-06.md index c5394dcee..b4f41ce93 100644 --- a/content/posts/2021-06.md +++ b/content/posts/2021-06.md @@ -85,4 +85,35 @@ elasticdump --input=/home/aorth/openrxv-items_data.json --output=http://localhos $ podman unshare chown 1000:1000 /home/aorth/.local/share/containers/storage/volumes/docker_esData_7/_data ``` +- The new OpenRXV harvesting method by Moayad uses pages of 10 items instead of 100 and it's much faster + - I harvested 90,000+ items from DSpace Test in ~3 hours + - There seem to be some issues with the health check step though + +## 2021-06-17 + +- I ported my ilri/resolve-addresses.py script that uses IPAPI.co to use the local GeoIP2 databases + - The new script is ilri/resolve-addresses-geoip2.py and it is much faster and works offline with no API rate limits +- Teams meeting with the CGIAR Metadata Working group to discuss CGSpace and open repositories and the way forward +- More work with Moayad on OpenRXV harvesting issues + - Using a JSON export from elasticdump we debugged the duplicate checker plugin and found that there are indeed duplicates: + +```console +$ grep -oE '"handle":"[[:digit:]]+/[[:digit:]]+"' openrxv-items_data.json | awk -F: '{print $2}' | wc -l +90459 +$ grep -oE '"handle":"[[:digit:]]+/[[:digit:]]+"' openrxv-items_data.json | awk -F: '{print $2}' | sort | uniq | wc -l +90380 +$ grep -oE '"handle":"[[:digit:]]+/[[:digit:]]+"' openrxv-items_data.json | awk -F: '{print $2}' | sort | uniq -c | sort -h +... + 2 "10568/99409" + 2 "10568/99410" + 2 "10568/99411" + 2 "10568/99516" + 3 "10568/102093" + 3 "10568/103524" + 3 "10568/106664" + 3 "10568/106940" + 3 "10568/107195" + 3 "10568/96546" +``` + diff --git a/docs/2021-06/index.html b/docs/2021-06/index.html index ca6d5e00d..025169ac7 100644 --- a/docs/2021-06/index.html +++ b/docs/2021-06/index.html @@ -20,7 +20,7 @@ I simply started it and AReS was running again: - + @@ -46,9 +46,9 @@ I simply started it and AReS was running again: "@type": "BlogPosting", "headline": "June, 2021", "url": "https://alanorth.github.io/cgspace-notes/2021-06/", - "wordCount": "627", + "wordCount": "817", "datePublished": "2021-06-01T10:51:07+03:00", - "dateModified": "2021-06-14T15:09:07+03:00", + "dateModified": "2021-06-16T18:31:15+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -209,6 +209,44 @@ elasticdump --input=/home/aorth/openrxv-items_data.json --output=http://localhos
$ podman unshare chown 1000:1000 /home/aorth/.local/share/containers/storage/volumes/docker_esData_7/_data
+
+

2021-06-17

+ +
$ grep -oE '"handle":"[[:digit:]]+/[[:digit:]]+"' openrxv-items_data.json | awk -F: '{print $2}' | wc -l
+90459
+$ grep -oE '"handle":"[[:digit:]]+/[[:digit:]]+"' openrxv-items_data.json | awk -F: '{print $2}' | sort | uniq | wc -l
+90380
+$ grep -oE '"handle":"[[:digit:]]+/[[:digit:]]+"' openrxv-items_data.json | awk -F: '{print $2}' | sort | uniq -c | sort -h
+...
+      2 "10568/99409"
+      2 "10568/99410"
+      2 "10568/99411"
+      2 "10568/99516"
+      3 "10568/102093"
+      3 "10568/103524"
+      3 "10568/106664"
+      3 "10568/106940"
+      3 "10568/107195"
+      3 "10568/96546"
 
diff --git a/docs/categories/index.html b/docs/categories/index.html index d4560cb1b..a50e4970f 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index ab6f3e8d5..c0df3320a 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index b6788b861..a093b205c 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index b52bb83cc..ad69bd0fc 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index ed239c8dd..08c1ef445 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index 4bf88c868..5eb4b2a67 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index f5b38e687..6a58929b0 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index bab75bbba..b644cf2be 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 4474fa79e..48b5d06cd 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index a6feb840e..c029300cf 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 71f167c96..0213ad306 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index b7df5cd3c..aa9580600 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 52333d1a1..294cfc289 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index fc09cb62e..f1208897f 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index a7c3a35ea..a4ca2bf27 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index abc00dc6b..a9e8e1792 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index ab332783d..44ca45b4d 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index de276e530..bf6010f00 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 0ea551a45..89773c1fb 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 62093e967..cfa8a8c99 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 2fe1d161f..cefc3ee7e 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index 7d8824a5c..1796c2111 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 7609d5e8c..6d1f9620c 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2021-06-14T15:09:07+03:00 + 2021-06-16T18:31:15+03:00 https://alanorth.github.io/cgspace-notes/ - 2021-06-14T15:09:07+03:00 + 2021-06-16T18:31:15+03:00 https://alanorth.github.io/cgspace-notes/2021-06/ - 2021-06-14T15:09:07+03:00 + 2021-06-16T18:31:15+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2021-06-14T15:09:07+03:00 + 2021-06-16T18:31:15+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2021-06-14T15:09:07+03:00 + 2021-06-16T18:31:15+03:00 https://alanorth.github.io/cgspace-notes/2021-05/ 2021-05-30T22:09:06+03:00