diff --git a/content/posts/2022-12.md b/content/posts/2022-12.md
index 298149fc3..e2c5e1825 100644
--- a/content/posts/2022-12.md
+++ b/content/posts/2022-12.md
@@ -25,4 +25,42 @@ categories: ["Notes"]
- [Set the description when submitting bitstreams to CGSpace](https://github.com/CodeObia/MEL/issues/11067)
- [Some items have a Creative Commons license, but are Limited Access and bitstreams are locked](https://github.com/CodeObia/MEL/issues/11068)
+## 2022-12-03
+
+- I downloaded a fresh copy of CLARISA's institutions list as well as ROR's latest dump from 2022-12-01 to check how many are matching:
+
+```console
+$ curl -s https://api.clarisa.cgiar.org/api/institutions | json_pp > ~/Downloads/2022-12-03-CLARISA-institutions.json
+$ jq -r '.[] | .name' ~/Downloads/2022-12-03-CLARISA-institutions.json > ~/Downloads/2022-12-03-CLARISA-institutions.txt
+$ ./ilri/ror-lookup.py -i ~/Downloads/2022-12-03-CLARISA-institutions.txt -o /tmp/clarisa-ror-matches.csv -r v1.15-2022-12-01-ror-data.json
+$ csvgrep -c matched -m true /tmp/clarisa-ror-matches.csv | wc -l
+1864
+$ wc -l ~/Downloads/2022-12-03-CLARISA-institutions.txt
+7060 /home/aorth/Downloads/2022-12-03-CLARISA-institutions.txt
+```
+
+- Out of the box they match 26.4%, but there are many institutions with multiple languages in the text value, as well as countries in parentheses so I think it could be higher
+- If I replace the slashes and remove the countries at the end there are slightly more matches, around 29%:
+
+```console
+$ sed -e 's_ / _\n_' -e 's_/_\n_' -e 's/ \?(.*)$//' ~/Downloads/2022-12-03-CLARISA-institutions.txt > ~/Downloads/2022-12-03-CLARISA-institutions-alan.txt
+```
+
+- I checked CGSpace's top 1,000 institutions too, first exporting from PostgreSQL:
+
+```console
+localhost/dspacetest= ☘ \COPY (SELECT DISTINCT text_value as "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id = 211 GROUP BY text_value ORDER BY count DESC LIMIT 1000) to /tmp/2022-11-22-affiliations.csv;
+```
+
+- Then cutting (tab is the default delimeter):
+
+```console
+$ cut -f 1 /tmp/2022-11-22-affiliations.csv > 2022-11-22-affiliations.txt
+$ ./ilri/ror-lookup.py -i 2022-11-22-affiliations.txt -o /tmp/cgspace-matches.csv -r v1.15-2022-12-01-ror-data.json
+$ csvgrep -c matched -m true /tmp/cgspace-matches.csv | wc -l
+542
+```
+
+- So that's a 54% match for our top institutions
+
diff --git a/docs/2022-11/index.html b/docs/2022-11/index.html
index 2ecefbd0a..b39c890cf 100644
--- a/docs/2022-11/index.html
+++ b/docs/2022-11/index.html
@@ -24,7 +24,7 @@ I reverted the Cocoon autosave change because it was more of a nuissance that Pe
-
+
@@ -56,7 +56,7 @@ I reverted the Cocoon autosave change because it was more of a nuissance that Pe
"url": "https://alanorth.github.io/cgspace-notes/2022-11/",
"wordCount": "3414",
"datePublished": "2022-11-01T09:11:36+03:00",
- "dateModified": "2022-11-30T18:21:20+03:00",
+ "dateModified": "2022-12-03T10:46:29+03:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
diff --git a/docs/2022-12/index.html b/docs/2022-12/index.html
index df3b07b6c..e1c991fac 100644
--- a/docs/2022-12/index.html
+++ b/docs/2022-12/index.html
@@ -20,7 +20,7 @@ Replace “East Asia” with “Eastern Asia” region on CGSpac
-
+
@@ -46,9 +46,9 @@ Replace “East Asia” with “Eastern Asia” region on CGSpac
"@type": "BlogPosting",
"headline": "December, 2022",
"url": "https://alanorth.github.io/cgspace-notes/2022-12/",
- "wordCount": "159",
+ "wordCount": "376",
"datePublished": "2022-12-01T08:52:36+03:00",
- "dateModified": "2022-12-01T08:52:36+03:00",
+ "dateModified": "2022-12-03T10:46:29+03:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
@@ -147,6 +147,36 @@ Replace “East Asia” with “Eastern Asia” region on CGSpac
+
2022-12-03
+
+- I downloaded a fresh copy of CLARISA’s institutions list as well as ROR’s latest dump from 2022-12-01 to check how many are matching:
+
+$ curl -s https://api.clarisa.cgiar.org/api/institutions | json_pp > ~/Downloads/2022-12-03-CLARISA-institutions.json
+$ jq -r '.[] | .name' ~/Downloads/2022-12-03-CLARISA-institutions.json > ~/Downloads/2022-12-03-CLARISA-institutions.txt
+$ ./ilri/ror-lookup.py -i ~/Downloads/2022-12-03-CLARISA-institutions.txt -o /tmp/clarisa-ror-matches.csv -r v1.15-2022-12-01-ror-data.json
+$ csvgrep -c matched -m true /tmp/clarisa-ror-matches.csv | wc -l
+1864
+$ wc -l ~/Downloads/2022-12-03-CLARISA-institutions.txt
+7060 /home/aorth/Downloads/2022-12-03-CLARISA-institutions.txt
+
+- Out of the box they match 26.4%, but there are many institutions with multiple languages in the text value, as well as countries in parentheses so I think it could be higher
+- If I replace the slashes and remove the countries at the end there are slightly more matches, around 29%:
+
+$ sed -e 's_ / _\n_' -e 's_/_\n_' -e 's/ \?(.*)$//' ~/Downloads/2022-12-03-CLARISA-institutions.txt > ~/Downloads/2022-12-03-CLARISA-institutions-alan.txt
+
+- I checked CGSpace’s top 1,000 institutions too, first exporting from PostgreSQL:
+
+localhost/dspacetest= ☘ \COPY (SELECT DISTINCT text_value as "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id = 211 GROUP BY text_value ORDER BY count DESC LIMIT 1000) to /tmp/2022-11-22-affiliations.csv;
+
+- Then cutting (tab is the default delimeter):
+
+$ cut -f 1 /tmp/2022-11-22-affiliations.csv > 2022-11-22-affiliations.txt
+$ ./ilri/ror-lookup.py -i 2022-11-22-affiliations.txt -o /tmp/cgspace-matches.csv -r v1.15-2022-12-01-ror-data.json
+$ csvgrep -c matched -m true /tmp/cgspace-matches.csv | wc -l
+542
+
+- So that’s a 54% match for our top institutions
+
diff --git a/docs/categories/index.html b/docs/categories/index.html
index bdceb410f..9b849ac1c 100644
--- a/docs/categories/index.html
+++ b/docs/categories/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html
index f968b5f02..3ea31043e 100644
--- a/docs/categories/notes/index.html
+++ b/docs/categories/notes/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html
index b8a384f2d..901915776 100644
--- a/docs/categories/notes/page/2/index.html
+++ b/docs/categories/notes/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html
index 968138bb5..19f8f0f33 100644
--- a/docs/categories/notes/page/3/index.html
+++ b/docs/categories/notes/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html
index 1cf25177e..40a38c196 100644
--- a/docs/categories/notes/page/4/index.html
+++ b/docs/categories/notes/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html
index 581ec46bf..3b493984c 100644
--- a/docs/categories/notes/page/5/index.html
+++ b/docs/categories/notes/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html
index 0b126d51b..d87cf5cee 100644
--- a/docs/categories/notes/page/6/index.html
+++ b/docs/categories/notes/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html
index 1eafc5154..116a2dc85 100644
--- a/docs/categories/notes/page/7/index.html
+++ b/docs/categories/notes/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/index.html b/docs/index.html
index 3cfe3e9d3..0df7aa0d4 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/2/index.html b/docs/page/2/index.html
index 222fcbb41..70c6cbca9 100644
--- a/docs/page/2/index.html
+++ b/docs/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/3/index.html b/docs/page/3/index.html
index 788529a77..14e58fd74 100644
--- a/docs/page/3/index.html
+++ b/docs/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/4/index.html b/docs/page/4/index.html
index 29cf8f409..3deb0e556 100644
--- a/docs/page/4/index.html
+++ b/docs/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/5/index.html b/docs/page/5/index.html
index ed3261745..4eed5db1e 100644
--- a/docs/page/5/index.html
+++ b/docs/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/6/index.html b/docs/page/6/index.html
index 1c148ea4b..341a2ee17 100644
--- a/docs/page/6/index.html
+++ b/docs/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/7/index.html b/docs/page/7/index.html
index 2d7247697..f75b79305 100644
--- a/docs/page/7/index.html
+++ b/docs/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/8/index.html b/docs/page/8/index.html
index f8db332ab..09c090634 100644
--- a/docs/page/8/index.html
+++ b/docs/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/9/index.html b/docs/page/9/index.html
index 85fd1e16f..06866be86 100644
--- a/docs/page/9/index.html
+++ b/docs/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/index.html b/docs/posts/index.html
index f44640299..f1fb94b6c 100644
--- a/docs/posts/index.html
+++ b/docs/posts/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html
index 3beb3c00a..59cb12efc 100644
--- a/docs/posts/page/2/index.html
+++ b/docs/posts/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html
index fa693b156..5c79727ee 100644
--- a/docs/posts/page/3/index.html
+++ b/docs/posts/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html
index a89a88d60..537d5f814 100644
--- a/docs/posts/page/4/index.html
+++ b/docs/posts/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html
index 80926bb39..5095d567d 100644
--- a/docs/posts/page/5/index.html
+++ b/docs/posts/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html
index 6a7567ab0..f27f235e1 100644
--- a/docs/posts/page/6/index.html
+++ b/docs/posts/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html
index d03d9ac51..653e4bd87 100644
--- a/docs/posts/page/7/index.html
+++ b/docs/posts/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html
index b911ef0af..636163f5c 100644
--- a/docs/posts/page/8/index.html
+++ b/docs/posts/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html
index c2dcec5bb..92e141144 100644
--- a/docs/posts/page/9/index.html
+++ b/docs/posts/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index d1a5c84f8..cc7c66b25 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -3,22 +3,22 @@
xmlns:xhtml="http://www.w3.org/1999/xhtml">
https://alanorth.github.io/cgspace-notes/categories/
- 2022-12-01T08:52:36+03:00
+ 2022-12-03T10:46:29+03:00
https://alanorth.github.io/cgspace-notes/
- 2022-12-01T08:52:36+03:00
+ 2022-12-03T10:46:29+03:00
https://alanorth.github.io/cgspace-notes/2022-12/
- 2022-12-01T08:52:36+03:00
+ 2022-12-03T10:46:29+03:00
https://alanorth.github.io/cgspace-notes/categories/notes/
- 2022-12-01T08:52:36+03:00
+ 2022-12-03T10:46:29+03:00
https://alanorth.github.io/cgspace-notes/posts/
- 2022-12-01T08:52:36+03:00
+ 2022-12-03T10:46:29+03:00
https://alanorth.github.io/cgspace-notes/2022-11/
- 2022-11-30T18:21:20+03:00
+ 2022-12-03T10:46:29+03:00
https://alanorth.github.io/cgspace-notes/2022-10/
2022-10-31T16:59:47+03:00