diff --git a/content/posts/2022-06.md b/content/posts/2022-06.md
index 1e572bc47..eb3ed8f54 100644
--- a/content/posts/2022-06.md
+++ b/content/posts/2022-06.md
@@ -145,4 +145,55 @@ $ grep -c 'Adding ORCID' /tmp/orcids2.log
- Meeting with Salem to discuss metadata between CGSpace and MEL
- We started working through his spreadsheet and then the Internet dropped
+## 2022-06-23
+
+- Start looking at country names between MEL, CGSpace, and standards like UN M.49 and GeoNames
+ - I used `xmllint` to extract the countries from CGSpace's input forms:
+
+```console
+$ xmllint --xpath '//value-pairs[@value-pairs-name="countrylist"]/pair/stored-value/node()' dspace/config/input-forms.xml > /tmp/cgspace-countries.txt
+```
+
+- Then I wrote a Python script (`countries-to-csv.py`) to read them and save their names alongside the ISO 3166-1 Alpha2 code
+- Then I joined them with the other lists:
+
+```console
+$ csvjoin --outer -c alpha2 ~/Downloads/clarisa-countries.csv ~/Downloads/UNSD\ —\ Methodology.csv ~/Downloads/geonames-countries.csv /tmp/cgspace-countries.csv /tmp/mel-countries.csv> /tmp/countries.csv
+```
+
+- This mostly worked fine, and is much easier than writing another Python script with Pandas...
+
+## 2022-06-24
+
+- Spent some more time working on my `countries-to-csv.py` script to fix some logic errors
+- Then re-export the UN M.49 countries to a clean list because the one I did yesterday somehow has errors:
+
+```console
+csvcut -d ';' -c 'ISO-alpha2 Code,Country or Area' ~/Downloads/UNSD\ —\ Methodology.csv | sed -e '1s/ISO-alpha2 Code/alpha2/' -e '1s/Country or Area/UN M.49 Name/' > ~/Downloads/un-countries.csv
+```
+
+- Check the number of lines in each file:
+
+```
+$ wc -l clarisa-countries.csv un-countries.csv cgspace-countries.csv mel-countries.csv
+ 250 clarisa-countries.csv
+ 250 un-countries.csv
+ 198 cgspace-countries.csv
+ 258 mel-countries.csv
+```
+
+- I am seeing strange results with csvjoin's `--outer` join that I need to keep unmatched terms from both left and right files...
+ - Using `xsv join --full` is giving me better results:
+
+```
+$ xsv join --full alpha2 ~/Downloads/clarisa-countries.csv alpha2 ~/Downloads/un-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-xsv-full.csv
+```
+
+- Then adding the CGSpace and MEL countries:
+
+```console
+$ xsv join --full alpha2 /tmp/clarisa-un-xsv-full.csv alpha2 /tmp/cgspace-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-xsv-full.csv
+$ xsv join --full alpha2 /tmp/clarisa-un-cgspace-xsv-full.csv alpha2 /tmp/mel-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-mel-xsv-full.csv
+```
+
diff --git a/docs/2022-06/index.html b/docs/2022-06/index.html
index c156fe59c..7eda4854a 100644
--- a/docs/2022-06/index.html
+++ b/docs/2022-06/index.html
@@ -26,7 +26,7 @@ There seem to be many more of these:
-
+
@@ -58,9 +58,9 @@ There seem to be many more of these:
"@type": "BlogPosting",
"headline": "June, 2022",
"url": "https://alanorth.github.io/cgspace-notes/2022-06/",
- "wordCount": "939",
+ "wordCount": "1190",
"datePublished": "2022-06-06T09:01:36+03:00",
- "dateModified": "2022-06-21T16:59:04+03:00",
+ "dateModified": "2022-06-23T08:40:53+03:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
@@ -299,7 +299,51 @@ There seem to be many more of these:
-
+
2022-06-23
+
+- Start looking at country names between MEL, CGSpace, and standards like UN M.49 and GeoNames
+
+- I used
xmllint
to extract the countries from CGSpace’s input forms:
+
+
+
+$ xmllint --xpath '//value-pairs[@value-pairs-name="countrylist"]/pair/stored-value/node()' dspace/config/input-forms.xml > /tmp/cgspace-countries.txt
+
+- Then I wrote a Python script (
countries-to-csv.py
) to read them and save their names alongside the ISO 3166-1 Alpha2 code
+- Then I joined them with the other lists:
+
+$ csvjoin --outer -c alpha2 ~/Downloads/clarisa-countries.csv ~/Downloads/UNSD\ —\ Methodology.csv ~/Downloads/geonames-countries.csv /tmp/cgspace-countries.csv /tmp/mel-countries.csv> /tmp/countries.csv
+
+- This mostly worked fine, and is much easier than writing another Python script with Pandas…
+
+2022-06-24
+
+- Spent some more time working on my
countries-to-csv.py
script to fix some logic errors
+- Then re-export the UN M.49 countries to a clean list because the one I did yesterday somehow has errors:
+
+csvcut -d ';' -c 'ISO-alpha2 Code,Country or Area' ~/Downloads/UNSD\ —\ Methodology.csv | sed -e '1s/ISO-alpha2 Code/alpha2/' -e '1s/Country or Area/UN M.49 Name/' > ~/Downloads/un-countries.csv
+
+- Check the number of lines in each file:
+
+$ wc -l clarisa-countries.csv un-countries.csv cgspace-countries.csv mel-countries.csv
+ 250 clarisa-countries.csv
+ 250 un-countries.csv
+ 198 cgspace-countries.csv
+ 258 mel-countries.csv
+
+- I am seeing strange results with csvjoin’s
--outer
join that I need to keep unmatched terms from both left and right files…
+
+- Using
xsv join --full
is giving me better results:
+
+
+
+$ xsv join --full alpha2 ~/Downloads/clarisa-countries.csv alpha2 ~/Downloads/un-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-xsv-full.csv
+
+- Then adding the CGSpace and MEL countries:
+
+$ xsv join --full alpha2 /tmp/clarisa-un-xsv-full.csv alpha2 /tmp/cgspace-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-xsv-full.csv
+$ xsv join --full alpha2 /tmp/clarisa-un-cgspace-xsv-full.csv alpha2 /tmp/mel-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-mel-xsv-full.csv
+
diff --git a/docs/categories/index.html b/docs/categories/index.html
index 673fe4843..58f5e3207 100644
--- a/docs/categories/index.html
+++ b/docs/categories/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html
index 8d53c655d..8944c79b4 100644
--- a/docs/categories/notes/index.html
+++ b/docs/categories/notes/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html
index 82c9e749c..3b8017ff1 100644
--- a/docs/categories/notes/page/2/index.html
+++ b/docs/categories/notes/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html
index f097950fa..d8f0cec68 100644
--- a/docs/categories/notes/page/3/index.html
+++ b/docs/categories/notes/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html
index 6f3836675..b7a0474b2 100644
--- a/docs/categories/notes/page/4/index.html
+++ b/docs/categories/notes/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html
index e3b726b35..2b9b0873e 100644
--- a/docs/categories/notes/page/5/index.html
+++ b/docs/categories/notes/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html
index 2e7852615..c2fe67f0b 100644
--- a/docs/categories/notes/page/6/index.html
+++ b/docs/categories/notes/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/index.html b/docs/index.html
index 1a0b9f726..330ff5d20 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/2/index.html b/docs/page/2/index.html
index 9e449ef01..3fee38b37 100644
--- a/docs/page/2/index.html
+++ b/docs/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/3/index.html b/docs/page/3/index.html
index 0e3c3e29d..d07dae2a5 100644
--- a/docs/page/3/index.html
+++ b/docs/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/4/index.html b/docs/page/4/index.html
index da5e0fb27..58f6648ba 100644
--- a/docs/page/4/index.html
+++ b/docs/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/5/index.html b/docs/page/5/index.html
index 46eca58d7..749bc2e47 100644
--- a/docs/page/5/index.html
+++ b/docs/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/6/index.html b/docs/page/6/index.html
index 3047ad95b..8836193e5 100644
--- a/docs/page/6/index.html
+++ b/docs/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/7/index.html b/docs/page/7/index.html
index 3eb269583..e86c2e4db 100644
--- a/docs/page/7/index.html
+++ b/docs/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/8/index.html b/docs/page/8/index.html
index b7d0218dd..053639dd2 100644
--- a/docs/page/8/index.html
+++ b/docs/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/9/index.html b/docs/page/9/index.html
index c9ae57918..987f40f49 100644
--- a/docs/page/9/index.html
+++ b/docs/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/index.html b/docs/posts/index.html
index 4cf5cf67a..dc634782b 100644
--- a/docs/posts/index.html
+++ b/docs/posts/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html
index 18837cee8..bcb29cab1 100644
--- a/docs/posts/page/2/index.html
+++ b/docs/posts/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html
index d6427bca2..111d6c54f 100644
--- a/docs/posts/page/3/index.html
+++ b/docs/posts/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html
index 036b70ed9..a4029a2e7 100644
--- a/docs/posts/page/4/index.html
+++ b/docs/posts/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html
index 346dce8a3..38c16390a 100644
--- a/docs/posts/page/5/index.html
+++ b/docs/posts/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html
index 56baf9ad8..2fb218b09 100644
--- a/docs/posts/page/6/index.html
+++ b/docs/posts/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html
index 8fd51e48c..d22189289 100644
--- a/docs/posts/page/7/index.html
+++ b/docs/posts/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html
index 317595051..cb39e74f5 100644
--- a/docs/posts/page/8/index.html
+++ b/docs/posts/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html
index ca0736433..68f0aaa92 100644
--- a/docs/posts/page/9/index.html
+++ b/docs/posts/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 10018467a..edba0c90e 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -3,19 +3,19 @@
xmlns:xhtml="http://www.w3.org/1999/xhtml">
https://alanorth.github.io/cgspace-notes/categories/
- 2022-06-21T16:59:04+03:00
+ 2022-06-23T08:40:53+03:00
https://alanorth.github.io/cgspace-notes/
- 2022-06-21T16:59:04+03:00
+ 2022-06-23T08:40:53+03:00
https://alanorth.github.io/cgspace-notes/2022-06/
- 2022-06-21T16:59:04+03:00
+ 2022-06-23T08:40:53+03:00
https://alanorth.github.io/cgspace-notes/categories/notes/
- 2022-06-21T16:59:04+03:00
+ 2022-06-23T08:40:53+03:00
https://alanorth.github.io/cgspace-notes/posts/
- 2022-06-21T16:59:04+03:00
+ 2022-06-23T08:40:53+03:00
https://alanorth.github.io/cgspace-notes/2022-05/
2022-05-30T16:00:02+03:00