From b913ff5353f186a0faf5d1b0c9be4daeb3ffe4f9 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Fri, 24 Jun 2022 14:49:37 +0300 Subject: [PATCH] Add notes for 2022-06-24 --- content/posts/2022-06.md | 51 ++++++++++++++++++++++++ docs/2022-06/index.html | 52 +++++++++++++++++++++++-- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/categories/notes/page/6/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/page/8/index.html | 2 +- docs/page/9/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/posts/page/8/index.html | 2 +- docs/posts/page/9/index.html | 2 +- docs/sitemap.xml | 10 ++--- 28 files changed, 129 insertions(+), 34 deletions(-) diff --git a/content/posts/2022-06.md b/content/posts/2022-06.md index 1e572bc47..eb3ed8f54 100644 --- a/content/posts/2022-06.md +++ b/content/posts/2022-06.md @@ -145,4 +145,55 @@ $ grep -c 'Adding ORCID' /tmp/orcids2.log - Meeting with Salem to discuss metadata between CGSpace and MEL - We started working through his spreadsheet and then the Internet dropped +## 2022-06-23 + +- Start looking at country names between MEL, CGSpace, and standards like UN M.49 and GeoNames + - I used `xmllint` to extract the countries from CGSpace's input forms: + +```console +$ xmllint --xpath '//value-pairs[@value-pairs-name="countrylist"]/pair/stored-value/node()' dspace/config/input-forms.xml > /tmp/cgspace-countries.txt +``` + +- Then I wrote a Python script (`countries-to-csv.py`) to read them and save their names alongside the ISO 3166-1 Alpha2 code +- Then I joined them with the other lists: + +```console +$ csvjoin --outer -c alpha2 ~/Downloads/clarisa-countries.csv ~/Downloads/UNSD\ —\ Methodology.csv ~/Downloads/geonames-countries.csv /tmp/cgspace-countries.csv /tmp/mel-countries.csv> /tmp/countries.csv +``` + +- This mostly worked fine, and is much easier than writing another Python script with Pandas... + +## 2022-06-24 + +- Spent some more time working on my `countries-to-csv.py` script to fix some logic errors +- Then re-export the UN M.49 countries to a clean list because the one I did yesterday somehow has errors: + +```console +csvcut -d ';' -c 'ISO-alpha2 Code,Country or Area' ~/Downloads/UNSD\ —\ Methodology.csv | sed -e '1s/ISO-alpha2 Code/alpha2/' -e '1s/Country or Area/UN M.49 Name/' > ~/Downloads/un-countries.csv +``` + +- Check the number of lines in each file: + +``` +$ wc -l clarisa-countries.csv un-countries.csv cgspace-countries.csv mel-countries.csv + 250 clarisa-countries.csv + 250 un-countries.csv + 198 cgspace-countries.csv + 258 mel-countries.csv +``` + +- I am seeing strange results with csvjoin's `--outer` join that I need to keep unmatched terms from both left and right files... + - Using `xsv join --full` is giving me better results: + +``` +$ xsv join --full alpha2 ~/Downloads/clarisa-countries.csv alpha2 ~/Downloads/un-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-xsv-full.csv +``` + +- Then adding the CGSpace and MEL countries: + +```console +$ xsv join --full alpha2 /tmp/clarisa-un-xsv-full.csv alpha2 /tmp/cgspace-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-xsv-full.csv +$ xsv join --full alpha2 /tmp/clarisa-un-cgspace-xsv-full.csv alpha2 /tmp/mel-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-mel-xsv-full.csv +``` + diff --git a/docs/2022-06/index.html b/docs/2022-06/index.html index c156fe59c..7eda4854a 100644 --- a/docs/2022-06/index.html +++ b/docs/2022-06/index.html @@ -26,7 +26,7 @@ There seem to be many more of these: - + @@ -58,9 +58,9 @@ There seem to be many more of these: "@type": "BlogPosting", "headline": "June, 2022", "url": "https://alanorth.github.io/cgspace-notes/2022-06/", - "wordCount": "939", + "wordCount": "1190", "datePublished": "2022-06-06T09:01:36+03:00", - "dateModified": "2022-06-21T16:59:04+03:00", + "dateModified": "2022-06-23T08:40:53+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -299,7 +299,51 @@ There seem to be many more of these: - +

2022-06-23

+ +
$ xmllint --xpath '//value-pairs[@value-pairs-name="countrylist"]/pair/stored-value/node()' dspace/config/input-forms.xml > /tmp/cgspace-countries.txt
+
+
$ csvjoin --outer -c alpha2 ~/Downloads/clarisa-countries.csv ~/Downloads/UNSD\ \ Methodology.csv ~/Downloads/geonames-countries.csv /tmp/cgspace-countries.csv /tmp/mel-countries.csv> /tmp/countries.csv
+
+

2022-06-24

+ +
csvcut -d ';' -c 'ISO-alpha2 Code,Country or Area' ~/Downloads/UNSD\ —\ Methodology.csv | sed -e '1s/ISO-alpha2 Code/alpha2/' -e '1s/Country or Area/UN M.49 Name/' > ~/Downloads/un-countries.csv
+
+
$ wc -l clarisa-countries.csv un-countries.csv cgspace-countries.csv mel-countries.csv
+  250 clarisa-countries.csv
+  250 un-countries.csv
+  198 cgspace-countries.csv
+  258 mel-countries.csv
+
+
$ xsv join --full alpha2 ~/Downloads/clarisa-countries.csv alpha2 ~/Downloads/un-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-xsv-full.csv
+
+
$ xsv join --full alpha2 /tmp/clarisa-un-xsv-full.csv alpha2 /tmp/cgspace-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-xsv-full.csv
+$ xsv join --full alpha2 /tmp/clarisa-un-cgspace-xsv-full.csv alpha2 /tmp/mel-countries.csv | xsv select '!alpha2[1]' > /tmp/clarisa-un-cgspace-mel-xsv-full.csv
+
diff --git a/docs/categories/index.html b/docs/categories/index.html index 673fe4843..58f5e3207 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 8d53c655d..8944c79b4 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 82c9e749c..3b8017ff1 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index f097950fa..d8f0cec68 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 6f3836675..b7a0474b2 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index e3b726b35..2b9b0873e 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index 2e7852615..c2fe67f0b 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 1a0b9f726..330ff5d20 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 9e449ef01..3fee38b37 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 0e3c3e29d..d07dae2a5 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index da5e0fb27..58f6648ba 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 46eca58d7..749bc2e47 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 3047ad95b..8836193e5 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 3eb269583..e86c2e4db 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index b7d0218dd..053639dd2 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/9/index.html b/docs/page/9/index.html index c9ae57918..987f40f49 100644 --- a/docs/page/9/index.html +++ b/docs/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 4cf5cf67a..dc634782b 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 18837cee8..bcb29cab1 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index d6427bca2..111d6c54f 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 036b70ed9..a4029a2e7 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 346dce8a3..38c16390a 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 56baf9ad8..2fb218b09 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 8fd51e48c..d22189289 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index 317595051..cb39e74f5 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html index ca0736433..68f0aaa92 100644 --- a/docs/posts/page/9/index.html +++ b/docs/posts/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 10018467a..edba0c90e 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2022-06-21T16:59:04+03:00 + 2022-06-23T08:40:53+03:00 https://alanorth.github.io/cgspace-notes/ - 2022-06-21T16:59:04+03:00 + 2022-06-23T08:40:53+03:00 https://alanorth.github.io/cgspace-notes/2022-06/ - 2022-06-21T16:59:04+03:00 + 2022-06-23T08:40:53+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2022-06-21T16:59:04+03:00 + 2022-06-23T08:40:53+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2022-06-21T16:59:04+03:00 + 2022-06-23T08:40:53+03:00 https://alanorth.github.io/cgspace-notes/2022-05/ 2022-05-30T16:00:02+03:00