diff --git a/content/posts/2022-08.md b/content/posts/2022-08.md
index 11dbb0ea9..92a68fe31 100644
--- a/content/posts/2022-08.md
+++ b/content/posts/2022-08.md
@@ -156,4 +156,57 @@ $ xsv join --left id ~/Downloads/2022-08-18-MELIAs-UTF-8-With-Files.csv id ~/Dow
- I created a SAF bundle and imported the 749 MELIAs to DSpace Test
- I found thirteen items on CGSpace with dates in format "DD/MM/YYYY" so I fixed those
+## 2022-08-20
+
+- Peter sent me back the results of the duplicate checking on the Gender presentations
+ - There were only a handful of duplicates, so I used the IDs in the spreadsheet to flag and delete them in OpenRefine
+- I had a new idea about matching AGROVOC subjects and countries in OpenRefine
+ - I was previously splitting up the text value field (title/abstract/etc) by spaces and searching for each word in the list of terms/countries like this:
+
+```console
+with open(r"/tmp/cgspace-countries.txt",'r') as f :
+ countries = [name.rstrip().lower() for name in f]
+
+return "||".join([x for x in value.split(' ') if x.lower() in countries])
+```
+
+- But that misses multi-word terms/countries with spaces, so we can search the other way around by using a regex for each term/country and checking if it appears in the text value field:
+
+```console
+import re
+
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f :
+ terms = [name.rstrip().lower() for name in f]
+
+return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
+```
+
+- Now we are only limited by our small (~1,400) list of AGROVOC subjects, so I did an export from PostgreSQL of all `dcterms.subjects` values and am looking them up against AGROVOC's API right now:
+
+```console
+localhost/dspacetest= ☘ \COPY (SELECT DISTINCT text_value AS "dcterms.subject", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id = 187 GROUP BY "dcterms.subject" ORDER BY count DESC) to /tmp/2022-08-20-agrovoc.csv WITH CSV HEADER;
+COPY 21685
+$ csvcut -c 1 /tmp/2022-08-20-agrovoc.csv | sed 1d > /tmp/all-subjects.txt
+$ ./ilri/agrovoc-lookup.py -i /tmp/all-subjects.txt -o 2022-08-20-all-subjects-results.csv
+$ csvgrep -c 'number of matches' -m 0 -i /tmp/2022-08-20-all-subjects-results.csv.bak | csvcut -c 1 | sed 1d > /tmp/agrovoc-subjects.txt
+$ wc -l /tmp/agrovoc-subjects.txt
+11834 /tmp/agrovoc-subjects.txt
+```
+
+- Then I created a new column joining the title and abstract, and ran the Jython expression above against this new file with 11,000 AGROVOC terms
+ - Then I joined that column with Peter's `dcterms.subject` column and then deduplicated it with this Jython:
+
+```console
+res = []
+
+[res.append(x) for x in value.split("||") if x not in res]
+
+return "||".join(res)
+```
+
+- This is way better, but you end up getting a bunch of countries, regions, and short words like "gates" matching in AGROVOC that are inappropriate (we typically don't tag these in AGROVOC) or incorrect (gates, as in windows or doors, not the funding agency)
+ - I did a text facet in OpenRefine and removed a bunch of these by eye
+- Then I finished adding the `dcterms.relation` and CRP metadata flagged by Peter on the Gender presentations
+ - I'm waiting for him to send me the PDFs and then I will upload them to DSpace Test
+
diff --git a/docs/2022-08/index.html b/docs/2022-08/index.html
index 915918467..3c2e4398c 100644
--- a/docs/2022-08/index.html
+++ b/docs/2022-08/index.html
@@ -14,7 +14,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
-
+
@@ -34,9 +34,9 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
"@type": "BlogPosting",
"headline": "August, 2022",
"url": "https://alanorth.github.io/cgspace-notes/2022-08/",
- "wordCount": "1446",
+ "wordCount": "1862",
"datePublished": "2022-08-01T10:22:36+03:00",
- "dateModified": "2022-08-18T22:43:37-07:00",
+ "dateModified": "2022-08-19T21:55:36-07:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
@@ -294,6 +294,66 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
I created a SAF bundle and imported the 749 MELIAs to DSpace Test
I found thirteen items on CGSpace with dates in format “DD/MM/YYYY” so I fixed those
+2022-08-20
+
+- Peter sent me back the results of the duplicate checking on the Gender presentations
+
+- There were only a handful of duplicates, so I used the IDs in the spreadsheet to flag and delete them in OpenRefine
+
+
+- I had a new idea about matching AGROVOC subjects and countries in OpenRefine
+
+- I was previously splitting up the text value field (title/abstract/etc) by spaces and searching for each word in the list of terms/countries like this:
+
+
+
+with open(r"/tmp/cgspace-countries.txt",'r') as f :
+ countries = [name.rstrip().lower() for name in f]
+
+return "||".join([x for x in value.split(' ') if x.lower() in countries])
+
+- But that misses multi-word terms/countries with spaces, so we can search the other way around by using a regex for each term/country and checking if it appears in the text value field:
+
+import re
+
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f :
+ terms = [name.rstrip().lower() for name in f]
+
+return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
+
+- Now we are only limited by our small (~1,400) list of AGROVOC subjects, so I did an export from PostgreSQL of all
dcterms.subjects
values and am looking them up against AGROVOC’s API right now:
+
+localhost/dspacetest= ☘ \COPY (SELECT DISTINCT text_value AS "dcterms.subject", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id = 187 GROUP BY "dcterms.subject" ORDER BY count DESC) to /tmp/2022-08-20-agrovoc.csv WITH CSV HEADER;
+COPY 21685
+$ csvcut -c 1 /tmp/2022-08-20-agrovoc.csv | sed 1d > /tmp/all-subjects.txt
+$ ./ilri/agrovoc-lookup.py -i /tmp/all-subjects.txt -o 2022-08-20-all-subjects-results.csv
+$ csvgrep -c 'number of matches' -m 0 -i /tmp/2022-08-20-all-subjects-results.csv.bak | csvcut -c 1 | sed 1d > /tmp/agrovoc-subjects.txt
+$ wc -l /tmp/agrovoc-subjects.txt
+11834 /tmp/agrovoc-subjects.txt
+
+- Then I created a new column joining the title and abstract, and ran the Jython expression above against this new file with 11,000 AGROVOC terms
+
+- Then I joined that column with Peter’s
dcterms.subject
column and then deduplicated it with this Jython:
+
+
+
+res = []
+
+[res.append(x) for x in value.split("||") if x not in res]
+
+return "||".join(res)
+
+- This is way better, but you end up getting a bunch of countries, regions, and short words like “gates” matching in AGROVOC that are inappropriate (we typically don’t tag these in AGROVOC) or incorrect (gates, as in windows or doors, not the funding agency)
+
+- I did a text facet in OpenRefine and removed a bunch of these by eye
+
+
+- Then I finished adding the
dcterms.relation
and CRP metadata flagged by Peter on the Gender presentations
+
+- I’m waiting for him to send me the PDFs and then I will upload them to DSpace Test
+
+
+
diff --git a/docs/categories/index.html b/docs/categories/index.html
index de2bd60f1..6a2d772e3 100644
--- a/docs/categories/index.html
+++ b/docs/categories/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html
index 03b55892d..b3c914fdf 100644
--- a/docs/categories/notes/index.html
+++ b/docs/categories/notes/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html
index ab779e046..6f09d7636 100644
--- a/docs/categories/notes/page/2/index.html
+++ b/docs/categories/notes/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html
index 6b5f905b0..0850149ac 100644
--- a/docs/categories/notes/page/3/index.html
+++ b/docs/categories/notes/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html
index 81016c70b..59126843b 100644
--- a/docs/categories/notes/page/4/index.html
+++ b/docs/categories/notes/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html
index 9e74f5921..8b37d429d 100644
--- a/docs/categories/notes/page/5/index.html
+++ b/docs/categories/notes/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html
index 17c7f89d0..af13dac32 100644
--- a/docs/categories/notes/page/6/index.html
+++ b/docs/categories/notes/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html
index 9335cab8d..5c17ec54c 100644
--- a/docs/categories/notes/page/7/index.html
+++ b/docs/categories/notes/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/index.html b/docs/index.html
index 73bd226d8..5c048ec24 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/2/index.html b/docs/page/2/index.html
index a7fdf026b..90f96ae8f 100644
--- a/docs/page/2/index.html
+++ b/docs/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/3/index.html b/docs/page/3/index.html
index bd711a0c5..10d76b476 100644
--- a/docs/page/3/index.html
+++ b/docs/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/4/index.html b/docs/page/4/index.html
index 8473b0cf8..5dd27bd2c 100644
--- a/docs/page/4/index.html
+++ b/docs/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/5/index.html b/docs/page/5/index.html
index 188ba74fd..a2d8cb213 100644
--- a/docs/page/5/index.html
+++ b/docs/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/6/index.html b/docs/page/6/index.html
index a4f564d00..5f7cf63b1 100644
--- a/docs/page/6/index.html
+++ b/docs/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/7/index.html b/docs/page/7/index.html
index 7de054cd5..ad3155e2f 100644
--- a/docs/page/7/index.html
+++ b/docs/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/8/index.html b/docs/page/8/index.html
index ba71b452b..eef5951d4 100644
--- a/docs/page/8/index.html
+++ b/docs/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/9/index.html b/docs/page/9/index.html
index 981e2e119..57d04a6fe 100644
--- a/docs/page/9/index.html
+++ b/docs/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/index.html b/docs/posts/index.html
index 584f3ed47..0d33652fa 100644
--- a/docs/posts/index.html
+++ b/docs/posts/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html
index dc87ade39..5778bb4ea 100644
--- a/docs/posts/page/2/index.html
+++ b/docs/posts/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html
index cbe3091d0..c9c1fee45 100644
--- a/docs/posts/page/3/index.html
+++ b/docs/posts/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html
index 0f80c0ff0..a26f92a55 100644
--- a/docs/posts/page/4/index.html
+++ b/docs/posts/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html
index f62a16be1..b0ba8623b 100644
--- a/docs/posts/page/5/index.html
+++ b/docs/posts/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html
index 1c15eebaf..b6fb856e6 100644
--- a/docs/posts/page/6/index.html
+++ b/docs/posts/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html
index 4e5fcc00a..0c2a1d522 100644
--- a/docs/posts/page/7/index.html
+++ b/docs/posts/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html
index 2f0bc5d0b..743562ce0 100644
--- a/docs/posts/page/8/index.html
+++ b/docs/posts/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html
index 76e0a0094..632b6a85f 100644
--- a/docs/posts/page/9/index.html
+++ b/docs/posts/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 65f8257cf..e08ea14df 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -3,19 +3,19 @@
xmlns:xhtml="http://www.w3.org/1999/xhtml">
https://alanorth.github.io/cgspace-notes/2022-08/
- 2022-08-18T22:43:37-07:00
+ 2022-08-19T21:55:36-07:00
https://alanorth.github.io/cgspace-notes/categories/
- 2022-08-18T22:43:37-07:00
+ 2022-08-19T21:55:36-07:00
https://alanorth.github.io/cgspace-notes/
- 2022-08-18T22:43:37-07:00
+ 2022-08-19T21:55:36-07:00
https://alanorth.github.io/cgspace-notes/categories/notes/
- 2022-08-18T22:43:37-07:00
+ 2022-08-19T21:55:36-07:00
https://alanorth.github.io/cgspace-notes/posts/
- 2022-08-18T22:43:37-07:00
+ 2022-08-19T21:55:36-07:00
https://alanorth.github.io/cgspace-notes/2022-07/
2022-07-31T15:49:35+03:00