diff --git a/content/posts/2022-08.md b/content/posts/2022-08.md
index ea59f423a..6eafea0e7 100644
--- a/content/posts/2022-08.md
+++ b/content/posts/2022-08.md
@@ -248,4 +248,53 @@ $ dspace import --add --eperson=fuu@fuu.com --source /tmp/SimpleArchiveFormat --
- I created a [GitHub issue for OpenRXV compatibility issues with DSpace 7](https://github.com/ilri/OpenRXV/issues/133)
+## 2022-08-24
+
+- Start working on the MARLO OICRs
+ - First I extracted the filenames and IDs from the v2 metadata file, then joined it with the UTF-8 version:
+
+```console
+$ xsv select 'cg.number (series/report No.),File' OICRS\ Metadata\ v2.csv > /tmp/OICR-files.csv
+$ xsv join --left 'cg.number (series/report No.)' OICRS\ metadata\ utf8\ 20220816_JM.csv 'cg.number (series/report No.)' /tmp/OICR-files.csv > OICRs-UTF-8-with-files.csv
+```
+
+- After that I imported it into OpenRefine for data cleaning
+ - To enrich the metadata I combined the title and abstract into a new field and then checked my list of 11,000 AGROVOC terms against it
+ - First, create a new column with this GREL:
+
+```console
+cells["dc.title"].value + " " + cells["dcterms.abstract"].value
+```
+
+- Then use this Jython:
+
+```python
+import re
+
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f :
+ terms = [name.rstrip().lower() for name in f]
+
+return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
+```
+
+- After that I de-duplicated the terms using this Jython:
+
+```python
+res = []
+
+[res.append(x) for x in value.split("||") if x not in res]
+
+return "||".join(res)
+```
+
+- Then I split the multi-values on "||" and used a text facet to remove some countries and other nonsense terms that matched, like "gates" and "al" and "s"
+ - Then I did the same for countries
+- Then I exported the CSV and started searching for duplicates so that I can add them as relations:
+
+```console
+$ ./ilri/check-duplicates.py -i ~/Downloads/2022-08-24-OICRs.csv -u dspace -db dspace -p 'omg' -o /tmp/oicrs-matches.csv
+```
+
+- Oh wow, I actually found one OICR already uploaded to CGSpace... I have to ask Jose about that
+
diff --git a/docs/2022-08/index.html b/docs/2022-08/index.html
index b3667b9eb..a841765f5 100644
--- a/docs/2022-08/index.html
+++ b/docs/2022-08/index.html
@@ -14,7 +14,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
-
+
@@ -34,9 +34,9 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
"@type": "BlogPosting",
"headline": "August, 2022",
"url": "https://alanorth.github.io/cgspace-notes/2022-08/",
- "wordCount": "2068",
+ "wordCount": "2309",
"datePublished": "2022-08-01T10:22:36+03:00",
- "dateModified": "2022-08-20T22:37:35-07:00",
+ "dateModified": "2022-08-23T12:14:14-07:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
@@ -395,6 +395,54 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
+2022-08-24
+
+- Start working on the MARLO OICRs
+
+- First I extracted the filenames and IDs from the v2 metadata file, then joined it with the UTF-8 version:
+
+
+
+$ xsv select 'cg.number (series/report No.),File' OICRS\ Metadata\ v2.csv > /tmp/OICR-files.csv
+$ xsv join --left 'cg.number (series/report No.)' OICRS\ metadata\ utf8\ 20220816_JM.csv 'cg.number (series/report No.)' /tmp/OICR-files.csv > OICRs-UTF-8-with-files.csv
+
+- After that I imported it into OpenRefine for data cleaning
+
+- To enrich the metadata I combined the title and abstract into a new field and then checked my list of 11,000 AGROVOC terms against it
+- First, create a new column with this GREL:
+
+
+
+cells["dc.title"].value + " " + cells["dcterms.abstract"].value
+
+import re
+
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f :
+ terms = [name.rstrip().lower() for name in f]
+
+return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
+
+- After that I de-duplicated the terms using this Jython:
+
+res = []
+
+[res.append(x) for x in value.split("||") if x not in res]
+
+return "||".join(res)
+
+- Then I split the multi-values on “||” and used a text facet to remove some countries and other nonsense terms that matched, like “gates” and “al” and “s”
+
+- Then I did the same for countries
+
+
+- Then I exported the CSV and started searching for duplicates so that I can add them as relations:
+
+$ ./ilri/check-duplicates.py -i ~/Downloads/2022-08-24-OICRs.csv -u dspace -db dspace -p 'omg' -o /tmp/oicrs-matches.csv
+
+- Oh wow, I actually found one OICR already uploaded to CGSpace… I have to ask Jose about that
+
diff --git a/docs/categories/index.html b/docs/categories/index.html
index 03e01d14e..ea7d4ffb1 100644
--- a/docs/categories/index.html
+++ b/docs/categories/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html
index 1b02125ea..6aee51001 100644
--- a/docs/categories/notes/index.html
+++ b/docs/categories/notes/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html
index 53ba87018..d0dad4128 100644
--- a/docs/categories/notes/page/2/index.html
+++ b/docs/categories/notes/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html
index 9dd09ec79..49f55389f 100644
--- a/docs/categories/notes/page/3/index.html
+++ b/docs/categories/notes/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html
index c66cb372d..616fa64f4 100644
--- a/docs/categories/notes/page/4/index.html
+++ b/docs/categories/notes/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html
index 102cca294..67d912087 100644
--- a/docs/categories/notes/page/5/index.html
+++ b/docs/categories/notes/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html
index 3ee93ed35..dd46d1aac 100644
--- a/docs/categories/notes/page/6/index.html
+++ b/docs/categories/notes/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html
index 59d45cf7d..f9c35ab8b 100644
--- a/docs/categories/notes/page/7/index.html
+++ b/docs/categories/notes/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/index.html b/docs/index.html
index e3f9bf770..aad5eb4ef 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/2/index.html b/docs/page/2/index.html
index 80d907d58..a9190a2f3 100644
--- a/docs/page/2/index.html
+++ b/docs/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/3/index.html b/docs/page/3/index.html
index e0add7fa5..4b88bf7fe 100644
--- a/docs/page/3/index.html
+++ b/docs/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/4/index.html b/docs/page/4/index.html
index 3309e40ca..9411adb19 100644
--- a/docs/page/4/index.html
+++ b/docs/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/5/index.html b/docs/page/5/index.html
index 373f751ea..da4289635 100644
--- a/docs/page/5/index.html
+++ b/docs/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/6/index.html b/docs/page/6/index.html
index 6df6025f9..09756f346 100644
--- a/docs/page/6/index.html
+++ b/docs/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/7/index.html b/docs/page/7/index.html
index 878b859e0..ee0bed253 100644
--- a/docs/page/7/index.html
+++ b/docs/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/8/index.html b/docs/page/8/index.html
index e9bf18a8a..232c6575c 100644
--- a/docs/page/8/index.html
+++ b/docs/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/9/index.html b/docs/page/9/index.html
index 6bfd5235b..c2cdba6cd 100644
--- a/docs/page/9/index.html
+++ b/docs/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/index.html b/docs/posts/index.html
index 450903703..dbb405b5a 100644
--- a/docs/posts/index.html
+++ b/docs/posts/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html
index 5d446685e..de66caec3 100644
--- a/docs/posts/page/2/index.html
+++ b/docs/posts/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html
index 18b10e3c3..68f84e4c6 100644
--- a/docs/posts/page/3/index.html
+++ b/docs/posts/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html
index 838630c09..f2de71a41 100644
--- a/docs/posts/page/4/index.html
+++ b/docs/posts/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html
index bc03bd5ff..0187d160d 100644
--- a/docs/posts/page/5/index.html
+++ b/docs/posts/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html
index fd5fc5b02..fe7a8ff8e 100644
--- a/docs/posts/page/6/index.html
+++ b/docs/posts/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html
index 30bdc2fe4..90d2847fa 100644
--- a/docs/posts/page/7/index.html
+++ b/docs/posts/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html
index dbaee2c65..d97e8056e 100644
--- a/docs/posts/page/8/index.html
+++ b/docs/posts/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html
index 5d75b7206..49ae71c7e 100644
--- a/docs/posts/page/9/index.html
+++ b/docs/posts/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 1c36ce00f..d1bc175b8 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -3,19 +3,19 @@
xmlns:xhtml="http://www.w3.org/1999/xhtml">
https://alanorth.github.io/cgspace-notes/2022-08/
- 2022-08-20T22:37:35-07:00
+ 2022-08-23T12:14:14-07:00
https://alanorth.github.io/cgspace-notes/categories/
- 2022-08-20T22:37:35-07:00
+ 2022-08-23T12:14:14-07:00
https://alanorth.github.io/cgspace-notes/
- 2022-08-20T22:37:35-07:00
+ 2022-08-23T12:14:14-07:00
https://alanorth.github.io/cgspace-notes/categories/notes/
- 2022-08-20T22:37:35-07:00
+ 2022-08-23T12:14:14-07:00
https://alanorth.github.io/cgspace-notes/posts/
- 2022-08-20T22:37:35-07:00
+ 2022-08-23T12:14:14-07:00
https://alanorth.github.io/cgspace-notes/2022-07/
2022-07-31T15:49:35+03:00