From 2c0b9ce100c1737d83b0e959a6ff0d63b79de6f3 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 24 Feb 2019 16:58:00 -0800 Subject: [PATCH] Update notes --- content/posts/2019-02.md | 64 +++++++++++++++++++++++++++++++ docs/2019-02/index.html | 83 ++++++++++++++++++++++++++++++++++++++-- docs/sitemap.xml | 10 ++--- 3 files changed, 149 insertions(+), 8 deletions(-) diff --git a/content/posts/2019-02.md b/content/posts/2019-02.md index be06b54fc..0a66b31cf 100644 --- a/content/posts/2019-02.md +++ b/content/posts/2019-02.md @@ -1046,4 +1046,68 @@ COPY 33 - PLANT PRODUCTION & HEALTH research theme to items with PLANT HEALTH subject - NUTRITION & HUMAN HEALTH research theme to items with NUTRITION subject +## 2019-02-22 + +- Help Udana from WLE with some issues related to CGSpace items on their [Publications website](https://www.wle.cgiar.org/publications) + - He wanted some IWMI items to show up in their publications website + - The items were mapped into WLE collections, but still weren't showing up on the publications website + - I told him that he needs to add the `cg.identifier.wletheme` to the items so that the website indexer finds them + - A few days ago he added the metadata to [10568/93011](https://cgspace.cgiar.org/handle/10568/93011) and now I see that the item is present on the [WLE publications website](https://www.wle.cgiar.org/resource-recovery-waste-business-models-energy-nutrient-and-water-reuse-low-and-middle-income) +- Start looking at IITA's latest round of batch uploads called ["IITA_Feb_14" on DSpace Test](https://dspacetest.cgiar.org/handle/10568/108684) + - One mispelled authorship type + - A few dozen incorrect inconsistent affiliations (I dumped a list of the top 1500 affiliations and reconciled against it, but it was still a lot of work) + - One issue with smart quotes in countries + - A few IITA subjects with syntax errors + - Some whitespace and consistency issues in sponsorships + - Eight items with invalid ISBN: 0-471-98560-3 + - Two incorrectly formatted ISSNs + - Lots of incorrect values in subjects, but that's a difficult problem to do in an automated way + +- I figured out how to query AGROVOC from OpenRefine using Jython by creating a custom text facet: + +``` +import json +import re +import urllib +import urllib2 + +pattern = re.compile('^S[A-Z ]+$') +if pattern.match(value): + url = 'http://agrovoc.uniroma2.it/agrovoc/rest/v1/search?query=' + urllib.quote_plus(value) + '&lang=en' + get = urllib2.urlopen(url) + data = json.load(get) + if len(data['results']) == 1: + return "matched" + +return "unmatched" +``` + +- You have to make sure to URL encode the value with `quote_plus()` and it totally works, but it seems to refresh the facets (and therefore re-query everything) when you select a facet so that makes it basically unusable +- There is a [good resource discussing OpenRefine, Jython, and web scraping](https://programminghistorian.org/en/lessons/fetch-and-parse-data-with-openrefine#example-2-url-queries-and-parsing-json) + +## 2019-02-24 + +- I decided to try to validate the AGROVOC subjects in IITA's recent batch upload by dumping all their terms, checking them in en/es/fr with `agrovoc-lookup.py`, then reconciling against the final list using reconcile-csv with OpenRefine +- I'm not sure how to deal with terms like "CORN" that are alternative labels (`altLabel`) in AGROVOC where the preferred label (`prefLabel`) would be "MAIZE" +- For example, [a query](http://agrovoc.uniroma2.it/agrovoc/rest/v1/search?query=CORN*&lang=en) for `CORN*` returns: + +``` + "results": [ + { + "altLabel": "corn (maize)", + "lang": "en", + "prefLabel": "maize", + "type": [ + "skos:Concept" + ], + "uri": "http://aims.fao.org/aos/agrovoc/c_12332", + "vocab": "agrovoc" + }, +``` + +- There are dozens of other entries like "corn (soft wheat)", "corn (zea)", "corn bran", "Cornales", etc that could potentially match and to determine if they are related programatically is difficult +- Shit, and then there are terms like "GENETIC DIVERSITY" that should [technically be](http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/page/c_33952) "genetic diversity (as resource)" +- I applied all changes to the IITA Feb 14 batch data except the affiliations and sponsorships because I think I made some mistakes with the copying of reconciled values so I will try to look at those again separately +- I went back and re-did the affiliations and sponsorships and then applied them on the IITA Feb 14 collection on DSpace Test + diff --git a/docs/2019-02/index.html b/docs/2019-02/index.html index 2d2235c5d..714836a3c 100644 --- a/docs/2019-02/index.html +++ b/docs/2019-02/index.html @@ -42,7 +42,7 @@ sys 0m1.979s - + @@ -89,9 +89,9 @@ sys 0m1.979s "@type": "BlogPosting", "headline": "February, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-02/", - "wordCount": "6074", + "wordCount": "6551", "datePublished": "2019-02-01T21:37:30+02:00", - "dateModified": "2019-02-21T17:21:37-08:00", + "dateModified": "2019-02-21T18:16:33-08:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -1347,6 +1347,83 @@ COPY 33 +

2019-02-22

+ + + +
import json
+import re
+import urllib
+import urllib2
+
+pattern = re.compile('^S[A-Z ]+$')
+if pattern.match(value):
+  url = 'http://agrovoc.uniroma2.it/agrovoc/rest/v1/search?query=' + urllib.quote_plus(value) + '&lang=en'
+  get = urllib2.urlopen(url)
+  data = json.load(get)
+  if len(data['results']) == 1:
+    return "matched"
+
+return "unmatched"
+
+ + + +

2019-02-24

+ + + +
    "results": [
+        {
+            "altLabel": "corn (maize)",
+            "lang": "en",
+            "prefLabel": "maize",
+            "type": [
+                "skos:Concept"
+            ],
+            "uri": "http://aims.fao.org/aos/agrovoc/c_12332",
+            "vocab": "agrovoc"
+        },
+
+ + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 7f5f41c1d..589d9b7a4 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2019-02/ - 2019-02-21T17:21:37-08:00 + 2019-02-21T18:16:33-08:00 @@ -209,7 +209,7 @@ https://alanorth.github.io/cgspace-notes/ - 2019-02-21T17:21:37-08:00 + 2019-02-21T18:16:33-08:00 0 @@ -220,7 +220,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2019-02-21T17:21:37-08:00 + 2019-02-21T18:16:33-08:00 0 @@ -232,13 +232,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2019-02-21T17:21:37-08:00 + 2019-02-21T18:16:33-08:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2019-02-21T17:21:37-08:00 + 2019-02-21T18:16:33-08:00 0