From 53f60284e2af6ac5dd57314f6177dad91d856633 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 30 Jun 2022 09:41:54 +0300 Subject: [PATCH] Add notes for 2022-06-29 --- content/posts/2022-06.md | 55 +++++++++++++++++++++ docs/2022-06/index.html | 63 +++++++++++++++++++++++-- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/categories/notes/page/6/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/page/8/index.html | 2 +- docs/page/9/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/posts/page/8/index.html | 2 +- docs/posts/page/9/index.html | 2 +- docs/sitemap.xml | 10 ++-- 28 files changed, 144 insertions(+), 34 deletions(-) diff --git a/content/posts/2022-06.md b/content/posts/2022-06.md index eacee10d5..f0343b1e6 100644 --- a/content/posts/2022-06.md +++ b/content/posts/2022-06.md @@ -200,4 +200,59 @@ $ xsv join --full alpha2 /tmp/clarisa-un-cgspace-xsv-full.csv alpha2 /tmp/mel-co - Start a harvest on AReS +## 2022-06-28 + +- Start working on the CGSpace subject export for FAO +- First I exported a list of all metadata in our `dcterms.subject` and other center-specific subject fields with their counts: + +```console +localhost/dspacetest= ☘ \COPY (SELECT DISTINCT text_value AS "subject", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id IN (187, 120, 210, 122, 215, 127, 208, 124, 128, 123, 125, 135, 203, 236, 238, 119) GROUP BY "subject" ORDER BY count DESC) to /tmp/2022-06-28-cgspace-subjects.csv WITH CSV HEADER; +COPY 27010 +``` + +- Then I extracted the subjects and looked them up against AGROVOC: + +```console +$ csvcut -c subject /tmp/2022-06-28-cgspace-subjects.csv | sed '1d' > /tmp/2022-06-28-cgspace-subjects.txt +$ ./ilri/agrovoc-lookup.py -i /tmp/2022-06-28-cgspace-subjects.txt -o /tmp/2022-06-28-cgspace-subjects-results.csv +``` + +- I keep getting timeouts after every five or ten requests, so this will not be feasible for 27,000 subjects! +- I think I will have to write some custom script to use the AGROVOC RDF file + - Using rdflib to open the 1.2GB `agrovoc_lod.rdf` file takes several minutes and doesn't seem very efficient +- I tried using [lightrdf](https://github.com/ozekik/lightrdf) and it's much quicker, but the documentation is limiting and I'm not sure how to search yet + - I had to try in different Python versions because 3.10.x is apparently too new +- For future reference I was able to search with lightrdf: + +```console +import lightrdf +parser = lightrdf.Parser() +# prints millions of lines +for triple in parser.parse("./agrovoc_lod.rdf", base_iri=None): + print(triple) +agrovoc = lightrdf.RDFDocument('agrovoc_lod.rdf'); +# all results for prefix http://aims.fao.org/aos/agrovoc/c_5 +for triple in agrovoc.search_triples('http://aims.fao.org/aos/agrovoc/c_5', None, None): + print(triple) +('http://aims.fao.org/aos/agrovoc/c_5', 'http://www.w3.org/2004/02/skos/core#altLabel', '"Abalone"@de') +('http://aims.fao.org/aos/agrovoc/c_5', 'http://www.w3.org/2004/02/skos/core#prefLabel', '"abalones"@en') +# all stuff for abalones in English +for triple in agrovoc.search_triples(None, None, '"abalones"@en'): + print(triple) +``` + +- I ran the `agrovoc-lookup.py` from a Linode server and it completed without issues... hmmm + +## 2022-06-29 + +- Continue working on the list of non-AGROVOC subject to report to FAO + - I got a one liner to get the list of non-AGROVOC subjects and join them with their counts: + +```console +$ csvgrep -c 'number of matches' -m 0 /tmp/2022-06-28-cgspace-subjects-results.csv \ + | csvcut -c subject \ + | csvjoin -c subject /tmp/2022-06-28-cgspace-subjects.csv - \ + > /tmp/2022-06-28-cgspace-non-agrovoc.csv +``` + diff --git a/docs/2022-06/index.html b/docs/2022-06/index.html index fff9cf95f..c63208405 100644 --- a/docs/2022-06/index.html +++ b/docs/2022-06/index.html @@ -26,7 +26,7 @@ There seem to be many more of these: - + @@ -58,9 +58,9 @@ There seem to be many more of these: "@type": "BlogPosting", "headline": "June, 2022", "url": "https://alanorth.github.io/cgspace-notes/2022-06/", - "wordCount": "1196", + "wordCount": "1520", "datePublished": "2022-06-06T09:01:36+03:00", - "dateModified": "2022-06-24T14:49:37+03:00", + "dateModified": "2022-06-26T18:11:33+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -347,7 +347,62 @@ There seem to be many more of these: - +

2022-06-28

+ +
localhost/dspacetest= ☘ \COPY (SELECT DISTINCT text_value AS "subject", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id IN (187, 120, 210, 122, 215, 127, 208, 124, 128, 123, 125, 135, 203, 236, 238, 119) GROUP BY "subject" ORDER BY count DESC) to /tmp/2022-06-28-cgspace-subjects.csv WITH CSV HEADER;
+COPY 27010
+
+
$ csvcut -c subject /tmp/2022-06-28-cgspace-subjects.csv | sed '1d' > /tmp/2022-06-28-cgspace-subjects.txt
+$ ./ilri/agrovoc-lookup.py -i /tmp/2022-06-28-cgspace-subjects.txt -o /tmp/2022-06-28-cgspace-subjects-results.csv
+
+
import lightrdf
+parser = lightrdf.Parser()
+# prints millions of lines
+for triple in parser.parse("./agrovoc_lod.rdf", base_iri=None):
+     print(triple)
+agrovoc = lightrdf.RDFDocument('agrovoc_lod.rdf');
+# all results for prefix http://aims.fao.org/aos/agrovoc/c_5
+for triple in agrovoc.search_triples('http://aims.fao.org/aos/agrovoc/c_5', None, None):
+     print(triple)
+('http://aims.fao.org/aos/agrovoc/c_5', 'http://www.w3.org/2004/02/skos/core#altLabel', '"Abalone"@de')
+('http://aims.fao.org/aos/agrovoc/c_5', 'http://www.w3.org/2004/02/skos/core#prefLabel', '"abalones"@en')
+# all stuff for abalones in English
+for triple in agrovoc.search_triples(None, None, '"abalones"@en'):
+     print(triple)
+
+

2022-06-29

+ +
$ csvgrep -c 'number of matches' -m 0 /tmp/2022-06-28-cgspace-subjects-results.csv \
+  | csvcut -c subject \
+  | csvjoin -c subject /tmp/2022-06-28-cgspace-subjects.csv - \
+  > /tmp/2022-06-28-cgspace-non-agrovoc.csv
+
diff --git a/docs/categories/index.html b/docs/categories/index.html index 30afee7b0..361ceb8d5 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index c6d98c4e0..a7f21ae4a 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index ab3b26a7d..aafc6d6d5 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index f4347364e..43f684302 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index b51e458ee..737f4eb76 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index 44c4e97ec..b15a331c8 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index 73e5e9d09..9ae458335 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 6e1fcd741..8e8a2ec84 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 33af54823..9949fcc24 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index db81f61b7..285fe7780 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 325809ac0..0e7b00aed 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 86d073b6d..7a052cd29 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 831986755..2c111dcc5 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 368728ac2..c9cb37fc7 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index 59e3c3cc8..afe2d7a22 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/9/index.html b/docs/page/9/index.html index e0c81fc19..0e33dd4cb 100644 --- a/docs/page/9/index.html +++ b/docs/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 5a6089025..46f761653 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index bdfdcb3db..baf0bb255 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index b8c2ce3d6..6ee14fd33 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 1cf718df3..b14900c29 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 07059ac60..1aab32af4 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 85f1fe920..868432f48 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index e43dfbdb2..c6a9ab217 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index b70a10af0..0b327d406 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html index d20bf1bb4..d65f1cd39 100644 --- a/docs/posts/page/9/index.html +++ b/docs/posts/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 9ca4823cc..5271ebf4c 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2022-06-24T14:49:37+03:00 + 2022-06-26T18:11:33+03:00 https://alanorth.github.io/cgspace-notes/ - 2022-06-24T14:49:37+03:00 + 2022-06-26T18:11:33+03:00 https://alanorth.github.io/cgspace-notes/2022-06/ - 2022-06-24T14:49:37+03:00 + 2022-06-26T18:11:33+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2022-06-24T14:49:37+03:00 + 2022-06-26T18:11:33+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2022-06-24T14:49:37+03:00 + 2022-06-26T18:11:33+03:00 https://alanorth.github.io/cgspace-notes/2022-05/ 2022-05-30T16:00:02+03:00