From befe3a3a58bf7fd7aa08ec824e0d49ed8522f3cd Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 27 May 2024 21:40:09 +0300 Subject: [PATCH] Add notes for 2024-05-27 --- content/posts/2024-05.md | 82 +++++++++++++++++++++++++ docs/2024-05/index.html | 74 +++++++++++++++++++++- docs/categories/index.html | 2 +- docs/categories/index.xml | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/index.xml | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/categories/notes/page/6/index.html | 2 +- docs/categories/notes/page/7/index.html | 2 +- docs/categories/notes/page/8/index.html | 2 +- docs/categories/notes/page/9/index.html | 2 +- docs/index.html | 2 +- docs/index.xml | 2 +- docs/page/10/index.html | 2 +- docs/page/11/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/page/8/index.html | 2 +- docs/page/9/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/index.xml | 2 +- docs/posts/page/10/index.html | 2 +- docs/posts/page/11/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/posts/page/8/index.html | 2 +- docs/posts/page/9/index.html | 2 +- docs/sitemap.xml | 10 +-- 39 files changed, 194 insertions(+), 44 deletions(-) diff --git a/content/posts/2024-05.md b/content/posts/2024-05.md index 1532deede..cef38224b 100644 --- a/content/posts/2024-05.md +++ b/content/posts/2024-05.md @@ -106,4 +106,86 @@ Continue working through alternative duplicate matching for IFPRI - One thing I think I can say for sure is that the default similarity factor in my script is 0.6, and I rarely see legitimate duplicates with such similarity so I might increase this to 0.7 to reduce the number of items I have to check - Also, the difference in issue dates is currently 365, but I should reduce that a bit, perhaps to 270 days (9 months) +## 2024-05-22 + +- Finalize and upload the IFPRI 2020–2021 batch set + - I used a new technique to get missing licenses via Crossref (it's Python 2 because of OpenRefine's Jython): + +```python +import urllib2 + +doi = cells['cg.identifier.doi[en_US]'].value +url = "https://api.crossref.org/works/" + doi +useragent = "Python (mailto:a.o@cgiar.org)" + +request = urllib2.Request(url.encode("utf-8"), headers={"User-Agent" : useragent}) +get = urllib2.urlopen(request) + +return get.read().decode('utf-8') +``` + +## 2024-05-23 + +- Finalize last of the duplicates I found for the IFPRI 2020–2021 batch set (those that we missed initially due to mismatched types) +- Export a new list of IFPRI redirects from CONTENTdm: + +```console +$ csvgrep -c 'dc.description.provenance[en_US]' -r 'Original URLs? from IFPRI CONTENTdm' cgspace.csv \ + | csvcut -c 'id,dc.description.provenance[en_US],dc.identifier.uri[en_US]' \ + | tee /tmp/ifpri-redirects.csv \ + | csvstat --count +4004 +``` + +I found a way to get abstracts from PLOS + - They offer an API that returns XML including the JATS-formatted abstracts + - I created a new column in OpenRefine by fetching specially crafted URLs based on the DOIs using this GREL: + +```console +"https://journals.plos.org/plosone/article/file?id=" + cells['doi'].value + '&type=manuscript' +``` + +Then used `value.parseXml()` on the resulting text to extract the abstract's text: + +```console +value.parseXml().select("abstract")[0].xmlText() +``` + +This doesn't preserve `

` tags though... + - Oh, nice, this does! + +```console +forEach(value.parseHtml().select("abstract p"), i, i.htmlText()).join("\r\n\r\n") +``` + +For each paragraph inside an abstract, get the inner text and join them as one string separated by two newlines... + - Ah, some articles have multiple abstracts, for example: https://journals.plos.org/plosone/article/file?id=https://doi.org/10.1371/journal.pntd.0001859&type=manuscript + - I need to select the abstract that does **not** have any attributes (using [Jsoup selector syntax](https://jsoup.org/apidocs/org/jsoup/select/Selector.html)) + +```console +forEach(value.parseXml().select("abstract:not([*]) p"), i, i.xmlText()).join("\r\n\r\n") +``` + +Testing `xsv` (Rust) versus `csvkit` (Python) to filter all items with DOIs from a DSpace dump with 118,000 items: + +```console +$ time xsv search -s doi 'doi\.org' /tmp/cgspace-minimal.csv | xsv select doi | xsv count +27339 +xsv search -s doi 'doi\.org' /tmp/cgspace-minimal.csv 0.06s user 0.03s system 98% cpu 0.091 total +xsv select doi 0.02s user 0.02s system 40% cpu 0.091 total +xsv count 0.01s user 0.00s system 9% cpu 0.090 total +$ time csvgrep -c doi -m 'doi.org' /tmp/cgspace-minimal.csv | csvcut -c doi | csvstat --count +27339 +csvgrep -c doi -m 'doi.org' /tmp/cgspace-minimal.csv 1.15s user 0.06s system 95% cpu 1.273 total +csvcut -c doi 0.42s user 0.05s system 36% cpu 1.283 total +csvstat --count 0.20s user 0.03s system 18% cpu 1.298 total +``` + +## 2024-05-27 + +- Working on IFPRI datasets batch migration + - 732 items total + - 6 duplicates on CGSpace + - 6 duplicates within set that need investigation + diff --git a/docs/2024-05/index.html b/docs/2024-05/index.html index 53b6f3263..59441010a 100644 --- a/docs/2024-05/index.html +++ b/docs/2024-05/index.html @@ -18,7 +18,7 @@ Then I did some work to add missing abstracts (about 900!), volumes, issues, lic - + @@ -42,9 +42,9 @@ Then I did some work to add missing abstracts (about 900!), volumes, issues, lic "@type": "BlogPosting", "headline": "May, 2024", "url": "https://alanorth.github.io/cgspace-notes/2024-05/", - "wordCount": "652", + "wordCount": "1023", "datePublished": "2024-05-01T10:39:00+03:00", - "dateModified": "2024-05-13T16:24:11+03:00", + "dateModified": "2024-05-20T17:34:14+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -224,6 +224,74 @@ dspace=# \COPY (SELECT i.uuid, m.text_value AS submitted_by FROM item i JOIN met

  • One thing I think I can say for sure is that the default similarity factor in my script is 0.6, and I rarely see legitimate duplicates with such similarity so I might increase this to 0.7 to reduce the number of items I have to check
  • Also, the difference in issue dates is currently 365, but I should reduce that a bit, perhaps to 270 days (9 months)
  • +

    2024-05-22

    + +
    import urllib2
    +
    +doi = cells['cg.identifier.doi[en_US]'].value
    +url = "https://api.crossref.org/works/" + doi
    +useragent = "Python (mailto:a.o@cgiar.org)"
    +
    +request = urllib2.Request(url.encode("utf-8"), headers={"User-Agent" : useragent})
    +get = urllib2.urlopen(request)
    +
    +return get.read().decode('utf-8')
    +

    2024-05-23

    + +
    $ csvgrep -c 'dc.description.provenance[en_US]' -r 'Original URLs? from IFPRI CONTENTdm' cgspace.csv \
    +  | csvcut -c 'id,dc.description.provenance[en_US],dc.identifier.uri[en_US]' \
    +  | tee /tmp/ifpri-redirects.csv \
    +  | csvstat --count
    +4004
    +

    I found a way to get abstracts from PLOS

    + +
    "https://journals.plos.org/plosone/article/file?id=" + cells['doi'].value + '&type=manuscript'
    +

    Then used value.parseXml() on the resulting text to extract the abstract’s text:

    +
    value.parseXml().select("abstract")[0].xmlText()
    +

    This doesn’t preserve <p> tags though…

    + +
    forEach(value.parseHtml().select("abstract p"), i, i.htmlText()).join("\r\n\r\n")
    +

    For each paragraph inside an abstract, get the inner text and join them as one string separated by two newlines…

    + +
    forEach(value.parseXml().select("abstract:not([*]) p"), i, i.xmlText()).join("\r\n\r\n")
    +

    Testing xsv (Rust) versus csvkit (Python) to filter all items with DOIs from a DSpace dump with 118,000 items:

    +
    $ time xsv search -s doi 'doi\.org' /tmp/cgspace-minimal.csv | xsv select doi | xsv count
    +27339
    +xsv search -s doi 'doi\.org' /tmp/cgspace-minimal.csv  0.06s user 0.03s system 98% cpu 0.091 total
    +xsv select doi  0.02s user 0.02s system 40% cpu 0.091 total
    +xsv count  0.01s user 0.00s system 9% cpu 0.090 total
    +$ time csvgrep -c doi -m 'doi.org' /tmp/cgspace-minimal.csv | csvcut -c doi | csvstat --count
    +27339
    +csvgrep -c doi -m 'doi.org' /tmp/cgspace-minimal.csv  1.15s user 0.06s system 95% cpu 1.273 total
    +csvcut -c doi  0.42s user 0.05s system 36% cpu 1.283 total
    +csvstat --count  0.20s user 0.03s system 18% cpu 1.298 total
    +

    2024-05-27

    + diff --git a/docs/categories/index.html b/docs/categories/index.html index 0482d7345..f51bb0fc0 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/index.xml b/docs/categories/index.xml index a5364eabb..7ba7811a7 100644 --- a/docs/categories/index.xml +++ b/docs/categories/index.xml @@ -6,7 +6,7 @@ Recent content in Categories on CGSpace Notes Hugo en-us - Thu, 16 May 2024 08:27:56 +0300 + Mon, 20 May 2024 17:34:14 +0300 Notes diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 21d5dca13..c974b7f5e 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.xml b/docs/categories/notes/index.xml index 928b7181d..8208cffa0 100644 --- a/docs/categories/notes/index.xml +++ b/docs/categories/notes/index.xml @@ -6,7 +6,7 @@ Recent content in Notes on CGSpace Notes Hugo en-us - Thu, 16 May 2024 08:27:56 +0300 + Mon, 20 May 2024 17:34:14 +0300 May, 2024 diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index a31f2a9fa..ea988161c 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index de60f0d0a..9f6d6cb7c 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 79d9a8169..b445d01a6 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index ad6dba77c..9bef268d5 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index 36c6f103a..aff2030a2 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html index 4ffdecbdc..1aae8c34f 100644 --- a/docs/categories/notes/page/7/index.html +++ b/docs/categories/notes/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/8/index.html b/docs/categories/notes/page/8/index.html index ddb5302e2..bbdea7541 100644 --- a/docs/categories/notes/page/8/index.html +++ b/docs/categories/notes/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/9/index.html b/docs/categories/notes/page/9/index.html index da0152455..55fc75d1c 100644 --- a/docs/categories/notes/page/9/index.html +++ b/docs/categories/notes/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 9e9811282..02abbc1f2 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.xml b/docs/index.xml index b50ffbde2..353435c43 100644 --- a/docs/index.xml +++ b/docs/index.xml @@ -6,7 +6,7 @@ Recent content on CGSpace Notes Hugo en-us - Thu, 16 May 2024 08:27:56 +0300 + Mon, 20 May 2024 17:34:14 +0300 May, 2024 diff --git a/docs/page/10/index.html b/docs/page/10/index.html index 836c1e4af..04b5ef922 100644 --- a/docs/page/10/index.html +++ b/docs/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/11/index.html b/docs/page/11/index.html index f560f48c1..79a307365 100644 --- a/docs/page/11/index.html +++ b/docs/page/11/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 25294ed53..2415122b5 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 088b5c756..c8f1e87bc 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 05ece51a1..5019e5552 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index d00790b23..7351cad20 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 037d31cfc..5c3fc9084 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 40d1435e7..d870b7b4b 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index 266612096..77b2cedd8 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/9/index.html b/docs/page/9/index.html index 2f981ecf5..fb97b0453 100644 --- a/docs/page/9/index.html +++ b/docs/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 1f456d8b6..9c39b21d6 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.xml b/docs/posts/index.xml index 3d03faa73..1528b64bd 100644 --- a/docs/posts/index.xml +++ b/docs/posts/index.xml @@ -6,7 +6,7 @@ Recent content in Posts on CGSpace Notes Hugo en-us - Thu, 16 May 2024 08:27:56 +0300 + Mon, 20 May 2024 17:34:14 +0300 May, 2024 diff --git a/docs/posts/page/10/index.html b/docs/posts/page/10/index.html index b015f73f4..6d65ae499 100644 --- a/docs/posts/page/10/index.html +++ b/docs/posts/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/11/index.html b/docs/posts/page/11/index.html index 868af12bd..7cf6018f3 100644 --- a/docs/posts/page/11/index.html +++ b/docs/posts/page/11/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 38c688667..0ac232cda 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 0e0b1f5a8..f98e8a98d 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 6d2539a09..3bdc46ed1 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 4499d9f67..6d725340e 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index c3d11c600..45baa0fe8 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 47aec0a33..8a6d85785 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index 28d3d8d64..65108817a 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html index 19cf97506..f596012e2 100644 --- a/docs/posts/page/9/index.html +++ b/docs/posts/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 99299cb07..a1250e591 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2024-05-16T08:27:56+03:00 + 2024-05-20T17:34:14+03:00 https://alanorth.github.io/cgspace-notes/ - 2024-05-16T08:27:56+03:00 + 2024-05-20T17:34:14+03:00 https://alanorth.github.io/cgspace-notes/2024-05/ - 2024-05-13T16:24:11+03:00 + 2024-05-20T17:34:14+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2024-05-16T08:27:56+03:00 + 2024-05-20T17:34:14+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2024-05-16T08:27:56+03:00 + 2024-05-20T17:34:14+03:00 https://alanorth.github.io/cgspace-notes/2024-04/ 2024-04-29T17:21:28+03:00