From e92683406578a95879478921de9c76290c2dc004 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sat, 18 Mar 2023 17:42:40 +0300 Subject: [PATCH] Add notes for 2023-03-18 --- content/posts/2023-03.md | 142 +++++++++++++++++++++ docs/2023-03/index.html | 159 +++++++++++++++++++++++- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/categories/notes/page/6/index.html | 2 +- docs/categories/notes/page/7/index.html | 2 +- docs/index.html | 2 +- docs/page/10/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/page/8/index.html | 2 +- docs/page/9/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/10/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/posts/page/8/index.html | 2 +- docs/posts/page/9/index.html | 2 +- docs/sitemap.xml | 10 +- 31 files changed, 330 insertions(+), 37 deletions(-) diff --git a/content/posts/2023-03.md b/content/posts/2023-03.md index 130a3db5c..53b0ff5e0 100644 --- a/content/posts/2023-03.md +++ b/content/posts/2023-03.md @@ -257,4 +257,146 @@ $ ./ilri/resolve_orcids.py -i /tmp/2023-03-14-orcids.txt -o /tmp/2023-03-14-orci $ ./ilri/update_orcids.py -i /tmp/2023-03-14-orcids-names.txt -db dspace -u dspace -p 'fuuu' -m 247 ``` +## 2023-03-15 + +- Jawoo was asking about possibilities to harvest PDFs from CGSpace for some kind of AI chatbot integration + - I see we have 45,000 PDFs (format ID 2) + +```console +localhost/dspacetest= ☘ SELECT COUNT(*) FROM bitstream WHERE NOT deleted AND bitstream_format_id=2; + count +─────── + 45281 +(1 row) +``` + +- Rework some of my Python scripts to use a common `db_connect` function from util +- I reworked my `post_bitstreams.py` script to be able to overwrite bitstreams if requested + - The use case is to upload thumbnails for all the journal articles where we have these horrible pixelated journal covers + - I replaced JPEG thumbnails for ~896 ILRI publications by exporting a list of DOIs from the 10568/3 collection that were CC-BY, getting their PDFs from Sci-Hub, and then posting them with my new script + +## 2023-03-16 + +- Continue working on the ILRI publication thumbnails + - There were about sixty-four that had existing PNG "journal cover" thumbnails that didn't get replaced because I only overwrote the JPEG ones yesterday + - Now I generated a list of those bitstream UUIDs and deleted them with a shell script via the REST API +- I made a [pull request on DSpace 7 to update the bitstream format registry for PNG, WebP, and AVIF](https://github.com/DSpace/DSpace/pull/8722) +- Export CGSpace to perform mappings to Initiatives collections +- I also used this export to find CC-BY items with DOIs that had JPEGs or PNGs in their provenance, meaning that the submitter likely submitted a low-quality "journal cover" for the item + - I found about 330 of them and got most of their PDFs from Sci-Hub and replaced the crappy thumbnails with real ones where Sci-Hub had them (~245) +- In related news, I realized you can get an [API key from Elsevier and download the PDFs from their API](https://stackoverflow.com/questions/59202176/python-download-papers-from-sciencedirect-by-doi-with-requests): + +```python +import requests + +api_key = 'fuuuuuuuuu' +doi = "10.1016/j.foodqual.2021.104362" +request_url = f'https://api.elsevier.com/content/article/doi:{doi}' + +headers = { + 'X-ELS-APIKEY': api_key, + 'Accept': 'application/pdf' +} + +with requests.get(request_url, stream=True, headers=headers) as r: + if r.status_code == 200: + with open("article.pdf", "wb") as f: + for chunk in r.iter_content(chunk_size=1024*1024): + f.write(chunk) +``` + +- The question is, how do we know if a DOI is Elsevier or not... +- CGIAR Repositories Working Group meeting + - We discussed controlled vocabularies for funders + - I suggested checking our combined lists against Crossref and ROR +- Export a list of donors from `cg.contributor.donor` on CGSpace: + +```console +localhost/dspacetest= ☘ \COPY (SELECT DISTINCT(text_value) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=248) to /tmp/2023-03-16-donors.txt; +COPY 1521 +``` + +- Then resolve them against Crossref's funders API: + +```console +$ ./ilri/crossref_funders_lookup.py -e fuuuu@cgiar.org -i /tmp/2023-03-16-donors.txt -o ~/Downloads/2023-03-16-cgspace-crossref-funders-results.csv -d +$ csvgrep -c matched -m true ~/Downloads/2023-03-16-cgspace-crossref-funders-results.csv | wc -l +472 +$ sed 1d ~/Downloads/2023-03-16-cgspace-crossref-funders-results.csv | wc -l +1521 +``` + +- That's a 31% hit rate, but I see some simple things like "Bill and Melinda Gates Foundation" instead of "Bill & Melinda Gates Foundation" + +## 2023-03-17 + +- I did the same lookup of CGSpace donors on ROR's 2022-12-01 data dump: + +```console +$ ./ilri/ror_lookup.py -i /tmp/2023-03-16-donors.txt -o ~/Downloads/2023-03-16-cgspace-ror-funders-results.csv -r v1.15-2022-12-01-ror-data.json +$ csvgrep -c matched -m true ~/Downloads/2023-03-16-cgspace-ror-funders-results.csv | wc -l +407 +$ sed 1d ~/Downloads/2023-03-16-cgspace-ror-funders-results.csv | wc -l +1521 +``` + +- That's a 26.7% hit rate +- As for the number of funders in each dataset + - Crossref has about 34,000 + - ROR has 15,000 if "FundRef" data is a proxy for that: + +```console +$ grep -c -rsI FundRef v1.15-2022-12-01-ror-data.json +15162 +``` + +- On a related note, I remembered that DOI.org has a list of DOI prefixes and publishers: https://doi.crossref.org/getPrefixPublisher + - In Python I can look up publishers by prefix easily, here with a nested list comprehension: + +```console +In [10]: [publisher for publisher in publishers if '10.3390' in publisher['prefixes']] +Out[10]: +[{'prefixes': ['10.1989', '10.32545', '10.20944', '10.3390', '10.35995'], + 'name': 'MDPI AG', + 'memberId': 1968}] +``` + +- And in OpenRefine, if I create a new column based on the DOI using Jython: + +```python +import json + +with open("/home/aorth/src/git/DSpace/publisher-doi-prefixes.json", "rb") as f: + publishers = json.load(f) + +doi_prefix = value.split("/")[3] + +publisher = [publisher for publisher in publishers if doi_prefix in publisher['prefixes']] + +return publisher[0]['name'] +``` + +- ... though this is very slow and hung OpenRefine when I tried it +- I added the ability to overwrite multiple bitstream formats at once in `post_bitstreams.py` + +```console +$ ./ilri/post_bitstreams.py -i test.csv -u https://dspacetest.cgiar.org/rest -e fuuu@example.com -p 'fffnjnjn' -d -s 2B40C7C4E34CEFCF5AFAE4B75A8C52E2 --overwrite JPEG --overwrite PNG -n +Session valid: 2B40C7C4E34CEFCF5AFAE4B75A8C52E2 +Opened test.csv +384142cb-58b9-4e64-bcdc-0a8cc34888b3: checking for existing bitstreams in THUMBNAIL bundle +> (DRY RUN) Deleting bitstream: IFPRI Malawi_Maize Market Report_February_202_anonymous.pdf.jpg (16883cb0-1fc8-4786-a04f-32132e0617d4) +> (DRY RUN) Deleting bitstream: AgroEcol_Newsletter_2.png (7e9cd434-45a6-4d55-8d56-4efa89d73813) +> (DRY RUN) Uploading file: 10568-129666.pdf.jpg +``` + +- I learned how to use Python's built-in `logging` module and it simplifies all my debug and info printing + - I re-factored a few scripts to use the new logging + +## 2023-03-18 + +- I applied changes for publishers on 16,000 items in batches of 5,000 +- While working on my `post_bitstreams.py` script I realized the Tomcat Crawler Session Manager valve that groups bot user agents into sessions is causing my login to fail the first time, every time + - I've disabled it for now and will check the Munin session graphs after some time to see if it makes a difference + - In any case I have much better spider user agent lists in DSpace now than I did years ago when I started using the Crawler Session Manager valve + diff --git a/docs/2023-03/index.html b/docs/2023-03/index.html index 45b7ad76e..a2dc2def3 100644 --- a/docs/2023-03/index.html +++ b/docs/2023-03/index.html @@ -16,7 +16,7 @@ I finally got through with porting the input form from DSpace 6 to DSpace 7 - + @@ -38,9 +38,9 @@ I finally got through with porting the input form from DSpace 6 to DSpace 7 "@type": "BlogPosting", "headline": "March, 2023", "url": "https://alanorth.github.io/cgspace-notes/2023-03/", - "wordCount": "1984", + "wordCount": "2804", "datePublished": "2023-03-01T07:58:36+03:00", - "dateModified": "2023-03-13T21:22:25+03:00", + "dateModified": "2023-03-15T08:03:48+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -401,7 +401,158 @@ pd.options.mode.nullable_dtypes = True
  • Then update them in the database:
  • $ ./ilri/update_orcids.py -i /tmp/2023-03-14-orcids-names.txt -db dspace -u dspace -p 'fuuu' -m 247
    -
    +

    2023-03-15

    + +
    localhost/dspacetest= ☘ SELECT COUNT(*) FROM bitstream WHERE NOT deleted AND bitstream_format_id=2;
    + count 
    +───────
    + 45281
    +(1 row)
    +
    +

    2023-03-16

    + +
    import requests
    +
    +api_key = 'fuuuuuuuuu'
    +doi = "10.1016/j.foodqual.2021.104362"
    +request_url = f'https://api.elsevier.com/content/article/doi:{doi}'
    +
    +headers = {
    +    'X-ELS-APIKEY': api_key,
    +    'Accept': 'application/pdf'
    +}
    +
    +with requests.get(request_url, stream=True, headers=headers) as r:
    +    if r.status_code == 200:
    +        with open("article.pdf", "wb") as f:
    +            for chunk in r.iter_content(chunk_size=1024*1024):
    +                f.write(chunk)
    +
    +
    localhost/dspacetest= ☘ \COPY (SELECT DISTINCT(text_value) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=248) to /tmp/2023-03-16-donors.txt;
    +COPY 1521
    +
    +
    $ ./ilri/crossref_funders_lookup.py -e fuuuu@cgiar.org -i /tmp/2023-03-16-donors.txt -o ~/Downloads/2023-03-16-cgspace-crossref-funders-results.csv -d
    +$ csvgrep -c matched -m true ~/Downloads/2023-03-16-cgspace-crossref-funders-results.csv | wc -l
    +472
    +$ sed 1d ~/Downloads/2023-03-16-cgspace-crossref-funders-results.csv | wc -l 
    +1521
    +
    +

    2023-03-17

    + +
    $ ./ilri/ror_lookup.py -i /tmp/2023-03-16-donors.txt -o ~/Downloads/2023-03-16-cgspace-ror-funders-results.csv -r v1.15-2022-12-01-ror-data.json
    +$ csvgrep -c matched -m true ~/Downloads/2023-03-16-cgspace-ror-funders-results.csv | wc -l                                            
    +407
    +$ sed 1d ~/Downloads/2023-03-16-cgspace-ror-funders-results.csv | wc -l
    +1521
    +
    +
    $ grep -c -rsI FundRef v1.15-2022-12-01-ror-data.json    
    +15162
    +
    +
    In [10]: [publisher for publisher in publishers if '10.3390' in publisher['prefixes']]
    +Out[10]: 
    +[{'prefixes': ['10.1989', '10.32545', '10.20944', '10.3390', '10.35995'],
    +  'name': 'MDPI AG',
    +  'memberId': 1968}]
    +
    +
    import json
    +
    +with open("/home/aorth/src/git/DSpace/publisher-doi-prefixes.json", "rb") as f:
    +    publishers = json.load(f)
    +
    +doi_prefix = value.split("/")[3]
    +
    +publisher = [publisher for publisher in publishers if doi_prefix in publisher['prefixes']]
    +
    +return publisher[0]['name']
    +
    +
    $ ./ilri/post_bitstreams.py -i test.csv -u https://dspacetest.cgiar.org/rest -e fuuu@example.com -p 'fffnjnjn' -d -s 2B40C7C4E34CEFCF5AFAE4B75A8C52E2 --overwrite JPEG --overwrite PNG -n
    +Session valid: 2B40C7C4E34CEFCF5AFAE4B75A8C52E2
    +Opened test.csv
    +384142cb-58b9-4e64-bcdc-0a8cc34888b3: checking for existing bitstreams in THUMBNAIL bundle
    +> (DRY RUN) Deleting bitstream: IFPRI Malawi_Maize Market Report_February_202_anonymous.pdf.jpg (16883cb0-1fc8-4786-a04f-32132e0617d4)
    +> (DRY RUN) Deleting bitstream: AgroEcol_Newsletter_2.png (7e9cd434-45a6-4d55-8d56-4efa89d73813)
    +> (DRY RUN) Uploading file: 10568-129666.pdf.jpg
    +
    +

    2023-03-18

    + + diff --git a/docs/categories/index.html b/docs/categories/index.html index 75dc69d0d..6f78850fa 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index e3741e3be..6629e2b1d 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 18f39aa40..7faff6e30 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 7b99c3c31..639ffb777 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 27d8fe9ac..75d8f9d37 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index 2ef6fa559..154e859d9 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index d87fa1477..ebf0f2155 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html index 0c1bbd7f3..cbde3978b 100644 --- a/docs/categories/notes/page/7/index.html +++ b/docs/categories/notes/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index ee5f99b04..436d12678 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/10/index.html b/docs/page/10/index.html index 017f14111..ac050a30a 100644 --- a/docs/page/10/index.html +++ b/docs/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 7a168849a..9ef8c24dd 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 3c1a15c97..0bcee4ade 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 812af2a33..89e93c08e 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 2318df757..bbfe9ec34 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 493d3a9ea..a90729235 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 29a22ae13..147215d0e 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index ea2ed8a29..6696d199e 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/9/index.html b/docs/page/9/index.html index 0d29bf707..f6abce6f8 100644 --- a/docs/page/9/index.html +++ b/docs/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index eaae827fb..77f6c60ce 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/10/index.html b/docs/posts/page/10/index.html index 5d1c77b9c..51d339ff7 100644 --- a/docs/posts/page/10/index.html +++ b/docs/posts/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index ba6c624ef..cf4766359 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 07b89ac15..1de9e52b6 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 65c7b3191..0b5dc0135 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index c763e1b1a..5d15d834e 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index cad7b6978..028b2c617 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 7c8777e2d..e26ee6c63 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index aa4fd5035..1e4e6773a 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html index 1efc81430..b2ce00b0c 100644 --- a/docs/posts/page/9/index.html +++ b/docs/posts/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 625afd54c..295345035 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2023-03-14T14:30:17+03:00 + 2023-03-15T08:03:48+03:00 https://alanorth.github.io/cgspace-notes/ - 2023-03-14T14:30:17+03:00 + 2023-03-15T08:03:48+03:00 https://alanorth.github.io/cgspace-notes/2023-03/ - 2023-03-13T21:22:25+03:00 + 2023-03-15T08:03:48+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2023-03-14T14:30:17+03:00 + 2023-03-15T08:03:48+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2023-03-14T14:30:17+03:00 + 2023-03-15T08:03:48+03:00 https://alanorth.github.io/cgspace-notes/2023-02/ 2023-03-01T08:30:25+03:00