mirror of
https://github.com/alanorth/cgspace-notes.git
synced 2024-12-01 19:08:18 +01:00
838 lines
53 KiB
HTML
838 lines
53 KiB
HTML
<!DOCTYPE html>
|
||
<html lang="en" >
|
||
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||
|
||
|
||
<meta property="og:title" content="September, 2022" />
|
||
<meta property="og:description" content="2022-09-01
|
||
|
||
A bit of work on the “Mapping CG Core–CGSpace–MEL–MARLO Types” spreadsheet
|
||
I tested an item submission on DSpace Test with the Cocoon org.apache.cocoon.uploads.autosave=false change
|
||
|
||
The submission works as expected
|
||
|
||
|
||
Start debugging some region-related issues with csv-metadata-quality
|
||
|
||
I created a new test file test-geography.csv with some different scenarios
|
||
I also fixed a few bugs and improved the region-matching logic
|
||
|
||
|
||
" />
|
||
<meta property="og:type" content="article" />
|
||
<meta property="og:url" content="https://alanorth.github.io/cgspace-notes/2022-09/" />
|
||
<meta property="article:published_time" content="2022-09-01T09:41:36+03:00" />
|
||
<meta property="article:modified_time" content="2022-09-30T17:29:50+03:00" />
|
||
|
||
|
||
|
||
<meta name="twitter:card" content="summary"/>
|
||
<meta name="twitter:title" content="September, 2022"/>
|
||
<meta name="twitter:description" content="2022-09-01
|
||
|
||
A bit of work on the “Mapping CG Core–CGSpace–MEL–MARLO Types” spreadsheet
|
||
I tested an item submission on DSpace Test with the Cocoon org.apache.cocoon.uploads.autosave=false change
|
||
|
||
The submission works as expected
|
||
|
||
|
||
Start debugging some region-related issues with csv-metadata-quality
|
||
|
||
I created a new test file test-geography.csv with some different scenarios
|
||
I also fixed a few bugs and improved the region-matching logic
|
||
|
||
|
||
"/>
|
||
<meta name="generator" content="Hugo 0.105.0">
|
||
|
||
|
||
|
||
<script type="application/ld+json">
|
||
{
|
||
"@context": "http://schema.org",
|
||
"@type": "BlogPosting",
|
||
"headline": "September, 2022",
|
||
"url": "https://alanorth.github.io/cgspace-notes/2022-09/",
|
||
"wordCount": "3621",
|
||
"datePublished": "2022-09-01T09:41:36+03:00",
|
||
"dateModified": "2022-09-30T17:29:50+03:00",
|
||
"author": {
|
||
"@type": "Person",
|
||
"name": "Alan Orth"
|
||
},
|
||
"keywords": "Notes"
|
||
}
|
||
</script>
|
||
|
||
|
||
|
||
<link rel="canonical" href="https://alanorth.github.io/cgspace-notes/2022-09/">
|
||
|
||
<title>September, 2022 | CGSpace Notes</title>
|
||
|
||
|
||
<!-- combined, minified CSS -->
|
||
|
||
<link href="https://alanorth.github.io/cgspace-notes/css/style.c6ba80bc50669557645abe05f86b73cc5af84408ed20f1551a267bc19ece8228.css" rel="stylesheet" integrity="sha256-xrqAvFBmlVdkWr4F+GtzzFr4RAjtIPFVGiZ7wZ7Ogig=" crossorigin="anonymous">
|
||
|
||
|
||
<!-- minified Font Awesome for SVG icons -->
|
||
|
||
<script defer src="https://alanorth.github.io/cgspace-notes/js/fontawesome.min.f5072c55a0721857184db93a50561d7dc13975b4de2e19db7f81eb5f3fa57270.js" integrity="sha256-9QcsVaByGFcYTbk6UFYdfcE5dbTeLhnbf4HrXz+lcnA=" crossorigin="anonymous"></script>
|
||
|
||
<!-- RSS 2.0 feed -->
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
<body>
|
||
|
||
|
||
<div class="blog-masthead">
|
||
<div class="container">
|
||
<nav class="nav blog-nav">
|
||
<a class="nav-link " href="https://alanorth.github.io/cgspace-notes/">Home</a>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<header class="blog-header">
|
||
<div class="container">
|
||
<h1 class="blog-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/" rel="home">CGSpace Notes</a></h1>
|
||
<p class="lead blog-description" dir="auto">Documenting day-to-day work on the <a href="https://cgspace.cgiar.org">CGSpace</a> repository.</p>
|
||
</div>
|
||
</header>
|
||
|
||
|
||
|
||
|
||
<div class="container">
|
||
<div class="row">
|
||
<div class="col-sm-8 blog-main">
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-09/">September, 2022</a></h2>
|
||
<p class="blog-post-meta">
|
||
<time datetime="2022-09-01T09:41:36+03:00">Thu Sep 01, 2022</time>
|
||
in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-09-01">2022-09-01</h2>
|
||
<ul>
|
||
<li>A bit of work on the “Mapping CG Core–CGSpace–MEL–MARLO Types” spreadsheet</li>
|
||
<li>I tested an item submission on DSpace Test with the Cocoon <code>org.apache.cocoon.uploads.autosave=false</code> change
|
||
<ul>
|
||
<li>The submission works as expected</li>
|
||
</ul>
|
||
</li>
|
||
<li>Start debugging some region-related issues with csv-metadata-quality
|
||
<ul>
|
||
<li>I created a new test file <code>test-geography.csv</code> with some different scenarios</li>
|
||
<li>I also fixed a few bugs and improved the region-matching logic</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<ul>
|
||
<li>I filed <a href="https://github.com/konstantinstadler/country_converter/issues/115">an issue for the “South-eastern Asia” case mismatch in country_converter</a> on GitHub</li>
|
||
<li>Meeting with Moayad to discuss OpenRXV developments
|
||
<ul>
|
||
<li>He demoed his new multiple dashboards feature and I helped him rebase those changes to master so we can test them more</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-02">2022-09-02</h2>
|
||
<ul>
|
||
<li>I worked a bit more on exclusion and skipping logic in csv-metadata-quality
|
||
<ul>
|
||
<li>I also pruned and updated all the Python dependencies</li>
|
||
<li>Then I released <a href="https://github.com/ilri/csv-metadata-quality/releases/tag/v0.6.0">version 0.6.0</a> now that the excludes and region matching support is working way better</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-05">2022-09-05</h2>
|
||
<ul>
|
||
<li>Started a harvest on AReS last night</li>
|
||
<li>Looking over the Solr statistics from last month I see many user agents that look suspicious:
|
||
<ul>
|
||
<li>Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.2; WOW64; Trident/7.0; .NET4.0E; .NET4.0C)</li>
|
||
<li>Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 77.0.3865.90 Safari / 537.36</li>
|
||
<li>Mozilla/5.0 (Windows NT 10.0; WOW64; Rv:50.0) Gecko/20100101 Firefox/50.0</li>
|
||
<li>Mozilla/5.0 (X11; Linux i686; rv:2.0b12pre) Gecko/20110204 Firefox/4.0b12pre</li>
|
||
<li>Mozilla/5.0 (Windows NT 10.0; Win64; x64; Xbox; Xbox One) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36 Edge/44.18363.8131</li>
|
||
<li>Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)</li>
|
||
<li>Mozilla/4.0 (compatible; MSIE 4.5; Windows 98;)</li>
|
||
<li>curb</li>
|
||
<li>bitdiscovery</li>
|
||
<li>omgili/0.5 +http://omgili.com</li>
|
||
<li>Mozilla/5.0 (compatible)</li>
|
||
<li>Vizzit</li>
|
||
<li>Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0</li>
|
||
<li>Mozilla/5.0 (Android; Mobile; rv:13.0) Gecko/13.0 Firefox/13.0</li>
|
||
<li>Java/17-ea</li>
|
||
<li>AdobeUxTechC4-Async/3.0.12 (win32)</li>
|
||
<li>ZaloPC-win32-24v473</li>
|
||
<li>Mozilla/5.0/Firefox/42.0 - nbertaupete95(at)gmail.com</li>
|
||
<li>Scoop.it</li>
|
||
<li>Mozilla/5.0 (Windows NT 6.1; rv:27.0) Gecko/20100101 Firefox/27.0</li>
|
||
<li>Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)</li>
|
||
<li>ows NT 10.0; WOW64; rv: 50.0) Gecko/20100101 Firefox/50.0</li>
|
||
<li>WebAPIClient</li>
|
||
<li>Mozilla/5.0 Firefox/26.0</li>
|
||
<li>Mozilla/5.0 (compatible; woorankreview/2.0; +https://www.woorank.com/)</li>
|
||
</ul>
|
||
</li>
|
||
<li>For example, some are apparently using versions of Firefox that are over ten years old, and some are obviously trying to look like valid user agents, but making typos (<code>Mozilla / 5.0</code>)</li>
|
||
<li>Tons of hosts making requests likt this:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>GET /bitstream/handle/10568/109408/Milk%20testing%20lab%20protocol.pdf?sequence=1&isAllowed=\x22><script%20>alert(String.fromCharCode(88,83,83))</script> HTTP/1.1" 400 5 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64; Rv:50.0) Gecko/20100101 Firefox/50.0
|
||
</span></span></code></pre></div><ul>
|
||
<li>I got a list of hosts making requests like that so I can purge their hits:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># zcat /var/log/nginx/<span style="color:#f92672">{</span>access,library-access,oai,rest<span style="color:#f92672">}</span>.log.<span style="color:#f92672">[</span>123<span style="color:#f92672">]</span>*.gz | grep <span style="color:#e6db74">'String.fromCharCode('</span> | awk <span style="color:#e6db74">'{print $1}'</span> | sort -u > /tmp/ips.txt
|
||
</span></span></code></pre></div><ul>
|
||
<li>I purged 4,718 hits from IPs</li>
|
||
<li>I see some new Hetzner ranges that I hadn’t blocked yet apparently?
|
||
<ul>
|
||
<li>I got a <a href="https://www.ipqualityscore.com/asn-details/AS24940/hetzner-online-gmbh">list of Hetzner’s IPs from IP Quality Score</a> then added them to the existing ones in my Ansible playbooks:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ awk <span style="color:#e6db74">'{print $1}'</span> /tmp/hetzner.txt | wc -l
|
||
</span></span><span style="display:flex;"><span>36
|
||
</span></span><span style="display:flex;"><span>$ sort -u /tmp/hetzner-combined.txt | wc -l
|
||
</span></span><span style="display:flex;"><span>49
|
||
</span></span></code></pre></div><ul>
|
||
<li>I will add this new list to nginx’s <code>bot-networks.conf</code> so they get throttled on scraping XMLUI and get classified as bots in Solr statistics</li>
|
||
<li>Then I purged hits from the following user agents:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ ./ilri/check-spider-hits.sh -f /tmp/agents
|
||
</span></span><span style="display:flex;"><span>Found 374 hits from curb in statistics
|
||
</span></span><span style="display:flex;"><span>Found 350 hits from bitdiscovery in statistics
|
||
</span></span><span style="display:flex;"><span>Found 564 hits from omgili in statistics
|
||
</span></span><span style="display:flex;"><span>Found 390 hits from Vizzit in statistics
|
||
</span></span><span style="display:flex;"><span>Found 9125 hits from AdobeUxTechC4-Async in statistics
|
||
</span></span><span style="display:flex;"><span>Found 97 hits from ZaloPC-win32-24v473 in statistics
|
||
</span></span><span style="display:flex;"><span>Found 518 hits from nbertaupete95 in statistics
|
||
</span></span><span style="display:flex;"><span>Found 218 hits from Scoop.it in statistics
|
||
</span></span><span style="display:flex;"><span>Found 584 hits from WebAPIClient in statistics
|
||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>Total number of hits from bots: 12220
|
||
</span></span></code></pre></div><ul>
|
||
<li>Then I will add these user agents to the ILRI spider override in DSpace</li>
|
||
</ul>
|
||
<h2 id="2022-09-06">2022-09-06</h2>
|
||
<ul>
|
||
<li>I’m testing dspace-statistics-api with our DSpace 7 test server
|
||
<ul>
|
||
<li>After setting up the env and the database the <code>python -m dspace_statistics_api.indexer</code> runs without issues</li>
|
||
<li>While playing with Solr I tried to search for statistics from this month using <code>time:2022-09*</code> but I get this error: “Can’t run prefix queries on numeric fields”</li>
|
||
<li>I guess that the syntax in Solr changed since 4.10…</li>
|
||
<li>This works, but is super annoying: <code>time:[2022-09-01T00:00:00Z TO 2022-09-30T23:59:59Z]</code></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-07">2022-09-07</h2>
|
||
<ul>
|
||
<li>I tested the controlled-vocabulary changes on DSpace 6 and they work fine
|
||
<ul>
|
||
<li>Last week I found that DSpace 7 is more strict with controlled vocabularies and requires IDs for all node values</li>
|
||
<li>This is a pain because it means I have to re-do the IDs in each file every time I update them</li>
|
||
<li>If I add <code>id="0000"</code> to each, then I can use <a href="https://vim.fandom.com/wiki/Making_a_list_of_numbers#Substitute_with_ascending_numbers">this vim expression</a> <code>let i=0001 | g/0000/s//\=i/ | let i=i+1</code> to replace the numbers with increments starting from 1</li>
|
||
</ul>
|
||
</li>
|
||
<li>Meeting with Marie Angelique, Abenet, Sarа, аnd Margarita to continue the discussion about Types from last week
|
||
<ul>
|
||
<li>We made progress with concrete actions and will continue next week</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-08">2022-09-08</h2>
|
||
<ul>
|
||
<li>I had a meeting with Nicky from UNEP to discuss issues they are having with their DSpace
|
||
<ul>
|
||
<li>I told her about the meeting of DSpace community people that we’re planning at ILRI in the next few weeks</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-09">2022-09-09</h2>
|
||
<ul>
|
||
<li>Add some value mappings to AReS because I see a lot of incorrect regions and countries</li>
|
||
<li>I also found some values that were blank in CGSpace so I deleted them:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>dspace=# BEGIN;
|
||
</span></span><span style="display:flex;"><span>BEGIN
|
||
</span></span><span style="display:flex;"><span>dspace=# DELETE FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_value='';
|
||
</span></span><span style="display:flex;"><span>DELETE 70
|
||
</span></span><span style="display:flex;"><span>dspace=# COMMIT;
|
||
</span></span><span style="display:flex;"><span>COMMIT
|
||
</span></span></code></pre></div><ul>
|
||
<li>Start a full Discovery index on CGSpace to catch these changes in the Discovery</li>
|
||
</ul>
|
||
<h2 id="2022-09-11">2022-09-11</h2>
|
||
<ul>
|
||
<li>Today is Sunday and I see the load on the server is high
|
||
<ul>
|
||
<li>Google and a bunch of other bots have been blocked on XMLUI for the past two weeks so it’s not from them!</li>
|
||
<li>Looking at the top IPs this morning:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># cat /var/log/nginx/<span style="color:#f92672">{</span>access,library-access,oai,rest<span style="color:#f92672">}</span>.log /var/log/nginx/<span style="color:#f92672">{</span>access,library-access,oai,rest<span style="color:#f92672">}</span>.log.1 | grep <span style="color:#e6db74">'11/Sep/2022'</span> | awk <span style="color:#e6db74">'{print $1}'</span> | sort | uniq -c | sort -h | tail -n <span style="color:#ae81ff">40</span>
|
||
</span></span><span style="display:flex;"><span>...
|
||
</span></span><span style="display:flex;"><span> 165 64.233.172.79
|
||
</span></span><span style="display:flex;"><span> 166 87.250.224.34
|
||
</span></span><span style="display:flex;"><span> 200 69.162.124.231
|
||
</span></span><span style="display:flex;"><span> 202 216.244.66.198
|
||
</span></span><span style="display:flex;"><span> 385 207.46.13.149
|
||
</span></span><span style="display:flex;"><span> 398 207.46.13.147
|
||
</span></span><span style="display:flex;"><span> 421 66.249.64.185
|
||
</span></span><span style="display:flex;"><span> 422 157.55.39.81
|
||
</span></span><span style="display:flex;"><span> 442 2a01:4f8:1c17:5550::1
|
||
</span></span><span style="display:flex;"><span> 451 64.124.8.36
|
||
</span></span><span style="display:flex;"><span> 578 137.184.159.211
|
||
</span></span><span style="display:flex;"><span> 597 136.243.228.195
|
||
</span></span><span style="display:flex;"><span> 1185 66.249.64.183
|
||
</span></span><span style="display:flex;"><span> 1201 157.55.39.80
|
||
</span></span><span style="display:flex;"><span> 3135 80.248.237.167
|
||
</span></span><span style="display:flex;"><span> 4794 54.195.118.125
|
||
</span></span><span style="display:flex;"><span> 5486 45.5.186.2
|
||
</span></span><span style="display:flex;"><span> 6322 2a01:7e00::f03c:91ff:fe9a:3a37
|
||
</span></span><span style="display:flex;"><span> 9556 66.249.64.181
|
||
</span></span></code></pre></div><ul>
|
||
<li>The top is still Google, but all the requests are HTTP 503 because I classified them as bots for XMLUI at least</li>
|
||
<li>Then there’s 80.248.237.167, which is using a normal user agent and scraping Discovery
|
||
<ul>
|
||
<li>That IP is on Internet Vikings aka Internetbolaget and we are already marking that subnet as ‘bot’ for XMLUI so most of these requests are HTTP 503</li>
|
||
</ul>
|
||
</li>
|
||
<li>On another note, I’m curious to explore enabling caching of certain REST API responses
|
||
<ul>
|
||
<li>For example, where the use is for harvesting rather than actual clients getting bitstreams or thumbnails, it seems there might be a benefit to speeding these up for subsequent requestors:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log | grep -v retrieve | sort | uniq -c | sort -h | tail -n <span style="color:#ae81ff">10</span>
|
||
</span></span><span style="display:flex;"><span> 4 /rest/items/3f692ddd-7856-4bf0-a587-99fb3df0688a/bitstreams
|
||
</span></span><span style="display:flex;"><span> 4 /rest/items/3f692ddd-7856-4bf0-a587-99fb3df0688a/metadata
|
||
</span></span><span style="display:flex;"><span> 4 /rest/items/b014e36f-b496-43d8-9148-cc9db8a6efac/bitstreams
|
||
</span></span><span style="display:flex;"><span> 4 /rest/items/b014e36f-b496-43d8-9148-cc9db8a6efac/metadata
|
||
</span></span><span style="display:flex;"><span> 5 /rest/handle/10568/110310?expand=all
|
||
</span></span><span style="display:flex;"><span> 5 /rest/handle/10568/89980?expand=all
|
||
</span></span><span style="display:flex;"><span> 5 /rest/handle/10568/97614?expand=all
|
||
</span></span><span style="display:flex;"><span> 6 /rest/handle/10568/107086?expand=all
|
||
</span></span><span style="display:flex;"><span> 6 /rest/handle/10568/108503?expand=all
|
||
</span></span><span style="display:flex;"><span> 6 /rest/handle/10568/98424?expand=all
|
||
</span></span></code></pre></div><ul>
|
||
<li>I specifically have to not cache things like requests for bitstreams because those are from actual users and we need to keep the real requests so we get the statistics hit
|
||
<ul>
|
||
<li>Will be interesting to check the results above as the day goes on (now 10AM)</li>
|
||
<li>To estimate the potential savings from caching I will check how many non-bitstream requests are made versus how many are made more than once (updated the next morning using yesterday’s log):</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort -u | wc -l
|
||
</span></span><span style="display:flex;"><span>33733
|
||
</span></span><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort | uniq -c | awk <span style="color:#e6db74">'$1 > 1'</span> | wc -l
|
||
</span></span><span style="display:flex;"><span>5637
|
||
</span></span></code></pre></div><ul>
|
||
<li>In the afternoon I started a harvest on AReS (which should affect the numbers above also)</li>
|
||
<li>I enabled an nginx proxy cache on DSpace Test for this location regex: <code>location ~ /rest/(handle|items|collections|communities)/.+</code></li>
|
||
</ul>
|
||
<h2 id="2022-09-12">2022-09-12</h2>
|
||
<ul>
|
||
<li>I am testing harvesting DSpace Test via AReS with the nginx proxy cache enabled
|
||
<ul>
|
||
<li>I had to tune the regular expression in nginx a bit because the REST requests OpenRXV uses weren’t matching</li>
|
||
<li>Now I’m trying this one: <code>/rest/(handle|items|collections|communities)/?</code></li>
|
||
<li>Testing in <a href="https://regex101.com/r/vPz11y/1">regex101.com</a> with this test string:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<pre tabindex="0"><code>/rest/handle/10568/27611
|
||
/rest/items?expand=metadata,parentCommunityList,parentCollectionList,bitstreams&limit=10&offset=36270
|
||
/rest/handle/10568/110310?expand=all
|
||
/rest/rest/bitstreams/28926633-c7c2-49c2-afa8-6d81cadc2316/retrieve
|
||
/rest/bitstreams/15412/retrieve
|
||
/rest/items/083dbb0d-11e2-4dfe-902b-eb48e4640d04/metadata
|
||
/rest/items/083dbb0d-11e2-4dfe-902b-eb48e4640d04/bitstreams
|
||
/rest/collections/edea23c0-0ebd-4525-90b0-0b401f997704/items
|
||
/rest/items/14507941-aff2-4d57-90bd-03a0733ad859/metadata
|
||
/rest/communities/b38ea726-475f-4247-a961-0d0b76e67f85/collections
|
||
/rest/collections/e994c450-6ff7-41c6-98df-51e5c424049e/items?limit=10000
|
||
</code></pre><ul>
|
||
<li>I estimate that it will take about 1GB of cache to harvest 100,000 items from CGSpace with OpenRXV (10,000 pages)</li>
|
||
<li>Basically all but 4 and 5 (bitstreams) should match</li>
|
||
<li>Upload 682 OICRs from MARLO to CGSpace
|
||
<ul>
|
||
<li>We had tested these on DSpace Test last month along with the MELIAs, Policies, and Innovations, but we decided to upload the OICRs first so that other things can link against them as related items</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-14">2022-09-14</h2>
|
||
<ul>
|
||
<li>Meeting with Peter, Abenet, Indira, and Michael about CGSpace rollout plan for the Initiatives</li>
|
||
</ul>
|
||
<h2 id="2022-09-16">2022-09-16</h2>
|
||
<ul>
|
||
<li>Meeting with Marie-Angeqlique, Abenet, Margarita, and Sara about types for CG Core
|
||
<ul>
|
||
<li>We are about halfway through the list of types now, with concrete actions for CG Core and CGSpace</li>
|
||
<li>We will meet next in two weeks to hopefully finalize the list, then we can move on to definitions</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-18">2022-09-18</h2>
|
||
<ul>
|
||
<li>Deploy the <code>org.apache.cocoon.uploads.autosave=false</code> change on CGSpace</li>
|
||
<li>Start a harvest on AReS</li>
|
||
</ul>
|
||
<h2 id="2022-09-19">2022-09-19</h2>
|
||
<ul>
|
||
<li>Deploy the nginx proxy cache for /rest requests on CGSpace
|
||
<ul>
|
||
<li>I had tested this last week on DSpace Test</li>
|
||
<li>By my counts on CGSpace yesterday (Sunday, a busy day for the REST API), we had 5,654 URLs that were requested more than twice, and it tails off after that towards two, three, four, etc:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort | uniq -c | awk <span style="color:#e6db74">'$1 > 1'</span> | wc -l
|
||
</span></span><span style="display:flex;"><span>5654
|
||
</span></span><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort | uniq -c | awk <span style="color:#e6db74">'$1 == 2'</span> | wc -l
|
||
</span></span><span style="display:flex;"><span>4710
|
||
</span></span><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort | uniq -c | awk <span style="color:#e6db74">'$1 == 3'</span> | wc -l
|
||
</span></span><span style="display:flex;"><span>814
|
||
</span></span><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort | uniq -c | awk <span style="color:#e6db74">'$1 == 4'</span> | wc -l
|
||
</span></span><span style="display:flex;"><span>86
|
||
</span></span><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $7}'</span> /var/log/nginx/rest.log.1 | grep -v retrieve | sort | uniq -c | awk <span style="color:#e6db74">'$1 == 5'</span> | wc -l
|
||
</span></span><span style="display:flex;"><span>39
|
||
</span></span></code></pre></div><ul>
|
||
<li>For now I guess requests that were done two or three times by different clients will be cached and that’s a win, and I expect more and more REST API activity soon when initiatives and One CGIAR stuff picks up</li>
|
||
</ul>
|
||
<h2 id="2022-09-20">2022-09-20</h2>
|
||
<ul>
|
||
<li>I checked the status of the nginx REST API cache on CGSpace and it was stuck at 7,083 items for hours:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># find /var/cache/nginx/rest_cache/ -type f | wc -l
|
||
</span></span><span style="display:flex;"><span>7083
|
||
</span></span></code></pre></div><ul>
|
||
<li>The proxy cache key zone is currently 1m, which can store ~8,000 keys, so that could be what we’re running into
|
||
<ul>
|
||
<li>I increased it to 2m and will keep monitoring it</li>
|
||
</ul>
|
||
</li>
|
||
<li>CIP webmaster contacted me to say they are having problems harvesting CGSpace from their WordPress
|
||
<ul>
|
||
<li>I am not sure if there are issues due to the REST API caching I enabled…</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-21">2022-09-21</h2>
|
||
<ul>
|
||
<li>Planning the Nairobi DSpace Users meeting with Abenet</li>
|
||
<li>Planning to have a call about MEL submitting to CGSpace on Monday with Mohammed Salem
|
||
<ul>
|
||
<li>I created two collections on DSpace Test: one with a workflow, and one without</li>
|
||
<li>According to my notes from <a href="/cgspace-notes/2020-10/">2020-10</a> the account must be in the admin group in order to submit via the REST API, so I added it to the admin group of each collection</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-22">2022-09-22</h2>
|
||
<ul>
|
||
<li>Nairobi DSpace users meeting at ILRI</li>
|
||
<li>I found a few users that didn’t have ORCID iDs and were missing tags on CGSpace so I tagged them:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>dc.contributor.author,cg.creator.identifier
|
||
</span></span><span style="display:flex;"><span>dc.contributor.author,cg.creator.identifier
|
||
</span></span><span style="display:flex;"><span>"Alonso, Silvia","Silvia Alonso: 0000-0002-0565-536X"
|
||
</span></span><span style="display:flex;"><span>"Goopy, John P.","John Goopy: 0000-0001-7177-1310"
|
||
</span></span><span style="display:flex;"><span>"Korir, Daniel","Daniel Korir: 0000-0002-1356-8039"
|
||
</span></span><span style="display:flex;"><span>"Leitner, Sonja","Sonja Leitner: 0000-0002-1276-8071"
|
||
</span></span><span style="display:flex;"><span>"Fèvre, Eric M.","Eric M. Fèvre: 0000-0001-8931-4986"
|
||
</span></span><span style="display:flex;"><span>"Galiè, Alessandra","Alessandra Galie: 0000-0001-9868-7733"
|
||
</span></span><span style="display:flex;"><span>"Baltenweck, Isabelle","Isabelle Baltenweck: 0000-0002-4147-5921"
|
||
</span></span><span style="display:flex;"><span>"Robinson, Timothy P.","Timothy Robinson: 0000-0002-4266-963X"
|
||
</span></span><span style="display:flex;"><span>"Lannerstad, Mats","Mats Lannerstad: 0000-0002-5116-3198"
|
||
</span></span><span style="display:flex;"><span>"Graham, Michael","Michael Graham: 0000-0002-1189-8640"
|
||
</span></span><span style="display:flex;"><span>"Merbold, Lutz","Lutz Merbold: 0000-0003-4974-170X"
|
||
</span></span><span style="display:flex;"><span>"Rufino, Mariana C.","Mariana Rufino: 0000-0003-4293-3290"
|
||
</span></span><span style="display:flex;"><span>"Wilkes, Andreas","Andreas Wilkes: 0000-0001-7546-991X"
|
||
</span></span><span style="display:flex;"><span>"van der Weerden, T.","Tony van der Weerden: 0000-0002-6999-2584"
|
||
</span></span><span style="display:flex;"><span>"Vermeulen, S.","Sonja Vermeulen: 0000-0001-6242-9513"
|
||
</span></span><span style="display:flex;"><span>"Vermeulen, Sonja","Sonja Vermeulen: 0000-0001-6242-9513"
|
||
</span></span><span style="display:flex;"><span>"Vermeulen, Sonja J.","Sonja Vermeulen: 0000-0001-6242-9513"
|
||
</span></span><span style="display:flex;"><span>"Hung Nguyen-Viet","Hung Nguyen-Viet: 0000-0003-1549-2733"
|
||
</span></span><span style="display:flex;"><span>"Herrero, Mario T.","Mario Herrero: 0000-0002-7741-5090"
|
||
</span></span><span style="display:flex;"><span>"Thornton, Philip K.","Philip Thornton: 0000-0002-1854-0182"
|
||
</span></span><span style="display:flex;"><span>"Duncan, Alan J.","Alan Duncan: 0000-0002-3954-3067"
|
||
</span></span><span style="display:flex;"><span>"Lukuyu, Ben A.","Ben Lukuyu: 0000-0002-9374-3553"
|
||
</span></span><span style="display:flex;"><span>"Lindahl, Johanna F.","Johanna Lindahl: 0000-0002-1175-0398"
|
||
</span></span><span style="display:flex;"><span>"Okeyo Mwai, Ally","Ally Okeyo Mwai: 0000-0003-2379-7801"
|
||
</span></span><span style="display:flex;"><span>"Wieland, Barbara","Barbara Wieland: 0000-0003-4020-9186"
|
||
</span></span><span style="display:flex;"><span>"Omore, Amos O.","Amos Omore: 0000-0001-9213-9891"
|
||
</span></span><span style="display:flex;"><span>"Randolph, Thomas F.","Thomas Fitz Randolph: 0000-0003-1849-9877"
|
||
</span></span><span style="display:flex;"><span>"Staal, Steven J.","Steven Staal: 0000-0002-1244-1773"
|
||
</span></span><span style="display:flex;"><span>"Hanotte, Olivier H.","Olivier Hanotte: 0000-0002-2877-4767"
|
||
</span></span><span style="display:flex;"><span>"Dessie, Tadelle","Tadelle Dessie: 0000-0002-1630-0417"
|
||
</span></span><span style="display:flex;"><span>"Dione, Michel M.","Michel Dione: 0000-0001-7812-5776"
|
||
</span></span><span style="display:flex;"><span>"Gebremedhin, Berhanu","Berhanu Gebremedhin: 0000-0002-3168-2783"
|
||
</span></span><span style="display:flex;"><span>"Ouma, Emily A.","Emily Ouma: 0000-0002-3123-1376"
|
||
</span></span><span style="display:flex;"><span>"Roesel, Kristina","Kristina Roesel: 0000-0002-2553-1129"
|
||
</span></span><span style="display:flex;"><span>"Bishop, Richard P.","Richard Bishop: 0000-0002-3720-9970"
|
||
</span></span><span style="display:flex;"><span>"Lapar, Ma. Lucila","Ma. Lucila Lapar: 0000-0002-4214-9845"
|
||
</span></span><span style="display:flex;"><span>"Rich, Karl M.","Karl Rich: 0000-0002-5581-9553"
|
||
</span></span><span style="display:flex;"><span>"Hoekstra, Dirk","Dirk Hoekstra: 0000-0002-6111-6627"
|
||
</span></span><span style="display:flex;"><span>"Nene, Vishvanath","Vishvanath Nene: 0000-0001-7066-4169"
|
||
</span></span><span style="display:flex;"><span>"Patel, S.P.","Sonal Henson: 0000-0002-2002-5462"
|
||
</span></span><span style="display:flex;"><span>"Hanson, Jean","Jean Hanson: 0000-0002-3648-2641"
|
||
</span></span><span style="display:flex;"><span>"Marshall, Karen","Karen Marshall: 0000-0003-4197-1455"
|
||
</span></span><span style="display:flex;"><span>"Notenbaert, An Maria Omer","An Maria Omer Notenbaert: 0000-0002-6266-2240"
|
||
</span></span><span style="display:flex;"><span>"Ojango, Julie M.K.","Ojango J.M.K.: 0000-0003-0224-5370"
|
||
</span></span><span style="display:flex;"><span>"Wijk, Mark T. van","Mark van Wijk: 0000-0003-0728-8839"
|
||
</span></span><span style="display:flex;"><span>"Tarawali, Shirley A.","Shirley Tarawali: 0000-0001-9398-8780"
|
||
</span></span><span style="display:flex;"><span>"Naessens, Jan","Jan Naessens: 0000-0002-7075-9915"
|
||
</span></span><span style="display:flex;"><span>"Butterbach-Bahl, Klaus","Klaus Butterbach-Bahl: 0000-0001-9499-6598"
|
||
</span></span><span style="display:flex;"><span>"Poole, Elizabeth J.","Elizabeth Jane Poole: 0000-0002-8570-794X"
|
||
</span></span><span style="display:flex;"><span>"Mulema, Annet A.","Annet Mulema: 0000-0003-4192-3939"
|
||
</span></span><span style="display:flex;"><span>"Dror, Iddo","Iddo Dror: 0000-0002-0800-7456"
|
||
</span></span><span style="display:flex;"><span>"Ballantyne, Peter G.","Peter G. Ballantyne: 0000-0001-9346-2893"
|
||
</span></span><span style="display:flex;"><span>"Baker, Derek","Derek Baker: 0000-0001-6020-6973"
|
||
</span></span><span style="display:flex;"><span>"Ericksen, Polly J.","Polly Ericksen: 0000-0002-5775-7691"
|
||
</span></span><span style="display:flex;"><span>"Jones, Christopher S.","Chris Jones: 0000-0001-9096-9728"
|
||
</span></span><span style="display:flex;"><span>"Mude, Andrew G.","Andrew Mude: 0000-0003-4903-6613"
|
||
</span></span><span style="display:flex;"><span>"Puskur, Ranjitha","Ranjitha Puskur: 0000-0002-9112-3414"
|
||
</span></span><span style="display:flex;"><span>"Kiara, Henry K.","Henry Kiara: 0000-0001-9578-1636"
|
||
</span></span><span style="display:flex;"><span>"Gibson, John P.","John Gibson: 0000-0003-0371-2401"
|
||
</span></span><span style="display:flex;"><span>"Flintan, Fiona E.","Fiona Flintan: 0000-0002-9732-097X"
|
||
</span></span><span style="display:flex;"><span>"Mrode, Raphael A.","Raphael Mrode: 0000-0003-1964-5653"
|
||
</span></span><span style="display:flex;"><span>"Mtimet, Nadhem","Nadhem Mtimet: 0000-0003-3125-2828"
|
||
</span></span><span style="display:flex;"><span>"Said, Mohammed Yahya","Mohammed Yahya Said: 0000-0001-8127-6399"
|
||
</span></span><span style="display:flex;"><span>"Pezo, Danilo A.","Danilo Pezo: 0000-0001-5345-5314"
|
||
</span></span><span style="display:flex;"><span>"Haileslassie, Amare","Amare Haileslassie: 0000-0001-5237-9006"
|
||
</span></span><span style="display:flex;"><span>"Wright, Iain A.","Iain Wright: 0000-0002-6216-5308"
|
||
</span></span><span style="display:flex;"><span>"Cadilhon, Joseph J.","Jean-Joseph Cadilhon: 0000-0002-3181-5136"
|
||
</span></span><span style="display:flex;"><span>"Domelevo Entfellner, Jean-Baka","Jean-Baka Domelevo Entfellner: 0000-0002-8282-1325"
|
||
</span></span><span style="display:flex;"><span>"Oyola, Samuel O.","Samuel O. Oyola: 0000-0002-6425-7345"
|
||
</span></span><span style="display:flex;"><span>"Agaba, M.","Morris Agaba: 0000-0001-6777-0382"
|
||
</span></span><span style="display:flex;"><span>"Beebe, Stephen E.","Stephen E Beebe: 0000-0002-3742-9930"
|
||
</span></span><span style="display:flex;"><span>"Ouso, Daniel","Daniel Ouso: 0000-0003-0994-2558"
|
||
</span></span><span style="display:flex;"><span>"Ouso, Daniel O.","Daniel Ouso: 0000-0003-0994-2558"
|
||
</span></span><span style="display:flex;"><span>"Rono, Gilbert K.","Gilbert Kibet-Rono: 0000-0001-8043-5423"
|
||
</span></span><span style="display:flex;"><span>"Kibet, Gilbert","Gilbert Kibet-Rono: 0000-0001-8043-5423"
|
||
</span></span><span style="display:flex;"><span>"Juma, John","John Juma: 0000-0002-1481-5337"
|
||
</span></span><span style="display:flex;"><span>"Juma, J.","John Juma: 0000-0002-1481-5337"
|
||
</span></span><span style="display:flex;"><span>$ ./ilri/add-orcid-identifiers-csv.py -i /tmp/2022-09-22-add-orcids.csv -db dspace -u dspace -p <span style="color:#e6db74">'fuuu'</span>
|
||
</span></span></code></pre></div><ul>
|
||
<li>This adds nearly 5,500 ORCID tags!
|
||
<ul>
|
||
<li>Some of these authors were not in the controlled vocabulary so I added them</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-23">2022-09-23</h2>
|
||
<ul>
|
||
<li>Tag some more ORCID metdata (amended above)</li>
|
||
<li>Meeting with Peter and Abenet to discuss CGSpace issues
|
||
<ul>
|
||
<li>We found a workable solution to the MEL submission issue: they can submit to a dedicated MEL-only collection with no workflow and we will map or move the items after</li>
|
||
</ul>
|
||
</li>
|
||
<li>Pascal says that they have made a <a href="https://github.com/DSpace/DSpace/pull/8415">pull request for their duplicate checker on DSpace 7</a> yayyyy</li>
|
||
</ul>
|
||
<h2 id="2022-09-24">2022-09-24</h2>
|
||
<ul>
|
||
<li>Found some more ORCID identifiers to tag so I added them to the list above</li>
|
||
<li>Start a harvest on AReS around 8PM on Saturday night</li>
|
||
</ul>
|
||
<h2 id="2022-09-25">2022-09-25</h2>
|
||
<ul>
|
||
<li>The harvest on AReS finished and now the load on CGSpace server is still high like always on Sunday mornings
|
||
<ul>
|
||
<li>UptimeRobot says it’s down sigh…</li>
|
||
</ul>
|
||
</li>
|
||
<li>I had an idea to include the HTTP Accept header in the nginx proxy cache key to fix the issue we had with CIP last week
|
||
<ul>
|
||
<li>It seems to work:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<pre tabindex="0"><code>$ http --print Hh 'https://dspacetest.cgiar.org/rest/items?expand=metadata,parentCommunityList,parentCollectionList,bitstreams&limit=10&offset=60'
|
||
...
|
||
Content-Type: application/json
|
||
X-Cache-Status: MISS
|
||
|
||
$ http --print Hh 'https://dspacetest.cgiar.org/rest/items?expand=metadata,parentCommunityList,parentCollectionList,bitstreams&limit=10&offset=60'
|
||
...
|
||
Content-Type: application/json
|
||
X-Cache-Status: HIT
|
||
|
||
$ http --print Hh 'https://dspacetest.cgiar.org/rest/items?expand=metadata,parentCommunityList,parentCollectionList,bitstreams&limit=10&offset=60' Accept:application/xml
|
||
...
|
||
Content-Type: application/xml
|
||
X-Cache-Status: MISS
|
||
|
||
$ http --print Hh 'https://dspacetest.cgiar.org/rest/items?expand=metadata,parentCommunityList,parentCollectionList,bitstreams&limit=10&offset=60' Accept:application/xml
|
||
...
|
||
Content-Type: application/xml
|
||
X-Cache-Status: HIT
|
||
</code></pre><ul>
|
||
<li>This effectively makes our cache half as effective, but hopefully as more people start harvesting the number of requests handled by it will go up</li>
|
||
<li>I will enable this on CGSpace and email Moises from CIP to check if their harvester is working</li>
|
||
</ul>
|
||
<h2 id="2022-09-26">2022-09-26</h2>
|
||
<ul>
|
||
<li>Update welcome text on CGSpace after our meeting last week</li>
|
||
<li>I found another dozen or so ORCIDs for top authors on ILRI’s community on CGSpace and tagged them (~1,100 more metadata fields)</li>
|
||
<li>Last week we discussed moving <code>cg.identifier.googleurl</code> to <code>cg.identifier.url</code> since there is no need to treat Google Books URLs specially anymore as far as we know
|
||
<ul>
|
||
<li>I made the changes to the submission form and the XMLUI item displays, then moved all existing metadata in PostgreSQL:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>dspace= ☘ UPDATE metadatavalue SET metadata_field_id=219 WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=222;
|
||
</span></span><span style="display:flex;"><span>UPDATE 1137
|
||
</span></span></code></pre></div><ul>
|
||
<li>Then I deleted <code>cg.identifier.googleurl</code> from the metadata registry</li>
|
||
<li>Meeting with Salem, Svetlana, Valentina, and Abenet about MEL depositing to CGSpace for the initiatives
|
||
<ul>
|
||
<li>Submitting to a collection without a workflow works as expected, and we can even select another collection (with a workflow) to map the item to from the MEL submission</li>
|
||
<li>The three minor issues we found were:
|
||
<ul>
|
||
<li>MEL still doesn’t send the bitstream</li>
|
||
<li>MEL sends metadata with a download URL on mel.cgiar.org</li>
|
||
<li>MEL sends a JPEG that says “no thumbnail” when an item doesn’t have a thumbnail</li>
|
||
</ul>
|
||
</li>
|
||
<li>I still need to send feedback to the group</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<h2 id="2022-09-27">2022-09-27</h2>
|
||
<ul>
|
||
<li>Find a few more ORCID identifiers missing for ILRI authors and add them to the controlled vocabulary and tag the authors on CGSpace</li>
|
||
<li>Moises from CIP says the WordPress importer worked fine with the current nginx proxy cache settings so it seems adding the HTTP Accept header to the cache key worked</li>
|
||
<li>Update my DSpace 7 environments to 7.4-SNAPSHOT
|
||
<ul>
|
||
<li>I see they have added thumbnails in some places now</li>
|
||
<li>Oh nice, they also added the “recent submissions” to the home page</li>
|
||
</ul>
|
||
</li>
|
||
<li>While talking with Salem about the MEL depositing to CGSpace we discovered an issue with HTTP DELETE on <code>/items/{item id}/bitstreams/{bitstream id}</code> or <code>/bitstreams/{bitstream id}</code>
|
||
<ul>
|
||
<li>DSpace removes the bitstream but keeps the empty <code>THUMBNAIL</code> bundle, which breaks the display in XMLUI</li>
|
||
</ul>
|
||
</li>
|
||
<li>Meeting with Enrico et al about PRMS reporting for the initiatives</li>
|
||
</ul>
|
||
<h2 id="2022-09-28">2022-09-28</h2>
|
||
<ul>
|
||
<li>I was reading the source code for DSpace 6’s REST API and found that it’s <a href="https://github.com/DSpace/DSpace/blob/dspace-6.4/dspace-rest/src/main/java/org/dspace/rest/ItemsResource.java#L427">not possible to specify a bundle while POSTing a bitstream</a>
|
||
<ul>
|
||
<li>I asked Salem how they do it on MEL and he said they pretend to be a human and do it via XMLUI!</li>
|
||
</ul>
|
||
</li>
|
||
<li>I added a few new ILRI subjects to the input forms on CGSpace
|
||
<ul>
|
||
<li>Both “bushmeat” and “wildlife conservation” are AGROVOC terms, but “wild meat” is not</li>
|
||
<li>The distinction ILRI would like to start making is:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<blockquote>
|
||
<p>Meat comes from any animal, and when at ILRI we specifically make
|
||
reference to it in the context of livestock. However the word bushmeat
|
||
refers to illegal harvesting of meat. wild meat is being used as legal
|
||
harvesting of meat from wildlife and not from livestock.</p>
|
||
</blockquote>
|
||
<ul>
|
||
<li>I added a few more CGIAR authors ORCID identifiers to our controlled vocabulary and tagged them on CGSpace (~450 more metadata fields)</li>
|
||
<li>Talking to Salem about ORCID identifiers, we compared list and they have a bunch that we don’t have:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ cat ~/src/git/DSpace/dspace/config/controlled-vocabularies/cg-creator-identifier.xml ~/Downloads/MEL_ORCID_2022-09-28.csv | <span style="color:#ae81ff">\
|
||
</span></span></span><span style="display:flex;"><span><span style="color:#ae81ff"></span> grep -oE '[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}' | \
|
||
</span></span><span style="display:flex;"><span> sort | \
|
||
</span></span><span style="display:flex;"><span> uniq > /tmp/2022-09-29-combined-orcids.txt
|
||
</span></span><span style="display:flex;"><span>$ cat ~/src/git/DSpace/dspace/config/controlled-vocabularies/cg-creator-identifier.xml | grep -oE <span style="color:#e6db74">'[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}'</span> | sort | uniq | wc -l
|
||
</span></span><span style="display:flex;"><span>1421
|
||
</span></span><span style="display:flex;"><span>$ wc -l /tmp/2022-09-29-combined-orcids.txt
|
||
</span></span><span style="display:flex;"><span>1905 /tmp/2022-09-29-combined-orcids.txt
|
||
</span></span></code></pre></div><ul>
|
||
<li>After combining them I ran them through my <code>resolve-orcids.py</code> script:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ ./ilri/resolve-orcids.py -i /tmp/2022-09-29-combined-orcids.txt -o /tmp/2022-09-29-combined-orcids-names.txt -d
|
||
</span></span></code></pre></div><ul>
|
||
<li>I wrote a script <code>update-orcids.py</code> to read a list of names and ORCID identifiers and update existing metadata in the database to the latest name format</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ ./ilri/update-orcids.py -i ~/src/git/cgspace-submission-guidelines/content/terms/cg-creator-identifier/cg-creator-identifier.txt -db dspace -u dspace -p <span style="color:#e6db74">'fuuu'</span> -m <span style="color:#ae81ff">247</span> -d
|
||
</span></span><span style="display:flex;"><span>Connected to database.
|
||
</span></span><span style="display:flex;"><span>Fixed 9 occurences of: ADEBOWALE AD AKANDE: 0000-0002-6521-3272
|
||
</span></span><span style="display:flex;"><span>Fixed 43 occurences of: Alamu Emmanuel Oladeji (PhD, FIFST, MNIFST): 0000-0001-6263-1359
|
||
</span></span><span style="display:flex;"><span>Fixed 3 occurences of: Alessandra Galie: 0000-0001-9868-7733
|
||
</span></span><span style="display:flex;"><span>Fixed 1 occurences of: Amanda De Filippo: 0000-0002-1536-3221
|
||
</span></span><span style="display:flex;"><span>...
|
||
</span></span></code></pre></div><h2 id="2022-09-29">2022-09-29</h2>
|
||
<ul>
|
||
<li>I’ve been checking the size of the nginx proxy cache the last few days and it always seems to hover around 14,000 entries and 385MB:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># find /var/cache/nginx/rest_cache/ -type f | wc -l
|
||
</span></span><span style="display:flex;"><span>14202
|
||
</span></span><span style="display:flex;"><span># du -sh /var/cache/nginx/rest_cache
|
||
</span></span><span style="display:flex;"><span>384M /var/cache/nginx/rest_cache
|
||
</span></span></code></pre></div><ul>
|
||
<li>Also on that note I’m trying to implement a workaround for a potential caching issue that causes MEL to not be able to update items on DSpace Test
|
||
<ul>
|
||
<li>I <em>think</em> we might need to allow requests with a JSESSIONID to bypass the cache, but I have to verify with Salem</li>
|
||
<li>We can do this with an nginx map:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># Check <span style="color:#66d9ef">if</span> the JSESSIONID cookie is present and contains a 32-character hex
|
||
</span></span><span style="display:flex;"><span># value, which would mean that a user is actively attempting to re-use their
|
||
</span></span><span style="display:flex;"><span># Tomcat session. Then we set the $active_user_session variable and use it
|
||
</span></span><span style="display:flex;"><span># to bypass the nginx proxy cache in REST requests.
|
||
</span></span><span style="display:flex;"><span>map $cookie_jsessionid $active_user_session {
|
||
</span></span><span style="display:flex;"><span> # requests with an empty key are not evaluated by limit_req
|
||
</span></span><span style="display:flex;"><span> # see: http://nginx.org/en/docs/http/ngx_http_limit_req_module.html
|
||
</span></span><span style="display:flex;"><span> default '';
|
||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span> '~[A-Z0-9]{32}' 1;
|
||
</span></span><span style="display:flex;"><span>}
|
||
</span></span></code></pre></div><ul>
|
||
<li>Then in the location block where we do the proxy cache:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span> # Don't cache when user Shift-refreshes (Cache-Control: no-cache) or
|
||
</span></span><span style="display:flex;"><span> # when a client has an active session (see the $cookie_jsessionid map).
|
||
</span></span><span style="display:flex;"><span> proxy_cache_bypass $http_cache_control $active_user_session;
|
||
</span></span><span style="display:flex;"><span> proxy_no_cache $http_cache_control $active_user_session;
|
||
</span></span></code></pre></div><ul>
|
||
<li>I found one client making 10,000 requests using a Windows 98 user agent:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)
|
||
</span></span></code></pre></div><ul>
|
||
<li>They all come from one IP address (129.227.149.43) in Hong Kong
|
||
<ul>
|
||
<li>The IP belongs to a hosting provider called Zenlayer</li>
|
||
<li>I will add this IP to the nginx bot networks and purge its hits</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ ./ilri/check-spider-ip-hits.sh -f /tmp/ip -p
|
||
</span></span><span style="display:flex;"><span>Purging 33027 hits from 129.227.149.43 in statistics
|
||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>Total number of bot hits purged: 33027
|
||
</span></span></code></pre></div><ul>
|
||
<li>So it seems we’ve seen this bot before and the total number is much higher than the 10,000 this month</li>
|
||
<li>I had a call with Salem and we verified that the nginx cache bypass for clients who provide a JSESSIONID fixes their issue with updating items/bitstreams from MEL
|
||
<ul>
|
||
<li>The issue was that they delete all metadata and bitstreams, then add them again to make sure everything is up to date, and in that process they also re-request the item with all expands to get the bitstreams, which ends up getting cached and then they try to delete the old bitstream</li>
|
||
</ul>
|
||
</li>
|
||
<li>I also noticed that someone made a <a href="https://github.com/DSpace/DSpace/pull/8343">pull request to enable POSTing bitstreams to a particular bundle</a> and it works, so that’s awesome!</li>
|
||
</ul>
|
||
<h2 id="2022-09-30">2022-09-30</h2>
|
||
<ul>
|
||
<li>I applied <a href="https://github.com/DSpace/DSpace/pull/8343">the patch for POSTing bitstreams to other bundles</a> on CGSpace</li>
|
||
<li>Testing a few other DSpace 6.4 patches on DSpace Test:
|
||
<ul>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/1901">DS-3791 Make sure the “yearDifference” takes into account that a gap of 10 year contains 11 years</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/2501">DS-3873 Limit the usage of PDFBoxThumbnail to PDFs</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/2161">Reduce itemCounter init</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/2201">ImageMagick: Only execute “identify” on first page</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/2371">DS-3881: Show no total results on search-filter</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/2699">pass value instead of qualifier to method</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/7993">dspace-api: check for null AND empty qualifier in findByElement()</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/7995">Avoid exporting mapped Item more than once</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/3162">[DS-4574] v. 6 - Upgrade DBCP2 dependency</a></li>
|
||
<li><a href="https://github.com/DSpace/DSpace/pull/2742">bump up pdfbox version on 6.x to match main branch</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<!-- raw HTML omitted -->
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
|
||
|
||
|
||
</div> <!-- /.blog-main -->
|
||
|
||
<aside class="col-sm-3 ml-auto blog-sidebar">
|
||
|
||
|
||
|
||
<section class="sidebar-module">
|
||
<h4>Recent Posts</h4>
|
||
<ol class="list-unstyled">
|
||
|
||
|
||
<li><a href="/cgspace-notes/2022-11/">November, 2022</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2022-10/">October, 2022</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2022-09/">September, 2022</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2022-08/">August, 2022</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2022-07/">July, 2022</a></li>
|
||
|
||
</ol>
|
||
</section>
|
||
|
||
|
||
|
||
|
||
<section class="sidebar-module">
|
||
<h4>Links</h4>
|
||
<ol class="list-unstyled">
|
||
|
||
<li><a href="https://cgspace.cgiar.org">CGSpace</a></li>
|
||
|
||
<li><a href="https://dspacetest.cgiar.org">DSpace Test</a></li>
|
||
|
||
<li><a href="https://github.com/ilri/DSpace">CGSpace @ GitHub</a></li>
|
||
|
||
</ol>
|
||
</section>
|
||
|
||
</aside>
|
||
|
||
|
||
</div> <!-- /.row -->
|
||
</div> <!-- /.container -->
|
||
|
||
|
||
|
||
<footer class="blog-footer">
|
||
<p dir="auto">
|
||
|
||
Blog template created by <a href="https://twitter.com/mdo">@mdo</a>, ported to Hugo by <a href='https://twitter.com/mralanorth'>@mralanorth</a>.
|
||
|
||
</p>
|
||
<p>
|
||
<a href="#">Back to top</a>
|
||
</p>
|
||
</footer>
|
||
|
||
|
||
</body>
|
||
|
||
</html>
|