mirror of
https://github.com/alanorth/cgspace-notes.git
synced 2024-11-27 00:48:19 +01:00
457 lines
16 KiB
HTML
457 lines
16 KiB
HTML
<!DOCTYPE html>
|
||
<html lang="en" >
|
||
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||
|
||
|
||
<meta property="og:title" content="CGSpace Notes" />
|
||
<meta property="og:description" content="Documenting day-to-day work on the [CGSpace](https://cgspace.cgiar.org) repository." />
|
||
<meta property="og:type" content="website" />
|
||
<meta property="og:url" content="https://alanorth.github.io/cgspace-notes/" />
|
||
<meta property="og:updated_time" content="2023-10-13T17:17:41+03:00" />
|
||
|
||
|
||
|
||
<meta name="twitter:card" content="summary"/>
|
||
<meta name="twitter:title" content="CGSpace Notes"/>
|
||
<meta name="twitter:description" content="Documenting day-to-day work on the [CGSpace](https://cgspace.cgiar.org) repository."/>
|
||
<meta name="generator" content="Hugo 0.119.0">
|
||
|
||
|
||
|
||
<script type="application/ld+json">
|
||
{
|
||
"@context": "http://schema.org",
|
||
"@type": "Blog",
|
||
"headline": "CGSpace Notes",
|
||
"url" : "https://alanorth.github.io/cgspace-notes/",
|
||
"author": {
|
||
"@type": "Person",
|
||
"name": "Alan Orth"
|
||
},
|
||
"dateModified": "2023-10-02T09:05:36+03:00",
|
||
"keywords": "notes, migration, notes",
|
||
"description":"Documenting day-to-day work on the [CGSpace](https://cgspace.cgiar.org) repository."
|
||
}
|
||
</script>
|
||
|
||
|
||
<link rel="canonical" href="https://alanorth.github.io/cgspace-notes/">
|
||
|
||
<title>CGSpace Notes</title>
|
||
|
||
|
||
<!-- combined, minified CSS -->
|
||
|
||
<link href="https://alanorth.github.io/cgspace-notes/css/style.c6ba80bc50669557645abe05f86b73cc5af84408ed20f1551a267bc19ece8228.css" rel="stylesheet" integrity="sha256-xrqAvFBmlVdkWr4F+GtzzFr4RAjtIPFVGiZ7wZ7Ogig=" crossorigin="anonymous">
|
||
|
||
|
||
<!-- minified Font Awesome for SVG icons -->
|
||
|
||
<script defer src="https://alanorth.github.io/cgspace-notes/js/fontawesome.min.f5072c55a0721857184db93a50561d7dc13975b4de2e19db7f81eb5f3fa57270.js" integrity="sha256-9QcsVaByGFcYTbk6UFYdfcE5dbTeLhnbf4HrXz+lcnA=" crossorigin="anonymous"></script>
|
||
|
||
<!-- RSS 2.0 feed -->
|
||
<link rel="alternate" type="application/rss+xml" href="https://alanorth.github.io/cgspace-notes/index.xml" title="CGSpace Notes" />
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
<body>
|
||
|
||
|
||
<div class="blog-masthead">
|
||
<div class="container">
|
||
<nav class="nav blog-nav">
|
||
<a class="nav-link active" href="https://alanorth.github.io/cgspace-notes/">Home</a>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<header class="blog-header">
|
||
<div class="container">
|
||
<h1 class="blog-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/" rel="home">CGSpace Notes</a></h1>
|
||
<p class="lead blog-description" dir="auto">Documenting day-to-day work on the <a href="https://cgspace.cgiar.org">CGSpace</a> repository.</p>
|
||
</div>
|
||
</header>
|
||
|
||
|
||
|
||
|
||
<div class="container">
|
||
<div class="row">
|
||
<div class="col-sm-8 blog-main">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-12/">December, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-12-01T08:52:36+03:00">Thu Dec 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-12-01">2022-12-01</h2>
|
||
<ul>
|
||
<li>Fix some incorrect regions on CGSpace
|
||
<ul>
|
||
<li>I exported the CCAFS and IITA communities, extracted just the country and region columns, then ran them through csv-metadata-quality to fix the regions</li>
|
||
</ul>
|
||
</li>
|
||
<li>Add a few more authors to my CSV with author names and ORCID identifiers and tag 283 items!</li>
|
||
<li>Replace “East Asia” with “Eastern Asia” region on CGSpace (UN M.49 region)</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-12/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-11/">November, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-11-01T09:11:36+03:00">Tue Nov 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-11-01">2022-11-01</h2>
|
||
<ul>
|
||
<li>Last night I re-synced DSpace 7 Test from CGSpace
|
||
<ul>
|
||
<li>I also updated all my local <code>7_x-dev</code> branches on the latest upstreams</li>
|
||
</ul>
|
||
</li>
|
||
<li>I spent some time updating the authorizations in Alliance collections
|
||
<ul>
|
||
<li>I want to make sure they use groups instead of individuals where possible!</li>
|
||
</ul>
|
||
</li>
|
||
<li>I reverted the Cocoon autosave change because it was more of a nuissance that Peter can’t upload CSVs from the web interface and is a very low severity security issue</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-11/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-10/">October, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-10-01T19:45:36+03:00">Sat Oct 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-10-01">2022-10-01</h2>
|
||
<ul>
|
||
<li>Start a harvest on AReS last night</li>
|
||
<li>Yesterday I realized how to use <a href="https://im4java.sourceforge.net/docs/dev-guide.html">GraphicsMagick with im4java</a> and I want to re-visit some of my thumbnail tests
|
||
<ul>
|
||
<li>I’m also interested in libvips support via jVips, though last time I checked it was only for Java 8</li>
|
||
<li>I filed <a href="https://github.com/criteo/JVips/issues/141">an issue to ask about Java 11+ support</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-10/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-09/">September, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-09-01T09:41:36+03:00">Thu Sep 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-09-01">2022-09-01</h2>
|
||
<ul>
|
||
<li>A bit of work on the “Mapping CG Core–CGSpace–MEL–MARLO Types” spreadsheet</li>
|
||
<li>I tested an item submission on DSpace Test with the Cocoon <code>org.apache.cocoon.uploads.autosave=false</code> change
|
||
<ul>
|
||
<li>The submission works as expected</li>
|
||
</ul>
|
||
</li>
|
||
<li>Start debugging some region-related issues with csv-metadata-quality
|
||
<ul>
|
||
<li>I created a new test file <code>test-geography.csv</code> with some different scenarios</li>
|
||
<li>I also fixed a few bugs and improved the region-matching logic</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-09/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-08/">August, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-08-01T10:22:36+03:00">Mon Aug 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-08-01">2022-08-01</h2>
|
||
<ul>
|
||
<li>Our request to add <a href="https://github.com/spdx/license-list-XML/issues/1525">CC-BY-3.0-IGO to SPDX</a> was approved a few weeks ago</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-08/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-07/">July, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-07-02T14:07:36+03:00">Sat Jul 02, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-07-02">2022-07-02</h2>
|
||
<ul>
|
||
<li>I learned how to use the Levenshtein functions in PostgreSQL
|
||
<ul>
|
||
<li>The thing is that there is a limit of 255 characters for these functions in PostgreSQL so you need to truncate the strings before comparing</li>
|
||
<li>Also, the trgm functions I’ve used before are case insensitive, but Levenshtein is not, so you need to make sure to lower case both strings first</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-07/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-06/">June, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-06-06T09:01:36+03:00">Mon Jun 06, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-06-06">2022-06-06</h2>
|
||
<ul>
|
||
<li>Look at the Solr statistics on CGSpace
|
||
<ul>
|
||
<li>I see 167,000 hits from a bunch of Microsoft IPs with reverse DNS “msnbot-” using the Solr query <code>dns:*msnbot* AND dns:*.msn.com</code></li>
|
||
<li>I purged these first so I could see the other “real” IPs in the Solr facets</li>
|
||
</ul>
|
||
</li>
|
||
<li>I see 47,500 hits from 80.248.237.167 on a data center ISP in Sweden, using a normal user agent</li>
|
||
<li>I see 13,000 hits from 163.237.216.11 on a data center ISP in Australia, using a normal user agent</li>
|
||
<li>I see 7,300 hits from 208.185.238.57 from Britanica, using a normal user agent
|
||
<ul>
|
||
<li>There seem to be many more of these:</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-06/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-05/">May, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-05-04T09:13:39+03:00">Wed May 04, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-05-04">2022-05-04</h2>
|
||
<ul>
|
||
<li>I found a few more IPs making requests using the shady Chrome 44 user agent in the last few days so I will add them to the block list too:
|
||
<ul>
|
||
<li>18.207.136.176</li>
|
||
<li>185.189.36.248</li>
|
||
<li>50.118.223.78</li>
|
||
<li>52.70.76.123</li>
|
||
<li>3.236.10.11</li>
|
||
</ul>
|
||
</li>
|
||
<li>Looking at the Solr statistics for 2022-04
|
||
<ul>
|
||
<li>52.191.137.59 is Microsoft, but they are using a normal user agent and making tens of thousands of requests</li>
|
||
<li>64.39.98.62 is owned by Qualys, and all their requests are probing for /etc/passwd etc</li>
|
||
<li>185.192.69.15 is in the Netherlands and is using a normal user agent, but making excessive automated HTTP requests to paths forbidden in robots.txt</li>
|
||
<li>157.55.39.159 is owned by Microsoft and identifies as bingbot so I don’t know why its requests were logged in Solr</li>
|
||
<li>52.233.67.176 is owned by Microsoft and uses a normal user agent, but making excessive automated HTTP requests</li>
|
||
<li>157.55.39.144 is owned by Microsoft and uses a normal user agent, but making excessive automated HTTP requests</li>
|
||
<li>207.46.13.177 is owned by Microsoft and identifies as bingbot so I don’t know why its requests were logged in Solr</li>
|
||
<li>If I query Solr for <code>time:2022-04* AND dns:*msnbot* AND dns:*.msn.com.</code> I see a handful of IPs that made 41,000 requests</li>
|
||
</ul>
|
||
</li>
|
||
<li>I purged 93,974 hits from these IPs using my <code>check-spider-ip-hits.sh</code> script</li>
|
||
</ul>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-05/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-04/">April, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-04-01T10:53:39+03:00">Fri Apr 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
2022-04-01 I did G1GC tests on DSpace Test (linode26) to compliment the CMS tests I did yesterday The Discovery indexing took this long: real 334m33.625s user 227m51.331s sys 3m43.037s 2022-04-04 Start a full harvest on AReS Help Marianne with submit/approve access on a new collection on CGSpace Go back in Gaia’s batch reports to find records that she indicated for replacing on CGSpace (ie, those with better new copies, new versions, etc) Looking at the Solr statistics for 2022-03 on CGSpace I see 54.
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-04/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<article class="blog-post">
|
||
<header>
|
||
<h2 class="blog-post-title" dir="auto"><a href="https://alanorth.github.io/cgspace-notes/2022-03/">March, 2022</a></h2>
|
||
<p class="blog-post-meta"><time datetime="2022-03-01T16:46:54+03:00">Tue Mar 01, 2022</time> by Alan Orth in
|
||
<span class="fas fa-folder" aria-hidden="true"></span> <a href="/categories/notes/" rel="category tag">Notes</a>
|
||
|
||
|
||
</p>
|
||
</header>
|
||
<h2 id="2022-03-01">2022-03-01</h2>
|
||
<ul>
|
||
<li>Send Gaia the last batch of potential duplicates for items 701 to 980:</li>
|
||
</ul>
|
||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ csvcut -c id,dc.title,dcterms.issued,dcterms.type ~/Downloads/2022-03-01-CGSpace-TAC-ICW-batch4-701-980.csv > /tmp/tac4.csv
|
||
</span></span><span style="display:flex;"><span>$ ./ilri/check-duplicates.py -i /tmp/tac4.csv -db dspace -u dspace -p <span style="color:#e6db74">'fuuu'</span> -o /tmp/2022-03-01-tac-batch4-701-980.csv
|
||
</span></span><span style="display:flex;"><span>$ csvcut -c id,filename ~/Downloads/2022-03-01-CGSpace-TAC-ICW-batch4-701-980.csv > /tmp/tac4-filenames.csv
|
||
</span></span><span style="display:flex;"><span>$ csvjoin -c id /tmp/2022-03-01-tac-batch4-701-980.csv /tmp/tac4-filenames.csv > /tmp/2022-03-01-tac-batch4-701-980-filenames.csv
|
||
</span></span></code></pre></div>
|
||
<a href='https://alanorth.github.io/cgspace-notes/2022-03/'>Read more →</a>
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
<nav class="blog-pagination">
|
||
|
||
<a class="btn btn-outline-primary" href="/cgspace-notes/" rel="prev" role="button">Previous page</a>
|
||
<a class="btn btn-outline-primary" href="/cgspace-notes/page/3/" rel="next" role="button">Next page</a>
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
|
||
|
||
|
||
|
||
</div> <!-- /.blog-main -->
|
||
|
||
<aside class="col-sm-3 ml-auto blog-sidebar">
|
||
|
||
|
||
|
||
<section class="sidebar-module">
|
||
<h4>Recent Posts</h4>
|
||
<ol class="list-unstyled">
|
||
|
||
|
||
<li><a href="/cgspace-notes/2023-10/">October, 2023</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2023-09/">September, 2023</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2023-08/">August, 2023</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2023-07/">July, 2023</a></li>
|
||
|
||
<li><a href="/cgspace-notes/2023-06/">June, 2023</a></li>
|
||
|
||
</ol>
|
||
</section>
|
||
|
||
|
||
|
||
|
||
<section class="sidebar-module">
|
||
<h4>Links</h4>
|
||
<ol class="list-unstyled">
|
||
|
||
<li><a href="https://cgspace.cgiar.org">CGSpace</a></li>
|
||
|
||
<li><a href="https://dspacetest.cgiar.org">DSpace Test</a></li>
|
||
|
||
<li><a href="https://github.com/ilri/DSpace">CGSpace @ GitHub</a></li>
|
||
|
||
</ol>
|
||
</section>
|
||
|
||
</aside>
|
||
|
||
|
||
</div> <!-- /.row -->
|
||
</div> <!-- /.container -->
|
||
|
||
|
||
|
||
<footer class="blog-footer">
|
||
<p dir="auto">
|
||
|
||
Blog template created by <a href="https://twitter.com/mdo">@mdo</a>, ported to Hugo by <a href='https://twitter.com/mralanorth'>@mralanorth</a>.
|
||
|
||
</p>
|
||
<p>
|
||
<a href="#">Back to top</a>
|
||
</p>
|
||
</footer>
|
||
|
||
|
||
</body>
|
||
|
||
</html>
|