mirror of
https://github.com/alanorth/cgspace-notes.git
synced 2025-01-27 05:49:12 +01:00
Add notes for 2022-03-04
This commit is contained in:
@ -38,7 +38,7 @@ We agreed to try to do more alignment of affiliations/funders with ROR
|
||||
|
||||
|
||||
"/>
|
||||
<meta name="generator" content="Hugo 0.92.2" />
|
||||
<meta name="generator" content="Hugo 0.93.1" />
|
||||
|
||||
|
||||
|
||||
@ -138,44 +138,44 @@ We agreed to try to do more alignment of affiliations/funders with ROR
|
||||
<ul>
|
||||
<li>I moved a bunch of communities:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/114639 --child<span style="color:#f92672">=</span>10568/115089
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/114639 --child<span style="color:#f92672">=</span>10568/115087
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/108598
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10947/1
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/35697 --child<span style="color:#f92672">=</span>10568/80211
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10947/2517
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10947/2517
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/89416
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/3530
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/80099
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/80100
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/34494
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117867 --child<span style="color:#f92672">=</span>10568/114644
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117867 --child<span style="color:#f92672">=</span>10568/16573
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117867 --child<span style="color:#f92672">=</span>10568/42211
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/109945
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/16498
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/99453
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/2983
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/133
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/1208
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/1208
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/56924
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/56924
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/91688
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10947/1 --child<span style="color:#f92672">=</span>10568/91688
|
||||
$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10947/2515
|
||||
$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10947/1 --child<span style="color:#f92672">=</span>10947/2515
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/114639 --child<span style="color:#f92672">=</span>10568/115089
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/114639 --child<span style="color:#f92672">=</span>10568/115087
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/108598
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10947/1
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/35697 --child<span style="color:#f92672">=</span>10568/80211
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10947/2517
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10947/2517
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/89416
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/3530
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/80099
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/80100
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/97114 --child<span style="color:#f92672">=</span>10568/34494
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117867 --child<span style="color:#f92672">=</span>10568/114644
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117867 --child<span style="color:#f92672">=</span>10568/16573
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117867 --child<span style="color:#f92672">=</span>10568/42211
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/109945
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/16498
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/99453
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/2983
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/133
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/1208
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/1208
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/56924
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10568/117865 --child<span style="color:#f92672">=</span>10568/56924
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10568/91688
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10947/1 --child<span style="color:#f92672">=</span>10568/91688
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --remove --parent<span style="color:#f92672">=</span>10568/83389 --child<span style="color:#f92672">=</span>10947/2515
|
||||
</span></span><span style="display:flex;"><span>$ dspace community-filiator --set --parent<span style="color:#f92672">=</span>10947/1 --child<span style="color:#f92672">=</span>10947/2515
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Remove CPWF and CTA subjects from the Discovery facets</li>
|
||||
<li>Start a full Discovery index on CGSpace:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ time chrt -b <span style="color:#ae81ff">0</span> ionice -c2 -n7 nice -n19 dspace index-discovery -b
|
||||
<span style="color:#960050;background-color:#1e0010">
|
||||
</span><span style="color:#960050;background-color:#1e0010"></span>real 275m15.777s
|
||||
user 182m52.171s
|
||||
sys 2m51.573s
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ time chrt -b <span style="color:#ae81ff">0</span> ionice -c2 -n7 nice -n19 dspace index-discovery -b
|
||||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>real 275m15.777s
|
||||
</span></span><span style="display:flex;"><span>user 182m52.171s
|
||||
</span></span><span style="display:flex;"><span>sys 2m51.573s
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I got a request to confirm validation of CGSpace on openarchives.org, with the requestor’s IP being 128.84.116.66
|
||||
<ul>
|
||||
<li>That is at Cornell… hmmmm who could that be?!</li>
|
||||
@ -192,8 +192,8 @@ sys 2m51.573s
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">45.134.26.171 - - [12/Jan/2022:06:25:27 +0100] "GET /bitstream/handle/10568/81964/varietal-2faea58f.pdf?sequence=1 HTTP/1.1" 200 1157807 "https://cgspace.cgiar.org:443/bitstream/handle/10568/81964/varietal-2faea58f.pdf" "Opera/9.64 (Windows NT 6.1; U; MRA 5.5 (build 02842); ru) Presto/2.1.1)) AND 4734=CTXSYS.DRITHSX.SN(4734,(CHR(113)||CHR(120)||CHR(120)||CHR(112)||CHR(113)||(SELECT (CASE WHEN (4734=4734) THEN 1 ELSE 0 END) FROM DUAL)||CHR(113)||CHR(120)||CHR(113)||CHR(122)||CHR(113))) AND ((3917=3917"
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>45.134.26.171 - - [12/Jan/2022:06:25:27 +0100] "GET /bitstream/handle/10568/81964/varietal-2faea58f.pdf?sequence=1 HTTP/1.1" 200 1157807 "https://cgspace.cgiar.org:443/bitstream/handle/10568/81964/varietal-2faea58f.pdf" "Opera/9.64 (Windows NT 6.1; U; MRA 5.5 (build 02842); ru) Presto/2.1.1)) AND 4734=CTXSYS.DRITHSX.SN(4734,(CHR(113)||CHR(120)||CHR(120)||CHR(112)||CHR(113)||(SELECT (CASE WHEN (4734=4734) THEN 1 ELSE 0 END) FROM DUAL)||CHR(113)||CHR(120)||CHR(113)||CHR(122)||CHR(113))) AND ((3917=3917"
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>3.225.28.105 made 3,000 requests mostly for one CIAT collection on the REST API and it is owned by Amazon
|
||||
<ul>
|
||||
<li>The user agent is sometimes a normal user one, and sometimes <code>Apache-HttpClient/4.3.4 (java 1.5)</code></li>
|
||||
@ -202,27 +202,27 @@ sys 2m51.573s
|
||||
<li>217.182.21.193 made 2,400 requests and is on OVH</li>
|
||||
<li>I purged these hits</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ ./ilri/check-spider-ip-hits.sh -f /tmp/ips.txt -p
|
||||
Purging 26817 hits from 64.39.98.40 in statistics
|
||||
Purging 9446 hits from 45.134.26.171 in statistics
|
||||
Purging 6490 hits from 3.225.28.105 in statistics
|
||||
Purging 11949 hits from 217.182.21.193 in statistics
|
||||
<span style="color:#960050;background-color:#1e0010">
|
||||
</span><span style="color:#960050;background-color:#1e0010"></span>Total number of bot hits purged: 54702
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ ./ilri/check-spider-ip-hits.sh -f /tmp/ips.txt -p
|
||||
</span></span><span style="display:flex;"><span>Purging 26817 hits from 64.39.98.40 in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 9446 hits from 45.134.26.171 in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 6490 hits from 3.225.28.105 in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 11949 hits from 217.182.21.193 in statistics
|
||||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>Total number of bot hits purged: 54702
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Export donors and affiliations from CGSpace database:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">localhost/dspace63= ☘ \COPY (SELECT DISTINCT text_value as "cg.contributor.donor", count(*) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id = 248 GROUP BY text_value ORDER BY count DESC) to /tmp/2022-02-02-donors.csv WITH CSV HEADER;
|
||||
COPY 1036
|
||||
localhost/dspace63= ☘ \COPY (SELECT DISTINCT text_value as "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id = 211 GROUP BY text_value ORDER BY count DESC) to /tmp/2022-02-02-affiliations.csv WITH CSV HEADER;
|
||||
COPY 7901
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>localhost/dspace63= ☘ \COPY (SELECT DISTINCT text_value as "cg.contributor.donor", count(*) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id = 248 GROUP BY text_value ORDER BY count DESC) to /tmp/2022-02-02-donors.csv WITH CSV HEADER;
|
||||
</span></span><span style="display:flex;"><span>COPY 1036
|
||||
</span></span><span style="display:flex;"><span>localhost/dspace63= ☘ \COPY (SELECT DISTINCT text_value as "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id = 211 GROUP BY text_value ORDER BY count DESC) to /tmp/2022-02-02-affiliations.csv WITH CSV HEADER;
|
||||
</span></span><span style="display:flex;"><span>COPY 7901
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Then check matches against the latest ROR dump:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ csvcut -c cg.contributor.donor /tmp/2022-02-02-donors.csv | sed <span style="color:#e6db74">'1d'</span> > /tmp/2022-02-02-donors.txt
|
||||
$ ./ilri/ror-lookup.py -i /tmp/2022-02-02-donors.txt -r 2021-09-23-ror-data.json -o /tmp/donor-ror-matches.csv
|
||||
...
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ csvcut -c cg.contributor.donor /tmp/2022-02-02-donors.csv | sed <span style="color:#e6db74">'1d'</span> > /tmp/2022-02-02-donors.txt
|
||||
</span></span><span style="display:flex;"><span>$ ./ilri/ror-lookup.py -i /tmp/2022-02-02-donors.txt -r 2021-09-23-ror-data.json -o /tmp/donor-ror-matches.csv
|
||||
</span></span><span style="display:flex;"><span>...
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I see we have 258/1036 (24.9%) of our donors matching ROR (as of the 2021-09-23 ROR dump)</li>
|
||||
<li>I see we have 1986/7901 (25.1%) of our affiliations matching ROR (as of the 2021-09-23 ROR dump)</li>
|
||||
<li>Update the PostgreSQL JDBC driver to 42.3.2 in the Ansible Infrastructure playbooks and deploy on DSpace Test</li>
|
||||
@ -245,37 +245,37 @@ $ ./ilri/ror-lookup.py -i /tmp/2022-02-02-donors.txt -r 2021-09-23-ror-data.json
|
||||
<li>I synchronized DSpace Test with a fresh snapshot of CGSpace</li>
|
||||
<li>I noticed a bunch of thumbnails missing for items submitted in the last week on CGSpace so I ran the <code>dspace filter-media</code> script manually and eventually it crashed:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace filter-media
|
||||
...
|
||||
SKIPPED: bitstream 48612de7-eec5-4990-8f1b-589a87219a39 (item: 10568/67391) because 'ilri_establishiment.pdf.txt' already exists
|
||||
Generated Thumbnail ilri_establishiment.pdf matches pattern and is replacable.
|
||||
SKIPPED: bitstream 48612de7-eec5-4990-8f1b-589a87219a39 (item: 10568/67391) because 'ilri_establishiment.pdf.jpg' already exists
|
||||
File: Agreement_on_the_Estab_of_ILRI.doc.txt
|
||||
Exception: org.apache.poi.util.LittleEndian.getUnsignedByte([BI)I
|
||||
java.lang.NoSuchMethodError: org.apache.poi.util.LittleEndian.getUnsignedByte([BI)I
|
||||
at org.textmining.extraction.word.model.FormattedDiskPage.<init>(FormattedDiskPage.java:66)
|
||||
at org.textmining.extraction.word.model.CHPFormattedDiskPage.<init>(CHPFormattedDiskPage.java:62)
|
||||
at org.textmining.extraction.word.model.CHPBinTable.<init>(CHPBinTable.java:70)
|
||||
at org.textmining.extraction.word.Word97TextExtractor.getText(Word97TextExtractor.java:122)
|
||||
at org.textmining.extraction.word.Word97TextExtractor.getText(Word97TextExtractor.java:63)
|
||||
at org.dspace.app.mediafilter.WordFilter.getDestinationStream(WordFilter.java:83)
|
||||
at com.atmire.dspace.app.mediafilter.AtmireMediaFilter.processBitstream(AtmireMediaFilter.java:103)
|
||||
at com.atmire.dspace.app.mediafilter.AtmireMediaFilterServiceImpl.filterBitstream(AtmireMediaFilterServiceImpl.java:61)
|
||||
at org.dspace.app.mediafilter.MediaFilterServiceImpl.filterItem(MediaFilterServiceImpl.java:181)
|
||||
at org.dspace.app.mediafilter.MediaFilterServiceImpl.applyFiltersItem(MediaFilterServiceImpl.java:159)
|
||||
at org.dspace.app.mediafilter.MediaFilterServiceImpl.applyFiltersAllItems(MediaFilterServiceImpl.java:111)
|
||||
at org.dspace.app.mediafilter.MediaFilterCLITool.main(MediaFilterCLITool.java:212)
|
||||
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
|
||||
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
|
||||
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
|
||||
at java.lang.reflect.Method.invoke(Method.java:498)
|
||||
at org.dspace.app.launcher.ScriptLauncher.runOneCommand(ScriptLauncher.java:229)
|
||||
at org.dspace.app.launcher.ScriptLauncher.main(ScriptLauncher.java:81)
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace filter-media
|
||||
</span></span><span style="display:flex;"><span>...
|
||||
</span></span><span style="display:flex;"><span>SKIPPED: bitstream 48612de7-eec5-4990-8f1b-589a87219a39 (item: 10568/67391) because 'ilri_establishiment.pdf.txt' already exists
|
||||
</span></span><span style="display:flex;"><span>Generated Thumbnail ilri_establishiment.pdf matches pattern and is replacable.
|
||||
</span></span><span style="display:flex;"><span>SKIPPED: bitstream 48612de7-eec5-4990-8f1b-589a87219a39 (item: 10568/67391) because 'ilri_establishiment.pdf.jpg' already exists
|
||||
</span></span><span style="display:flex;"><span>File: Agreement_on_the_Estab_of_ILRI.doc.txt
|
||||
</span></span><span style="display:flex;"><span>Exception: org.apache.poi.util.LittleEndian.getUnsignedByte([BI)I
|
||||
</span></span><span style="display:flex;"><span>java.lang.NoSuchMethodError: org.apache.poi.util.LittleEndian.getUnsignedByte([BI)I
|
||||
</span></span><span style="display:flex;"><span> at org.textmining.extraction.word.model.FormattedDiskPage.<init>(FormattedDiskPage.java:66)
|
||||
</span></span><span style="display:flex;"><span> at org.textmining.extraction.word.model.CHPFormattedDiskPage.<init>(CHPFormattedDiskPage.java:62)
|
||||
</span></span><span style="display:flex;"><span> at org.textmining.extraction.word.model.CHPBinTable.<init>(CHPBinTable.java:70)
|
||||
</span></span><span style="display:flex;"><span> at org.textmining.extraction.word.Word97TextExtractor.getText(Word97TextExtractor.java:122)
|
||||
</span></span><span style="display:flex;"><span> at org.textmining.extraction.word.Word97TextExtractor.getText(Word97TextExtractor.java:63)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.mediafilter.WordFilter.getDestinationStream(WordFilter.java:83)
|
||||
</span></span><span style="display:flex;"><span> at com.atmire.dspace.app.mediafilter.AtmireMediaFilter.processBitstream(AtmireMediaFilter.java:103)
|
||||
</span></span><span style="display:flex;"><span> at com.atmire.dspace.app.mediafilter.AtmireMediaFilterServiceImpl.filterBitstream(AtmireMediaFilterServiceImpl.java:61)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.mediafilter.MediaFilterServiceImpl.filterItem(MediaFilterServiceImpl.java:181)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.mediafilter.MediaFilterServiceImpl.applyFiltersItem(MediaFilterServiceImpl.java:159)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.mediafilter.MediaFilterServiceImpl.applyFiltersAllItems(MediaFilterServiceImpl.java:111)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.mediafilter.MediaFilterCLITool.main(MediaFilterCLITool.java:212)
|
||||
</span></span><span style="display:flex;"><span> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
|
||||
</span></span><span style="display:flex;"><span> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
|
||||
</span></span><span style="display:flex;"><span> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
|
||||
</span></span><span style="display:flex;"><span> at java.lang.reflect.Method.invoke(Method.java:498)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.launcher.ScriptLauncher.runOneCommand(ScriptLauncher.java:229)
|
||||
</span></span><span style="display:flex;"><span> at org.dspace.app.launcher.ScriptLauncher.main(ScriptLauncher.java:81)
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I should look up that issue and report a bug somewhere perhaps, but for now I just forced the JPG thumbnails with:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace filter-media -p <span style="color:#e6db74">"ImageMagick PDF Thumbnail"</span> -v >& /tmp/filter-media.log
|
||||
</code></pre></div><h2 id="2022-02-04">2022-02-04</h2>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace filter-media -p <span style="color:#e6db74">"ImageMagick PDF Thumbnail"</span> -v >& /tmp/filter-media.log
|
||||
</span></span></code></pre></div><h2 id="2022-02-04">2022-02-04</h2>
|
||||
<ul>
|
||||
<li>I found a thread on the dspace-tech mailing list about the <code>media-filter</code> crash above
|
||||
<ul>
|
||||
@ -284,14 +284,14 @@ java.lang.NoSuchMethodError: org.apache.poi.util.LittleEndian.getUnsignedByte([B
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace filter-media -i 10568/67391 -p <span style="color:#e6db74">"Word Text Extractor"</span> -v
|
||||
The following MediaFilters are enabled:
|
||||
Full Filter Name: org.dspace.app.mediafilter.PoiWordFilter
|
||||
org.dspace.app.mediafilter.PoiWordFilter
|
||||
File: Agreement_on_the_Estab_of_ILRI.doc.txt
|
||||
<span style="color:#960050;background-color:#1e0010">
|
||||
</span><span style="color:#960050;background-color:#1e0010"></span>FILTERED: bitstream 31db7d05-5369-4309-adeb-3b888c80b73d (item: 10568/67391) and created 'Agreement_on_the_Estab_of_ILRI.doc.txt'
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace filter-media -i 10568/67391 -p <span style="color:#e6db74">"Word Text Extractor"</span> -v
|
||||
</span></span><span style="display:flex;"><span>The following MediaFilters are enabled:
|
||||
</span></span><span style="display:flex;"><span>Full Filter Name: org.dspace.app.mediafilter.PoiWordFilter
|
||||
</span></span><span style="display:flex;"><span>org.dspace.app.mediafilter.PoiWordFilter
|
||||
</span></span><span style="display:flex;"><span>File: Agreement_on_the_Estab_of_ILRI.doc.txt
|
||||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>FILTERED: bitstream 31db7d05-5369-4309-adeb-3b888c80b73d (item: 10568/67391) and created 'Agreement_on_the_Estab_of_ILRI.doc.txt'
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Meeting with the repositories working group to discuss issues moving forward in the One CGIAR</li>
|
||||
</ul>
|
||||
<h2 id="2022-02-07">2022-02-07</h2>
|
||||
@ -302,20 +302,20 @@ File: Agreement_on_the_Estab_of_ILRI.doc.txt
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">or(
|
||||
isNotNull(value.match('1')),
|
||||
isNotNull(value.match('4')),
|
||||
isNotNull(value.match('5')),
|
||||
isNotNull(value.match('6')),
|
||||
isNotNull(value.match('8')),
|
||||
...
|
||||
sNotNull(value.match('178')),
|
||||
isNotNull(value.match('186')),
|
||||
isNotNull(value.match('188')),
|
||||
isNotNull(value.match('189')),
|
||||
isNotNull(value.match('197'))
|
||||
)
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>or(
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('1')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('4')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('5')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('6')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('8')),
|
||||
</span></span><span style="display:flex;"><span>...
|
||||
</span></span><span style="display:flex;"><span>sNotNull(value.match('178')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('186')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('188')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('189')),
|
||||
</span></span><span style="display:flex;"><span>isNotNull(value.match('197'))
|
||||
</span></span><span style="display:flex;"><span>)
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Then I flagged all of these (seventy-five items)…
|
||||
<ul>
|
||||
<li>I decided to flag the deletes instead of star the keeps because there are some items in the original file that we not marked as duplicates so we have to keep those too</li>
|
||||
@ -323,19 +323,19 @@ isNotNull(value.match('197'))
|
||||
</li>
|
||||
<li>I generated the next batch of 200 items, from IDs 201 to 400, checked them for duplicates, and then added the PDF file names to the CSV for reference:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ csvcut -c id,dc.title,dcterms.issued,dcterms.type ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch201-400.csv > /tmp/tac.csv
|
||||
$ ./ilri/check-duplicates.py -i /tmp/tac.csv -db dspace63 -u dspacetest -p <span style="color:#e6db74">'dom@in34sniper'</span> -o /tmp/2022-02-07-tac-batch2-201-400.csv
|
||||
$ csvcut -c id,filename ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch201-400.csv > /tmp/batch2-filenames.csv
|
||||
$ csvjoin -c id /tmp/2022-02-07-tac-batch2-201-400.csv /tmp/batch2-filenames.csv > /tmp/2022-02-07-tac-batch2-201-400-filenames.csv
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ csvcut -c id,dc.title,dcterms.issued,dcterms.type ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch201-400.csv > /tmp/tac.csv
|
||||
</span></span><span style="display:flex;"><span>$ ./ilri/check-duplicates.py -i /tmp/tac.csv -db dspace63 -u dspacetest -p <span style="color:#e6db74">'dom@in34sniper'</span> -o /tmp/2022-02-07-tac-batch2-201-400.csv
|
||||
</span></span><span style="display:flex;"><span>$ csvcut -c id,filename ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch201-400.csv > /tmp/batch2-filenames.csv
|
||||
</span></span><span style="display:flex;"><span>$ csvjoin -c id /tmp/2022-02-07-tac-batch2-201-400.csv /tmp/batch2-filenames.csv > /tmp/2022-02-07-tac-batch2-201-400-filenames.csv
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Then I sent this second batch of items to Gaia to look at</li>
|
||||
</ul>
|
||||
<h2 id="2022-02-08">2022-02-08</h2>
|
||||
<ul>
|
||||
<li>Create a SAF archive for the first 200 items (IDs 1 to 200) that were <em>not</em> flagged as duplicates and upload them to a <a href="https://dspacetest.cgiar.org/handle/10568/117921">new collection on DSpace Test</a>:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace import --add --eperson<span style="color:#f92672">=</span>bngo@mfin.com --source /tmp/SimpleArchiveFormat --mapfile<span style="color:#f92672">=</span>./2022-02-08-tac-batch1-1to200.map
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace import --add --eperson<span style="color:#f92672">=</span>bngo@mfin.com --source /tmp/SimpleArchiveFormat --mapfile<span style="color:#f92672">=</span>./2022-02-08-tac-batch1-1to200.map
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Fix some occurrences of “Hammond, Jim” to be “Hammond, James” on CGSpace</li>
|
||||
<li>Start a full index on AReS</li>
|
||||
</ul>
|
||||
@ -355,12 +355,12 @@ $ csvjoin -c id /tmp/2022-02-07-tac-batch2-201-400.csv /tmp/batch2-filenames.csv
|
||||
<ul>
|
||||
<li>I extract the logs from nginx for yesterday so I can analyze the traffic:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console"># zcat --force /var/log/nginx/access.log.1 /var/log/nginx/access.log.2.gz | grep <span style="color:#e6db74">'09/Feb/2022'</span> > /tmp/feb9-access.log
|
||||
# zcat --force /var/log/nginx/rest.log.1 /var/log/nginx/rest.log.2.gz | grep <span style="color:#e6db74">'09/Feb/2022'</span> > /tmp/feb9-rest.log
|
||||
# awk <span style="color:#e6db74">'{print $1}'</span> /tmp/feb9-* | less | sort -u > /tmp/feb9-ips.txt
|
||||
# wc -l /tmp/feb9-ips.txt
|
||||
11636 /tmp/feb9-ips.tx
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># zcat --force /var/log/nginx/access.log.1 /var/log/nginx/access.log.2.gz | grep <span style="color:#e6db74">'09/Feb/2022'</span> > /tmp/feb9-access.log
|
||||
</span></span><span style="display:flex;"><span># zcat --force /var/log/nginx/rest.log.1 /var/log/nginx/rest.log.2.gz | grep <span style="color:#e6db74">'09/Feb/2022'</span> > /tmp/feb9-rest.log
|
||||
</span></span><span style="display:flex;"><span># awk <span style="color:#e6db74">'{print $1}'</span> /tmp/feb9-* | less | sort -u > /tmp/feb9-ips.txt
|
||||
</span></span><span style="display:flex;"><span># wc -l /tmp/feb9-ips.txt
|
||||
</span></span><span style="display:flex;"><span>11636 /tmp/feb9-ips.tx
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I started resolving them with my <code>resolve-addresses-geoip2.py</code> script</li>
|
||||
<li>In the mean time I am looking at the requests and I see a new user agent: <code>1science Resolver 1.0.0</code>
|
||||
<ul>
|
||||
@ -374,52 +374,52 @@ $ csvjoin -c id /tmp/2022-02-07-tac-batch2-201-400.csv /tmp/batch2-filenames.csv
|
||||
</li>
|
||||
<li>Looking at the top twenty or so ASNs for the resolved IPs I see lots of bot traffic, but nothing malicious:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ csvcut -c asn /tmp/feb9-ips.csv | sort | uniq -c | sort -h | tail -n <span style="color:#ae81ff">20</span>
|
||||
79 24940
|
||||
89 36908
|
||||
100 9299
|
||||
107 2635
|
||||
110 44546
|
||||
111 16509
|
||||
118 7552
|
||||
120 4837
|
||||
123 50245
|
||||
123 55836
|
||||
147 45899
|
||||
173 33771
|
||||
192 39832
|
||||
202 32934
|
||||
235 29465
|
||||
260 15169
|
||||
466 14618
|
||||
607 24757
|
||||
768 714
|
||||
1214 8075
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ csvcut -c asn /tmp/feb9-ips.csv | sort | uniq -c | sort -h | tail -n <span style="color:#ae81ff">20</span>
|
||||
</span></span><span style="display:flex;"><span> 79 24940
|
||||
</span></span><span style="display:flex;"><span> 89 36908
|
||||
</span></span><span style="display:flex;"><span> 100 9299
|
||||
</span></span><span style="display:flex;"><span> 107 2635
|
||||
</span></span><span style="display:flex;"><span> 110 44546
|
||||
</span></span><span style="display:flex;"><span> 111 16509
|
||||
</span></span><span style="display:flex;"><span> 118 7552
|
||||
</span></span><span style="display:flex;"><span> 120 4837
|
||||
</span></span><span style="display:flex;"><span> 123 50245
|
||||
</span></span><span style="display:flex;"><span> 123 55836
|
||||
</span></span><span style="display:flex;"><span> 147 45899
|
||||
</span></span><span style="display:flex;"><span> 173 33771
|
||||
</span></span><span style="display:flex;"><span> 192 39832
|
||||
</span></span><span style="display:flex;"><span> 202 32934
|
||||
</span></span><span style="display:flex;"><span> 235 29465
|
||||
</span></span><span style="display:flex;"><span> 260 15169
|
||||
</span></span><span style="display:flex;"><span> 466 14618
|
||||
</span></span><span style="display:flex;"><span> 607 24757
|
||||
</span></span><span style="display:flex;"><span> 768 714
|
||||
</span></span><span style="display:flex;"><span> 1214 8075
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>The same information, but by org name:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ csvcut -c org /tmp/feb9-ips.csv | sort | uniq -c | sort -h | tail -n <span style="color:#ae81ff">20</span>
|
||||
92 Orange
|
||||
100 Hetzner Online GmbH
|
||||
100 Philippine Long Distance Telephone Company
|
||||
107 AUTOMATTIC
|
||||
110 ALFA TELECOM s.r.o.
|
||||
111 AMAZON-02
|
||||
118 Viettel Group
|
||||
120 CHINA UNICOM China169 Backbone
|
||||
123 Reliance Jio Infocomm Limited
|
||||
123 Serverel Inc.
|
||||
147 VNPT Corp
|
||||
173 SAFARICOM-LIMITED
|
||||
192 Opera Software AS
|
||||
202 FACEBOOK
|
||||
235 MTN NIGERIA Communication limited
|
||||
260 GOOGLE
|
||||
466 AMAZON-AES
|
||||
607 Ethiopian Telecommunication Corporation
|
||||
768 APPLE-ENGINEERING
|
||||
1214 MICROSOFT-CORP-MSN-AS-BLOCK
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ csvcut -c org /tmp/feb9-ips.csv | sort | uniq -c | sort -h | tail -n <span style="color:#ae81ff">20</span>
|
||||
</span></span><span style="display:flex;"><span> 92 Orange
|
||||
</span></span><span style="display:flex;"><span> 100 Hetzner Online GmbH
|
||||
</span></span><span style="display:flex;"><span> 100 Philippine Long Distance Telephone Company
|
||||
</span></span><span style="display:flex;"><span> 107 AUTOMATTIC
|
||||
</span></span><span style="display:flex;"><span> 110 ALFA TELECOM s.r.o.
|
||||
</span></span><span style="display:flex;"><span> 111 AMAZON-02
|
||||
</span></span><span style="display:flex;"><span> 118 Viettel Group
|
||||
</span></span><span style="display:flex;"><span> 120 CHINA UNICOM China169 Backbone
|
||||
</span></span><span style="display:flex;"><span> 123 Reliance Jio Infocomm Limited
|
||||
</span></span><span style="display:flex;"><span> 123 Serverel Inc.
|
||||
</span></span><span style="display:flex;"><span> 147 VNPT Corp
|
||||
</span></span><span style="display:flex;"><span> 173 SAFARICOM-LIMITED
|
||||
</span></span><span style="display:flex;"><span> 192 Opera Software AS
|
||||
</span></span><span style="display:flex;"><span> 202 FACEBOOK
|
||||
</span></span><span style="display:flex;"><span> 235 MTN NIGERIA Communication limited
|
||||
</span></span><span style="display:flex;"><span> 260 GOOGLE
|
||||
</span></span><span style="display:flex;"><span> 466 AMAZON-AES
|
||||
</span></span><span style="display:flex;"><span> 607 Ethiopian Telecommunication Corporation
|
||||
</span></span><span style="display:flex;"><span> 768 APPLE-ENGINEERING
|
||||
</span></span><span style="display:flex;"><span> 1214 MICROSOFT-CORP-MSN-AS-BLOCK
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Most of these are pretty normal except “Serverel” and Hetzner perhaps, but their user agents are pretending to be normal users so who knows…</li>
|
||||
<li>I decided to look in the Solr stats with <code>facet.limit=1000&facet.mincount=1</code> and found a few more definitely non-human agents:
|
||||
<ul>
|
||||
@ -439,25 +439,25 @@ $ csvjoin -c id /tmp/2022-02-07-tac-batch2-201-400.csv /tmp/batch2-filenames.csv
|
||||
</li>
|
||||
<li>I added them to the ILRI override in the DSpace spider list and ran the <code>check-spider-hits.sh</code> script:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ ./ilri/check-spider-hits.sh -f dspace/config/spiders/agents/ilri -p
|
||||
Purging 234 hits from randint in statistics
|
||||
Purging 337 hits from Koha in statistics
|
||||
Purging 1164 hits from scalaj-http in statistics
|
||||
Purging 1528 hits from scpitspi-rs in statistics
|
||||
Purging 3050 hits from lua-resty-http in statistics
|
||||
Purging 1683 hits from AHC in statistics
|
||||
Purging 1129 hits from acebookexternalhit in statistics
|
||||
Purging 534 hits from Iframely in statistics
|
||||
Purging 1022 hits from qbhttp in statistics
|
||||
Purging 330 hits from ^got in statistics
|
||||
Purging 156 hits from ^colly in statistics
|
||||
Purging 38 hits from article-parser in statistics
|
||||
Purging 1148 hits from SomeRandomText in statistics
|
||||
Purging 3126 hits from adreview in statistics
|
||||
Purging 217 hits from 1science in statistics
|
||||
<span style="color:#960050;background-color:#1e0010">
|
||||
</span><span style="color:#960050;background-color:#1e0010"></span>Total number of bot hits purged: 14696
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ ./ilri/check-spider-hits.sh -f dspace/config/spiders/agents/ilri -p
|
||||
</span></span><span style="display:flex;"><span>Purging 234 hits from randint in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 337 hits from Koha in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 1164 hits from scalaj-http in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 1528 hits from scpitspi-rs in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 3050 hits from lua-resty-http in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 1683 hits from AHC in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 1129 hits from acebookexternalhit in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 534 hits from Iframely in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 1022 hits from qbhttp in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 330 hits from ^got in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 156 hits from ^colly in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 38 hits from article-parser in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 1148 hits from SomeRandomText in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 3126 hits from adreview in statistics
|
||||
</span></span><span style="display:flex;"><span>Purging 217 hits from 1science in statistics
|
||||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>Total number of bot hits purged: 14696
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I don’t have time right now to add any of these to the COUNTER-Robots list…</li>
|
||||
<li>Peter asked me to add a new item type on CGSpace: Opinion Piece</li>
|
||||
<li>Map an item on CGSpace for Maria since she couldn’t find it in the item mapper</li>
|
||||
@ -476,22 +476,22 @@ Purging 217 hits from 1science in statistics
|
||||
<ul>
|
||||
<li>Install PostgreSQL 12 on my local dev environment to starting DSpace 6.x workflows with it:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ podman run --name dspacedb -v dspacedb_data:/var/lib/postgresql/data -e POSTGRES_PASSWORD<span style="color:#f92672">=</span>postgres -p 5432:5432 -d postgres:12-alpine
|
||||
$ createuser -h localhost -p <span style="color:#ae81ff">5432</span> -U postgres --pwprompt dspacetest
|
||||
$ createdb -h localhost -p <span style="color:#ae81ff">5432</span> -U postgres -O dspacetest --encoding<span style="color:#f92672">=</span>UNICODE dspacetest
|
||||
$ psql -h localhost -U postgres -c <span style="color:#e6db74">'ALTER USER dspacetest SUPERUSER;'</span>
|
||||
$ pg_restore -h localhost -U postgres -d dspacetest -O --role<span style="color:#f92672">=</span>dspacetest -h localhost ~/Downloads/dspace-2022-02-12.backup
|
||||
$ psql -h localhost -U postgres -c <span style="color:#e6db74">'ALTER USER dspacetest NOSUPERUSER;'</span>
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ podman run --name dspacedb -v dspacedb_data:/var/lib/postgresql/data -e POSTGRES_PASSWORD<span style="color:#f92672">=</span>postgres -p 5432:5432 -d postgres:12-alpine
|
||||
</span></span><span style="display:flex;"><span>$ createuser -h localhost -p <span style="color:#ae81ff">5432</span> -U postgres --pwprompt dspacetest
|
||||
</span></span><span style="display:flex;"><span>$ createdb -h localhost -p <span style="color:#ae81ff">5432</span> -U postgres -O dspacetest --encoding<span style="color:#f92672">=</span>UNICODE dspacetest
|
||||
</span></span><span style="display:flex;"><span>$ psql -h localhost -U postgres -c <span style="color:#e6db74">'ALTER USER dspacetest SUPERUSER;'</span>
|
||||
</span></span><span style="display:flex;"><span>$ pg_restore -h localhost -U postgres -d dspacetest -O --role<span style="color:#f92672">=</span>dspacetest -h localhost ~/Downloads/dspace-2022-02-12.backup
|
||||
</span></span><span style="display:flex;"><span>$ psql -h localhost -U postgres -c <span style="color:#e6db74">'ALTER USER dspacetest NOSUPERUSER;'</span>
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Eventually I will updated DSpace Test, then CGSpace (time to start paying off some technical debt!)</li>
|
||||
<li>Start a full Discovery re-index on CGSpace:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ time chrt -b <span style="color:#ae81ff">0</span> ionice -c2 -n7 nice -n19 dspace index-discovery -b
|
||||
<span style="color:#960050;background-color:#1e0010">
|
||||
</span><span style="color:#960050;background-color:#1e0010"></span>real 292m49.263s
|
||||
user 201m26.097s
|
||||
sys 3m2.459s
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ time chrt -b <span style="color:#ae81ff">0</span> ionice -c2 -n7 nice -n19 dspace index-discovery -b
|
||||
</span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010">
|
||||
</span></span></span><span style="display:flex;"><span><span style="color:#960050;background-color:#1e0010"></span>real 292m49.263s
|
||||
</span></span><span style="display:flex;"><span>user 201m26.097s
|
||||
</span></span><span style="display:flex;"><span>sys 3m2.459s
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Start a full harvest on AReS</li>
|
||||
</ul>
|
||||
<h2 id="2022-02-14">2022-02-14</h2>
|
||||
@ -503,17 +503,17 @@ sys 3m2.459s
|
||||
</li>
|
||||
</ul>
|
||||
<pre tabindex="0"><code>or(
|
||||
isNotNull(value.match('201')),
|
||||
isNotNull(value.match('203')),
|
||||
isNotNull(value.match('209')),
|
||||
isNotNull(value.match('209')),
|
||||
isNotNull(value.match('215')),
|
||||
isNotNull(value.match('220')),
|
||||
isNotNull(value.match('225')),
|
||||
isNotNull(value.match('226')),
|
||||
isNotNull(value.match('227')),
|
||||
isNotNull(value.match('201')),
|
||||
isNotNull(value.match('203')),
|
||||
isNotNull(value.match('209')),
|
||||
isNotNull(value.match('209')),
|
||||
isNotNull(value.match('215')),
|
||||
isNotNull(value.match('220')),
|
||||
isNotNull(value.match('225')),
|
||||
isNotNull(value.match('226')),
|
||||
isNotNull(value.match('227')),
|
||||
...
|
||||
isNotNull(value.match('396'))
|
||||
isNotNull(value.match('396'))
|
||||
</code></pre><ul>
|
||||
<li>Then I flagged all matching records and exported a CSV to use with SAFBuilder
|
||||
<ul>
|
||||
@ -521,15 +521,15 @@ isNotNull(value.match('396'))
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace import --add --eperson<span style="color:#f92672">=</span>fuuu@umm.com --source /tmp/SimpleArchiveFormat --mapfile<span style="color:#f92672">=</span>./2022-02-14-tac-batch2-201to400.map
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace import --add --eperson<span style="color:#f92672">=</span>fuuu@umm.com --source /tmp/SimpleArchiveFormat --mapfile<span style="color:#f92672">=</span>./2022-02-14-tac-batch2-201to400.map
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Export the next batch from OpenRefine (items with ID 401 to 700), check duplicates, and then join with the file names:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ csvcut -c id,dc.title,dcterms.issued,dcterms.type ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch3-401to700.csv > /tmp/tac3.csv
|
||||
$ ./ilri/check-duplicates.py -i /tmp/tac3.csv -db dspace -u dspace -p <span style="color:#e6db74">'fuuu'</span> -o /tmp/2022-02-14-tac-batch3-401-700.csv
|
||||
$ csvcut -c id,filename ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch3-401to700.csv > /tmp/tac3-filenames.csv
|
||||
$ csvjoin -c id /tmp/2022-02-14-tac-batch3-401-700.csv /tmp/tac3-filenames.csv > /tmp/2022-02-14-tac-batch3-401-700-filenames.csv
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ csvcut -c id,dc.title,dcterms.issued,dcterms.type ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch3-401to700.csv > /tmp/tac3.csv
|
||||
</span></span><span style="display:flex;"><span>$ ./ilri/check-duplicates.py -i /tmp/tac3.csv -db dspace -u dspace -p <span style="color:#e6db74">'fuuu'</span> -o /tmp/2022-02-14-tac-batch3-401-700.csv
|
||||
</span></span><span style="display:flex;"><span>$ csvcut -c id,filename ~/Downloads/2022-01-21-CGSpace-TAC-ICW-batch3-401to700.csv > /tmp/tac3-filenames.csv
|
||||
</span></span><span style="display:flex;"><span>$ csvjoin -c id /tmp/2022-02-14-tac-batch3-401-700.csv /tmp/tac3-filenames.csv > /tmp/2022-02-14-tac-batch3-401-700-filenames.csv
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I sent these 300 items to Gaia…</li>
|
||||
</ul>
|
||||
<h2 id="2022-02-16">2022-02-16</h2>
|
||||
@ -541,36 +541,36 @@ $ csvjoin -c id /tmp/2022-02-14-tac-batch3-401-700.csv /tmp/tac3-filenames.csv &
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console"># systemctl stop tomcat7
|
||||
# pg_ctlcluster <span style="color:#ae81ff">10</span> main stop
|
||||
# tar -cvzpf var-lib-postgresql-10.tar.gz /var/lib/postgresql/10
|
||||
# tar -cvzpf etc-postgresql-10.tar.gz /etc/postgresql/10
|
||||
# pg_ctlcluster <span style="color:#ae81ff">12</span> main stop
|
||||
# pg_dropcluster <span style="color:#ae81ff">12</span> main
|
||||
# pg_upgradecluster <span style="color:#ae81ff">10</span> main
|
||||
# pg_ctlcluster <span style="color:#ae81ff">12</span> main start
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># systemctl stop tomcat7
|
||||
</span></span><span style="display:flex;"><span># pg_ctlcluster <span style="color:#ae81ff">10</span> main stop
|
||||
</span></span><span style="display:flex;"><span># tar -cvzpf var-lib-postgresql-10.tar.gz /var/lib/postgresql/10
|
||||
</span></span><span style="display:flex;"><span># tar -cvzpf etc-postgresql-10.tar.gz /etc/postgresql/10
|
||||
</span></span><span style="display:flex;"><span># pg_ctlcluster <span style="color:#ae81ff">12</span> main stop
|
||||
</span></span><span style="display:flex;"><span># pg_dropcluster <span style="color:#ae81ff">12</span> main
|
||||
</span></span><span style="display:flex;"><span># pg_upgradecluster <span style="color:#ae81ff">10</span> main
|
||||
</span></span><span style="display:flex;"><span># pg_ctlcluster <span style="color:#ae81ff">12</span> main start
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>After that I <a href="https://adamj.eu/tech/2021/04/13/reindexing-all-tables-after-upgrading-to-postgresql-13/">re-indexed the database indexes using a query</a>:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ su - postgres
|
||||
$ cat /tmp/generate-reindex.sql
|
||||
SELECT 'REINDEX TABLE CONCURRENTLY ' || quote_ident(relname) || ' /*' || pg_size_pretty(pg_total_relation_size(C.oid)) || '*/;'
|
||||
FROM pg_class C
|
||||
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
|
||||
WHERE nspname = 'public'
|
||||
AND C.relkind = 'r'
|
||||
AND nspname !~ '^pg_toast'
|
||||
ORDER BY pg_total_relation_size(C.oid) ASC;
|
||||
$ psql dspace < /tmp/generate-reindex.sql > /tmp/reindex.sql
|
||||
$ <trim the extra stuff from /tmp/reindex.sql>
|
||||
$ psql dspace < /tmp/reindex.sql
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ su - postgres
|
||||
</span></span><span style="display:flex;"><span>$ cat /tmp/generate-reindex.sql
|
||||
</span></span><span style="display:flex;"><span>SELECT 'REINDEX TABLE CONCURRENTLY ' || quote_ident(relname) || ' /*' || pg_size_pretty(pg_total_relation_size(C.oid)) || '*/;'
|
||||
</span></span><span style="display:flex;"><span>FROM pg_class C
|
||||
</span></span><span style="display:flex;"><span>LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
|
||||
</span></span><span style="display:flex;"><span>WHERE nspname = 'public'
|
||||
</span></span><span style="display:flex;"><span> AND C.relkind = 'r'
|
||||
</span></span><span style="display:flex;"><span> AND nspname !~ '^pg_toast'
|
||||
</span></span><span style="display:flex;"><span>ORDER BY pg_total_relation_size(C.oid) ASC;
|
||||
</span></span><span style="display:flex;"><span>$ psql dspace < /tmp/generate-reindex.sql > /tmp/reindex.sql
|
||||
</span></span><span style="display:flex;"><span>$ <trim the extra stuff from /tmp/reindex.sql>
|
||||
</span></span><span style="display:flex;"><span>$ psql dspace < /tmp/reindex.sql
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I saw that the index on <code>metadatavalue</code> shrunk by about 200MB!</li>
|
||||
<li>After testing a few things I dropped the old cluster:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console"># pg_dropcluster <span style="color:#ae81ff">10</span> main
|
||||
# dpkg -l | grep postgresql-10 | awk <span style="color:#e6db74">'{print $2}'</span> | xargs dpkg -r
|
||||
</code></pre></div><h2 id="2022-02-17">2022-02-17</h2>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span># pg_dropcluster <span style="color:#ae81ff">10</span> main
|
||||
</span></span><span style="display:flex;"><span># dpkg -l | grep postgresql-10 | awk <span style="color:#e6db74">'{print $2}'</span> | xargs dpkg -r
|
||||
</span></span></code></pre></div><h2 id="2022-02-17">2022-02-17</h2>
|
||||
<ul>
|
||||
<li>I updated my <code>migrate-fields.sh</code> script to use field names instead of IDs
|
||||
<ul>
|
||||
@ -582,25 +582,25 @@ $ psql dspace < /tmp/reindex.sql
|
||||
<ul>
|
||||
<li>Normalize the <code>text_lang</code> attributes of metadata on CGSpace:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">dspace=# SELECT DISTINCT text_lang, count(text_lang) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) GROUP BY text_lang ORDER BY count DESC;
|
||||
text_lang | count
|
||||
-----------+---------
|
||||
en_US | 2838588
|
||||
en | 1082
|
||||
| 801
|
||||
fr | 2
|
||||
vn | 2
|
||||
en_US. | 1
|
||||
sp | 1
|
||||
| 0
|
||||
(8 rows)
|
||||
dspace=# UPDATE metadatavalue SET text_lang='en_US' WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_lang IN ('en', 'en_US.', '');
|
||||
UPDATE 1884
|
||||
dspace=# UPDATE metadatavalue SET text_lang='vi' WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_lang IN ('vn');
|
||||
UPDATE 2
|
||||
dspace=# UPDATE metadatavalue SET text_lang='es' WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_lang IN ('sp');
|
||||
UPDATE 1
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>dspace=# SELECT DISTINCT text_lang, count(text_lang) FROM metadatavalue WHERE dspace_object_id IN (SELECT uuid FROM item) GROUP BY text_lang ORDER BY count DESC;
|
||||
</span></span><span style="display:flex;"><span> text_lang | count
|
||||
</span></span><span style="display:flex;"><span>-----------+---------
|
||||
</span></span><span style="display:flex;"><span> en_US | 2838588
|
||||
</span></span><span style="display:flex;"><span> en | 1082
|
||||
</span></span><span style="display:flex;"><span> | 801
|
||||
</span></span><span style="display:flex;"><span> fr | 2
|
||||
</span></span><span style="display:flex;"><span> vn | 2
|
||||
</span></span><span style="display:flex;"><span> en_US. | 1
|
||||
</span></span><span style="display:flex;"><span> sp | 1
|
||||
</span></span><span style="display:flex;"><span> | 0
|
||||
</span></span><span style="display:flex;"><span>(8 rows)
|
||||
</span></span><span style="display:flex;"><span>dspace=# UPDATE metadatavalue SET text_lang='en_US' WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_lang IN ('en', 'en_US.', '');
|
||||
</span></span><span style="display:flex;"><span>UPDATE 1884
|
||||
</span></span><span style="display:flex;"><span>dspace=# UPDATE metadatavalue SET text_lang='vi' WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_lang IN ('vn');
|
||||
</span></span><span style="display:flex;"><span>UPDATE 2
|
||||
</span></span><span style="display:flex;"><span>dspace=# UPDATE metadatavalue SET text_lang='es' WHERE dspace_object_id IN (SELECT uuid FROM item) AND text_lang IN ('sp');
|
||||
</span></span><span style="display:flex;"><span>UPDATE 1
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>I then exported the entire repository and did some cleanup on DOIs
|
||||
<ul>
|
||||
<li>I found ~1,200 items with no <code>cg.identifier.doi</code>, but which had a DOI in their citation</li>
|
||||
@ -623,8 +623,8 @@ UPDATE 1
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">abs(diff(toDate(cells["issued"].value),toDate(cells["dcterms.issued[en_US]"].value), "days"))
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>abs(diff(toDate(cells["issued"].value),toDate(cells["dcterms.issued[en_US]"].value), "days"))
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>In <em>most</em> cases Crossref’s dates are more correct than ours, though there are a few odd cases that I don’t know what strategy I want to use yet</li>
|
||||
<li>Start a full harvest on AReS</li>
|
||||
</ul>
|
||||
@ -639,26 +639,26 @@ UPDATE 1
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">or(
|
||||
value.contains("10.1017"),
|
||||
value.contains("10.1007"),
|
||||
value.contains("10.1016"),
|
||||
value.contains("10.1098"),
|
||||
value.contains("10.1111"),
|
||||
value.contains("10.1002"),
|
||||
value.contains("10.1046"),
|
||||
value.contains("10.2135"),
|
||||
value.contains("10.1006"),
|
||||
value.contains("10.1177"),
|
||||
value.contains("10.1079"),
|
||||
value.contains("10.2298"),
|
||||
value.contains("10.1186"),
|
||||
value.contains("10.3835"),
|
||||
value.contains("10.1128"),
|
||||
value.contains("10.3732"),
|
||||
value.contains("10.2134")
|
||||
)
|
||||
</code></pre></div><ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>or(
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1017"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1007"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1016"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1098"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1111"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1002"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1046"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.2135"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1006"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1177"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1079"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.2298"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1186"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.3835"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.1128"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.3732"),
|
||||
</span></span><span style="display:flex;"><span>value.contains("10.2134")
|
||||
</span></span><span style="display:flex;"><span>)
|
||||
</span></span></code></pre></div><ul>
|
||||
<li>Many many of Crossref’s records are correct where we have no license, and in some cases more correct when we have a different license
|
||||
<ul>
|
||||
<li>I ran license updates on ~167 DOIs in the end on CGSpace</li>
|
||||
@ -669,11 +669,11 @@ value.contains("10.2134")
|
||||
<ul>
|
||||
<li>Update some audience metadata on CGSpace:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">dspace=# UPDATE metadatavalue SET text_value='Academics' WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=144 AND text_value = 'Academicians';
|
||||
UPDATE 354
|
||||
dspace=# UPDATE metadatavalue SET text_value='Scientists' WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=144 AND text_value = 'SCIENTISTS';
|
||||
UPDATE 2
|
||||
</code></pre></div><h2 id="2022-02-25">2022-02-25</h2>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>dspace=# UPDATE metadatavalue SET text_value='Academics' WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=144 AND text_value = 'Academicians';
|
||||
</span></span><span style="display:flex;"><span>UPDATE 354
|
||||
</span></span><span style="display:flex;"><span>dspace=# UPDATE metadatavalue SET text_value='Scientists' WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=144 AND text_value = 'SCIENTISTS';
|
||||
</span></span><span style="display:flex;"><span>UPDATE 2
|
||||
</span></span></code></pre></div><h2 id="2022-02-25">2022-02-25</h2>
|
||||
<ul>
|
||||
<li>A few days ago Gaia sent me her notes on the third batch of TAC/ICW documents (items 401–700 in the spreadsheet)
|
||||
<ul>
|
||||
@ -682,23 +682,23 @@ UPDATE 2
|
||||
</li>
|
||||
</ul>
|
||||
<pre tabindex="0"><code>or(
|
||||
isNotNull(value.match('405')),
|
||||
isNotNull(value.match('410')),
|
||||
isNotNull(value.match('412')),
|
||||
isNotNull(value.match('414')),
|
||||
isNotNull(value.match('419')),
|
||||
isNotNull(value.match('436')),
|
||||
isNotNull(value.match('448')),
|
||||
isNotNull(value.match('449')),
|
||||
isNotNull(value.match('450')),
|
||||
isNotNull(value.match('405')),
|
||||
isNotNull(value.match('410')),
|
||||
isNotNull(value.match('412')),
|
||||
isNotNull(value.match('414')),
|
||||
isNotNull(value.match('419')),
|
||||
isNotNull(value.match('436')),
|
||||
isNotNull(value.match('448')),
|
||||
isNotNull(value.match('449')),
|
||||
isNotNull(value.match('450')),
|
||||
...
|
||||
isNotNull(value.match('699'))
|
||||
isNotNull(value.match('699'))
|
||||
)
|
||||
</code></pre><ul>
|
||||
<li>Then I flagged all matching records, exported a CSV to use with SAFBuilder, and imported them on DSpace Test:</li>
|
||||
</ul>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4"><code class="language-console" data-lang="console">$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace import --add --eperson<span style="color:#f92672">=</span>fuuu@umm.com --source /tmp/SimpleArchiveFormat --mapfile<span style="color:#f92672">=</span>./2022-02-25-tac-batch3-401to700.map
|
||||
</code></pre></div><h2 id="2022-02-26">2022-02-26</h2>
|
||||
<div class="highlight"><pre tabindex="0" style="color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;"><code class="language-console" data-lang="console"><span style="display:flex;"><span>$ JAVA_OPTS<span style="color:#f92672">=</span><span style="color:#e6db74">"-Xmx1024m -Dfile.encoding=UTF-8"</span> dspace import --add --eperson<span style="color:#f92672">=</span>fuuu@umm.com --source /tmp/SimpleArchiveFormat --mapfile<span style="color:#f92672">=</span>./2022-02-25-tac-batch3-401to700.map
|
||||
</span></span></code></pre></div><h2 id="2022-02-26">2022-02-26</h2>
|
||||
<ul>
|
||||
<li>Upgrade CGSpace (linode18) to Ubuntu 20.04</li>
|
||||
<li>Start a full AReS harvest</li>
|
||||
|
Reference in New Issue
Block a user