2023-12-02 08:38:09 +01:00
<!DOCTYPE html>
< html lang = "en" >
< head >
< meta charset = "utf-8" >
< meta name = "viewport" content = "width=device-width, initial-scale=1, shrink-to-fit=no" >
< meta property = "og:title" content = "December, 2023" / >
< meta property = "og:description" content = "2023-12-01 There is still high load on CGSpace and I don’t know why I don’t see a high number of sessions compared to previous days in the last few weeks $ for file in dspace.log.2023-11-[23]*; do echo "$file"; grep -a -oE 'session_id=[A-Z0-9]{32}' "$file" | sort | uniq | wc -l; done dspace.log.2023-11-20 22865 dspace.log.2023-11-21 20296 dspace.log.2023-11-22 19688 dspace.log.2023-11-23 17906 dspace.log.2023-11-24 18453 dspace.log.2023-11-25 17513 dspace.log.2023-11-26 19037 dspace.log.2023-11-27 21103 dspace.log.2023-11-28 23023 dspace.log.2023-11-29 23545 dspace." / >
< meta property = "og:type" content = "article" / >
< meta property = "og:url" content = "https://alanorth.github.io/cgspace-notes/2023-12/" / >
< meta property = "article:published_time" content = "2023-12-01T08:48:36+03:00" / >
2023-12-09 07:55:16 +01:00
< meta property = "article:modified_time" content = "2023-12-08T16:32:48+03:00" / >
2023-12-02 08:38:09 +01:00
< meta name = "twitter:card" content = "summary" / >
< meta name = "twitter:title" content = "December, 2023" / >
< meta name = "twitter:description" content = "2023-12-01 There is still high load on CGSpace and I don’t know why I don’t see a high number of sessions compared to previous days in the last few weeks $ for file in dspace.log.2023-11-[23]*; do echo "$file"; grep -a -oE 'session_id=[A-Z0-9]{32}' "$file" | sort | uniq | wc -l; done dspace.log.2023-11-20 22865 dspace.log.2023-11-21 20296 dspace.log.2023-11-22 19688 dspace.log.2023-11-23 17906 dspace.log.2023-11-24 18453 dspace.log.2023-11-25 17513 dspace.log.2023-11-26 19037 dspace.log.2023-11-27 21103 dspace.log.2023-11-28 23023 dspace.log.2023-11-29 23545 dspace." / >
2023-12-09 07:55:16 +01:00
< meta name = "generator" content = "Hugo 0.121.1" >
2023-12-02 08:38:09 +01:00
< script type = "application/ld+json" >
{
"@context": "http://schema.org",
"@type": "BlogPosting",
"headline": "December, 2023",
"url": "https://alanorth.github.io/cgspace-notes/2023-12/",
2023-12-09 07:55:16 +01:00
"wordCount": "695",
2023-12-02 08:38:09 +01:00
"datePublished": "2023-12-01T08:48:36+03:00",
2023-12-09 07:55:16 +01:00
"dateModified": "2023-12-08T16:32:48+03:00",
2023-12-02 08:38:09 +01:00
"author": {
"@type": "Person",
"name": "Alan Orth"
},
"keywords": "Notes"
}
< / script >
< link rel = "canonical" href = "https://alanorth.github.io/cgspace-notes/2023-12/" >
< title > December, 2023 | CGSpace Notes< / title >
<!-- combined, minified CSS -->
< link href = "https://alanorth.github.io/cgspace-notes/css/style.c6ba80bc50669557645abe05f86b73cc5af84408ed20f1551a267bc19ece8228.css" rel = "stylesheet" integrity = "sha256-xrqAvFBmlVdkWr4F+GtzzFr4RAjtIPFVGiZ7wZ7Ogig=" crossorigin = "anonymous" >
<!-- minified Font Awesome for SVG icons -->
< script defer src = "https://alanorth.github.io/cgspace-notes/js/fontawesome.min.f5072c55a0721857184db93a50561d7dc13975b4de2e19db7f81eb5f3fa57270.js" integrity = "sha256-9QcsVaByGFcYTbk6UFYdfcE5dbTeLhnbf4HrXz+lcnA=" crossorigin = "anonymous" > < / script >
<!-- RSS 2.0 feed -->
< / head >
< body >
< div class = "blog-masthead" >
< div class = "container" >
< nav class = "nav blog-nav" >
< a class = "nav-link " href = "https://alanorth.github.io/cgspace-notes/" > Home< / a >
< / nav >
< / div >
< / div >
< header class = "blog-header" >
< div class = "container" >
< h1 class = "blog-title" dir = "auto" > < a href = "https://alanorth.github.io/cgspace-notes/" rel = "home" > CGSpace Notes< / a > < / h1 >
< p class = "lead blog-description" dir = "auto" > Documenting day-to-day work on the < a href = "https://cgspace.cgiar.org" > CGSpace< / a > repository.< / p >
< / div >
< / header >
< div class = "container" >
< div class = "row" >
< div class = "col-sm-8 blog-main" >
< article class = "blog-post" >
< header >
< h2 class = "blog-post-title" dir = "auto" > < a href = "https://alanorth.github.io/cgspace-notes/2023-12/" > December, 2023< / a > < / h2 >
< p class = "blog-post-meta" >
< time datetime = "2023-12-01T08:48:36+03:00" > Fri Dec 01, 2023< / time >
in
< span class = "fas fa-folder" aria-hidden = "true" > < / span > < a href = "/categories/notes/" rel = "category tag" > Notes< / a >
< / p >
< / header >
< h2 id = "2023-12-01" > 2023-12-01< / h2 >
< ul >
< li > There is still high load on CGSpace and I don’ t know why
< ul >
< li > I don’ t see a high number of sessions compared to previous days in the last few weeks< / li >
< / ul >
< / li >
< / ul >
<!-- raw HTML omitted -->
< div class = "highlight" > < pre tabindex = "0" style = "color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;" > < code class = "language-console" data-lang = "console" > < span style = "display:flex;" > < span > $ < span style = "color:#66d9ef" > for< / span > file in dspace.log.2023-11-< span style = "color:#f92672" > [< / span > 23< span style = "color:#f92672" > ]< / span > *; < span style = "color:#66d9ef" > do< / span > echo < span style = "color:#e6db74" > " < / span > $file< span style = "color:#e6db74" > " < / span > ; grep -a -oE < span style = "color:#e6db74" > ' session_id=[A-Z0-9]{32}' < / span > < span style = "color:#e6db74" > " < / span > $file< span style = "color:#e6db74" > " < / span > | sort | uniq | wc -l; < span style = "color:#66d9ef" > done< / span >
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-20
< / span > < / span > < span style = "display:flex;" > < span > 22865
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-21
< / span > < / span > < span style = "display:flex;" > < span > 20296
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-22
< / span > < / span > < span style = "display:flex;" > < span > 19688
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-23
< / span > < / span > < span style = "display:flex;" > < span > 17906
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-24
< / span > < / span > < span style = "display:flex;" > < span > 18453
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-25
< / span > < / span > < span style = "display:flex;" > < span > 17513
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-26
< / span > < / span > < span style = "display:flex;" > < span > 19037
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-27
< / span > < / span > < span style = "display:flex;" > < span > 21103
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-28
< / span > < / span > < span style = "display:flex;" > < span > 23023
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-29
< / span > < / span > < span style = "display:flex;" > < span > 23545
< / span > < / span > < span style = "display:flex;" > < span > dspace.log.2023-11-30
< / span > < / span > < span style = "display:flex;" > < span > 21298
< / span > < / span > < / code > < / pre > < / div > < ul >
< li > Even the number of unique IPs is not very high compared to the last week or so:< / li >
< / ul >
< div class = "highlight" > < pre tabindex = "0" style = "color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;" > < code class = "language-console" data-lang = "console" > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.1 | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 17023
< / span > < / span > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.2.gz | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 17294
< / span > < / span > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.3.gz | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 22057
< / span > < / span > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.4.gz | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 32956
< / span > < / span > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.5.gz | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 11415
< / span > < / span > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.6.gz | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 15444
< / span > < / span > < span style = "display:flex;" > < span > # awk < span style = "color:#e6db74" > ' {print $1}' < / span > /var/log/nginx/< span style = "color:#f92672" > {< / span > access,library-access,oai,rest< span style = "color:#f92672" > }< / span > .log.7.gz | sort | uniq | wc -l
< / span > < / span > < span style = "display:flex;" > < span > 12648
< / span > < / span > < / code > < / pre > < / div > < ul >
< li > It doesn’ t make any sense so I think I’ m going to restart the server…
< ul >
< li > After restarting the server the load went down to normal levels… who knows… < / li >
< / ul >
< / li >
< li > I started trying to see how I’ m going to generate the fake statistics for the Alliance bitstream that was replaced
< ul >
< li > I exported all the statistics for the owningItem now:< / li >
< / ul >
< / li >
< / ul >
< div class = "highlight" > < pre tabindex = "0" style = "color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;" > < code class = "language-console" data-lang = "console" > < span style = "display:flex;" > < span > $ chrt -b < span style = "color:#ae81ff" > 0< / span > ./run.sh -s http://localhost:8081/solr/statistics -a export -o /tmp/stats-export.json -f < span style = "color:#e6db74" > ' owningItem:b5862bfa-9799-4167-b1cf-76f0f4ea1e18' < / span > -k uid
< / span > < / span > < / code > < / pre > < / div > < ul >
< li > Importing them into DSpace Test didn’ t show the statistics in the Atmire module, but I see them in Solr… < / li >
< / ul >
< h2 id = "2023-12-02" > 2023-12-02< / h2 >
< ul >
< li > Export CGSpace to check for missing Initiative collection mappings< / li >
< li > Start a harvest on AReS< / li >
< / ul >
2023-12-06 07:55:57 +01:00
< h2 id = "2023-12-04" > 2023-12-04< / h2 >
< ul >
< li > Send a message to Altmetric support because the item IWMI highlighted last month still doesn’ t show the attention score for the Handle after I tweeted it several times weeks ago< / li >
< li > Spent some time writing a Python script to fix the literal MaxMind City JSON objects in our Solr statistics
< ul >
< li > There are about 1.6 million of these, so I exported them using solr-import-export-json with the query < code > city:com*< / code > but ended up finding many that have missing bundles, container bitstreams, etc:< / li >
< / ul >
< / li >
< / ul >
< pre tabindex = "0" > < code > city:com* AND -bundleName:[* TO *] AND -containerBitstream:[* TO *] AND -file_id:[* TO *] AND -owningItem:[* TO *] AND -version_id:[* TO *]
< / code > < / pre > < ul >
< li > (Note the negation to find fields that are missing)< / li >
< li > I don’ t know what I want to do with these yet< / li >
< / ul >
< h2 id = "2023-12-05" > 2023-12-05< / h2 >
< ul >
< li > I finished the < code > fix_maxmind_stats.py< / code > script and fixed 1.6 million records and imported them on CGSpace after testing on DSpace 7 Test< / li >
< li > Altmetric said there was a glitch regarding the Handle and DOI linking and they successfully re-scraped the item page and linked them
< ul >
< li > They sent me a list of current production IPs and I notice that some of them are in our nginx bot network list:< / li >
< / ul >
< / li >
< / ul >
< div class = "highlight" > < pre tabindex = "0" style = "color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;" > < code class = "language-console" data-lang = "console" > < span style = "display:flex;" > < span > $ < span style = "color:#66d9ef" > for< / span > network in < span style = "color:#66d9ef" > $(< / span > csvcut -c network /tmp/ips.csv | sed 1d | sort -u< span style = "color:#66d9ef" > )< / span > ; < span style = "color:#66d9ef" > do< / span > grepcidr $network ~/src/git/rmg-ansible-public/roles/dspace/files/nginx/bot-networks.conf; < span style = "color:#66d9ef" > done< / span >
< / span > < / span > < span style = "display:flex;" > < span > 108.128.0.0/13 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 46.137.0.0/16 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 52.208.0.0/13 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 52.48.0.0/13 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 54.194.0.0/15 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 54.216.0.0/14 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 54.220.0.0/15 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 54.228.0.0/15 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 63.32.242.35/32 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 63.32.0.0/14 ' bot' ;
< / span > < / span > < span style = "display:flex;" > < span > 99.80.0.0/15 ' bot'
< / span > < / span > < / code > < / pre > < / div > < ul >
< li > I will remove those for now so that Altmetric doesn’ t have any unexpected issues harvesting< / li >
< / ul >
2023-12-08 14:32:48 +01:00
< h2 id = "2023-12-08" > 2023-12-08< / h2 >
< ul >
< li > Finalized the script to generate Solr statistics for Alliance research Mirjam
< ul >
< li > The script is < code > ilri/generate_solr_statistics.py< / code > < / li >
< li > I generated ~3,200 statistics based on her records of the download statistics of < a href = "https://hdl.handle.net/10568/131997" > that item< / a > and imported them on CGSpace< / li >
< / ul >
< / li >
2023-12-09 07:55:16 +01:00
< li > Did some work on the DSpace 7 submission form< / li >
2023-12-08 14:32:48 +01:00
< li > Peter asked for lists of affiliations, investors, and publishers to do some cleanups
< ul >
< li > I generated a list from a CSV export instead of doing it based on a SQL dump… < / li >
< / ul >
< / li >
< / ul >
< div class = "highlight" > < pre tabindex = "0" style = "color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;" > < code class = "language-console" data-lang = "console" > < span style = "display:flex;" > < span > $ csvcut -c < span style = "color:#e6db74" > ' cg.contributor.affiliation[en_US]' < / span > /tmp/initiatives.csv < span style = "color:#ae81ff" > \
< / span > < / span > < / span > < span style = "display:flex;" > < span > < span style = "color:#ae81ff" > < / span > | sed -e 1d -e ' s/^" //' -e ' s/" $//' -e ' s/||/\n/g' -e ' /^$/d' \
< / span > < / span > < span style = "display:flex;" > < span > | sort | uniq -c | sort -hr \
< / span > < / span > < span style = "display:flex;" > < span > | awk ' BEGIN { FS = " ^[[:space:]]+[[:digit:]]+[[:space:]]+" } {print $2}' \
< / span > < / span > < span style = "display:flex;" > < span > | sed -e ' 1i cg.contributor.affiliation' -e ' s/^\(.*\)$/" \1" /' \
< / span > < / span > < span style = "display:flex;" > < span > > /tmp/2023-12-08-initiatives-affiliations.csv
2023-12-09 07:55:16 +01:00
< / span > < / span > < / code > < / pre > < / div > < ul >
< li > Export a list of authors as well:< / li >
< / ul >
< div class = "highlight" > < pre tabindex = "0" style = "color:#f8f8f2;background-color:#272822;-moz-tab-size:4;-o-tab-size:4;tab-size:4;" > < code class = "language-console" data-lang = "console" > < span style = "display:flex;" > < span > localhost/dspace7= ☘ \COPY (SELECT DISTINCT text_value AS " dc.contributor.author" , count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id = 3 GROUP BY " dc.contributor.author" ORDER BY count DESC) to /tmp/2023-12-08-authors.csv WITH CSV HEADER;
< / span > < / span > < span style = "display:flex;" > < span > COPY 102435
2023-12-08 14:32:48 +01:00
< / span > < / span > < / code > < / pre > < / div > <!-- raw HTML omitted -->
2023-12-02 08:38:09 +01:00
< / article >
< / div > <!-- /.blog - main -->
< aside class = "col-sm-3 ml-auto blog-sidebar" >
< section class = "sidebar-module" >
< h4 > Recent Posts< / h4 >
< ol class = "list-unstyled" >
< li > < a href = "/cgspace-notes/2023-12/" > December, 2023< / a > < / li >
< li > < a href = "/cgspace-notes/2023-11/" > November, 2023< / a > < / li >
< li > < a href = "/cgspace-notes/2023-10/" > October, 2023< / a > < / li >
< li > < a href = "/cgspace-notes/2023-09/" > September, 2023< / a > < / li >
< li > < a href = "/cgspace-notes/2023-08/" > August, 2023< / a > < / li >
< / ol >
< / section >
< section class = "sidebar-module" >
< h4 > Links< / h4 >
< ol class = "list-unstyled" >
< li > < a href = "https://cgspace.cgiar.org" > CGSpace< / a > < / li >
< li > < a href = "https://dspacetest.cgiar.org" > DSpace Test< / a > < / li >
< li > < a href = "https://github.com/ilri/DSpace" > CGSpace @ GitHub< / a > < / li >
< / ol >
< / section >
< / aside >
< / div > <!-- /.row -->
< / div > <!-- /.container -->
< footer class = "blog-footer" >
< p dir = "auto" >
Blog template created by < a href = "https://twitter.com/mdo" > @mdo< / a > , ported to Hugo by < a href = 'https://twitter.com/mralanorth' > @mralanorth< / a > .
< / p >
< p >
< a href = "#" > Back to top< / a >
< / p >
< / footer >
< / body >
< / html >