From 682a2c21948714b80daa17e7bbc7d4a16babe469 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sat, 6 Apr 2019 11:47:45 +0300 Subject: [PATCH] Update notes for 2019-04-06 --- content/posts/2019-04.md | 50 ++++++++++++++++++++++++++++++ docs/2019-04/index.html | 67 ++++++++++++++++++++++++++++++++++++++-- docs/sitemap.xml | 10 +++--- 3 files changed, 119 insertions(+), 8 deletions(-) diff --git a/content/posts/2019-04.md b/content/posts/2019-04.md index 57c961024..4cd0fbb13 100644 --- a/content/posts/2019-04.md +++ b/content/posts/2019-04.md @@ -109,4 +109,54 @@ statistics-2017: org.apache.solr.common.SolrException:org.apache.solr.common.Sol - I restarted it again and all the Solr cores came up properly... +## 2019-04-06 + +- Udana asked why item [10568/91278](https://cgspace.cgiar.org/handle/10568/91278) didn't have an Altmetric badge on CGSpace, but on the [WLE website](https://wle.cgiar.org/food-and-agricultural-innovation-pathways-prosperity) it does + - I looked and saw that the WLE website is using the Altmetric score associated with the DOI, and that the Handle has no score at all + - I tweeted the item and I assume this will link the Handle with the DOI in the system +- Linode sent an alert that there was high CPU usage this morning on CGSpace (linode18) and these were the top IPs in the webserver access logs around the time: + +``` +# zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep -E "06/Apr/2019:(06|07|08|09)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 222 18.195.78.144 + 245 207.46.13.58 + 303 207.46.13.194 + 328 66.249.79.33 + 564 207.46.13.210 + 566 66.249.79.62 + 575 40.77.167.66 + 1803 66.249.79.59 + 2834 2a01:4f8:140:3192::2 + 9623 45.5.184.72 +# zcat --force /var/log/nginx/{rest,oai}.log /var/log/nginx/{rest,oai}.log.1 | grep -E "06/Apr/2019:(06|07|08|09)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 31 66.249.79.62 + 41 207.46.13.210 + 42 40.77.167.66 + 54 42.113.50.219 + 132 66.249.79.59 + 785 2001:41d0:d:1990:: + 1164 45.5.184.72 + 2014 50.116.102.77 + 4267 45.5.186.2 + 4893 205.186.128.185 +``` + +- `45.5.184.72` is in Colombia so it's probably CIAT, and I see they are indeed trying to get crawl the Discover pages on CIAT's datasets collection: + +``` +GET /handle/10568/72970/discover?filtertype_0=type&filtertype_1=author&filter_relational_operator_1=contains&filter_relational_operator_0=equals&filter_1=&filter_0=Dataset&filtertype=dateIssued&filter_relational_operator=equals&filter=2014 +``` + +- Their user agent is the one I added to the badbots list in nginx last week: "GuzzleHttp/6.3.3 curl/7.47.0 PHP/7.0.30-0ubuntu0.16.04.1" +- They made 22,000 requests to Discover on this collection today alone (and it's only 11AM): + +``` +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "06/Apr/2019" | grep 45.5.184.72 | grep -oE '/handle/[0-9]+/[0-9]+/discover' | sort | uniq -c + 22077 /handle/10568/72970/discover +``` + +- I need to find a contact at CIAT to tell them to use the REST API rather than crawling Discover +- Maria from Bioversity recommended that we use the phrase "AGROVOC subject" instead of "Subject" in Listings and Reports + - I made a pull request to update this and merged it to the `5_x-prod` branch ([#418](https://github.com/ilri/DSpace/pull/418)) + diff --git a/docs/2019-04/index.html b/docs/2019-04/index.html index 9ebdf427c..9bc721731 100644 --- a/docs/2019-04/index.html +++ b/docs/2019-04/index.html @@ -38,7 +38,7 @@ $ ./delete-metadata-values.py -i /tmp/2019-02-21-delete-1-region.csv -db dspace - + @@ -81,9 +81,9 @@ $ ./delete-metadata-values.py -i /tmp/2019-02-21-delete-1-region.csv -db dspace "@type": "BlogPosting", "headline": "April, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-04/", - "wordCount": "661", + "wordCount": "980", "datePublished": "2019-04-01T09:00:43+03:00", - "dateModified": "2019-04-05T22:22:41+03:00", + "dateModified": "2019-04-05T23:07:30+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -286,6 +286,67 @@ $ ./delete-metadata-values.py -i /tmp/2019-02-21-delete-1-region.csv -db dspace
  • I restarted it again and all the Solr cores came up properly…
  • +

    2019-04-06

    + + + +
    # zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep -E "06/Apr/2019:(06|07|08|09)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
    +    222 18.195.78.144
    +    245 207.46.13.58
    +    303 207.46.13.194
    +    328 66.249.79.33
    +    564 207.46.13.210
    +    566 66.249.79.62
    +    575 40.77.167.66
    +   1803 66.249.79.59
    +   2834 2a01:4f8:140:3192::2
    +   9623 45.5.184.72
    +# zcat --force /var/log/nginx/{rest,oai}.log /var/log/nginx/{rest,oai}.log.1 | grep -E "06/Apr/2019:(06|07|08|09)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
    +     31 66.249.79.62
    +     41 207.46.13.210
    +     42 40.77.167.66
    +     54 42.113.50.219
    +    132 66.249.79.59
    +    785 2001:41d0:d:1990::
    +   1164 45.5.184.72
    +   2014 50.116.102.77
    +   4267 45.5.186.2
    +   4893 205.186.128.185
    +
    + + + +
    GET /handle/10568/72970/discover?filtertype_0=type&filtertype_1=author&filter_relational_operator_1=contains&filter_relational_operator_0=equals&filter_1=&filter_0=Dataset&filtertype=dateIssued&filter_relational_operator=equals&filter=2014
    +
    + + + +
    # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "06/Apr/2019" | grep 45.5.184.72 | grep -oE '/handle/[0-9]+/[0-9]+/discover' | sort | uniq -c 
    +  22077 /handle/10568/72970/discover
    +
    + + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index a752c5c8f..35364f1e1 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2019-04/ - 2019-04-05T22:22:41+03:00 + 2019-04-05T23:07:30+03:00 @@ -219,7 +219,7 @@ https://alanorth.github.io/cgspace-notes/ - 2019-04-05T22:22:41+03:00 + 2019-04-05T23:07:30+03:00 0 @@ -230,7 +230,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2019-04-05T22:22:41+03:00 + 2019-04-05T23:07:30+03:00 0 @@ -242,13 +242,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2019-04-05T22:22:41+03:00 + 2019-04-05T23:07:30+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2019-04-05T22:22:41+03:00 + 2019-04-05T23:07:30+03:00 0