From 5acf4589373e924f9bb2f464bf27c8447076fb1d Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 29 Oct 2017 10:02:34 +0200 Subject: [PATCH] Add notes for 2017-10-29 --- content/post/2017-10.md | 26 ++++++++++++++++++++++++++ public/2017-10/index.html | 36 +++++++++++++++++++++++++++++++++--- public/sitemap.xml | 10 +++++----- 3 files changed, 64 insertions(+), 8 deletions(-) diff --git a/content/post/2017-10.md b/content/post/2017-10.md index a716d262e..26d46d174 100644 --- a/content/post/2017-10.md +++ b/content/post/2017-10.md @@ -198,3 +198,29 @@ http://library.cgiar.org/browse?value=Intellectual%20Assets%20Reports&type=subje ## 2017-10-28 - Linode alerted about high CPU usage again on CGSpace around 2AM this morning + +## 2017-10-29 + +- Linode alerted about high CPU usage again on CGSpace around 2AM and 4AM +- I'm still not sure why this started causing alerts so repeatadely the past week +- I don't see any tell tale signs in the REST or OAI logs, so trying to do rudimentary analysis in DSpace logs: + +``` +# grep '2017-10-29 02:' dspace.log.2017-10-29 | grep -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l +2049 +``` + +- So there were 2049 unique sessions during the hour of 2AM +- Looking at my notes, the number of unique sessions was about the same during the same hour on other days when there were no alerts +- I think I'll need to enable access logging in nginx to figure out what's going on +- After enabling logging on requests to XMLUI on `/` I see some new bot I've never seen before: + +``` +137.108.70.6 - - [29/Oct/2017:07:39:49 +0000] "GET /discover?filtertype_0=type&filter_relational_operator_0=equals&filter_0=Internal+Document&filtertype=author&filter_relational_operator=equals&filter=CGIAR+Secretariat HTTP/1.1" 200 7776 "-" "Mozilla/5.0 (compatible; CORE/0.6; +http://core.ac.uk; http://core.ac.uk/intro/contact)" +``` + +- CORE seems to be some bot that is "Aggregating the world’s open access research papers" +- The contact address listed in their bot's user agent is incorrect, correct page is simply: https://core.ac.uk/contact +- I will check the logs in a few days to see if they are harvesting us regularly, then add their bot's user agent to the Tomcat Crawler Session Valve +- After browsing the CORE site it seems that the CGIAR Library is somehow a member of CORE, so they have probably only been harvesting CGSpace since we did the migration, as library.cgiar.org directs to us now +- For now I will just contact them to have them update their contact info in the bot's user agent, but eventually I think I'll tell them to swap out the CGIAR Library entry for CGSpace diff --git a/public/2017-10/index.html b/public/2017-10/index.html index 0cbc51a80..38526a81b 100644 --- a/public/2017-10/index.html +++ b/public/2017-10/index.html @@ -28,7 +28,7 @@ Add Katherine Lutz to the groups for content sumission and edit steps of the CGI - + @@ -66,9 +66,9 @@ Add Katherine Lutz to the groups for content sumission and edit steps of the CGI "@type": "BlogPosting", "headline": "October, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-10/", - "wordCount": "1566", + "wordCount": "1851", "datePublished": "2017-10-01T08:07:54+03:00", - "dateModified": "2017-10-26T17:50:10+03:00", + "dateModified": "2017-10-28T11:31:47+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -365,6 +365,36 @@ Add Katherine Lutz to the groups for content sumission and edit steps of the CGI
  • Linode alerted about high CPU usage again on CGSpace around 2AM this morning
  • +

    2017-10-29

    + + + +
    # grep '2017-10-29 02:' dspace.log.2017-10-29 | grep -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
    +2049
    +
    + + + +
    137.108.70.6 - - [29/Oct/2017:07:39:49 +0000] "GET /discover?filtertype_0=type&filter_relational_operator_0=equals&filter_0=Internal+Document&filtertype=author&filter_relational_operator=equals&filter=CGIAR+Secretariat HTTP/1.1" 200 7776 "-" "Mozilla/5.0 (compatible; CORE/0.6; +http://core.ac.uk; http://core.ac.uk/intro/contact)"
    +
    + + + diff --git a/public/sitemap.xml b/public/sitemap.xml index 329e25dfb..36f123594 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2017-10/ - 2017-10-26T17:50:10+03:00 + 2017-10-28T11:31:47+02:00 @@ -129,7 +129,7 @@ https://alanorth.github.io/cgspace-notes/ - 2017-10-26T17:50:10+03:00 + 2017-10-28T11:31:47+02:00 0 @@ -140,7 +140,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-10-26T17:50:10+03:00 + 2017-10-28T11:31:47+02:00 0 @@ -152,13 +152,13 @@ https://alanorth.github.io/cgspace-notes/post/ - 2017-10-26T17:50:10+03:00 + 2017-10-28T11:31:47+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-10-26T17:50:10+03:00 + 2017-10-28T11:31:47+02:00 0