From 97c4ce04f0f2c29bac2ec9ce8ed59a1ba896b324 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 6 Dec 2018 12:45:12 +0200 Subject: [PATCH] Add notes for 2018-12-06 --- content/posts/2018-12.md | 38 +++++++++++++++++++++++++++++++ docs/2018-12/index.html | 49 +++++++++++++++++++++++++++++++++++++--- docs/robots.txt | 2 +- docs/sitemap.xml | 20 ++++++++-------- 4 files changed, 95 insertions(+), 14 deletions(-) diff --git a/content/posts/2018-12.md b/content/posts/2018-12.md index cd41af389..ae6654dfd 100644 --- a/content/posts/2018-12.md +++ b/content/posts/2018-12.md @@ -308,4 +308,42 @@ $ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=78.46.79.71' dspace.log.2018-12-03 - Discuss RSS issues with IWMI and WLE people +## 2018-12-06 + +- Linode sent a message that the CPU usage of CGSpace (linode18) is too high last night +- I looked in the logs and there's nothing particular going on: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "05/Dec/2018" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 1225 157.55.39.177 + 1240 207.46.13.12 + 1261 207.46.13.101 + 1411 207.46.13.157 + 1529 34.218.226.147 + 2085 50.116.102.77 + 3334 2a01:7e00::f03c:91ff:fe0a:d645 + 3733 66.249.70.27 + 3815 35.237.175.180 + 7669 54.70.40.11 +``` + +- `54.70.40.11` is some new bot with the following user agent: + +``` +Mozilla/5.0 (compatible) SemanticScholarBot (+https://www.semanticscholar.org/crawler) +``` + +- But Tomcat is forcing them to re-use their Tomcat sessions with the Crawler Session Manager valve: + +``` +$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=54.70.40.11' dspace.log.2018-12-05 +6980 +$ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=54.70.40.11' dspace.log.2018-12-05 | sort | uniq | wc -l +1156 +``` + +- `2a01:7e00::f03c:91ff:fe0a:d645` appears to be the CKM dev server where Danny is testing harvesting via Drupal +- It seems they are hitting the XMLUI's OpenSearch a bit, but mostly on the REST API so no issues here yet +- `Drupal` is already in the Tomcat Crawler Session Manager Valve's regex so that's good! + diff --git a/docs/2018-12/index.html b/docs/2018-12/index.html index ed50c7886..67df7e03b 100644 --- a/docs/2018-12/index.html +++ b/docs/2018-12/index.html @@ -21,7 +21,7 @@ I noticed that there is another issue with PDF thumbnails on CGSpace, and I see " /> - + @@ -48,9 +48,9 @@ I noticed that there is another issue with PDF thumbnails on CGSpace, and I see "@type": "BlogPosting", "headline": "December, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-12/", - "wordCount": "1835", + "wordCount": "2009", "datePublished": "2018-12-02T02:09:30+02:00", - "dateModified": "2018-12-04T10:02:51+02:00", + "dateModified": "2018-12-05T17:20:48+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -457,6 +457,49 @@ $ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=78.46.79.71' dspace.log.2018-12-03
  • Discuss RSS issues with IWMI and WLE people
  • +

    2018-12-06

    + + + +
    # zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "05/Dec/2018" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
    +   1225 157.55.39.177
    +   1240 207.46.13.12
    +   1261 207.46.13.101
    +   1411 207.46.13.157
    +   1529 34.218.226.147
    +   2085 50.116.102.77
    +   3334 2a01:7e00::f03c:91ff:fe0a:d645
    +   3733 66.249.70.27
    +   3815 35.237.175.180
    +   7669 54.70.40.11
    +
    + + + +
    Mozilla/5.0 (compatible) SemanticScholarBot (+https://www.semanticscholar.org/crawler)
    +
    + + + +
    $ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=54.70.40.11' dspace.log.2018-12-05
    +6980
    +$ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=54.70.40.11' dspace.log.2018-12-05 | sort | uniq | wc -l
    +1156
    +
    + + + diff --git a/docs/robots.txt b/docs/robots.txt index e5d4b2f54..3d2efaa86 100644 --- a/docs/robots.txt +++ b/docs/robots.txt @@ -42,7 +42,7 @@ Disallow: /cgspace-notes/2015-12/ Disallow: /cgspace-notes/2015-11/ Disallow: /cgspace-notes/ Disallow: /cgspace-notes/categories/ -Disallow: /cgspace-notes/categories/notes/ Disallow: /cgspace-notes/tags/notes/ +Disallow: /cgspace-notes/categories/notes/ Disallow: /cgspace-notes/posts/ Disallow: /cgspace-notes/tags/ diff --git a/docs/sitemap.xml b/docs/sitemap.xml index ec16b6fc9..62efbbc80 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-12/ - 2018-12-04T10:02:51+02:00 + 2018-12-05T17:20:48+02:00 @@ -199,7 +199,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-12-04T10:02:51+02:00 + 2018-12-05T17:20:48+02:00 0 @@ -208,27 +208,27 @@ 0 + + https://alanorth.github.io/cgspace-notes/tags/notes/ + 2018-12-05T17:20:48+02:00 + 0 + + https://alanorth.github.io/cgspace-notes/categories/notes/ 2018-03-09T22:10:33+02:00 0 - - https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-12-04T10:02:51+02:00 - 0 - - https://alanorth.github.io/cgspace-notes/posts/ - 2018-12-04T10:02:51+02:00 + 2018-12-05T17:20:48+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-12-04T10:02:51+02:00 + 2018-12-05T17:20:48+02:00 0