From 323e968c2f13723738d6648c12e1ffaff6506aa8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 12 Jul 2018 08:35:39 +0300 Subject: [PATCH] Add notes for 2018-07-12 --- content/posts/2018-07.md | 58 +++++++++++++++++++++++++++++++++ docs/2018-07/index.html | 69 ++++++++++++++++++++++++++++++++++++++-- docs/sitemap.xml | 10 +++--- 3 files changed, 129 insertions(+), 8 deletions(-) diff --git a/content/posts/2018-07.md b/content/posts/2018-07.md index e409ac6f4..404c41aef 100644 --- a/content/posts/2018-07.md +++ b/content/posts/2018-07.md @@ -257,4 +257,62 @@ $ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=95.108.181.88' dspace.log.2018-07- - Peter told Sisay to test this controlled vocabulary - Discuss meeting in Nairobi in October +## 2018-07-12 + +- Uptime Robot said that CGSpace went down a few times last night, around 10:45 PM and 12:30 AM +- Here are the top ten IPs from last night and this morning: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "11/Jul/2018:22" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 48 66.249.64.91 + 50 35.227.26.162 + 57 157.55.39.234 + 59 157.55.39.71 + 62 147.99.27.190 + 82 95.108.181.88 + 92 40.77.167.90 + 97 183.128.40.185 + 97 240e:f0:44:fa53:745a:8afe:d221:1232 + 3634 208.110.72.10 +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "12/Jul/2018:00" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 25 216.244.66.198 + 38 40.77.167.185 + 46 66.249.64.93 + 56 157.55.39.71 + 60 35.227.26.162 + 65 157.55.39.234 + 83 95.108.181.88 + 87 66.249.64.91 + 96 40.77.167.90 + 7075 208.110.72.10 +``` + +- We have never seen `208.110.72.10` before... so that's interesting! +- The user agent for these requests is: Pcore-HTTP/v0.44.0 +- A brief Google search doesn't turn up any information about what this bot is, but lots of users complaining about it +- This bot does make a lot of requests all through the day, although it seems to re-use its Tomcat session: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "Pcore-HTTP" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 17098 208.110.72.10 +# grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=208.110.72.10' dspace.log.2018-07-11 +1161 +# grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=208.110.72.10' dspace.log.2018-07-12 +1885 +``` + +- I think the problem is that, despite the bot requesting `robots.txt`, it almost exlusively requests dynamic pages from `/discover`: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "Pcore-HTTP" | grep -o -E "GET /(browse|discover|search-filter)" | sort -n | uniq -c | sort -rn + 13364 GET /discover + 993 GET /search-filter + 804 GET /browse +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "Pcore-HTTP" | grep robots +208.110.72.10 - - [12/Jul/2018:00:22:28 +0000] "GET /robots.txt HTTP/1.1" 200 1301 "https://cgspace.cgiar.org/robots.txt" "Pcore-HTTP/v0.44.0 +``` + +- So this bot is just like Baiduspider, and I need to add it to the nginx rate limiting +- I'll also add it to Tomcat's Crawler Session Manager Valve to force the re-use of a common Tomcat sesssion for all crawlers just in case + diff --git a/docs/2018-07/index.html b/docs/2018-07/index.html index 94cb05416..2ddf4c474 100644 --- a/docs/2018-07/index.html +++ b/docs/2018-07/index.html @@ -30,7 +30,7 @@ There is insufficient memory for the Java Runtime Environment to continue. - + @@ -71,9 +71,9 @@ There is insufficient memory for the Java Runtime Environment to continue. "@type": "BlogPosting", "headline": "July, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-07/", - "wordCount": "1740", + "wordCount": "2079", "datePublished": "2018-07-01T12:56:54+03:00", - "dateModified": "2018-07-10T17:34:30+03:00", + "dateModified": "2018-07-11T16:55:30+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -432,6 +432,69 @@ org.apache.solr.client.solrj.SolrServerException: IOException occured when talki +

2018-07-12

+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "11/Jul/2018:22" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+     48 66.249.64.91
+     50 35.227.26.162
+     57 157.55.39.234
+     59 157.55.39.71
+     62 147.99.27.190
+     82 95.108.181.88
+     92 40.77.167.90
+     97 183.128.40.185
+     97 240e:f0:44:fa53:745a:8afe:d221:1232
+   3634 208.110.72.10
+# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "12/Jul/2018:00" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+     25 216.244.66.198
+     38 40.77.167.185
+     46 66.249.64.93
+     56 157.55.39.71
+     60 35.227.26.162
+     65 157.55.39.234
+     83 95.108.181.88
+     87 66.249.64.91
+     96 40.77.167.90
+   7075 208.110.72.10
+
+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "Pcore-HTTP" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+  17098 208.110.72.10
+# grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=208.110.72.10' dspace.log.2018-07-11
+1161
+# grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=208.110.72.10' dspace.log.2018-07-12
+1885
+
+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "Pcore-HTTP" | grep -o -E "GET /(browse|discover|search-filter)" | sort -n | uniq -c | sort -rn
+  13364 GET /discover
+    993 GET /search-filter
+    804 GET /browse
+# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "Pcore-HTTP" | grep robots
+208.110.72.10 - - [12/Jul/2018:00:22:28 +0000] "GET /robots.txt HTTP/1.1" 200 1301 "https://cgspace.cgiar.org/robots.txt" "Pcore-HTTP/v0.44.0
+
+ + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 2b7825fde..2948400dd 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-07/ - 2018-07-10T17:34:30+03:00 + 2018-07-11T16:55:30+03:00 @@ -174,7 +174,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-07-10T17:34:30+03:00 + 2018-07-11T16:55:30+03:00 0 @@ -185,7 +185,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-07-10T17:34:30+03:00 + 2018-07-11T16:55:30+03:00 0 @@ -197,13 +197,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-07-10T17:34:30+03:00 + 2018-07-11T16:55:30+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-07-10T17:34:30+03:00 + 2018-07-11T16:55:30+03:00 0