From ed623594e9f655785fe8fb9a129f633f0bd7638f Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 4 Nov 2018 12:18:52 +0200 Subject: [PATCH] Update notes for 2018-11-04 --- content/posts/2018-11.md | 67 ++++++++++++++++++++++++++++++++- docs/2018-11/index.html | 80 ++++++++++++++++++++++++++++++++++++++-- docs/sitemap.xml | 10 ++--- 3 files changed, 147 insertions(+), 10 deletions(-) diff --git a/content/posts/2018-11.md b/content/posts/2018-11.md index ec2878a3f..6caebf544 100644 --- a/content/posts/2018-11.md +++ b/content/posts/2018-11.md @@ -125,6 +125,71 @@ $ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=78.46.89.18' /home/cgspace.cgiar.o - If they want to download all our metadata and PDFs they should use an API rather than scraping the XMLUI - I will add them to the list of bot IPs in nginx for now and think about enforcing rate limits in XMLUI later -- Also, this is the third (?) time a mysterious IP in Hetzner has done this... who is this? +- Also, this is the third (?) time a mysterious IP on Hetzner has done this... who is this? + +## 2018-11-04 + +- Forward Peter's information about CGSpace financials to Modi from ICRISAT +- Linode emailed about the CPU load and outgoing bandwidth on CGSpace (linode18) again +- Here are the top ten IPs active so far this morning: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "04/Nov/2018" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 1083 2a03:2880:11ff:2::face:b00c + 1105 2a03:2880:11ff:d::face:b00c + 1111 2a03:2880:11ff:f::face:b00c + 1134 84.38.130.177 + 1893 50.116.102.77 + 2040 66.249.64.63 + 4210 66.249.64.61 + 4534 70.32.83.92 + 13036 78.46.89.18 + 20407 66.249.64.59 +``` + +- `78.46.89.18` is back... and still making tons of Tomcat sessions: + +``` +$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=78.46.89.18' dspace.log.2018-11-04 | sort | uniq +8765 +``` + +- Also, now we have a ton of Facebook crawlers: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "04/Nov/2018" | grep "2a03:2880:11ff:" | awk '{print $1}' | sort | uniq -c | sort -n + 905 2a03:2880:11ff:b::face:b00c + 955 2a03:2880:11ff:5::face:b00c + 965 2a03:2880:11ff:e::face:b00c + 984 2a03:2880:11ff:8::face:b00c + 993 2a03:2880:11ff:3::face:b00c + 994 2a03:2880:11ff:7::face:b00c + 1006 2a03:2880:11ff:10::face:b00c + 1011 2a03:2880:11ff:4::face:b00c + 1023 2a03:2880:11ff:6::face:b00c + 1026 2a03:2880:11ff:9::face:b00c + 1039 2a03:2880:11ff:1::face:b00c + 1043 2a03:2880:11ff:c::face:b00c + 1070 2a03:2880:11ff::face:b00c + 1075 2a03:2880:11ff:a::face:b00c + 1093 2a03:2880:11ff:2::face:b00c + 1107 2a03:2880:11ff:d::face:b00c + 1116 2a03:2880:11ff:f::face:b00c +``` + +- They are really making shit tons of Tomcat sessions: + +``` +$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=2a03:2880:11ff' dspace.log.2018-11-04 | sort | uniq +14368 +``` + +- Their user agent is: + +``` +facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php) +``` + +- I will add it to the Tomcat Crawler Session Manager valve diff --git a/docs/2018-11/index.html b/docs/2018-11/index.html index 2c3bc422b..5c836fd2e 100644 --- a/docs/2018-11/index.html +++ b/docs/2018-11/index.html @@ -23,7 +23,7 @@ Today these are the top 10 IPs: " /> - + @@ -52,9 +52,9 @@ Today these are the top 10 IPs: "@type": "BlogPosting", "headline": "November, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-11/", - "wordCount": "586", + "wordCount": "791", "datePublished": "2018-11-01T16:41:30+02:00", - "dateModified": "2018-11-03T18:13:49+02:00", + "dateModified": "2018-11-04T01:02:29+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -251,7 +251,79 @@ Today these are the top 10 IPs: + +

2018-11-04

+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "04/Nov/2018" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+   1083 2a03:2880:11ff:2::face:b00c
+   1105 2a03:2880:11ff:d::face:b00c
+   1111 2a03:2880:11ff:f::face:b00c
+   1134 84.38.130.177
+   1893 50.116.102.77
+   2040 66.249.64.63
+   4210 66.249.64.61
+   4534 70.32.83.92
+  13036 78.46.89.18
+  20407 66.249.64.59
+
+ + + +
$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=78.46.89.18' dspace.log.2018-11-04 | sort | uniq
+8765
+
+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "04/Nov/2018" | grep "2a03:2880:11ff:" | awk '{print $1}' | sort | uniq -c | sort -n
+    905 2a03:2880:11ff:b::face:b00c
+    955 2a03:2880:11ff:5::face:b00c
+    965 2a03:2880:11ff:e::face:b00c
+    984 2a03:2880:11ff:8::face:b00c
+    993 2a03:2880:11ff:3::face:b00c
+    994 2a03:2880:11ff:7::face:b00c
+   1006 2a03:2880:11ff:10::face:b00c
+   1011 2a03:2880:11ff:4::face:b00c
+   1023 2a03:2880:11ff:6::face:b00c
+   1026 2a03:2880:11ff:9::face:b00c
+   1039 2a03:2880:11ff:1::face:b00c
+   1043 2a03:2880:11ff:c::face:b00c
+   1070 2a03:2880:11ff::face:b00c
+   1075 2a03:2880:11ff:a::face:b00c
+   1093 2a03:2880:11ff:2::face:b00c
+   1107 2a03:2880:11ff:d::face:b00c
+   1116 2a03:2880:11ff:f::face:b00c
+
+ + + +
$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=2a03:2880:11ff' dspace.log.2018-11-04 | sort | uniq
+14368
+
+ + + +
facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)
+
+ + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index e3b842c53..9e4687605 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-11/ - 2018-11-03T18:13:49+02:00 + 2018-11-04T01:02:29+02:00 @@ -194,7 +194,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-11-03T18:13:49+02:00 + 2018-11-04T01:02:29+02:00 0 @@ -205,7 +205,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-11-03T18:13:49+02:00 + 2018-11-04T01:02:29+02:00 0 @@ -217,13 +217,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-11-03T18:13:49+02:00 + 2018-11-04T01:02:29+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-11-03T18:13:49+02:00 + 2018-11-04T01:02:29+02:00 0