From 6f561ce4b5ff6064782c5e0436b4b4bf20c450a8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 4 Nov 2018 22:45:00 +0200 Subject: [PATCH] Update notes for 2018-11-04 --- content/posts/2018-11.md | 41 ++++++++++++++++++++++++++++++++ docs/2018-11/index.html | 50 +++++++++++++++++++++++++++++++++++++--- docs/sitemap.xml | 10 ++++---- 3 files changed, 93 insertions(+), 8 deletions(-) diff --git a/content/posts/2018-11.md b/content/posts/2018-11.md index 6caebf544..127196b38 100644 --- a/content/posts/2018-11.md +++ b/content/posts/2018-11.md @@ -191,5 +191,46 @@ facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php) ``` - I will add it to the Tomcat Crawler Session Manager valve +- Later in the evening... ok, this Facebook bot is getting super annoying: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "04/Nov/2018" | grep "2a03:2880:11ff:" | awk '{print $1}' | sort | uniq -c | sort -n + 1871 2a03:2880:11ff:3::face:b00c + 1885 2a03:2880:11ff:b::face:b00c + 1941 2a03:2880:11ff:8::face:b00c + 1942 2a03:2880:11ff:e::face:b00c + 1987 2a03:2880:11ff:1::face:b00c + 2023 2a03:2880:11ff:2::face:b00c + 2027 2a03:2880:11ff:4::face:b00c + 2032 2a03:2880:11ff:9::face:b00c + 2034 2a03:2880:11ff:10::face:b00c + 2050 2a03:2880:11ff:5::face:b00c + 2061 2a03:2880:11ff:c::face:b00c + 2076 2a03:2880:11ff:6::face:b00c + 2093 2a03:2880:11ff:7::face:b00c + 2107 2a03:2880:11ff::face:b00c + 2118 2a03:2880:11ff:d::face:b00c + 2164 2a03:2880:11ff:a::face:b00c + 2178 2a03:2880:11ff:f::face:b00c +``` + +- And still making shit tons of Tomcat sessions: + +``` +$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=2a03:2880:11ff' dspace.log.2018-11-04 | sort | uniq +28470 +``` + +- And that's even using the Tomcat Crawler Session Manager valve! +- Maybe we need to limit more dynamic pages, like the "most popular" country, item, and author pages +- It seems these are popular too, and there is no fucking way Facebook needs that information, yet they are requesting thousands of them! + +``` +# grep 'face:b00c' /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep -c 'most-popular/' +7033 +``` + +- I added the "most-popular" pages to the list that return `X-Robots-Tag: none` to try to inform bots not to index or follow those pages +- Also, I implemented an nginx rate limit of twelve requests per minute on all dynamic pages... I figure a human user might legitimately request one every five seconds diff --git a/docs/2018-11/index.html b/docs/2018-11/index.html index 5c836fd2e..3174f1e8a 100644 --- a/docs/2018-11/index.html +++ b/docs/2018-11/index.html @@ -23,7 +23,7 @@ Today these are the top 10 IPs: " /> - + @@ -52,9 +52,9 @@ Today these are the top 10 IPs: "@type": "BlogPosting", "headline": "November, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-11/", - "wordCount": "791", + "wordCount": "992", "datePublished": "2018-11-01T16:41:30+02:00", - "dateModified": "2018-11-04T01:02:29+02:00", + "dateModified": "2018-11-04T12:18:52+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -324,6 +324,50 @@ Today these are the top 10 IPs: + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "04/Nov/2018" | grep "2a03:2880:11ff:" | awk '{print $1}' | sort | uniq -c | sort -n
+   1871 2a03:2880:11ff:3::face:b00c
+   1885 2a03:2880:11ff:b::face:b00c
+   1941 2a03:2880:11ff:8::face:b00c
+   1942 2a03:2880:11ff:e::face:b00c
+   1987 2a03:2880:11ff:1::face:b00c
+   2023 2a03:2880:11ff:2::face:b00c
+   2027 2a03:2880:11ff:4::face:b00c
+   2032 2a03:2880:11ff:9::face:b00c
+   2034 2a03:2880:11ff:10::face:b00c
+   2050 2a03:2880:11ff:5::face:b00c
+   2061 2a03:2880:11ff:c::face:b00c
+   2076 2a03:2880:11ff:6::face:b00c
+   2093 2a03:2880:11ff:7::face:b00c
+   2107 2a03:2880:11ff::face:b00c
+   2118 2a03:2880:11ff:d::face:b00c
+   2164 2a03:2880:11ff:a::face:b00c
+   2178 2a03:2880:11ff:f::face:b00c
+
+ + + +
$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=2a03:2880:11ff' dspace.log.2018-11-04 | sort | uniq
+28470
+
+ + + +
# grep 'face:b00c' /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep -c 'most-popular/'
+7033
+
+ + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 9e4687605..fee4bd7fd 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-11/ - 2018-11-04T01:02:29+02:00 + 2018-11-04T12:18:52+02:00 @@ -194,7 +194,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-11-04T01:02:29+02:00 + 2018-11-04T12:18:52+02:00 0 @@ -205,7 +205,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-11-04T01:02:29+02:00 + 2018-11-04T12:18:52+02:00 0 @@ -217,13 +217,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-11-04T01:02:29+02:00 + 2018-11-04T12:18:52+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-11-04T01:02:29+02:00 + 2018-11-04T12:18:52+02:00 0