diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 382708db7..84a547f7e 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -253,7 +253,7 @@ $ grep -c 207.46.13.36 /var/log/nginx/access.log.1 - I think I will end up blocking Baidu as well... - Next is for me to look and see what was happening specifically at 3AM and 7AM when the server crashed - I should look in nginx access.log, rest.log, oai.log, and DSpace's dspace.log.2017-11-07 -- Here are the top IPs making requests to XMLUI from 2–8 AM: +- Here are the top IPs making requests to XMLUI from 2 to 8 AM: ``` # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail @@ -270,10 +270,10 @@ $ grep -c 207.46.13.36 /var/log/nginx/access.log.1 ``` - Of those, most are Google, Bing, Yahoo, etc, except 63.143.42.244 and 63.143.42.242 which are Uptime Robot -- Here are the top IPs making requests to REST from 2–8 AM: +- Here are the top IPs making requests to REST from 2 to 8 AM: ``` -# cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail +# cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail 8 207.241.229.237 10 66.249.66.90 16 104.196.152.243 @@ -465,9 +465,9 @@ proxy_set_header User-Agent $ua; - It seems that they rarely even bother checking `robots.txt`, but Google does multiple times per day! ``` -# zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt +# zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt 14 -# zgrep Googlebot /var/log/nginx/access.log* | grep -c robots.txt +# zgrep Googlebot /var/log/nginx/access.log* | grep -c robots.txt 1134 ``` @@ -482,7 +482,7 @@ proxy_set_header User-Agent $ua; - Awesome, it seems my bot mapping stuff in nginx actually reduced the number of Tomcat sessions used by the CIAT scraper today, total requests and unique sessions: ``` -# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243 +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243 5769 $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l 223 @@ -495,9 +495,9 @@ $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{3 10216 $ grep 104.196.152.243 dspace.log.2017-11-08 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l 2592 -# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243 +# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243 8120 -$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l +$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l 3506 ``` diff --git a/public/2017-11/index.html b/public/2017-11/index.html index 3fb07be9e..5f0b88bc0 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "2877", + "wordCount": "2881", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-08T22:36:15+02:00", + "dateModified": "2017-11-09T17:41:14+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -433,7 +433,7 @@ $ grep 104.196.152.243 dspace.log.2017-11-01 | grep -o -E 'session_id=[A-Z0-9]{3
# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail
@@ -451,10 +451,10 @@ $ grep 104.196.152.243 dspace.log.2017-11-01 | grep -o -E 'session_id=[A-Z0-9]{3
- Of those, most are Google, Bing, Yahoo, etc, except 63.143.42.244 and 63.143.42.242 which are Uptime Robot
-- Here are the top IPs making requests to REST from 2–8 AM:
+- Here are the top IPs making requests to REST from 2 to 8 AM:
-# cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail
+# cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail
8 207.241.229.237
10 66.249.66.90
16 104.196.152.243
@@ -664,9 +664,9 @@ proxy_set_header User-Agent $ua;
It seems that they rarely even bother checking robots.txt
, but Google does multiple times per day!
-# zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt
+# zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt
14
-# zgrep Googlebot /var/log/nginx/access.log* | grep -c robots.txt
+# zgrep Googlebot /var/log/nginx/access.log* | grep -c robots.txt
1134
@@ -684,7 +684,7 @@ proxy_set_header User-Agent $ua;
Awesome, it seems my bot mapping stuff in nginx actually reduced the number of Tomcat sessions used by the CIAT scraper today, total requests and unique sessions:
-# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243
+# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243
5769
$ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
223
@@ -698,9 +698,9 @@ $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{3
10216
$ grep 104.196.152.243 dspace.log.2017-11-08 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
2592
-# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243
+# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243
8120
-$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
+$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
3506
diff --git a/public/sitemap.xml b/public/sitemap.xml
index bf181634a..df2368a69 100644
--- a/public/sitemap.xml
+++ b/public/sitemap.xml
@@ -4,7 +4,7 @@
https://alanorth.github.io/cgspace-notes/2017-11/
- 2017-11-08T22:36:15+02:00
+ 2017-11-09T17:41:14+02:00
@@ -134,7 +134,7 @@
https://alanorth.github.io/cgspace-notes/
- 2017-11-08T22:36:15+02:00
+ 2017-11-09T17:41:14+02:00
0
@@ -145,7 +145,7 @@
https://alanorth.github.io/cgspace-notes/tags/notes/
- 2017-11-08T22:36:15+02:00
+ 2017-11-09T17:41:14+02:00
0
@@ -157,13 +157,13 @@
https://alanorth.github.io/cgspace-notes/post/
- 2017-11-08T22:36:15+02:00
+ 2017-11-09T17:41:14+02:00
0
https://alanorth.github.io/cgspace-notes/tags/
- 2017-11-08T22:36:15+02:00
+ 2017-11-09T17:41:14+02:00
0