diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 382708db7..84a547f7e 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -253,7 +253,7 @@ $ grep -c 207.46.13.36 /var/log/nginx/access.log.1 - I think I will end up blocking Baidu as well... - Next is for me to look and see what was happening specifically at 3AM and 7AM when the server crashed - I should look in nginx access.log, rest.log, oai.log, and DSpace's dspace.log.2017-11-07 -- Here are the top IPs making requests to XMLUI from 2–8 AM: +- Here are the top IPs making requests to XMLUI from 2 to 8 AM: ``` # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail @@ -270,10 +270,10 @@ $ grep -c 207.46.13.36 /var/log/nginx/access.log.1 ``` - Of those, most are Google, Bing, Yahoo, etc, except 63.143.42.244 and 63.143.42.242 which are Uptime Robot -- Here are the top IPs making requests to REST from 2–8 AM: +- Here are the top IPs making requests to REST from 2 to 8 AM: ``` -# cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail +# cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail 8 207.241.229.237 10 66.249.66.90 16 104.196.152.243 @@ -465,9 +465,9 @@ proxy_set_header User-Agent $ua; - It seems that they rarely even bother checking `robots.txt`, but Google does multiple times per day! ``` -# zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt +# zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt 14 -# zgrep Googlebot /var/log/nginx/access.log* | grep -c robots.txt +# zgrep Googlebot /var/log/nginx/access.log* | grep -c robots.txt 1134 ``` @@ -482,7 +482,7 @@ proxy_set_header User-Agent $ua; - Awesome, it seems my bot mapping stuff in nginx actually reduced the number of Tomcat sessions used by the CIAT scraper today, total requests and unique sessions: ``` -# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243 +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243 5769 $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l 223 @@ -495,9 +495,9 @@ $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{3 10216 $ grep 104.196.152.243 dspace.log.2017-11-08 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l 2592 -# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243 +# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243 8120 -$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l +$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l 3506 ``` diff --git a/public/2017-11/index.html b/public/2017-11/index.html index 3fb07be9e..5f0b88bc0 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "2877", + "wordCount": "2881", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-08T22:36:15+02:00", + "dateModified": "2017-11-09T17:41:14+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -433,7 +433,7 @@ $ grep 104.196.152.243 dspace.log.2017-11-01 | grep -o -E 'session_id=[A-Z0-9]{3
  • I think I will end up blocking Baidu as well…
  • Next is for me to look and see what was happening specifically at 3AM and 7AM when the server crashed
  • I should look in nginx access.log, rest.log, oai.log, and DSpace’s dspace.log.2017-11-07
  • -
  • Here are the top IPs making requests to XMLUI from 2–8 AM:
  • +
  • Here are the top IPs making requests to XMLUI from 2 to 8 AM:
  • # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail
    @@ -451,10 +451,10 @@ $ grep 104.196.152.243 dspace.log.2017-11-01 | grep -o -E 'session_id=[A-Z0-9]{3
     
     
     
    -
    # cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail                                                                        
    +
    # cat /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep -E '07/Nov/2017:0[2-8]' | awk '{print $1}' | sort -n | uniq -c | sort -h | tail
           8 207.241.229.237
          10 66.249.66.90
          16 104.196.152.243
    @@ -664,9 +664,9 @@ proxy_set_header User-Agent $ua;
     
  • It seems that they rarely even bother checking robots.txt, but Google does multiple times per day!
  • -
    # zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt 
    +
    # zgrep Baiduspider /var/log/nginx/access.log* | grep -c robots.txt
     14
    -# zgrep Googlebot  /var/log/nginx/access.log* | grep -c robots.txt 
    +# zgrep Googlebot  /var/log/nginx/access.log* | grep -c robots.txt
     1134
     
    @@ -684,7 +684,7 @@ proxy_set_header User-Agent $ua;
  • Awesome, it seems my bot mapping stuff in nginx actually reduced the number of Tomcat sessions used by the CIAT scraper today, total requests and unique sessions:
  • -
    # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243 
    +
    # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep '09/Nov/2017' | grep -c 104.196.152.243
     5769
     $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
     223
    @@ -698,9 +698,9 @@ $ grep 104.196.152.243 dspace.log.2017-11-09 | grep -o -E 'session_id=[A-Z0-9]{3
     10216
     $ grep 104.196.152.243 dspace.log.2017-11-08 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
     2592
    -# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243                                          
    +# zcat -f -- /var/log/nginx/access.log.2.gz /var/log/nginx/access.log.3.gz | grep '07/Nov/2017' | grep -c 104.196.152.243
     8120
    -$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l         
    +$ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
     3506
     
    diff --git a/public/sitemap.xml b/public/sitemap.xml index bf181634a..df2368a69 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2017-11/ - 2017-11-08T22:36:15+02:00 + 2017-11-09T17:41:14+02:00 @@ -134,7 +134,7 @@ https://alanorth.github.io/cgspace-notes/ - 2017-11-08T22:36:15+02:00 + 2017-11-09T17:41:14+02:00 0 @@ -145,7 +145,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-11-08T22:36:15+02:00 + 2017-11-09T17:41:14+02:00 0 @@ -157,13 +157,13 @@ https://alanorth.github.io/cgspace-notes/post/ - 2017-11-08T22:36:15+02:00 + 2017-11-09T17:41:14+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-11-08T22:36:15+02:00 + 2017-11-09T17:41:14+02:00 0