From f2ef00d1e93fc92f93fab81909d7d5d3f7db7013 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 12 Nov 2017 10:41:44 +0200 Subject: [PATCH] Update notes for 2017-11-12 --- content/post/2017-11.md | 41 ++++++++++++++++++++++++++++++++ public/2017-11/index.html | 49 ++++++++++++++++++++++++++++++++++++--- public/sitemap.xml | 10 ++++---- 3 files changed, 92 insertions(+), 8 deletions(-) diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 2aacbf202..c96fdb221 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -514,3 +514,44 @@ $ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{3 ## 2017-11-12 - Update the [Ansible infrastructure templates](https://github.com/ilri/rmg-ansible-public) to be a little more modular and flexible +- Looking at the top client IPs on CGSpace so far this morning, even though it's only been eight hours: + +``` +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "12/Nov/2017" | awk '{print $1}' | sort -n | uniq -c | sort -h | tail + 243 5.83.120.111 + 335 40.77.167.103 + 424 66.249.66.91 + 529 207.46.13.36 + 554 40.77.167.129 + 604 207.46.13.53 + 754 104.196.152.243 + 883 66.249.66.90 + 1150 95.108.181.88 + 1381 5.9.6.51 +``` + +- 5.9.6.51 seems to be a Russian bot: + +``` +# grep 5.9.6.51 /var/log/nginx/access.log | tail -n 1 +5.9.6.51 - - [12/Nov/2017:08:13:13 +0000] "GET /handle/10568/16515/recent-submissions HTTP/1.1" 200 5097 "-" "Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +http://megaindex.com/crawler)" +``` + +- What's amazing is that it seems to reuse its Java session across all requests: + +``` +$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=5.9.6.51' /home/cgspace.cgiar.org/log/dspace.log.2017-11-12 +1558 +$ grep 5.9.6.51 /home/cgspace.cgiar.org/log/dspace.log.2017-11-12 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l +1 +``` + +- Bravo to MegaIndex.ru! +- The same cannot be said for 95.108.181.88, which appears to be YandexBot, even though Tomcat's Crawler Session Manager valve regex should match 'YandexBot': + +``` +# grep 95.108.181.88 /var/log/nginx/access.log | tail -n 1 +95.108.181.88 - - [12/Nov/2017:08:33:17 +0000] "GET /bitstream/handle/10568/57004/GenebankColombia_23Feb2015.pdf HTTP/1.1" 200 972019 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" +$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=95.108.181.88' /home/cgspace.cgiar.org/log/dspace.log.2017-11-12 +991 +``` diff --git a/public/2017-11/index.html b/public/2017-11/index.html index 7477c51fd..5a18b3b86 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "2964", + "wordCount": "3150", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-10T13:52:33+02:00", + "dateModified": "2017-11-12T10:19:47+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -722,8 +722,51 @@ $ grep 104.196.152.243 dspace.log.2017-11-07 | grep -o -E 'session_id=[A-Z0-9]{3 +
# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "12/Nov/2017" | awk '{print $1}' | sort -n | uniq -c | sort -h | tail
+    243 5.83.120.111
+    335 40.77.167.103
+    424 66.249.66.91
+    529 207.46.13.36
+    554 40.77.167.129
+    604 207.46.13.53
+    754 104.196.152.243
+    883 66.249.66.90
+   1150 95.108.181.88
+   1381 5.9.6.51
+
+ + + +
# grep 5.9.6.51 /var/log/nginx/access.log | tail -n 1
+5.9.6.51 - - [12/Nov/2017:08:13:13 +0000] "GET /handle/10568/16515/recent-submissions HTTP/1.1" 200 5097 "-" "Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +http://megaindex.com/crawler)"
+
+ + + +
$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=5.9.6.51' /home/cgspace.cgiar.org/log/dspace.log.2017-11-12
+1558
+$ grep 5.9.6.51 /home/cgspace.cgiar.org/log/dspace.log.2017-11-12 | grep -o -E 'session_id=[A-Z0-9]{32}' | sort -n | uniq | wc -l
+1
+
+ + + +
# grep 95.108.181.88 /var/log/nginx/access.log | tail -n 1
+95.108.181.88 - - [12/Nov/2017:08:33:17 +0000] "GET /bitstream/handle/10568/57004/GenebankColombia_23Feb2015.pdf HTTP/1.1" 200 972019 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
+$ grep -c -E 'session_id=[A-Z0-9]{32}:ip_addr=95.108.181.88' /home/cgspace.cgiar.org/log/dspace.log.2017-11-12
+991
+
+ diff --git a/public/sitemap.xml b/public/sitemap.xml index 30c00c1dd..2ccafa5b4 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2017-11/ - 2017-11-10T13:52:33+02:00 + 2017-11-12T10:19:47+02:00 @@ -134,7 +134,7 @@ https://alanorth.github.io/cgspace-notes/ - 2017-11-10T13:52:33+02:00 + 2017-11-12T10:19:47+02:00 0 @@ -145,7 +145,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-11-10T13:52:33+02:00 + 2017-11-12T10:19:47+02:00 0 @@ -157,13 +157,13 @@ https://alanorth.github.io/cgspace-notes/post/ - 2017-11-10T13:52:33+02:00 + 2017-11-12T10:19:47+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-11-10T13:52:33+02:00 + 2017-11-12T10:19:47+02:00 0