diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 772e67324..65d509cc5 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -376,3 +376,29 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017- ``` - I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API +- About Baidu, I found a link to their [robots.txt tester tool](http://ziyuan.baidu.com/robots/) +- It seems like our robots.txt file is valid, and they claim to recognize that URLs like `/discover` should be forbidden: + +![Baidu robots.txt tester](/cgspace-notes/2017/11/baidu-robotstxt.png) + +- But they literally just made this request today: + +``` +180.76.15.136 - - [07/Nov/2017:06:25:11 +0000] "GET /discover?filtertype_0=crpsubject&filter_relational_operator_0=equals&filter_0=WATER%2C+LAND+AND+ECOSYSTEMS&filtertype=subject&filter_relational_operator=equals&filter=WATER+RESOURCES HTTP/1.1" 200 82265 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" +``` + +- Along with another thousand or so requests to URLs that are forbidden in robots.txt today alone: + +``` +# grep -c Baiduspider /var/log/nginx/access.log +3806 +# grep Baiduspider /var/log/nginx/access.log | grep -c -E "GET /(browse|discover|search-filter)" +1085 +``` + +- I will think about blocking their IPs but they have 164 of them! + +``` +# grep "Baiduspider/2.0" /var/log/nginx/access.log | awk '{print $1}' | sort -n | uniq | wc -l +164 +``` diff --git a/public/2017-11/index.html b/public/2017-11/index.html index a5111a0d4..527581e3c 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "1997", + "wordCount": "2084", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-07T17:03:49+02:00", + "dateModified": "2017-11-07T17:26:16+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -565,8 +565,29 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017- +

Baidu robots.txt tester

+ + + +
180.76.15.136 - - [07/Nov/2017:06:25:11 +0000] "GET /discover?filtertype_0=crpsubject&filter_relational_operator_0=equals&filter_0=WATER%2C+LAND+AND+ECOSYSTEMS&filtertype=subject&filter_relational_operator=equals&filter=WATER+RESOURCES HTTP/1.1" 200 82265 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
+
+ + + +
# grep -c Baiduspider /var/log/nginx/access.log
+3806
+# grep Baiduspider /var/log/nginx/access.log | grep -c -E "GET /(browse|discover|search-filter)"
+1085
+
+ diff --git a/public/2017/11/baidu-robotstxt.png b/public/2017/11/baidu-robotstxt.png new file mode 100644 index 000000000..29cb66563 Binary files /dev/null and b/public/2017/11/baidu-robotstxt.png differ diff --git a/public/robots.txt b/public/robots.txt index b90b33492..51a461e57 100644 --- a/public/robots.txt +++ b/public/robots.txt @@ -29,7 +29,7 @@ Disallow: /cgspace-notes/2015-12/ Disallow: /cgspace-notes/2015-11/ Disallow: /cgspace-notes/ Disallow: /cgspace-notes/categories/ -Disallow: /cgspace-notes/categories/notes/ Disallow: /cgspace-notes/tags/notes/ +Disallow: /cgspace-notes/categories/notes/ Disallow: /cgspace-notes/post/ Disallow: /cgspace-notes/tags/ diff --git a/public/sitemap.xml b/public/sitemap.xml index 403e64e7d..e653d34e3 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2017-11/ - 2017-11-07T17:03:49+02:00 + 2017-11-07T17:26:16+02:00 @@ -134,7 +134,7 @@ https://alanorth.github.io/cgspace-notes/ - 2017-11-07T17:03:49+02:00 + 2017-11-07T17:26:16+02:00 0 @@ -143,27 +143,27 @@ 0 + + https://alanorth.github.io/cgspace-notes/tags/notes/ + 2017-11-07T17:26:16+02:00 + 0 + + https://alanorth.github.io/cgspace-notes/categories/notes/ 2017-09-28T12:00:49+03:00 0 - - https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-11-07T17:03:49+02:00 - 0 - - https://alanorth.github.io/cgspace-notes/post/ - 2017-11-07T17:03:49+02:00 + 2017-11-07T17:26:16+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-11-07T17:03:49+02:00 + 2017-11-07T17:26:16+02:00 0 diff --git a/static/2017/11/baidu-robotstxt.png b/static/2017/11/baidu-robotstxt.png new file mode 100644 index 000000000..29cb66563 Binary files /dev/null and b/static/2017/11/baidu-robotstxt.png differ