From 6a60dfa9e43cdb211e6c692ceef6ab263e3ea713 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 7 Nov 2017 18:23:10 +0200 Subject: [PATCH] Update notes for 2017-11-07 --- content/post/2017-11.md | 2 +- public/2017-11/index.html | 16 ++++++++++++---- public/sitemap.xml | 10 +++++----- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 65d509cc5..3cc4e1609 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -377,7 +377,7 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017- - I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API - About Baidu, I found a link to their [robots.txt tester tool](http://ziyuan.baidu.com/robots/) -- It seems like our robots.txt file is valid, and they claim to recognize that URLs like `/discover` should be forbidden: +- It seems like our robots.txt file is valid, and they claim to recognize that URLs like `/discover` should be forbidden (不允许, aka "not allowed"): ![Baidu robots.txt tester](/cgspace-notes/2017/11/baidu-robotstxt.png) diff --git a/public/2017-11/index.html b/public/2017-11/index.html index 527581e3c..c7ad416e1 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "2084", + "wordCount": "2118", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-07T17:26:16+02:00", + "dateModified": "2017-11-07T18:09:29+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -566,7 +566,7 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-

Baidu robots.txt tester

@@ -588,6 +588,14 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017- 1085 + + +
# grep "Baiduspider/2.0" /var/log/nginx/access.log | awk '{print $1}' | sort -n | uniq | wc -l
+164
+
+ diff --git a/public/sitemap.xml b/public/sitemap.xml index e653d34e3..8e35571fb 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2017-11/ - 2017-11-07T17:26:16+02:00 + 2017-11-07T18:09:29+02:00 @@ -134,7 +134,7 @@ https://alanorth.github.io/cgspace-notes/ - 2017-11-07T17:26:16+02:00 + 2017-11-07T18:09:29+02:00 0 @@ -145,7 +145,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-11-07T17:26:16+02:00 + 2017-11-07T18:09:29+02:00 0 @@ -157,13 +157,13 @@ https://alanorth.github.io/cgspace-notes/post/ - 2017-11-07T17:26:16+02:00 + 2017-11-07T18:09:29+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-11-07T17:26:16+02:00 + 2017-11-07T18:09:29+02:00 0