mirror of
https://github.com/alanorth/cgspace-notes.git
synced 2024-11-17 20:27:05 +01:00
Update notes for 2017-11-07
This commit is contained in:
parent
0ffe1f07b0
commit
6a60dfa9e4
@ -377,7 +377,7 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-
|
|||||||
|
|
||||||
- I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API
|
- I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API
|
||||||
- About Baidu, I found a link to their [robots.txt tester tool](http://ziyuan.baidu.com/robots/)
|
- About Baidu, I found a link to their [robots.txt tester tool](http://ziyuan.baidu.com/robots/)
|
||||||
- It seems like our robots.txt file is valid, and they claim to recognize that URLs like `/discover` should be forbidden:
|
- It seems like our robots.txt file is valid, and they claim to recognize that URLs like `/discover` should be forbidden (不允许, aka "not allowed"):
|
||||||
|
|
||||||
![Baidu robots.txt tester](/cgspace-notes/2017/11/baidu-robotstxt.png)
|
![Baidu robots.txt tester](/cgspace-notes/2017/11/baidu-robotstxt.png)
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ COPY 54701
|
|||||||
|
|
||||||
<meta property="article:published_time" content="2017-11-02T09:37:54+02:00"/>
|
<meta property="article:published_time" content="2017-11-02T09:37:54+02:00"/>
|
||||||
|
|
||||||
<meta property="article:modified_time" content="2017-11-07T17:26:16+02:00"/>
|
<meta property="article:modified_time" content="2017-11-07T18:09:29+02:00"/>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -86,9 +86,9 @@ COPY 54701
|
|||||||
"@type": "BlogPosting",
|
"@type": "BlogPosting",
|
||||||
"headline": "November, 2017",
|
"headline": "November, 2017",
|
||||||
"url": "https://alanorth.github.io/cgspace-notes/2017-11/",
|
"url": "https://alanorth.github.io/cgspace-notes/2017-11/",
|
||||||
"wordCount": "2084",
|
"wordCount": "2118",
|
||||||
"datePublished": "2017-11-02T09:37:54+02:00",
|
"datePublished": "2017-11-02T09:37:54+02:00",
|
||||||
"dateModified": "2017-11-07T17:26:16+02:00",
|
"dateModified": "2017-11-07T18:09:29+02:00",
|
||||||
"author": {
|
"author": {
|
||||||
"@type": "Person",
|
"@type": "Person",
|
||||||
"name": "Alan Orth"
|
"name": "Alan Orth"
|
||||||
@ -566,7 +566,7 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-
|
|||||||
<ul>
|
<ul>
|
||||||
<li>I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API</li>
|
<li>I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API</li>
|
||||||
<li>About Baidu, I found a link to their <a href="http://ziyuan.baidu.com/robots/">robots.txt tester tool</a></li>
|
<li>About Baidu, I found a link to their <a href="http://ziyuan.baidu.com/robots/">robots.txt tester tool</a></li>
|
||||||
<li>It seems like our robots.txt file is valid, and they claim to recognize that URLs like <code>/discover</code> should be forbidden:</li>
|
<li>It seems like our robots.txt file is valid, and they claim to recognize that URLs like <code>/discover</code> should be forbidden (不允许, aka “not allowed”):</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
<p><img src="/cgspace-notes/2017/11/baidu-robotstxt.png" alt="Baidu robots.txt tester" /></p>
|
<p><img src="/cgspace-notes/2017/11/baidu-robotstxt.png" alt="Baidu robots.txt tester" /></p>
|
||||||
@ -588,6 +588,14 @@ $ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-
|
|||||||
1085
|
1085
|
||||||
</code></pre>
|
</code></pre>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>I will think about blocking their IPs but they have 164 of them!</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<pre><code># grep "Baiduspider/2.0" /var/log/nginx/access.log | awk '{print $1}' | sort -n | uniq | wc -l
|
||||||
|
164
|
||||||
|
</code></pre>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/2017-11/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/2017-11/</loc>
|
||||||
<lastmod>2017-11-07T17:26:16+02:00</lastmod>
|
<lastmod>2017-11-07T18:09:29+02:00</lastmod>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
@ -134,7 +134,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/</loc>
|
||||||
<lastmod>2017-11-07T17:26:16+02:00</lastmod>
|
<lastmod>2017-11-07T18:09:29+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -145,7 +145,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/tags/notes/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/tags/notes/</loc>
|
||||||
<lastmod>2017-11-07T17:26:16+02:00</lastmod>
|
<lastmod>2017-11-07T18:09:29+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -157,13 +157,13 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/post/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/post/</loc>
|
||||||
<lastmod>2017-11-07T17:26:16+02:00</lastmod>
|
<lastmod>2017-11-07T18:09:29+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/tags/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/tags/</loc>
|
||||||
<lastmod>2017-11-07T17:26:16+02:00</lastmod>
|
<lastmod>2017-11-07T18:09:29+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user