mirror of
https://github.com/alanorth/cgspace-notes.git
synced 2024-11-29 09:58:22 +01:00
Update notes for 2019-03-26
This commit is contained in:
parent
9f7556a803
commit
28116d091e
@ -917,5 +917,20 @@ $ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=(18.195.78.144|18.196.196.108)' ds
|
|||||||
```
|
```
|
||||||
|
|
||||||
- I will add their IPs to the list of bot IPs in nginx so I can tag them as bots to let Tomcat's Crawler Session Manager Valve to force them to re-use their session
|
- I will add their IPs to the list of bot IPs in nginx so I can tag them as bots to let Tomcat's Crawler Session Manager Valve to force them to re-use their session
|
||||||
|
- Another user agent behaving badly in Colombia is "GuzzleHttp/6.3.3 curl/7.47.0 PHP/7.0.30-0ubuntu0.16.04.1"
|
||||||
|
- I will add curl to the Tomcat Crawler Session Manager because anyone using curl is most likely an automated read-only request
|
||||||
|
- I will add GuzzleHttp to the nginx badbots rate limiting, because it is making requests to dynamic Discovery pages
|
||||||
|
|
||||||
|
```
|
||||||
|
# zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep 45.5.184.72 | grep -E "26/Mar/2019:" | grep -E '(discover|browse)' | wc -l
|
||||||
|
119
|
||||||
|
```
|
||||||
|
|
||||||
|
- What's strange is that I can't see any of their requests in the DSpace log...
|
||||||
|
|
||||||
|
```
|
||||||
|
$ grep -I -c 45.5.184.72 dspace.log.2019-03-26
|
||||||
|
0
|
||||||
|
```
|
||||||
|
|
||||||
<!-- vim: set sw=2 ts=2: -->
|
<!-- vim: set sw=2 ts=2: -->
|
||||||
|
@ -25,7 +25,7 @@ I think I will need to ask Udana to re-copy and paste the abstracts with more ca
|
|||||||
<meta property="og:type" content="article" />
|
<meta property="og:type" content="article" />
|
||||||
<meta property="og:url" content="https://alanorth.github.io/cgspace-notes/2019-03/" />
|
<meta property="og:url" content="https://alanorth.github.io/cgspace-notes/2019-03/" />
|
||||||
<meta property="article:published_time" content="2019-03-01T12:16:30+01:00"/>
|
<meta property="article:published_time" content="2019-03-01T12:16:30+01:00"/>
|
||||||
<meta property="article:modified_time" content="2019-03-26T18:25:05+02:00"/>
|
<meta property="article:modified_time" content="2019-03-26T19:41:33+02:00"/>
|
||||||
|
|
||||||
<meta name="twitter:card" content="summary"/>
|
<meta name="twitter:card" content="summary"/>
|
||||||
<meta name="twitter:title" content="March, 2019"/>
|
<meta name="twitter:title" content="March, 2019"/>
|
||||||
@ -55,9 +55,9 @@ I think I will need to ask Udana to re-copy and paste the abstracts with more ca
|
|||||||
"@type": "BlogPosting",
|
"@type": "BlogPosting",
|
||||||
"headline": "March, 2019",
|
"headline": "March, 2019",
|
||||||
"url": "https://alanorth.github.io/cgspace-notes/2019-03/",
|
"url": "https://alanorth.github.io/cgspace-notes/2019-03/",
|
||||||
"wordCount": "5785",
|
"wordCount": "5878",
|
||||||
"datePublished": "2019-03-01T12:16:30+01:00",
|
"datePublished": "2019-03-01T12:16:30+01:00",
|
||||||
"dateModified": "2019-03-26T18:25:05+02:00",
|
"dateModified": "2019-03-26T19:41:33+02:00",
|
||||||
"author": {
|
"author": {
|
||||||
"@type": "Person",
|
"@type": "Person",
|
||||||
"name": "Alan Orth"
|
"name": "Alan Orth"
|
||||||
@ -1201,8 +1201,23 @@ $ ./delete-metadata-values.py -i /tmp/2019-03-26-AGROVOC-79-deletions.csv -db ds
|
|||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
<li>I will add their IPs to the list of bot IPs in nginx so I can tag them as bots to let Tomcat’s Crawler Session Manager Valve to force them to re-use their session</li>
|
<li>I will add their IPs to the list of bot IPs in nginx so I can tag them as bots to let Tomcat’s Crawler Session Manager Valve to force them to re-use their session</li>
|
||||||
|
<li>Another user agent behaving badly in Colombia is “GuzzleHttp/6.3.3 curl/7.47.0 PHP/7.0.30-0ubuntu0.16.04.1”</li>
|
||||||
|
<li>I will add curl to the Tomcat Crawler Session Manager because anyone using curl is most likely an automated read-only request</li>
|
||||||
|
<li>I will add GuzzleHttp to the nginx badbots rate limiting, because it is making requests to dynamic Discovery pages</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<pre><code># zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep 45.5.184.72 | grep -E "26/Mar/2019:" | grep -E '(discover|browse)' | wc -l
|
||||||
|
119
|
||||||
|
</code></pre>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>What’s strange is that I can’t see any of their requests in the DSpace log…</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<pre><code>$ grep -I -c 45.5.184.72 dspace.log.2019-03-26
|
||||||
|
0
|
||||||
|
</code></pre>
|
||||||
|
|
||||||
<!-- vim: set sw=2 ts=2: -->
|
<!-- vim: set sw=2 ts=2: -->
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ Disallow: /cgspace-notes/2015-12/
|
|||||||
Disallow: /cgspace-notes/2015-11/
|
Disallow: /cgspace-notes/2015-11/
|
||||||
Disallow: /cgspace-notes/
|
Disallow: /cgspace-notes/
|
||||||
Disallow: /cgspace-notes/categories/
|
Disallow: /cgspace-notes/categories/
|
||||||
Disallow: /cgspace-notes/categories/notes/
|
|
||||||
Disallow: /cgspace-notes/tags/notes/
|
Disallow: /cgspace-notes/tags/notes/
|
||||||
|
Disallow: /cgspace-notes/categories/notes/
|
||||||
Disallow: /cgspace-notes/posts/
|
Disallow: /cgspace-notes/posts/
|
||||||
Disallow: /cgspace-notes/tags/
|
Disallow: /cgspace-notes/tags/
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/2019-03/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/2019-03/</loc>
|
||||||
<lastmod>2019-03-26T18:25:05+02:00</lastmod>
|
<lastmod>2019-03-26T19:41:33+02:00</lastmod>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
@ -214,7 +214,7 @@
|
|||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/</loc>
|
||||||
<lastmod>2019-03-26T18:25:05+02:00</lastmod>
|
<lastmod>2019-03-26T19:41:33+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
@ -223,27 +223,27 @@
|
|||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
<url>
|
||||||
|
<loc>https://alanorth.github.io/cgspace-notes/tags/notes/</loc>
|
||||||
|
<lastmod>2019-03-26T19:41:33+02:00</lastmod>
|
||||||
|
<priority>0</priority>
|
||||||
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/categories/notes/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/categories/notes/</loc>
|
||||||
<lastmod>2018-03-09T22:10:33+02:00</lastmod>
|
<lastmod>2018-03-09T22:10:33+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/tags/notes/</loc>
|
|
||||||
<lastmod>2019-03-26T18:25:05+02:00</lastmod>
|
|
||||||
<priority>0</priority>
|
|
||||||
</url>
|
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/posts/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/posts/</loc>
|
||||||
<lastmod>2019-03-26T18:25:05+02:00</lastmod>
|
<lastmod>2019-03-26T19:41:33+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
<url>
|
<url>
|
||||||
<loc>https://alanorth.github.io/cgspace-notes/tags/</loc>
|
<loc>https://alanorth.github.io/cgspace-notes/tags/</loc>
|
||||||
<lastmod>2019-03-26T18:25:05+02:00</lastmod>
|
<lastmod>2019-03-26T19:41:33+02:00</lastmod>
|
||||||
<priority>0</priority>
|
<priority>0</priority>
|
||||||
</url>
|
</url>
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user