diff --git a/content/posts/2019-03.md b/content/posts/2019-03.md index febbe0319..81f99352d 100644 --- a/content/posts/2019-03.md +++ b/content/posts/2019-03.md @@ -872,4 +872,50 @@ $ ./fix-metadata-values.py -i /tmp/2019-03-26-AGROVOC-89-corrections.csv -db dsp $ ./delete-metadata-values.py -i /tmp/2019-03-26-AGROVOC-79-deletions.csv -db dspace -u dspace -p 'fuuu' -m 57 -f dc.subject -d -n ``` +- UptimeRobot says CGSpace is down again, but it seems to just be slow, as the load is over 10.0 +- Looking at the nginx logs I don't see anything terribly abusive, but SemrushBot has made ~3,000 requests to Discovery and Browse pages today: + +``` +# grep SemrushBot /var/log/nginx/access.log | grep -E "26/Mar/2019" | grep -E '(discover|browse)' | wc -l +2931 +``` + +- So I'm adding it to the badbot rate limiting in nginx, and actually, I kinda feel like just blocking all user agents with "bot" in the name for a few days to see if things calm down... maybe not just yet +- Otherwise, these are the top users in the web and API logs the last hour (18–19): + +``` +# zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep -E "26/Mar/2019:(18|19)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 54 41.216.228.158 + 65 199.47.87.140 + 75 157.55.39.238 + 77 157.55.39.237 + 89 157.55.39.236 + 100 18.196.196.108 + 128 18.195.78.144 + 277 2a01:4f8:13b:1296::2 + 291 66.249.66.80 + 328 35.174.184.209 +# zcat --force /var/log/nginx/{oai,rest,statistics}.log /var/log/nginx/{oai,rest,statistics}.log.1 | grep -E "26/Mar/2019:(18|19)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 2 2409:4066:211:2caf:3c31:3fae:2212:19cc + 2 35.10.204.140 + 2 45.251.231.45 + 2 95.108.181.88 + 2 95.137.190.2 + 3 104.198.9.108 + 3 107.167.109.88 + 6 66.249.66.80 + 13 41.89.230.156 + 1860 45.5.184.2 +``` + +- For the XMLUI I see `18.195.78.144` and `18.196.196.108` requesting only CTA items and with no user agent +- They are responsible for almost 1,000 XMLUI sessions today: + +``` +$ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=(18.195.78.144|18.196.196.108)' dspace.log.2019-03-26 | sort | uniq | wc -l +937 +``` + +- I will add their IPs to the list of bot IPs in nginx so I can tag them as bots to let Tomcat's Crawler Session Manager Valve to force them to re-use their session + diff --git a/docs/2019-03/index.html b/docs/2019-03/index.html index 90f7761e2..6eab9d923 100644 --- a/docs/2019-03/index.html +++ b/docs/2019-03/index.html @@ -25,7 +25,7 @@ I think I will need to ask Udana to re-copy and paste the abstracts with more ca - + @@ -55,9 +55,9 @@ I think I will need to ask Udana to re-copy and paste the abstracts with more ca "@type": "BlogPosting", "headline": "March, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-03/", - "wordCount": "5506", + "wordCount": "5785", "datePublished": "2019-03-01T12:16:30+01:00", - "dateModified": "2019-03-26T09:09:19+02:00", + "dateModified": "2019-03-26T18:25:05+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -1152,6 +1152,57 @@ $ grep -o -E 'session_id=[A-Z0-9]{32}' dspace.log.2019-03-22 | sort -u | wc -l $ ./delete-metadata-values.py -i /tmp/2019-03-26-AGROVOC-79-deletions.csv -db dspace -u dspace -p 'fuuu' -m 57 -f dc.subject -d -n + + +
# grep SemrushBot /var/log/nginx/access.log | grep -E "26/Mar/2019" | grep -E '(discover|browse)' | wc -l
+2931
+
+ + + +
# zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep -E "26/Mar/2019:(18|19)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 
+     54 41.216.228.158
+     65 199.47.87.140
+     75 157.55.39.238
+     77 157.55.39.237
+     89 157.55.39.236
+    100 18.196.196.108
+    128 18.195.78.144
+    277 2a01:4f8:13b:1296::2
+    291 66.249.66.80
+    328 35.174.184.209
+# zcat --force /var/log/nginx/{oai,rest,statistics}.log /var/log/nginx/{oai,rest,statistics}.log.1 | grep -E "26/Mar/2019:(18|19)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+      2 2409:4066:211:2caf:3c31:3fae:2212:19cc
+      2 35.10.204.140
+      2 45.251.231.45
+      2 95.108.181.88
+      2 95.137.190.2
+      3 104.198.9.108
+      3 107.167.109.88
+      6 66.249.66.80
+     13 41.89.230.156
+   1860 45.5.184.2
+
+ + + +
$ grep -o -E 'session_id=[A-Z0-9]{32}:ip_addr=(18.195.78.144|18.196.196.108)' dspace.log.2019-03-26 | sort | uniq | wc -l
+937
+
+ + + diff --git a/docs/robots.txt b/docs/robots.txt index 9c8f41429..2b88ae317 100644 --- a/docs/robots.txt +++ b/docs/robots.txt @@ -45,7 +45,7 @@ Disallow: /cgspace-notes/2015-12/ Disallow: /cgspace-notes/2015-11/ Disallow: /cgspace-notes/ Disallow: /cgspace-notes/categories/ -Disallow: /cgspace-notes/tags/notes/ Disallow: /cgspace-notes/categories/notes/ +Disallow: /cgspace-notes/tags/notes/ Disallow: /cgspace-notes/posts/ Disallow: /cgspace-notes/tags/ diff --git a/docs/sitemap.xml b/docs/sitemap.xml index bf11ca38d..0716957ae 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2019-03/ - 2019-03-26T09:09:19+02:00 + 2019-03-26T18:25:05+02:00 @@ -214,7 +214,7 @@ https://alanorth.github.io/cgspace-notes/ - 2019-03-26T09:09:19+02:00 + 2019-03-26T18:25:05+02:00 0 @@ -223,27 +223,27 @@ 0 - - https://alanorth.github.io/cgspace-notes/tags/notes/ - 2019-03-26T09:09:19+02:00 - 0 - - https://alanorth.github.io/cgspace-notes/categories/notes/ 2018-03-09T22:10:33+02:00 0 + + https://alanorth.github.io/cgspace-notes/tags/notes/ + 2019-03-26T18:25:05+02:00 + 0 + + https://alanorth.github.io/cgspace-notes/posts/ - 2019-03-26T09:09:19+02:00 + 2019-03-26T18:25:05+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2019-03-26T09:09:19+02:00 + 2019-03-26T18:25:05+02:00 0