diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 571e92e63..772e67324 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -364,3 +364,15 @@ $ grep -E '2017-11-07 0[2-8]' dspace.log.2017-11-07 | grep -o -E 'ip_addr=[0-9.] # grep -E "07/Nov/2017:1[234]:" /var/log/nginx/access.log | grep 104.196.152.243 | grep -c pdf 0 ``` + +- About CIAT, I think I need to encourage them to specify a user agent string for their requests, because they are not reuising their Tomcat session and they are creating thousands of sessions per day +- All CIAT requests vs unique ones: + +``` +$ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-11-07 | wc -l +3506 +$ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-11-07 | sort | uniq | wc -l +3506 +``` + +- I emailed CIAT about the session issue, user agent issue, and told them they should not scrape the HTML contents of communities, instead using the REST API diff --git a/public/2017-11/index.html b/public/2017-11/index.html index 900f97e96..a5111a0d4 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "1905", + "wordCount": "1997", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-07T14:50:01+02:00", + "dateModified": "2017-11-07T17:03:49+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -552,6 +552,21 @@ $ grep 104.196.152.243 dspace.log.2017-11-01 | grep -o -E 'session_id=[A-Z0-9]{3 0 +
$ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-11-07 | wc -l
+3506
+$ grep -Io -E 'session_id=[A-Z0-9]{32}:ip_addr=104.196.152.243' dspace.log.2017-11-07 | sort | uniq | wc -l
+3506
+
+
+