From 9158b39c5f9733dacb6469067b1824789bef8817 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 6 Feb 2019 16:50:39 +0200 Subject: [PATCH] Add notes for 2019-02-06 --- content/posts/2019-02.md | 106 ++++++++++++++++++++++++++++++++++- docs/2019-02/index.html | 118 +++++++++++++++++++++++++++++++++++++-- docs/sitemap.xml | 10 ++-- 3 files changed, 224 insertions(+), 10 deletions(-) diff --git a/content/posts/2019-02.md b/content/posts/2019-02.md index 5e0a10395..f55d6da2c 100644 --- a/content/posts/2019-02.md +++ b/content/posts/2019-02.md @@ -190,6 +190,110 @@ $ export JAVA_OPTS="-Dfile.encoding=UTF-8 -Xmx1024m" $ time schedtool -D -e ionice -c2 -n7 nice -n19 dspace index-discovery -b ``` -- Peter had marked several terms with `||` to indicate multiple values in his corrections so I will have to go back and do those manually +- Peter had marked several terms with `||` to indicate multiple values in his corrections so I will have to go back and do those manually: + +``` +EMPODERAMENTO DE JOVENS,EMPODERAMENTO||JOVENS +ENVIRONMENTAL PROTECTION AND NATURAL RESOURCES MANAGEMENT,NATURAL RESOURCES MANAGEMENT||ENVIRONMENT +FISHERIES AND AQUACULTURE,FISHERIES||AQUACULTURE +MARKETING AND TRADE,MARKETING||TRADE +MARKETING ET COMMERCE,MARKETING||COMMERCE +NATURAL RESOURCES AND ENVIRONMENT,NATURAL RESOURCES MANAGEMENT||ENVIRONMENT +PÊCHES ET AQUACULTURE,PÊCHES||AQUACULTURE +PESCAS E AQUACULTURE,PISCICULTURA||AQUACULTURE +``` + +## 2019-02-06 + +- I dumped the CTA community so I can try to fix the subjects with multiple subjects that Peter indicated in his corrections: + +``` +$ dspace metadata-export -i 10568/42211 -f /tmp/cta.csv +``` + +- Then I used `csvcut` to get only the CTA subject columns: + +``` +$ csvcut -c "id,collection,cg.subject.cta,cg.subject.cta[],cg.subject.cta[en_US]" /tmp/cta.csv > /tmp/cta-subjects.csv +``` + +- After that I imported the CSV into OpenRefine where I could properly identify and edit the subjects as multiple values +- Then I imported it back into CGSpace: + +``` +$ dspace metadata-import -f /tmp/2019-02-06-CTA-multiple-subjects.csv +``` + +- Another day, another alert about high load on CGSpace (linode18) from Linode +- This time the load average was 370% and the top ten IPs before, during, and after that time were: + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "06/Feb/2019:0(5|6|7|8|9)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 689 35.237.175.180 + 1236 5.9.6.51 + 1305 34.218.226.147 + 1580 66.249.66.219 + 1939 50.116.102.77 + 2313 108.212.105.35 + 4666 205.186.128.185 + 4666 70.32.83.92 + 4950 85.25.237.71 + 5158 45.5.186.2 +``` + +- Looking closer at the top users, I see `45.5.186.2` is in Brazil and was making over 100 requests per minute to the REST API: + +``` +# zcat --force /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep 45.5.186.2 | grep -o -E '06/Feb/2019:0[0-9]:[0-9][0-9]' | uniq -c | sort -n | tail -n 10 + 118 06/Feb/2019:05:46 + 119 06/Feb/2019:05:37 + 119 06/Feb/2019:05:47 + 120 06/Feb/2019:05:43 + 120 06/Feb/2019:05:44 + 121 06/Feb/2019:05:38 + 122 06/Feb/2019:05:39 + 125 06/Feb/2019:05:42 + 126 06/Feb/2019:05:40 + 126 06/Feb/2019:05:41 +``` + +- I was thinking of rate limiting those because I assumed most of them would be errors, but actually most are HTTP 200 OK! + +``` +# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E '06/Feb/2019' | grep 45.5.186.2 | awk '{print $9}' | sort | uniq -c + 10411 200 + 1 301 + 7 302 + 3 404 + 18 499 + 2 500 +``` + +- I should probably start looking at the top IPs for web (XMLUI) and for API (REST and OAI) separately: + +``` +# zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep -E "06/Feb/2019:0(5|6|7|8|9)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 328 220.247.212.35 + 372 66.249.66.221 + 380 207.46.13.2 + 519 2a01:4f8:140:3192::2 + 572 5.143.231.8 + 689 35.237.175.180 + 771 108.212.105.35 + 1236 5.9.6.51 + 1554 66.249.66.219 + 4942 85.25.237.71 +# zcat --force /var/log/nginx/{oai,rest,statistics}.log /var/log/nginx/{oai,rest,statistics}.log.1 | grep -E "06/Feb/2019:0(5|6|7|8|9)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10 + 10 66.249.66.221 + 26 66.249.66.219 + 69 5.143.231.8 + 340 45.5.184.72 + 1040 34.218.226.147 + 1542 108.212.105.35 + 1937 50.116.102.77 + 4661 205.186.128.185 + 4661 70.32.83.92 + 5102 45.5.186.2 +``` diff --git a/docs/2019-02/index.html b/docs/2019-02/index.html index 084f56a80..afb1b8165 100644 --- a/docs/2019-02/index.html +++ b/docs/2019-02/index.html @@ -42,7 +42,7 @@ sys 0m1.979s - + @@ -89,9 +89,9 @@ sys 0m1.979s "@type": "BlogPosting", "headline": "February, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-02/", - "wordCount": "1015", + "wordCount": "1435", "datePublished": "2019-02-01T21:37:30+02:00", - "dateModified": "2019-02-05T08:59:54+02:00", + "dateModified": "2019-02-05T09:22:06+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -372,9 +372,119 @@ $ time schedtool -D -e ionice -c2 -n7 nice -n19 dspace index-discovery -b +
EMPODERAMENTO DE JOVENS,EMPODERAMENTO||JOVENS
+ENVIRONMENTAL PROTECTION AND NATURAL RESOURCES MANAGEMENT,NATURAL RESOURCES MANAGEMENT||ENVIRONMENT
+FISHERIES AND AQUACULTURE,FISHERIES||AQUACULTURE
+MARKETING AND TRADE,MARKETING||TRADE
+MARKETING ET COMMERCE,MARKETING||COMMERCE
+NATURAL RESOURCES AND ENVIRONMENT,NATURAL RESOURCES MANAGEMENT||ENVIRONMENT
+PÊCHES ET AQUACULTURE,PÊCHES||AQUACULTURE
+PESCAS E AQUACULTURE,PISCICULTURA||AQUACULTURE
+
+ +

2019-02-06

+ + + +
$ dspace metadata-export -i 10568/42211 -f /tmp/cta.csv
+
+ + + +
$ csvcut -c "id,collection,cg.subject.cta,cg.subject.cta[],cg.subject.cta[en_US]" /tmp/cta.csv > /tmp/cta-subjects.csv
+
+ + + +
$ dspace metadata-import -f /tmp/2019-02-06-CTA-multiple-subjects.csv
+
+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E "06/Feb/2019:0(5|6|7|8|9)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+    689 35.237.175.180
+   1236 5.9.6.51
+   1305 34.218.226.147
+   1580 66.249.66.219
+   1939 50.116.102.77
+   2313 108.212.105.35
+   4666 205.186.128.185
+   4666 70.32.83.92
+   4950 85.25.237.71
+   5158 45.5.186.2
+
+ + + +
# zcat --force /var/log/nginx/rest.log /var/log/nginx/rest.log.1 | grep 45.5.186.2 | grep -o -E '06/Feb/2019:0[0-9]:[0-9][0-9]' | uniq -c | sort -n | tail -n 10
+    118 06/Feb/2019:05:46
+    119 06/Feb/2019:05:37
+    119 06/Feb/2019:05:47
+    120 06/Feb/2019:05:43
+    120 06/Feb/2019:05:44
+    121 06/Feb/2019:05:38
+    122 06/Feb/2019:05:39
+    125 06/Feb/2019:05:42
+    126 06/Feb/2019:05:40
+    126 06/Feb/2019:05:41
+
+ + + +
# zcat --force /var/log/nginx/*.log /var/log/nginx/*.log.1 | grep -E '06/Feb/2019' | grep 45.5.186.2 | awk '{print $9}' | sort | uniq -c
+  10411 200
+      1 301
+      7 302
+      3 404
+     18 499
+      2 500
+
+ + + +
# zcat --force /var/log/nginx/{access,error,library-access}.log /var/log/nginx/{access,error,library-access}.log.1 | grep -E "06/Feb/2019:0(5|6|7|8|9)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+    328 220.247.212.35
+    372 66.249.66.221
+    380 207.46.13.2
+    519 2a01:4f8:140:3192::2
+    572 5.143.231.8
+    689 35.237.175.180
+    771 108.212.105.35
+   1236 5.9.6.51
+   1554 66.249.66.219
+   4942 85.25.237.71
+# zcat --force /var/log/nginx/{oai,rest,statistics}.log /var/log/nginx/{oai,rest,statistics}.log.1 | grep -E "06/Feb/2019:0(5|6|7|8|9)" | awk '{print $1}' | sort | uniq -c | sort -n | tail -n 10
+     10 66.249.66.221
+     26 66.249.66.219
+     69 5.143.231.8
+    340 45.5.184.72
+   1040 34.218.226.147
+   1542 108.212.105.35
+   1937 50.116.102.77
+   4661 205.186.128.185
+   4661 70.32.83.92
+   5102 45.5.186.2
+
+ diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 9c7ec3d5a..c18b78bbd 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2019-02/ - 2019-02-05T08:59:54+02:00 + 2019-02-05T09:22:06+02:00 @@ -209,7 +209,7 @@ https://alanorth.github.io/cgspace-notes/ - 2019-02-05T08:59:54+02:00 + 2019-02-05T09:22:06+02:00 0 @@ -220,7 +220,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2019-02-05T08:59:54+02:00 + 2019-02-05T09:22:06+02:00 0 @@ -232,13 +232,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2019-02-05T08:59:54+02:00 + 2019-02-05T09:22:06+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2019-02-05T08:59:54+02:00 + 2019-02-05T09:22:06+02:00 0