From e77e3a13ae7ed904eed7d05582c0fd8aa911cbd2 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 13 Nov 2017 12:04:41 +0200 Subject: [PATCH] Add notes for 2017-11-13 --- content/post/2017-11.md | 27 +++++++++++++++++++++++++++ public/2017-11/index.html | 39 ++++++++++++++++++++++++++++++++++++--- public/sitemap.xml | 10 +++++----- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/content/post/2017-11.md b/content/post/2017-11.md index 7375b667a..1ce50e6cd 100644 --- a/content/post/2017-11.md +++ b/content/post/2017-11.md @@ -596,3 +596,30 @@ Server: nginx - The first request works, second is denied with an HTTP 503! - I need to remember to check the Munin graphs for PostgreSQL and JVM next week to see how this affects them + +## 2017-11-13 + +- Just a few hours into the day and it really looks like the Baidu rate limiting is working, HTTP 200 vs 503: + +``` +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "13/Nov/2017" | grep "Baiduspider" | grep -c " 200 " +508 +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "13/Nov/2017" | grep "Baiduspider" | grep -c " 503 " +5462 +``` + +- Helping Sisay proof 47 records for IITA: https://dspacetest.cgiar.org/handle/10568/97029 +- From looking at the data in OpenRefine I found: + - Errors in `cg.authorship.types` + - Errors in `cg.coverage.country` (smart quote in "COTE D’IVOIRE", "HAWAII" is not a country) + - Whitespace issues in some `cg.contributor.affiliatio + - Whitespace issues in some `cg.identifier.doi` fields and most values are using HTTP instead of HTTPS + - Whitespace issues in some `dc.contributor.author` fields + - Issue with invalid `dc.date.issued` value "2011-3" + - Description fields are poorly copy–pasted + - Whitespace issues in `dc.description.sponsorship` + - Lots of inconsistency in `dc.format.extent` (mixed dash style, periods at the end of values) + - Whitespace errors in `dc.identifier.citation` + - Whitespace errors in `dc.subject` + - Whitespace errors in `dc.title` +- After uploading and looking at the data in DSpace Test I saw more errors with CRPs, subjects (one item had four copies of all of its subjects, another had a "." in it), affiliations, sponsors, etc. diff --git a/public/2017-11/index.html b/public/2017-11/index.html index 64e72cefd..7f67ba49e 100644 --- a/public/2017-11/index.html +++ b/public/2017-11/index.html @@ -38,7 +38,7 @@ COPY 54701 - + @@ -86,9 +86,9 @@ COPY 54701 "@type": "BlogPosting", "headline": "November, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-11/", - "wordCount": "3351", + "wordCount": "3544", "datePublished": "2017-11-02T09:37:54+02:00", - "dateModified": "2017-11-12T10:41:44+02:00", + "dateModified": "2017-11-12T18:48:52+02:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -812,6 +812,39 @@ Server: nginx
  • I need to remember to check the Munin graphs for PostgreSQL and JVM next week to see how this affects them
  • +

    2017-11-13

    + + + +
    # cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "13/Nov/2017" | grep "Baiduspider" | grep -c " 200 "
    +508
    +# cat /var/log/nginx/access.log /var/log/nginx/access.log.1 | grep "13/Nov/2017" | grep "Baiduspider" | grep -c " 503 "
    +5462
    +
    + + + diff --git a/public/sitemap.xml b/public/sitemap.xml index e5545389b..efcdff656 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2017-11/ - 2017-11-12T10:41:44+02:00 + 2017-11-12T18:48:52+02:00 @@ -134,7 +134,7 @@ https://alanorth.github.io/cgspace-notes/ - 2017-11-12T10:41:44+02:00 + 2017-11-12T18:48:52+02:00 0 @@ -145,7 +145,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-11-12T10:41:44+02:00 + 2017-11-12T18:48:52+02:00 0 @@ -157,13 +157,13 @@ https://alanorth.github.io/cgspace-notes/post/ - 2017-11-12T10:41:44+02:00 + 2017-11-12T18:48:52+02:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-11-12T10:41:44+02:00 + 2017-11-12T18:48:52+02:00 0