From cb038636478c803ef973a2231dd3d321c1da5804 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 10 Aug 2020 15:59:22 +0300 Subject: [PATCH] Update notes for 2020-08-10 --- content/posts/2020-08.md | 18 +++++++++++++ docs/2020-08/index.html | 34 ++++++++++++++++++++++--- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/sitemap.xml | 10 ++++---- 20 files changed, 70 insertions(+), 26 deletions(-) diff --git a/content/posts/2020-08.md b/content/posts/2020-08.md index b9e59284c..22f2070b7 100644 --- a/content/posts/2020-08.md +++ b/content/posts/2020-08.md @@ -350,4 +350,22 @@ $ wc -l /tmp/2020-08-09-orcid-identifiers-uniq.csv 1949 /tmp/2020-08-09-orcid-identifiers-uniq.csv ``` +- I looked into the strange Solr record above that had "{set=830}" in the communities and collections + - There are exactly 11724 records like this in the current CGSpace (DSpace 5.8) statistics-2018 Solr core + - None of them have an `id` or `type` field! + - I see 242,000 of them in the statistics-2017 core, 185,063 in the statistics-2016 core... all the way to 2010, but not in 2019 or the current statistics core + - I decided to purge all of these records from CGSpace right now so they don't even have a chance at being an issue on the real migration: + +``` +$ curl -s "http://localhost:8081/solr/statistics-2018/update?softCommit=true" -H "Content-Type: text/xml" --data-binary 'owningColl:/.*set.*/' +... +$ curl -s "http://localhost:8081/solr/statistics-2010/update?softCommit=true" -H "Content-Type: text/xml" --data-binary 'owningColl:/.*set.*/' +``` + +- I added `Googlebot` and `Twitterbot` to the list of explicitly allowed bots + - In Google's case, they were getting lumped in with all the other bad bots and then important links like the sitemaps were returning HTTP 503, but they generally respect `robots.txt` so we should just allow them (perhaps we can control the crawl rate in the webmaster console) + - In Twitter's case they were also getting lumped in with the bad bots too, but really they only make ~50 or so requests a day when someone posts a CGSpace link on Twitter +- I tagged the ISO 3166-1 Alpha2 country codes on all items on CGSpace using my [CountryCodeTagger](https://github.com/ilri/cgspace-java-helpers) curation task + - I still need to set up a cron job for it... + diff --git a/docs/2020-08/index.html b/docs/2020-08/index.html index fab61838d..91c1cb480 100644 --- a/docs/2020-08/index.html +++ b/docs/2020-08/index.html @@ -19,7 +19,7 @@ It is class based so I can easily add support for other vocabularies, and the te - + @@ -43,9 +43,9 @@ It is class based so I can easily add support for other vocabularies, and the te "@type": "BlogPosting", "headline": "August, 2020", "url": "https://alanorth.github.io/cgspace-notes/2020-08/", - "wordCount": "2049", + "wordCount": "2285", "datePublished": "2020-08-02T15:35:54+03:00", - "dateModified": "2020-08-07T19:55:21+03:00", + "dateModified": "2020-08-10T09:27:50+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -504,7 +504,33 @@ dspace=# \q $ grep -oE '[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}' /tmp/2020-08-09-orcid-identifiers.csv | sort | uniq > /tmp/2020-08-09-orcid-identifiers-uniq.csv $ wc -l /tmp/2020-08-09-orcid-identifiers-uniq.csv 1949 /tmp/2020-08-09-orcid-identifiers-uniq.csv - + +
$ curl -s "http://localhost:8081/solr/statistics-2018/update?softCommit=true" -H "Content-Type: text/xml" --data-binary '<delete><query>owningColl:/.*set.*/</query></delete>'
+...
+$ curl -s "http://localhost:8081/solr/statistics-2010/update?softCommit=true" -H "Content-Type: text/xml" --data-binary '<delete><query>owningColl:/.*set.*/</query></delete>'
+
+ diff --git a/docs/categories/index.html b/docs/categories/index.html index 74a2297bd..365604a97 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 71bced4a0..4be8b4c3a 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 3c194c0ed..f66e47184 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 737076049..ad3a45e2c 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 7687aaccb..2b272b345 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 5504db17a..1b4717aae 100644 --- a/docs/index.html +++ b/docs/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 8d34a3020..aee8e375e 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index c615ff8bb..08b4f5f2b 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index de66a7f9d..388186efd 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index a13a03435..0dc7e51f1 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index e44d3e974..f56a5a9a6 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 3a08c6c98..e59ea6e73 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 5956eec7d..a31e95ee1 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 671a9c0a0..c8f26e90b 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 37b6cafba..0cebf0bb5 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 53bbb03bd..1ddc9d8b4 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 9fd833436..e7cfe20b0 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index e6eea3447..0c1225f61 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,27 +4,27 @@ https://alanorth.github.io/cgspace-notes/2020-08/ - 2020-08-07T19:55:21+03:00 + 2020-08-10T09:27:50+03:00 https://alanorth.github.io/cgspace-notes/categories/ - 2020-08-07T19:55:21+03:00 + 2020-08-10T09:27:50+03:00 https://alanorth.github.io/cgspace-notes/ - 2020-08-07T19:55:21+03:00 + 2020-08-10T09:27:50+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2020-08-07T19:55:21+03:00 + 2020-08-10T09:27:50+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2020-08-07T19:55:21+03:00 + 2020-08-10T09:27:50+03:00