From 5b11434f0fe9ac7413d0337326c4c2022cce8db8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 1 Aug 2017 12:03:37 +0300 Subject: [PATCH] Update notes for 2017-08-01 --- content/post/2017-08.md | 2 ++ public/2015-11/index.html | 4 +-- public/2015-12/index.html | 4 +-- public/2016-01/index.html | 4 +-- public/2016-02/index.html | 4 +-- public/2016-03/index.html | 4 +-- public/2016-04/index.html | 4 +-- public/2016-05/index.html | 4 +-- public/2016-06/index.html | 4 +-- public/2016-07/index.html | 4 +-- public/2016-08/index.html | 4 +-- public/2016-09/index.html | 4 +-- public/2016-10/index.html | 4 +-- public/2016-11/index.html | 4 +-- public/2016-12/index.html | 4 +-- public/2017-01/index.html | 4 +-- public/2017-02/index.html | 4 +-- public/2017-03/index.html | 4 +-- public/2017-04/index.html | 4 +-- public/2017-05/index.html | 4 +-- public/2017-06/index.html | 4 +-- public/2017-07/index.html | 4 +-- public/2017-08/index.html | 12 ++++++--- public/index.html | 2 ++ public/index.xml | 2 ++ public/post/index.html | 2 ++ public/post/index.xml | 2 ++ public/sitemap.xml | 52 ++++++++++++++++++------------------ public/tags/notes/index.html | 2 ++ public/tags/notes/index.xml | 2 ++ 30 files changed, 91 insertions(+), 71 deletions(-) diff --git a/content/post/2017-08.md b/content/post/2017-08.md index f569ef86a..ecea52895 100644 --- a/content/post/2017-08.md +++ b/content/post/2017-08.md @@ -16,5 +16,7 @@ tags = ["Notes"] - /handle/10568/16510/browse - The `robots.txt` only blocks the top-level `/discover` and `/browse` URLs... we will need to find a way to forbid them from accessing these! - Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962 +- It turns out that we're already adding the `X-Robots-Tag "none"` HTTP header, but this only forbids the search engine from _indexing_ the page, not crawling it! +- Also, the bot has to successfully browse the page first so it can receive the HTTP header... diff --git a/public/2015-11/index.html b/public/2015-11/index.html index 3807393ac..7c87b0a2b 100644 --- a/public/2015-11/index.html +++ b/public/2015-11/index.html @@ -25,7 +25,7 @@ $ psql -c 'SELECT * from pg_stat_activity;' | grep idle | grep -c cgspac - + @@ -71,7 +71,7 @@ $ psql -c 'SELECT * from pg_stat_activity;' | grep idle | grep -c cgspac "url": "https://alanorth.github.io/cgspace-notes/2015-11/", "wordCount": "798", "datePublished": "2015-11-23T17:00:57+03:00", - "dateModified": "2015-11-23T17:00:57+03:00", + "dateModified": "2016-09-28T17:02:30+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2015-12/index.html b/public/2015-12/index.html index f0c34062b..b5e137590 100644 --- a/public/2015-12/index.html +++ b/public/2015-12/index.html @@ -26,7 +26,7 @@ Replace lzop with xz in log compression cron jobs on DSpace Test—it uses less - + @@ -73,7 +73,7 @@ Replace lzop with xz in log compression cron jobs on DSpace Test—it uses less "url": "https://alanorth.github.io/cgspace-notes/2015-12/", "wordCount": "753", "datePublished": "2015-12-02T13:18:00+03:00", - "dateModified": "2015-12-02T13:18:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-01/index.html b/public/2016-01/index.html index 473bfed55..0829d2646 100644 --- a/public/2016-01/index.html +++ b/public/2016-01/index.html @@ -21,7 +21,7 @@ Update GitHub wiki for documentation of maintenance tasks. - + @@ -63,7 +63,7 @@ Update GitHub wiki for documentation of maintenance tasks. "url": "https://alanorth.github.io/cgspace-notes/2016-01/", "wordCount": "466", "datePublished": "2016-01-13T13:18:00+03:00", - "dateModified": "2016-01-13T13:18:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-02/index.html b/public/2016-02/index.html index da2e33386..233e0f02f 100644 --- a/public/2016-02/index.html +++ b/public/2016-02/index.html @@ -28,7 +28,7 @@ Also, lots of things like “COTE D`LVOIRE” and “COTE D IVOIRE&r - + @@ -77,7 +77,7 @@ Also, lots of things like “COTE D`LVOIRE” and “COTE D IVOIRE&r "url": "https://alanorth.github.io/cgspace-notes/2016-02/", "wordCount": "1657", "datePublished": "2016-02-05T13:18:00+03:00", - "dateModified": "2016-02-05T13:18:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-03/index.html b/public/2016-03/index.html index b92e5fcf6..c0a3834a4 100644 --- a/public/2016-03/index.html +++ b/public/2016-03/index.html @@ -21,7 +21,7 @@ Reinstall my local (Mac OS X) DSpace stack with Tomcat 7, PostgreSQL 9.3, and Ja - + @@ -63,7 +63,7 @@ Reinstall my local (Mac OS X) DSpace stack with Tomcat 7, PostgreSQL 9.3, and Ja "url": "https://alanorth.github.io/cgspace-notes/2016-03/", "wordCount": "1581", "datePublished": "2016-03-02T16:50:00+03:00", - "dateModified": "2016-03-02T16:50:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-04/index.html b/public/2016-04/index.html index 5bf1dceb3..d8f3dddaf 100644 --- a/public/2016-04/index.html +++ b/public/2016-04/index.html @@ -23,7 +23,7 @@ Also, I noticed the checker log has some errors we should pay attention to: - + @@ -67,7 +67,7 @@ Also, I noticed the checker log has some errors we should pay attention to: "url": "https://alanorth.github.io/cgspace-notes/2016-04/", "wordCount": "2006", "datePublished": "2016-04-04T11:06:00+03:00", - "dateModified": "2016-04-04T11:06:00+03:00", + "dateModified": "2016-09-28T17:02:30+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-05/index.html b/public/2016-05/index.html index c4b1676d9..c1cce091c 100644 --- a/public/2016-05/index.html +++ b/public/2016-05/index.html @@ -25,7 +25,7 @@ There are 3,000 IPs accessing the REST API in a 24-hour period! - + @@ -71,7 +71,7 @@ There are 3,000 IPs accessing the REST API in a 24-hour period! "url": "https://alanorth.github.io/cgspace-notes/2016-05/", "wordCount": "1349", "datePublished": "2016-05-01T23:06:00+03:00", - "dateModified": "2016-05-01T23:06:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-06/index.html b/public/2016-06/index.html index dff8863c1..6dd36415c 100644 --- a/public/2016-06/index.html +++ b/public/2016-06/index.html @@ -24,7 +24,7 @@ Working on second phase of metadata migration, looks like this will work for mov - + @@ -69,7 +69,7 @@ Working on second phase of metadata migration, looks like this will work for mov "url": "https://alanorth.github.io/cgspace-notes/2016-06/", "wordCount": "1549", "datePublished": "2016-06-01T10:53:00+03:00", - "dateModified": "2016-06-01T10:53:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-07/index.html b/public/2016-07/index.html index 4796aea88..b32b73c6e 100644 --- a/public/2016-07/index.html +++ b/public/2016-07/index.html @@ -32,7 +32,7 @@ In this case the select query was showing 95 results before the update - + @@ -85,7 +85,7 @@ In this case the select query was showing 95 results before the update "url": "https://alanorth.github.io/cgspace-notes/2016-07/", "wordCount": "866", "datePublished": "2016-07-01T10:53:00+03:00", - "dateModified": "2016-07-01T10:53:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-08/index.html b/public/2016-08/index.html index e6945c028..895887e22 100644 --- a/public/2016-08/index.html +++ b/public/2016-08/index.html @@ -29,7 +29,7 @@ $ git rebase -i dspace-5.5 - + @@ -79,7 +79,7 @@ $ git rebase -i dspace-5.5 "url": "https://alanorth.github.io/cgspace-notes/2016-08/", "wordCount": "1514", "datePublished": "2016-08-01T15:53:00+03:00", - "dateModified": "2016-08-01T15:53:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-09/index.html b/public/2016-09/index.html index 463a497db..ed5bc1620 100644 --- a/public/2016-09/index.html +++ b/public/2016-09/index.html @@ -25,7 +25,7 @@ $ ldapsearch -x -H ldaps://svcgroot2.cgiarad.org:3269/ -b "dc=cgiarad,dc=or - + @@ -71,7 +71,7 @@ $ ldapsearch -x -H ldaps://svcgroot2.cgiarad.org:3269/ -b "dc=cgiarad,dc=or "url": "https://alanorth.github.io/cgspace-notes/2016-09/", "wordCount": "3298", "datePublished": "2016-09-01T15:53:00+03:00", - "dateModified": "2016-09-01T15:53:00+03:00", + "dateModified": "2017-01-09T16:18:07+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-10/index.html b/public/2016-10/index.html index 667de7713..468841f8b 100644 --- a/public/2016-10/index.html +++ b/public/2016-10/index.html @@ -29,7 +29,7 @@ I exported a random item’s metadata as CSV, deleted all columns except id - + @@ -79,7 +79,7 @@ I exported a random item’s metadata as CSV, deleted all columns except id "url": "https://alanorth.github.io/cgspace-notes/2016-10/", "wordCount": "1828", "datePublished": "2016-10-03T15:53:00+03:00", - "dateModified": "2016-10-03T15:53:00+03:00", + "dateModified": "2017-01-10T16:21:47+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-11/index.html b/public/2016-11/index.html index 8e229abd6..421134f23 100644 --- a/public/2016-11/index.html +++ b/public/2016-11/index.html @@ -21,7 +21,7 @@ Add dc.type to the output options for Atmire’s Listings and Reports module - + @@ -63,7 +63,7 @@ Add dc.type to the output options for Atmire’s Listings and Reports module "url": "https://alanorth.github.io/cgspace-notes/2016-11/", "wordCount": "2825", "datePublished": "2016-11-01T09:21:00+03:00", - "dateModified": "2016-11-01T09:21:00+03:00", + "dateModified": "2017-01-10T16:21:47+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2016-12/index.html b/public/2016-12/index.html index f35c43d03..c2eb6a186 100644 --- a/public/2016-12/index.html +++ b/public/2016-12/index.html @@ -33,7 +33,7 @@ Another worrying error from dspace.log is: - + @@ -87,7 +87,7 @@ Another worrying error from dspace.log is: "url": "https://alanorth.github.io/cgspace-notes/2016-12/", "wordCount": "4078", "datePublished": "2016-12-02T10:43:00+03:00", - "dateModified": "2016-12-02T10:43:00+03:00", + "dateModified": "2017-01-10T16:21:47+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-01/index.html b/public/2017-01/index.html index 17da54221..0c8680de2 100644 --- a/public/2017-01/index.html +++ b/public/2017-01/index.html @@ -21,7 +21,7 @@ I asked on the dspace-tech mailing list because it seems to be broken, and actua - + @@ -63,7 +63,7 @@ I asked on the dspace-tech mailing list because it seems to be broken, and actua "url": "https://alanorth.github.io/cgspace-notes/2017-01/", "wordCount": "1594", "datePublished": "2017-01-02T10:43:00+03:00", - "dateModified": "2017-01-02T10:43:00+03:00", + "dateModified": "2017-01-29T13:18:32+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-02/index.html b/public/2017-02/index.html index e3433514a..140d8babc 100644 --- a/public/2017-02/index.html +++ b/public/2017-02/index.html @@ -35,7 +35,7 @@ Looks like we’ll be using cg.identifier.ccafsprojectpii as the field name - + @@ -91,7 +91,7 @@ Looks like we’ll be using cg.identifier.ccafsprojectpii as the field name "url": "https://alanorth.github.io/cgspace-notes/2017-02/", "wordCount": "2028", "datePublished": "2017-02-07T07:04:52-08:00", - "dateModified": "2017-02-07T07:04:52-08:00", + "dateModified": "2017-02-28T22:58:29+02:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-03/index.html b/public/2017-03/index.html index f5f2c6e0d..f01cbead9 100644 --- a/public/2017-03/index.html +++ b/public/2017-03/index.html @@ -37,7 +37,7 @@ $ identify ~/Desktop/alc_contrastes_desafios.jpg - + @@ -95,7 +95,7 @@ $ identify ~/Desktop/alc_contrastes_desafios.jpg "url": "https://alanorth.github.io/cgspace-notes/2017-03/", "wordCount": "1538", "datePublished": "2017-03-01T17:08:52+02:00", - "dateModified": "2017-03-01T17:08:52+02:00", + "dateModified": "2017-03-31T05:36:10+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-04/index.html b/public/2017-04/index.html index 52581a1e0..75d5fecb3 100644 --- a/public/2017-04/index.html +++ b/public/2017-04/index.html @@ -30,7 +30,7 @@ $ [dspace]/bin/dspace filter-media -f -i 10568/16498 -p "ImageMagick PDF Th - + @@ -81,7 +81,7 @@ $ [dspace]/bin/dspace filter-media -f -i 10568/16498 -p "ImageMagick PDF Th "url": "https://alanorth.github.io/cgspace-notes/2017-04/", "wordCount": "2917", "datePublished": "2017-04-02T17:08:52+02:00", - "dateModified": "2017-04-02T17:08:52+02:00", + "dateModified": "2017-04-26T13:35:10+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-05/index.html b/public/2017-05/index.html index 2c0e58f67..0c7023d3d 100644 --- a/public/2017-05/index.html +++ b/public/2017-05/index.html @@ -13,7 +13,7 @@ - + @@ -47,7 +47,7 @@ "url": "https://alanorth.github.io/cgspace-notes/2017-05/", "wordCount": "2412", "datePublished": "2017-05-01T16:21:52+02:00", - "dateModified": "2017-05-01T16:21:52+02:00", + "dateModified": "2017-05-29T13:15:22+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-06/index.html b/public/2017-06/index.html index 019e22eae..9b127b8e8 100644 --- a/public/2017-06/index.html +++ b/public/2017-06/index.html @@ -13,7 +13,7 @@ - + @@ -47,7 +47,7 @@ "url": "https://alanorth.github.io/cgspace-notes/2017-06/", "wordCount": "1261", "datePublished": "2017-06-01T10:14:52+03:00", - "dateModified": "2017-06-01T10:14:52+03:00", + "dateModified": "2017-06-30T18:34:51+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-07/index.html b/public/2017-07/index.html index 0ec36b23b..a4fcfd0f3 100644 --- a/public/2017-07/index.html +++ b/public/2017-07/index.html @@ -27,7 +27,7 @@ We can use PostgreSQL’s extended output format (-x) plus sed to format the - + @@ -75,7 +75,7 @@ We can use PostgreSQL’s extended output format (-x) plus sed to format the "url": "https://alanorth.github.io/cgspace-notes/2017-07/", "wordCount": "1151", "datePublished": "2017-07-01T18:03:52+03:00", - "dateModified": "2017-07-01T18:03:52+03:00", + "dateModified": "2017-08-01T08:55:37+03:00", "author": { "@type": "Person", "name": "Alan Orth" diff --git a/public/2017-08/index.html b/public/2017-08/index.html index ebd4fa3ea..56be6ff1f 100644 --- a/public/2017-08/index.html +++ b/public/2017-08/index.html @@ -21,6 +21,8 @@ But many of the bots are browsing dynamic URLs like: The robots.txt only blocks the top-level /discover and /browse URLs… we will need to find a way to forbid them from accessing these! Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962 +It turns out that we’re already adding the X-Robots-Tag "none" HTTP header, but this only forbids the search engine from indexing the page, not crawling it! +Also, the bot has to successfully browse the page first so it can receive the HTTP header… " /> @@ -30,7 +32,7 @@ Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.dura - + @@ -65,6 +67,8 @@ But many of the bots are browsing dynamic URLs like: The robots.txt only blocks the top-level /discover and /browse URLs… we will need to find a way to forbid them from accessing these! Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962 +It turns out that we’re already adding the X-Robots-Tag "none" HTTP header, but this only forbids the search engine from indexing the page, not crawling it! +Also, the bot has to successfully browse the page first so it can receive the HTTP header… "/> @@ -79,9 +83,9 @@ Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.dura "@type": "BlogPosting", "headline": "August, 2017", "url": "https://alanorth.github.io/cgspace-notes/2017-08/", - "wordCount": "123", + "wordCount": "166", "datePublished": "2017-08-01T11:51:52+03:00", - "dateModified": "2017-08-01T11:51:52+03:00", + "dateModified": "2017-08-01T11:57:37+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -159,6 +163,8 @@ Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.dura
  • The robots.txt only blocks the top-level /discover and /browse URLs… we will need to find a way to forbid them from accessing these!
  • Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962
  • +
  • It turns out that we’re already adding the X-Robots-Tag "none" HTTP header, but this only forbids the search engine from indexing the page, not crawling it!
  • +
  • Also, the bot has to successfully browse the page first so it can receive the HTTP header…
  • diff --git a/public/index.html b/public/index.html index 9f28bb520..e471e24ef 100644 --- a/public/index.html +++ b/public/index.html @@ -119,6 +119,8 @@
  • The robots.txt only blocks the top-level /discover and /browse URLs… we will need to find a way to forbid them from accessing these!
  • Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962
  • +
  • It turns out that we’re already adding the X-Robots-Tag "none" HTTP header, but this only forbids the search engine from indexing the page, not crawling it!
  • +
  • Also, the bot has to successfully browse the page first so it can receive the HTTP header…
  • diff --git a/public/index.xml b/public/index.xml index ff750c0d6..d62071a91 100644 --- a/public/index.xml +++ b/public/index.xml @@ -32,6 +32,8 @@ </ul></li> <li>The <code>robots.txt</code> only blocks the top-level <code>/discover</code> and <code>/browse</code> URLs&hellip; we will need to find a way to forbid them from accessing these!</li> <li>Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): <a href="https://jira.duraspace.org/browse/DS-2962">https://jira.duraspace.org/browse/DS-2962</a></li> +<li>It turns out that we&rsquo;re already adding the <code>X-Robots-Tag &quot;none&quot;</code> HTTP header, but this only forbids the search engine from <em>indexing</em> the page, not crawling it!</li> +<li>Also, the bot has to successfully browse the page first so it can receive the HTTP header&hellip;</li> </ul> <p></p> diff --git a/public/post/index.html b/public/post/index.html index 35ab783ab..40292476c 100644 --- a/public/post/index.html +++ b/public/post/index.html @@ -119,6 +119,8 @@
  • The robots.txt only blocks the top-level /discover and /browse URLs… we will need to find a way to forbid them from accessing these!
  • Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962
  • +
  • It turns out that we’re already adding the X-Robots-Tag "none" HTTP header, but this only forbids the search engine from indexing the page, not crawling it!
  • +
  • Also, the bot has to successfully browse the page first so it can receive the HTTP header…
  • diff --git a/public/post/index.xml b/public/post/index.xml index 4bdce3a9e..e356464be 100644 --- a/public/post/index.xml +++ b/public/post/index.xml @@ -32,6 +32,8 @@ </ul></li> <li>The <code>robots.txt</code> only blocks the top-level <code>/discover</code> and <code>/browse</code> URLs&hellip; we will need to find a way to forbid them from accessing these!</li> <li>Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): <a href="https://jira.duraspace.org/browse/DS-2962">https://jira.duraspace.org/browse/DS-2962</a></li> +<li>It turns out that we&rsquo;re already adding the <code>X-Robots-Tag &quot;none&quot;</code> HTTP header, but this only forbids the search engine from <em>indexing</em> the page, not crawling it!</li> +<li>Also, the bot has to successfully browse the page first so it can receive the HTTP header&hellip;</li> </ul> <p></p> diff --git a/public/sitemap.xml b/public/sitemap.xml index 330b650cd..9e317c57e 100644 --- a/public/sitemap.xml +++ b/public/sitemap.xml @@ -4,117 +4,117 @@ https://alanorth.github.io/cgspace-notes/2017-08/ - 2017-08-01T11:51:52+03:00 + 2017-08-01T11:57:37+03:00 https://alanorth.github.io/cgspace-notes/2017-07/ - 2017-07-01T18:03:52+03:00 + 2017-08-01T08:55:37+03:00 https://alanorth.github.io/cgspace-notes/2017-06/ - 2017-06-01T10:14:52+03:00 + 2017-06-30T18:34:51+03:00 https://alanorth.github.io/cgspace-notes/2017-05/ - 2017-05-01T16:21:52+02:00 + 2017-05-29T13:15:22+03:00 https://alanorth.github.io/cgspace-notes/2017-04/ - 2017-04-02T17:08:52+02:00 + 2017-04-26T13:35:10+03:00 https://alanorth.github.io/cgspace-notes/2017-03/ - 2017-03-01T17:08:52+02:00 + 2017-03-31T05:36:10+03:00 https://alanorth.github.io/cgspace-notes/2017-02/ - 2017-02-07T07:04:52-08:00 + 2017-02-28T22:58:29+02:00 https://alanorth.github.io/cgspace-notes/2017-01/ - 2017-01-02T10:43:00+03:00 + 2017-01-29T13:18:32+02:00 https://alanorth.github.io/cgspace-notes/2016-12/ - 2016-12-02T10:43:00+03:00 + 2017-01-10T16:21:47+02:00 https://alanorth.github.io/cgspace-notes/2016-11/ - 2016-11-01T09:21:00+03:00 + 2017-01-10T16:21:47+02:00 https://alanorth.github.io/cgspace-notes/2016-10/ - 2016-10-03T15:53:00+03:00 + 2017-01-10T16:21:47+02:00 https://alanorth.github.io/cgspace-notes/2016-09/ - 2016-09-01T15:53:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-08/ - 2016-08-01T15:53:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-07/ - 2016-07-01T10:53:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-06/ - 2016-06-01T10:53:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-05/ - 2016-05-01T23:06:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-04/ - 2016-04-04T11:06:00+03:00 + 2016-09-28T17:02:30+03:00 https://alanorth.github.io/cgspace-notes/2016-03/ - 2016-03-02T16:50:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-02/ - 2016-02-05T13:18:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2016-01/ - 2016-01-13T13:18:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2015-12/ - 2015-12-02T13:18:00+03:00 + 2017-01-09T16:18:07+02:00 https://alanorth.github.io/cgspace-notes/2015-11/ - 2015-11-23T17:00:57+03:00 + 2016-09-28T17:02:30+03:00 https://alanorth.github.io/cgspace-notes/ - 2017-08-01T11:51:52+03:00 + 2017-08-01T11:57:37+03:00 0 @@ -125,19 +125,19 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2017-08-01T11:51:52+03:00 + 2017-08-01T11:57:37+03:00 0 https://alanorth.github.io/cgspace-notes/post/ - 2017-08-01T11:51:52+03:00 + 2017-08-01T11:57:37+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2017-08-01T11:51:52+03:00 + 2017-08-01T11:57:37+03:00 0 diff --git a/public/tags/notes/index.html b/public/tags/notes/index.html index 7d7df9792..f6fa181a4 100644 --- a/public/tags/notes/index.html +++ b/public/tags/notes/index.html @@ -119,6 +119,8 @@
  • The robots.txt only blocks the top-level /discover and /browse URLs… we will need to find a way to forbid them from accessing these!
  • Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): https://jira.duraspace.org/browse/DS-2962
  • +
  • It turns out that we’re already adding the X-Robots-Tag "none" HTTP header, but this only forbids the search engine from indexing the page, not crawling it!
  • +
  • Also, the bot has to successfully browse the page first so it can receive the HTTP header…
  • diff --git a/public/tags/notes/index.xml b/public/tags/notes/index.xml index d9546dabd..3551bbd75 100644 --- a/public/tags/notes/index.xml +++ b/public/tags/notes/index.xml @@ -32,6 +32,8 @@ </ul></li> <li>The <code>robots.txt</code> only blocks the top-level <code>/discover</code> and <code>/browse</code> URLs&hellip; we will need to find a way to forbid them from accessing these!</li> <li>Relevant issue from DSpace Jira (semi resolved in DSpace 6.0): <a href="https://jira.duraspace.org/browse/DS-2962">https://jira.duraspace.org/browse/DS-2962</a></li> +<li>It turns out that we&rsquo;re already adding the <code>X-Robots-Tag &quot;none&quot;</code> HTTP header, but this only forbids the search engine from <em>indexing</em> the page, not crawling it!</li> +<li>Also, the bot has to successfully browse the page first so it can receive the HTTP header&hellip;</li> </ul> <p></p>