From 00dc8241fc51df61d50491679e1e9514b5d56418 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 17 May 2018 13:14:29 +0300 Subject: [PATCH] Update notes for 2018-05-17 --- content/posts/2018-05.md | 1 + docs/2018-05/index.html | 7 ++++--- docs/sitemap.xml | 10 +++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/content/posts/2018-05.md b/content/posts/2018-05.md index d884686a1..dba84c7d3 100644 --- a/content/posts/2018-05.md +++ b/content/posts/2018-05.md @@ -286,3 +286,4 @@ ga('send', 'pageview', { - I'm not sure which method is better, perhaps the `solr.ASCIIFoldingFilterFactory` filter because it doesn't require copying the `mapping-FoldToASCII.txt` file - And actually I'm not entirely sure about the order of filtering before tokenizing, etc... - Ah, I see that `charFilter` must be before the tokenizer because it works on a stream, whereas `filter` operates on tokenized input so it must come after the tokenizer +- Regarding the use of the `charFilter` vs the `filter` class before and after the tokenizer, respectively, I think it's better to use the `charFilter` to normalize the input stream before tokenizing it as I have no idea what kinda stuff might get removed by the tokenizer diff --git a/docs/2018-05/index.html b/docs/2018-05/index.html index 834e1580b..54f530186 100644 --- a/docs/2018-05/index.html +++ b/docs/2018-05/index.html @@ -27,7 +27,7 @@ Also, I switched it to use OpenJDK instead of Oracle Java, as well as re-worked - + @@ -65,9 +65,9 @@ Also, I switched it to use OpenJDK instead of Oracle Java, as well as re-worked "@type": "BlogPosting", "headline": "May, 2018", "url": "https://alanorth.github.io/cgspace-notes/2018-05/", - "wordCount": "2267", + "wordCount": "2313", "datePublished": "2018-05-01T16:43:54+03:00", - "dateModified": "2018-05-17T10:51:46+03:00", + "dateModified": "2018-05-17T12:37:21+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -469,6 +469,7 @@ $ ./bin/post -c countries ~/src/git/DSpace/2018-05-10-countries.csv
  • I’m not sure which method is better, perhaps the solr.ASCIIFoldingFilterFactory filter because it doesn’t require copying the mapping-FoldToASCII.txt file
  • And actually I’m not entirely sure about the order of filtering before tokenizing, etc…
  • Ah, I see that charFilter must be before the tokenizer because it works on a stream, whereas filter operates on tokenized input so it must come after the tokenizer
  • +
  • Regarding the use of the charFilter vs the filter class before and after the tokenizer, respectively, I think it’s better to use the charFilter to normalize the input stream before tokenizing it as I have no idea what kinda stuff might get removed by the tokenizer
  • diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 5cee460f8..5011ef0cc 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2018-05/ - 2018-05-17T10:51:46+03:00 + 2018-05-17T12:37:21+03:00 @@ -164,7 +164,7 @@ https://alanorth.github.io/cgspace-notes/ - 2018-05-17T10:51:46+03:00 + 2018-05-17T12:37:21+03:00 0 @@ -175,7 +175,7 @@ https://alanorth.github.io/cgspace-notes/tags/notes/ - 2018-05-17T10:51:46+03:00 + 2018-05-17T12:37:21+03:00 0 @@ -187,13 +187,13 @@ https://alanorth.github.io/cgspace-notes/posts/ - 2018-05-17T10:51:46+03:00 + 2018-05-17T12:37:21+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2018-05-17T10:51:46+03:00 + 2018-05-17T12:37:21+03:00 0