From 13cdfe298169c217799a78df5a1bb2c0e961c5c0 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 5 Jul 2020 21:52:01 +0300 Subject: [PATCH] Update notes for 2020-07-05 --- content/posts/2020-07.md | 28 +++++++++++++++++++++++ docs/2020-07/index.html | 30 +++++++++++++++++++++---- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/sitemap.xml | 10 ++++----- 20 files changed, 76 insertions(+), 26 deletions(-) diff --git a/content/posts/2020-07.md b/content/posts/2020-07.md index d1e53673f..3ccf1b2e3 100644 --- a/content/posts/2020-07.md +++ b/content/posts/2020-07.md @@ -232,5 +232,33 @@ $ ./run.sh -s http://localhost:8080/solr/statistics -a import -o ~/Downloads/sta - Mohammed Salem modified my [dspace-statistics-api](https://github.com/ilri/dspace-statistics-api) to query Solr directly so I started writing a script to benchmark it today - I will monitor the JVM memory and CPU usage in visualvm, just like I did in 2019-04 - I noticed an issue with his limit parameter so I sent him some feedback on that in the meantime +- I noticed that we have 20,000 distinct values for `dc.subject`, but there are at least 500 that are lower or mixed case that we should simply uppercase without further thought: + +``` +dspace=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE resource_type_id=2 AND metadata_field_id=57 AND text_value ~ '[[:lower:]]'; +``` + +- DSpace Test needs a different query because it is running DSpace 6 with UUIDs for everything: + +``` +dspace63=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=57 AND text_value ~ '[[:lower:]]'; +``` + +- Note the use of the POSIX character class :) +- I suggest that we generate a list of the top 5,000 values that don't match AGROVOC so that Sisay can correct them + - Start by getting the top 6,500 subjects (assuming that the top ~1,500 are valid from our previous work): + +``` +dspace=# \COPY (SELECT DISTINCT text_value, count(text_value) FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=57 GROUP BY text_value ORDER BY count DESC) TO /tmp/2020-07-05-subjects.csv WITH CSV; +COPY 19640 +dspace=# \q +$ csvcut -c1 /tmp/2020-07-05-subjects-upper.csv | head -n 6500 > 2020-07-05-cgspace-subjects.txt +``` + +- Then start looking them up using `agrovoc-lookup.py`: + +``` +$ ./agrovoc-lookup.py -i 2020-07-05-cgspace-subjects.txt -om 2020-07-05-cgspace-subjects-matched.txt -or 2020-07-05-cgspace-subjects-rejected.txt -d +``` diff --git a/docs/2020-07/index.html b/docs/2020-07/index.html index 549df2ebf..576f0c062 100644 --- a/docs/2020-07/index.html +++ b/docs/2020-07/index.html @@ -20,7 +20,7 @@ Since I was restarting Tomcat anyways I decided to redeploy the latest changes f - + @@ -45,9 +45,9 @@ Since I was restarting Tomcat anyways I decided to redeploy the latest changes f "@type": "BlogPosting", "headline": "July, 2020", "url": "https://alanorth.github.io/cgspace-notes/2020-07/", - "wordCount": "1256", + "wordCount": "1435", "datePublished": "2020-07-01T10:53:54+03:00", - "dateModified": "2020-07-05T10:50:09+03:00", + "dateModified": "2020-07-05T16:29:04+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -353,8 +353,30 @@ $ ./run.sh -s http://localhost:8080/solr/statistics -a import -o ~/Downloads/sta
  • I noticed an issue with his limit parameter so I sent him some feedback on that in the meantime
  • +
  • I noticed that we have 20,000 distinct values for dc.subject, but there are at least 500 that are lower or mixed case that we should simply uppercase without further thought:
  • - +
    dspace=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE resource_type_id=2 AND metadata_field_id=57 AND text_value ~ '[[:lower:]]';
    +
    +
    dspace63=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=57 AND text_value ~ '[[:lower:]]';
    +
    +
    dspace=# \COPY (SELECT DISTINCT text_value, count(text_value) FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=57 GROUP BY text_value ORDER BY count DESC) TO /tmp/2020-07-05-subjects.csv WITH CSV;
    +COPY 19640
    +dspace=# \q
    +$ csvcut -c1 /tmp/2020-07-05-subjects-upper.csv | head -n 6500 > 2020-07-05-cgspace-subjects.txt
    +
    +
    $ ./agrovoc-lookup.py -i 2020-07-05-cgspace-subjects.txt -om 2020-07-05-cgspace-subjects-matched.txt -or 2020-07-05-cgspace-subjects-rejected.txt -d
    +
    diff --git a/docs/categories/index.html b/docs/categories/index.html index 91461aca2..db955d225 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 064a9488a..e0d1653c3 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index be53a3015..99013bf2c 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index d0579abf7..0f06b9c63 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 258100c09..7b80beed0 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 8abbb584c..977bae58c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index e9836f415..c09eee9b1 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 200146081..10881db1f 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index c045ff69a..bdbd252d1 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 796ccbc63..826011b12 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 935952603..bcc69b933 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 5aca1293f..6c440d841 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 25e7756b5..6ea9a5576 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index d37b22cc4..a174f96a4 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index e4f4128b5..c60523fed 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 365a64906..399436097 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 9c621047c..59774b0a2 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index c42d6ec12..9a42f468f 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,27 +4,27 @@ https://alanorth.github.io/cgspace-notes/categories/ - 2020-07-05T10:50:09+03:00 + 2020-07-05T16:29:04+03:00 https://alanorth.github.io/cgspace-notes/ - 2020-07-05T10:50:09+03:00 + 2020-07-05T16:29:04+03:00 https://alanorth.github.io/cgspace-notes/2020-07/ - 2020-07-05T10:50:09+03:00 + 2020-07-05T16:29:04+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2020-07-05T10:50:09+03:00 + 2020-07-05T16:29:04+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2020-07-05T10:50:09+03:00 + 2020-07-05T16:29:04+03:00