diff --git a/content/posts/2020-07.md b/content/posts/2020-07.md index d1e53673f..3ccf1b2e3 100644 --- a/content/posts/2020-07.md +++ b/content/posts/2020-07.md @@ -232,5 +232,33 @@ $ ./run.sh -s http://localhost:8080/solr/statistics -a import -o ~/Downloads/sta - Mohammed Salem modified my [dspace-statistics-api](https://github.com/ilri/dspace-statistics-api) to query Solr directly so I started writing a script to benchmark it today - I will monitor the JVM memory and CPU usage in visualvm, just like I did in 2019-04 - I noticed an issue with his limit parameter so I sent him some feedback on that in the meantime +- I noticed that we have 20,000 distinct values for `dc.subject`, but there are at least 500 that are lower or mixed case that we should simply uppercase without further thought: + +``` +dspace=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE resource_type_id=2 AND metadata_field_id=57 AND text_value ~ '[[:lower:]]'; +``` + +- DSpace Test needs a different query because it is running DSpace 6 with UUIDs for everything: + +``` +dspace63=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=57 AND text_value ~ '[[:lower:]]'; +``` + +- Note the use of the POSIX character class :) +- I suggest that we generate a list of the top 5,000 values that don't match AGROVOC so that Sisay can correct them + - Start by getting the top 6,500 subjects (assuming that the top ~1,500 are valid from our previous work): + +``` +dspace=# \COPY (SELECT DISTINCT text_value, count(text_value) FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=57 GROUP BY text_value ORDER BY count DESC) TO /tmp/2020-07-05-subjects.csv WITH CSV; +COPY 19640 +dspace=# \q +$ csvcut -c1 /tmp/2020-07-05-subjects-upper.csv | head -n 6500 > 2020-07-05-cgspace-subjects.txt +``` + +- Then start looking them up using `agrovoc-lookup.py`: + +``` +$ ./agrovoc-lookup.py -i 2020-07-05-cgspace-subjects.txt -om 2020-07-05-cgspace-subjects-matched.txt -or 2020-07-05-cgspace-subjects-rejected.txt -d +``` diff --git a/docs/2020-07/index.html b/docs/2020-07/index.html index 549df2ebf..576f0c062 100644 --- a/docs/2020-07/index.html +++ b/docs/2020-07/index.html @@ -20,7 +20,7 @@ Since I was restarting Tomcat anyways I decided to redeploy the latest changes f - + @@ -45,9 +45,9 @@ Since I was restarting Tomcat anyways I decided to redeploy the latest changes f "@type": "BlogPosting", "headline": "July, 2020", "url": "https://alanorth.github.io/cgspace-notes/2020-07/", - "wordCount": "1256", + "wordCount": "1435", "datePublished": "2020-07-01T10:53:54+03:00", - "dateModified": "2020-07-05T10:50:09+03:00", + "dateModified": "2020-07-05T16:29:04+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -353,8 +353,30 @@ $ ./run.sh -s http://localhost:8080/solr/statistics -a import -o ~/Downloads/sta
dc.subject
, but there are at least 500 that are lower or mixed case that we should simply uppercase without further thought:dspace=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE resource_type_id=2 AND metadata_field_id=57 AND text_value ~ '[[:lower:]]';
+
dspace63=# UPDATE metadatavalue SET text_value=UPPER(text_value) WHERE dspace_object_id IN (SELECT uuid FROM item) AND metadata_field_id=57 AND text_value ~ '[[:lower:]]';
+
dspace=# \COPY (SELECT DISTINCT text_value, count(text_value) FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=57 GROUP BY text_value ORDER BY count DESC) TO /tmp/2020-07-05-subjects.csv WITH CSV;
+COPY 19640
+dspace=# \q
+$ csvcut -c1 /tmp/2020-07-05-subjects-upper.csv | head -n 6500 > 2020-07-05-cgspace-subjects.txt
+
agrovoc-lookup.py
:$ ./agrovoc-lookup.py -i 2020-07-05-cgspace-subjects.txt -om 2020-07-05-cgspace-subjects-matched.txt -or 2020-07-05-cgspace-subjects-rejected.txt -d
+
diff --git a/docs/categories/index.html b/docs/categories/index.html
index 91461aca2..db955d225 100644
--- a/docs/categories/index.html
+++ b/docs/categories/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html
index 064a9488a..e0d1653c3 100644
--- a/docs/categories/notes/index.html
+++ b/docs/categories/notes/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html
index be53a3015..99013bf2c 100644
--- a/docs/categories/notes/page/2/index.html
+++ b/docs/categories/notes/page/2/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html
index d0579abf7..0f06b9c63 100644
--- a/docs/categories/notes/page/3/index.html
+++ b/docs/categories/notes/page/3/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html
index 258100c09..7b80beed0 100644
--- a/docs/categories/notes/page/4/index.html
+++ b/docs/categories/notes/page/4/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/index.html b/docs/index.html
index 8abbb584c..977bae58c 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/page/2/index.html b/docs/page/2/index.html
index e9836f415..c09eee9b1 100644
--- a/docs/page/2/index.html
+++ b/docs/page/2/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/page/3/index.html b/docs/page/3/index.html
index 200146081..10881db1f 100644
--- a/docs/page/3/index.html
+++ b/docs/page/3/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/page/4/index.html b/docs/page/4/index.html
index c045ff69a..bdbd252d1 100644
--- a/docs/page/4/index.html
+++ b/docs/page/4/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/page/5/index.html b/docs/page/5/index.html
index 796ccbc63..826011b12 100644
--- a/docs/page/5/index.html
+++ b/docs/page/5/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/page/6/index.html b/docs/page/6/index.html
index 935952603..bcc69b933 100644
--- a/docs/page/6/index.html
+++ b/docs/page/6/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/posts/index.html b/docs/posts/index.html
index 5aca1293f..6c440d841 100644
--- a/docs/posts/index.html
+++ b/docs/posts/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html
index 25e7756b5..6ea9a5576 100644
--- a/docs/posts/page/2/index.html
+++ b/docs/posts/page/2/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html
index d37b22cc4..a174f96a4 100644
--- a/docs/posts/page/3/index.html
+++ b/docs/posts/page/3/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html
index e4f4128b5..c60523fed 100644
--- a/docs/posts/page/4/index.html
+++ b/docs/posts/page/4/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html
index 365a64906..399436097 100644
--- a/docs/posts/page/5/index.html
+++ b/docs/posts/page/5/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html
index 9c621047c..59774b0a2 100644
--- a/docs/posts/page/6/index.html
+++ b/docs/posts/page/6/index.html
@@ -9,7 +9,7 @@
-
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index c42d6ec12..9a42f468f 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -4,27 +4,27 @@