diff --git a/content/posts/2019-04.md b/content/posts/2019-04.md index 029fe758d..9af7ddd45 100644 --- a/content/posts/2019-04.md +++ b/content/posts/2019-04.md @@ -427,5 +427,34 @@ $ ./fix-metadata-values.py -i 2019-04-08-fix-13-affiliations.csv -db dspace -u d ``` - We should create a new list of affiliations to update our controlled vocabulary again +- I dumped a list of the top 1500 affiliations: + +``` +dspace=# \COPY (SELECT DISTINCT text_value, count(*) FROM metadatavalue WHERE metadata_field_id = 211 AND resource_type_id = 2 GROUP BY text_value ORDER BY count DESC LIMIT 1500) to /tmp/2019-04-08-top-1500-affiliations.csv WITH CSV HEADER; +COPY 1500 +``` + +- Fix a few more messed up affiliations that have return characters in them (use Ctrl-V Ctrl-M to re-create control character): + +``` +dspace=# UPDATE metadatavalue SET text_value='International Institute for Environment and Development' WHERE resource_type_id = 2 AND metadata_field_id = 211 AND text_value LIKE 'International Institute^M%'; +dspace=# UPDATE metadatavalue SET text_value='Kenya Agriculture and Livestock Research Organization' WHERE resource_type_id = 2 AND metadata_field_id = 211 AND text_value LIKE 'Kenya Agricultural and Livestock Research^M%'; +``` + +- I noticed a bunch of subjects and affiliations that use stylized apostrophes so I will export those and then batch update them: + +``` +dspace=# \COPY (SELECT DISTINCT text_value FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 211 AND text_value LIKE '%’%') to /tmp/2019-04-08-affiliations-apostrophes.csv WITH CSV HEADER; +COPY 60 +dspace=# \COPY (SELECT DISTINCT text_value FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 57 AND text_value LIKE '%’%') to /tmp/2019-04-08-subject-apostrophes.csv WITH CSV HEADER; +COPY 20 +``` + +- I cleaned them up in OpenRefine and then applied the fixes on CGSpace and DSpace Test: + +``` +$ ./fix-metadata-values.py -i /tmp/2019-04-08-fix-60-affiliations-apostrophes.csv -db dspace -u dspace -p 'fuuu' -f cg.contributor.affiliation -m 211 -t correct -d +$ ./fix-metadata-values.py -i /tmp/2019-04-08-fix-20-subject-apostrophes.csv -db dspace -u dspace -p 'fuuu' -f dc.subject -m 57 -t correct -d +``` diff --git a/docs/2019-04/index.html b/docs/2019-04/index.html index e0d720935..4c02bd271 100644 --- a/docs/2019-04/index.html +++ b/docs/2019-04/index.html @@ -38,7 +38,7 @@ $ ./delete-metadata-values.py -i /tmp/2019-02-21-delete-1-region.csv -db dspace - + @@ -81,9 +81,9 @@ $ ./delete-metadata-values.py -i /tmp/2019-02-21-delete-1-region.csv -db dspace "@type": "BlogPosting", "headline": "April, 2019", "url": "https://alanorth.github.io/cgspace-notes/2019-04/", - "wordCount": "2397", + "wordCount": "2631", "datePublished": "2019-04-01T09:00:43+03:00", - "dateModified": "2019-04-07T21:17:16+03:00", + "dateModified": "2019-04-08T11:26:20+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -659,8 +659,39 @@ X-XSS-Protection: 1; mode=block +
dspace=# \COPY (SELECT DISTINCT text_value, count(*) FROM metadatavalue WHERE metadata_field_id = 211 AND resource_type_id = 2 GROUP BY text_value ORDER BY count DESC LIMIT 1500) to /tmp/2019-04-08-top-1500-affiliations.csv WITH CSV HEADER;
+COPY 1500
+
+ + + +
dspace=# UPDATE metadatavalue SET text_value='International Institute for Environment and Development' WHERE resource_type_id = 2 AND metadata_field_id = 211 AND text_value LIKE 'International Institute^M%';
+dspace=# UPDATE metadatavalue SET text_value='Kenya Agriculture and Livestock Research Organization' WHERE resource_type_id = 2 AND metadata_field_id = 211 AND text_value LIKE 'Kenya Agricultural  and Livestock  Research^M%';
+
+ + + +
dspace=# \COPY (SELECT DISTINCT text_value FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 211 AND text_value LIKE '%’%') to /tmp/2019-04-08-affiliations-apostrophes.csv WITH CSV HEADER;
+COPY 60
+dspace=# \COPY (SELECT DISTINCT text_value FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 57 AND text_value LIKE '%’%') to /tmp/2019-04-08-subject-apostrophes.csv WITH CSV HEADER;
+COPY 20
+
+ + + +
$ ./fix-metadata-values.py -i /tmp/2019-04-08-fix-60-affiliations-apostrophes.csv -db dspace -u dspace -p 'fuuu' -f cg.contributor.affiliation -m 211 -t correct -d
+$ ./fix-metadata-values.py -i /tmp/2019-04-08-fix-20-subject-apostrophes.csv -db dspace -u dspace -p 'fuuu' -f dc.subject -m 57 -t correct -d
+
+ diff --git a/docs/robots.txt b/docs/robots.txt index 9ea3f6b9d..fc9a8bbb4 100644 --- a/docs/robots.txt +++ b/docs/robots.txt @@ -46,7 +46,7 @@ Disallow: /cgspace-notes/2015-12/ Disallow: /cgspace-notes/2015-11/ Disallow: /cgspace-notes/ Disallow: /cgspace-notes/categories/ -Disallow: /cgspace-notes/tags/notes/ Disallow: /cgspace-notes/categories/notes/ +Disallow: /cgspace-notes/tags/notes/ Disallow: /cgspace-notes/posts/ Disallow: /cgspace-notes/tags/ diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 571531f8e..f261b355d 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ https://alanorth.github.io/cgspace-notes/2019-04/ - 2019-04-07T21:17:16+03:00 + 2019-04-08T11:26:20+03:00 @@ -219,7 +219,7 @@ https://alanorth.github.io/cgspace-notes/ - 2019-04-07T21:17:16+03:00 + 2019-04-08T11:26:20+03:00 0 @@ -228,27 +228,27 @@ 0 - - https://alanorth.github.io/cgspace-notes/tags/notes/ - 2019-04-07T21:17:16+03:00 - 0 - - https://alanorth.github.io/cgspace-notes/categories/notes/ 2018-03-09T22:10:33+02:00 0 + + https://alanorth.github.io/cgspace-notes/tags/notes/ + 2019-04-08T11:26:20+03:00 + 0 + + https://alanorth.github.io/cgspace-notes/posts/ - 2019-04-07T21:17:16+03:00 + 2019-04-08T11:26:20+03:00 0 https://alanorth.github.io/cgspace-notes/tags/ - 2019-04-07T21:17:16+03:00 + 2019-04-08T11:26:20+03:00 0