diff --git a/content/posts/2020-07.md b/content/posts/2020-07.md index 25cf21351..33c982f47 100644 --- a/content/posts/2020-07.md +++ b/content/posts/2020-07.md @@ -345,15 +345,70 @@ dc.contributor.author,correction dspace=# \COPY (SELECT DISTINCT text_value as "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 211 GROUP BY text_value ORDER BY count DESC) to /tmp/2020-07-08-affiliations.csv WITH CSV HEADER; ``` -- Then I stripped the header and quotes to make it a plain text file and ran `ror-lookup.py`: +- Then I stripped the CSV header and quotes to make it a plain text file and ran `ror-lookup.py`: ``` $ ./ror-lookup.py -i /tmp/2020-07-08-affiliations.txt -r ror.json -o 2020-07-08-affiliations-ror.csv -d -$ ./ror-lookup.py -i /tmp/2020-07-08-affiliations.txt -r ror.json -o 2020-07-08-affiliations-ror.csv -d -$ csvgrep -c 2 -m true 2020-07-08-affiliations-ror.csv | wc -l -1378 -$ csvgrep -c 2 -m false 2020-07-08-affiliations-ror.csv | wc -l -4490 +$ wc -l /tmp/2020-07-08-affiliations.txt +5866 /tmp/2020-07-08-affiliations.txt +$ csvgrep -c matched -m true 2020-07-08-affiliations-ror.csv | wc -l +1406 +$ csvgrep -c matched -m false 2020-07-08-affiliations-ror.csv | wc -l +4462 +``` + +- So, minus the CSV header, we have 1405 case-insensitive matches out of 5866 (23.9%) + + +## 2020-07-09 + +- Atmire responded to the ticket about DSpace 6 and Solr yesterday + - They said that the CUA issue is due to the "unmigrated" Solr records and that we should delete them + - I told them that [the "unmigrated" IDs are a known issue in DSpace 6](https://wiki.lyrasis.org/display/DSDOC6x/SOLR+Statistics+Maintenance) and we should rather figure out why they are unmigrated + - I didn't see any discussion on the dspace-tech mailing list or on DSpace Jira about unmigrated IDs, so I sent a mail to the mailing list to ask +- I updated `ror-lookup.py` to check aliases and acronyms as well and now the results are better for CGSpace's affiliation list: + +``` +$ wc -l /tmp/2020-07-08-affiliations.txt +5866 /tmp/2020-07-08-affiliations.txt +$ csvgrep -c matched -m true 2020-07-08-affiliations-ror.csv | wc -l +1516 +$ csvgrep -c matched -m false 2020-07-08-affiliations-ror.csv | wc -l +4352 +``` + +- So now our matching improves to 1515 out of 5866 (25.8%) +- Gabriela from CIP said that I should run the author corrections minus those that remove accent characters so I will run it on CGSpace: + +``` +$ ./fix-metadata-values.py -i /tmp/2020-07-09-fix-90-cip-authors.csv -db dspace -u dspace -p 'fuuu' -f dc.contributor.author -t correction -m 3 +``` + +- Apply 110 fixes and 90 deletions to sponsorships that Peter sent me a few days ago: + +``` +$ ./fix-metadata-values.py -i /tmp/2020-07-07-fix-110-sponsors.csv -db dspace -u dspace -p 'fuuu' -f dc.description.sponsorship -t 'correct/action' -m 29 +$ ./delete-metadata-values.py -i /tmp/2020-07-07-delete-90-sponsors.csv -db dspace -u dspace -p 'fuuu' -f dc.description.sponsorship -m 29 +``` + +- Start a full Discovery re-index on CGSpace: + +``` +$ time chrt -b 0 dspace index-discovery -b + +real 94m21.413s +user 9m40.364s +sys 2m37.246s +``` + +- I modified `crossref-funders-lookup.py` to be case insensitive and now CGSpace's sponsors match 173 out of 534 (32.4%): + +``` +$ ./crossref-funders-lookup.py -i 2020-07-09-cgspace-sponsors.txt -o 2020-07-09-cgspace-sponsors-crossref.csv -d -e a.orth@cgiar.org +$ wc -l 2020-07-09-cgspace-sponsors.txt +534 2020-07-09-cgspace-sponsors.txt +$ csvgrep -c matched -m true 2020-07-09-cgspace-sponsors-crossref.csv | wc -l +174 ``` diff --git a/docs/2020-07/index.html b/docs/2020-07/index.html index feb9a0c95..eb3d15231 100644 --- a/docs/2020-07/index.html +++ b/docs/2020-07/index.html @@ -20,7 +20,7 @@ Since I was restarting Tomcat anyways I decided to redeploy the latest changes f - + @@ -45,9 +45,9 @@ Since I was restarting Tomcat anyways I decided to redeploy the latest changes f "@type": "BlogPosting", "headline": "July, 2020", "url": "https://alanorth.github.io/cgspace-notes/2020-07/", - "wordCount": "2246", + "wordCount": "2550", "datePublished": "2020-07-01T10:53:54+03:00", - "dateModified": "2020-07-08T16:30:40+03:00", + "dateModified": "2020-07-09T09:35:58+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -474,14 +474,61 @@ dc.contributor.author,correction
dspace=# \COPY (SELECT DISTINCT text_value as "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE resource_type_id = 2 AND metadata_field_id = 211 GROUP BY text_value ORDER BY count DESC) to /tmp/2020-07-08-affiliations.csv WITH CSV HEADER;
 
$ ./ror-lookup.py -i /tmp/2020-07-08-affiliations.txt -r ror.json -o 2020-07-08-affiliations-ror.csv -d
-$ ./ror-lookup.py -i /tmp/2020-07-08-affiliations.txt -r ror.json -o 2020-07-08-affiliations-ror.csv -d
-$ csvgrep -c 2 -m true 2020-07-08-affiliations-ror.csv | wc -l 
-1378
-$ csvgrep -c 2 -m false 2020-07-08-affiliations-ror.csv | wc -l
-4490
+$ wc -l /tmp/2020-07-08-affiliations.txt 
+5866 /tmp/2020-07-08-affiliations.txt
+$ csvgrep -c matched -m true 2020-07-08-affiliations-ror.csv | wc -l 
+1406
+$ csvgrep -c matched -m false 2020-07-08-affiliations-ror.csv | wc -l
+4462
+
+

2020-07-09

+ +
$ wc -l /tmp/2020-07-08-affiliations.txt 
+5866 /tmp/2020-07-08-affiliations.txt
+$ csvgrep -c matched -m true 2020-07-08-affiliations-ror.csv | wc -l 
+1516
+$ csvgrep -c matched -m false 2020-07-08-affiliations-ror.csv | wc -l
+4352
+
+
$ ./fix-metadata-values.py -i /tmp/2020-07-09-fix-90-cip-authors.csv -db dspace -u dspace -p 'fuuu' -f dc.contributor.author -t correction -m 3
+
+
$ ./fix-metadata-values.py -i /tmp/2020-07-07-fix-110-sponsors.csv -db dspace -u dspace -p 'fuuu' -f dc.description.sponsorship -t 'correct/action' -m 29
+$ ./delete-metadata-values.py -i /tmp/2020-07-07-delete-90-sponsors.csv -db dspace -u dspace -p 'fuuu' -f dc.description.sponsorship -m 29
+
+
$ time chrt -b 0 dspace index-discovery -b
+
+real    94m21.413s
+user    9m40.364s
+sys     2m37.246s
+
+
$ ./crossref-funders-lookup.py -i 2020-07-09-cgspace-sponsors.txt -o 2020-07-09-cgspace-sponsors-crossref.csv -d -e a.orth@cgiar.org
+$ wc -l 2020-07-09-cgspace-sponsors.txt
+534 2020-07-09-cgspace-sponsors.txt
+$ csvgrep -c matched -m true 2020-07-09-cgspace-sponsors-crossref.csv | wc -l 
+174
 
diff --git a/docs/categories/index.html b/docs/categories/index.html index fcee507ee..4a8fcbc23 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 25be03d46..05bd59b95 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 59a5e0d0b..e1206f7f1 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 81712f775..d4aa72153 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index ce4dc4df1..bb38392a9 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 9a84453ef..b45afb379 100644 --- a/docs/index.html +++ b/docs/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index e7f3bc9d0..6203fb735 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 01b09d621..14208c2d0 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 6f873b1e3..a1c98dd1d 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 662eec260..b989647be 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index a2318b44f..4e6f7d2a1 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index cae6cd884..d23f8fbd6 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index b51390c10..4085a7eeb 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 61ca08002..68dcd46da 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index c60fe8448..504830286 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index d18d521c0..5298848b3 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 6570cc035..75bf410c0 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -9,7 +9,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 94253d775..b04675bff 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,27 +4,27 @@ https://alanorth.github.io/cgspace-notes/categories/ - 2020-07-08T16:30:40+03:00 + 2020-07-09T09:35:58+03:00 https://alanorth.github.io/cgspace-notes/ - 2020-07-08T16:30:40+03:00 + 2020-07-09T09:35:58+03:00 https://alanorth.github.io/cgspace-notes/2020-07/ - 2020-07-08T16:30:40+03:00 + 2020-07-09T09:35:58+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2020-07-08T16:30:40+03:00 + 2020-07-09T09:35:58+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2020-07-08T16:30:40+03:00 + 2020-07-09T09:35:58+03:00