From ab8cb272ea56be76893b9bffca69df955775894c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sat, 9 Oct 2021 22:00:59 +0300 Subject: [PATCH] Add notes for 2021-10-09 --- content/posts/2021-10.md | 27 +++++++++++++++++++++++++ docs/2021-10/index.html | 27 ++++++++++++++++++++++--- docs/categories/index.html | 2 +- docs/categories/notes/index.html | 2 +- docs/categories/notes/page/2/index.html | 2 +- docs/categories/notes/page/3/index.html | 2 +- docs/categories/notes/page/4/index.html | 2 +- docs/categories/notes/page/5/index.html | 2 +- docs/categories/notes/page/6/index.html | 2 +- docs/index.html | 2 +- docs/page/2/index.html | 2 +- docs/page/3/index.html | 2 +- docs/page/4/index.html | 2 +- docs/page/5/index.html | 2 +- docs/page/6/index.html | 2 +- docs/page/7/index.html | 2 +- docs/page/8/index.html | 2 +- docs/posts/index.html | 2 +- docs/posts/page/2/index.html | 2 +- docs/posts/page/3/index.html | 2 +- docs/posts/page/4/index.html | 2 +- docs/posts/page/5/index.html | 2 +- docs/posts/page/6/index.html | 2 +- docs/posts/page/7/index.html | 2 +- docs/posts/page/8/index.html | 2 +- docs/sitemap.xml | 10 ++++----- 26 files changed, 79 insertions(+), 31 deletions(-) diff --git a/content/posts/2021-10.md b/content/posts/2021-10.md index 0097172b4..50794a98c 100644 --- a/content/posts/2021-10.md +++ b/content/posts/2021-10.md @@ -248,5 +248,32 @@ if(cells['dcterms.subject[en_US]'].value == cells['dcterms.subject[en_Fu]'].valu - For these rows I starred them and then blanked out the original field so DSpace would see it as a removal, and add the new column - After these are uploaded I will normalize the `text_lang` fields in PostgreSQL again +- I did the same for CIAT but there were over 7,000 duplicate metadata values! Hard to believe: + +```console +$ grep -c 'Removing duplicate value' /tmp/out.log +7720 +``` + +- I applied these to the CIAT community, so in total that's over 8,000 duplicate metadata values removed in a handful of fields... + +## 2021-10-09 + +- I did similar metadata cleanups for CCAFS and IITA too, but there were only a few hundred duplicates there +- Also of note, there are some other fixes too, for example in IITA's community: + +```console +$ grep -c -E '(Fixing|Removing) (duplicate|excessive|invalid)' /tmp/out.log +249 +``` + +- I ran a full Discovery re-indexing on CGSpace +- Then I exported all of CGSpace and extracted the ISSNs and ISBNs: + +```console +$ csvcut -c 'id,cg.issn[en_US],dc.identifier.issn[en_US],cg.isbn[en_US],dc.identifier.isbn[en_US]' /tmp/cgspace.csv > /tmp/cgspace-issn-isbn.csv +``` + +- I did cleanups on about seventy items with invalid and mixed ISSNs/ISBNs diff --git a/docs/2021-10/index.html b/docs/2021-10/index.html index fb6d7ded5..6e2643a0b 100644 --- a/docs/2021-10/index.html +++ b/docs/2021-10/index.html @@ -25,7 +25,7 @@ So we have 1879/7100 (26.46%) matching already - + @@ -56,9 +56,9 @@ So we have 1879/7100 (26.46%) matching already "@type": "BlogPosting", "headline": "October, 2021", "url": "https://alanorth.github.io/cgspace-notes/2021-10/", - "wordCount": "1754", + "wordCount": "1882", "datePublished": "2021-10-01T11:14:07+03:00", - "dateModified": "2021-10-07T08:27:39+03:00", + "dateModified": "2021-10-08T17:15:17+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -390,6 +390,27 @@ $ csvjoin -c id /tmp/ilri-deduplicated-items.csv /tmp/ilri-deduplicated-items-cl
  • After these are uploaded I will normalize the text_lang fields in PostgreSQL again
  • +
  • I did the same for CIAT but there were over 7,000 duplicate metadata values! Hard to believe:
  • + +
    $ grep -c 'Removing duplicate value' /tmp/out.log
    +7720
    +
    +

    2021-10-09

    + +
    $ grep -c -E '(Fixing|Removing) (duplicate|excessive|invalid)' /tmp/out.log
    +249
    +
    +
    $ csvcut -c 'id,cg.issn[en_US],dc.identifier.issn[en_US],cg.isbn[en_US],dc.identifier.isbn[en_US]' /tmp/cgspace.csv > /tmp/cgspace-issn-isbn.csv
    +
    diff --git a/docs/categories/index.html b/docs/categories/index.html index 89ba2d206..1f2b0bdfd 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 1dbc47de6..1c4dac8ff 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 67b6e16c2..74442dbc1 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 9796727f6..41fbac0b8 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 8dd297991..4ab040d59 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index fb5c4666c..8c1946310 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index 2c36b6ecf..dfca41797 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index e85b0987d..7261e0d7f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 1d5392725..940bfd49c 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 04506761a..2e5ace70d 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 8d6be6044..ed9dd1d48 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 78288e46e..5e1ba13f4 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 0f4f01b87..670cafb5b 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 6675623e8..99aafc62a 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index cbb211dc3..1dc7e8af4 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index 2a81e7cb5..fac1a4bb8 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 347d35c00..13a1e4da8 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 312ed9ea6..186bf5e97 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 587752e36..9795d4785 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index b4f2f2d19..f7eed27c6 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index b26714806..3c137302e 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index c247c6ec4..b74320e6a 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index 2f884d788..0a01d4216 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index b30504859..7cff60982 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2021-10-07T08:27:39+03:00 + 2021-10-08T17:15:17+03:00 https://alanorth.github.io/cgspace-notes/ - 2021-10-07T08:27:39+03:00 + 2021-10-08T17:15:17+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2021-10-07T08:27:39+03:00 + 2021-10-08T17:15:17+03:00 https://alanorth.github.io/cgspace-notes/2021-10/ - 2021-10-07T08:27:39+03:00 + 2021-10-08T17:15:17+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2021-10-07T08:27:39+03:00 + 2021-10-08T17:15:17+03:00 https://alanorth.github.io/cgspace-notes/2021-09/ 2021-10-04T11:10:54+03:00