diff --git a/content/posts/2023-02.md b/content/posts/2023-02.md index 2e710ffec..fec770645 100644 --- a/content/posts/2023-02.md +++ b/content/posts/2023-02.md @@ -328,4 +328,41 @@ org.apache.solr.client.solrj.impl.HttpSolrServer$RemoteSolrException: missing re 2023 200 ``` +- Start reviewing and fixing metadata for Sam's ~250 CAS publications from last year + - Both Abenet and Peter have already looked at them and Sam has been waiting for months on this + +## 2023-02-22 + +- Continue proofing CAS records for Sam + - I downloaded all the PDFs manually and checked the issue dates for each from the PDF, noting some that had licenses, ISBNs, etc + - I combined the title, abstract, and system subjects into one column to mine them for AGROVOC terms: + +```console +toLowercase(value) + toLowercase(cells["dcterms.abstract"].value) + toLowercase(cells["cg.subject.system"].value.replace("||", " ")) +``` + +- Then I extracted a list of AGROVOC terms the same way I did in [August, 2022]({{< relref "2022-08.md" >}}) and used this Jython code to extract matching terms: + +```python +import re + +with open(r"/tmp/agrovoc-subjects.txt",'r') as f : + terms = [name.rstrip().lower() for name in f] + +return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())]) +``` + +- Then I used [this cool Jython to remove duplicate metadata values](https://stackoverflow.com/questions/15419080/openrefine-remove-duplicates-from-list-with-jython): + +```python +deduped_list = list(set(value.split("||"))) +return '||'.join(map(str, deduped_list)) +``` + +- Then I did the same with countries, woooooo! +- I checked for duplicates and found forty-one +- I just stumbled upon UNTERM, which provides the official list of countries for the UN General Assembly, including a downloadable Excel with the short and formal names in all UN languages: https://unterm.un.org/unterm2/en/country +- I created a [pull request to add common names for Iran, Laos, and Syria on the Debian iso-codes package](https://salsa.debian.org/iso-codes-team/iso-codes/-/merge_requests/32) + - These are remarked upon in the ISO.org online browsing platform for ISO 3166-1 + diff --git a/docs/2022-08/index.html b/docs/2022-08/index.html index 28dd0e4d9..384562834 100644 --- a/docs/2022-08/index.html +++ b/docs/2022-08/index.html @@ -14,7 +14,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago - + @@ -34,9 +34,9 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago "@type": "BlogPosting", "headline": "August, 2022", "url": "https://alanorth.github.io/cgspace-notes/2022-08/", - "wordCount": "2706", + "wordCount": "2704", "datePublished": "2022-08-01T10:22:36+03:00", - "dateModified": "2022-09-27T14:35:26+03:00", + "dateModified": "2023-02-22T11:59:48+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -311,7 +311,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago -
with open(r"/tmp/cgspace-countries.txt",'r') as f : 
+
with open(r"/tmp/cgspace-countries.txt",'r') as f:
     countries = [name.rstrip().lower() for name in f]
 
 return "||".join([x for x in value.split(' ') if x.lower() in countries])
@@ -320,7 +320,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
 
 
import re
 
-with open(r"/tmp/agrovoc-subjects.txt",'r') as f : 
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f:
     terms = [name.rstrip().lower() for name in f]
 
 return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
diff --git a/docs/2023-02/index.html b/docs/2023-02/index.html
index 355689546..ce35cf7b8 100644
--- a/docs/2023-02/index.html
+++ b/docs/2023-02/index.html
@@ -18,7 +18,7 @@ I want to try to expand my use of their data to journals, publishers, volumes, i
 
 
 
-
+
 
 
 
@@ -42,9 +42,9 @@ I want to try to expand my use of their data to journals, publishers, volumes, i
   "@type": "BlogPosting",
   "headline": "February, 2023",
   "url": "https://alanorth.github.io/cgspace-notes/2023-02/",
-  "wordCount": "2333",
+  "wordCount": "2566",
   "datePublished": "2023-02-01T10:57:36+03:00",
-  "dateModified": "2023-02-15T19:47:13+03:00",
+  "dateModified": "2023-02-21T20:46:53+03:00",
   "author": {
     "@type": "Person",
     "name": "Alan Orth"
@@ -508,7 +508,48 @@ I want to try to expand my use of their data to journals, publishers, volumes, i
 
 
# grep 'RTB website BOT' /var/log/nginx/rest.log | awk '{print $9}' | sort | uniq -c | sort -h
    2023 200
-
+
    +
  • Start reviewing and fixing metadata for Sam’s ~250 CAS publications from last year +
      +
    • Both Abenet and Peter have already looked at them and Sam has been waiting for months on this
    • +
    +
  • +
+

2023-02-22

+
    +
  • Continue proofing CAS records for Sam +
      +
    • I downloaded all the PDFs manually and checked the issue dates for each from the PDF, noting some that had licenses, ISBNs, etc
    • +
    • I combined the title, abstract, and system subjects into one column to mine them for AGROVOC terms:
    • +
    +
  • +
+
toLowercase(value) + toLowercase(cells["dcterms.abstract"].value) + toLowercase(cells["cg.subject.system"].value.replace("||", " "))
+
    +
  • Then I extracted a list of AGROVOC terms the same way I did in August, 2022 and used this Jython code to extract matching terms:
  • +
+
import re
+
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f : 
+    terms = [name.rstrip().lower() for name in f]
+
+return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
+
+
deduped_list = list(set(value.split("||")))
+return '||'.join(map(str, deduped_list))
+
+ diff --git a/docs/categories/index.html b/docs/categories/index.html index 912d07caf..b721670ba 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index ce23dabd3..b2d620f47 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index d46a2d839..3584c4a0d 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 15ac92603..954941368 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 64f00b47e..862ce954c 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index 4cf1c6e57..4dcd814cd 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index 5270c1ac1..5d531ce79 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html index fd22b8025..6070b487f 100644 --- a/docs/categories/notes/page/7/index.html +++ b/docs/categories/notes/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index 4616f1616..939beac72 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/10/index.html b/docs/page/10/index.html index 7c5c9ff30..52219c6d7 100644 --- a/docs/page/10/index.html +++ b/docs/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 0f3391a81..21d91f441 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index f6f19e485..1f9c39229 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 4f1670e89..a9b3c50d5 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index c3b13112f..67cfbea7c 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index e20b29065..5335cbb5d 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 9f70064d4..358bdb18a 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index 6a7eae01b..a94b9aa0f 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/9/index.html b/docs/page/9/index.html index fba1bed8c..cef2eb960 100644 --- a/docs/page/9/index.html +++ b/docs/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index c4c1130e4..843d4df1f 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/10/index.html b/docs/posts/page/10/index.html index 3c05be58b..fcdc038be 100644 --- a/docs/posts/page/10/index.html +++ b/docs/posts/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index d7d1ea6ea..51d8c131f 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index b2b78ac0b..80b0b2b6f 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 07e56d283..75835b6cb 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 14ada67a9..1e292a746 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index c1be48781..f0a1d100e 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 9e98df96c..733c41afc 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index c1deaa208..269ea7c90 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html index f5df103c8..08cb2cd52 100644 --- a/docs/posts/page/9/index.html +++ b/docs/posts/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 779ffd1d6..d0805691f 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2023-02-15T19:47:13+03:00 + 2023-02-22T11:59:48+03:00 https://alanorth.github.io/cgspace-notes/ - 2023-02-15T19:47:13+03:00 + 2023-02-22T11:59:48+03:00 https://alanorth.github.io/cgspace-notes/2023-02/ - 2023-02-15T19:47:13+03:00 + 2023-02-21T20:46:53+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2023-02-15T19:47:13+03:00 + 2023-02-22T11:59:48+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2023-02-15T19:47:13+03:00 + 2023-02-22T11:59:48+03:00 https://alanorth.github.io/cgspace-notes/2023-01/ 2023-01-31T22:20:38+03:00 @@ -33,7 +33,7 @@ 2022-09-30T17:29:50+03:00 https://alanorth.github.io/cgspace-notes/2022-08/ - 2022-09-27T14:35:26+03:00 + 2023-02-22T11:59:48+03:00 https://alanorth.github.io/cgspace-notes/2022-07/ 2022-07-31T15:49:35+03:00