diff --git a/content/posts/2023-02.md b/content/posts/2023-02.md index 2e710ffec..fec770645 100644 --- a/content/posts/2023-02.md +++ b/content/posts/2023-02.md @@ -328,4 +328,41 @@ org.apache.solr.client.solrj.impl.HttpSolrServer$RemoteSolrException: missing re 2023 200 ``` +- Start reviewing and fixing metadata for Sam's ~250 CAS publications from last year + - Both Abenet and Peter have already looked at them and Sam has been waiting for months on this + +## 2023-02-22 + +- Continue proofing CAS records for Sam + - I downloaded all the PDFs manually and checked the issue dates for each from the PDF, noting some that had licenses, ISBNs, etc + - I combined the title, abstract, and system subjects into one column to mine them for AGROVOC terms: + +```console +toLowercase(value) + toLowercase(cells["dcterms.abstract"].value) + toLowercase(cells["cg.subject.system"].value.replace("||", " ")) +``` + +- Then I extracted a list of AGROVOC terms the same way I did in [August, 2022]({{< relref "2022-08.md" >}}) and used this Jython code to extract matching terms: + +```python +import re + +with open(r"/tmp/agrovoc-subjects.txt",'r') as f : + terms = [name.rstrip().lower() for name in f] + +return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())]) +``` + +- Then I used [this cool Jython to remove duplicate metadata values](https://stackoverflow.com/questions/15419080/openrefine-remove-duplicates-from-list-with-jython): + +```python +deduped_list = list(set(value.split("||"))) +return '||'.join(map(str, deduped_list)) +``` + +- Then I did the same with countries, woooooo! +- I checked for duplicates and found forty-one +- I just stumbled upon UNTERM, which provides the official list of countries for the UN General Assembly, including a downloadable Excel with the short and formal names in all UN languages: https://unterm.un.org/unterm2/en/country +- I created a [pull request to add common names for Iran, Laos, and Syria on the Debian iso-codes package](https://salsa.debian.org/iso-codes-team/iso-codes/-/merge_requests/32) + - These are remarked upon in the ISO.org online browsing platform for ISO 3166-1 + diff --git a/docs/2022-08/index.html b/docs/2022-08/index.html index 28dd0e4d9..384562834 100644 --- a/docs/2022-08/index.html +++ b/docs/2022-08/index.html @@ -14,7 +14,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago - + @@ -34,9 +34,9 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago "@type": "BlogPosting", "headline": "August, 2022", "url": "https://alanorth.github.io/cgspace-notes/2022-08/", - "wordCount": "2706", + "wordCount": "2704", "datePublished": "2022-08-01T10:22:36+03:00", - "dateModified": "2022-09-27T14:35:26+03:00", + "dateModified": "2023-02-22T11:59:48+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -311,7 +311,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago -
with open(r"/tmp/cgspace-countries.txt",'r') as f :
+with open(r"/tmp/cgspace-countries.txt",'r') as f:
countries = [name.rstrip().lower() for name in f]
return "||".join([x for x in value.split(' ') if x.lower() in countries])
@@ -320,7 +320,7 @@ Our request to add CC-BY-3.0-IGO to SPDX was approved a few weeks ago
import re
-with open(r"/tmp/agrovoc-subjects.txt",'r') as f :
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f:
terms = [name.rstrip().lower() for name in f]
return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
diff --git a/docs/2023-02/index.html b/docs/2023-02/index.html
index 355689546..ce35cf7b8 100644
--- a/docs/2023-02/index.html
+++ b/docs/2023-02/index.html
@@ -18,7 +18,7 @@ I want to try to expand my use of their data to journals, publishers, volumes, i
-
+
@@ -42,9 +42,9 @@ I want to try to expand my use of their data to journals, publishers, volumes, i
"@type": "BlogPosting",
"headline": "February, 2023",
"url": "https://alanorth.github.io/cgspace-notes/2023-02/",
- "wordCount": "2333",
+ "wordCount": "2566",
"datePublished": "2023-02-01T10:57:36+03:00",
- "dateModified": "2023-02-15T19:47:13+03:00",
+ "dateModified": "2023-02-21T20:46:53+03:00",
"author": {
"@type": "Person",
"name": "Alan Orth"
@@ -508,7 +508,48 @@ I want to try to expand my use of their data to journals, publishers, volumes, i
# grep 'RTB website BOT' /var/log/nginx/rest.log | awk '{print $9}' | sort | uniq -c | sort -h
2023 200
-
+
+- Start reviewing and fixing metadata for Sam’s ~250 CAS publications from last year
+
+- Both Abenet and Peter have already looked at them and Sam has been waiting for months on this
+
+
+
+2023-02-22
+
+- Continue proofing CAS records for Sam
+
+- I downloaded all the PDFs manually and checked the issue dates for each from the PDF, noting some that had licenses, ISBNs, etc
+- I combined the title, abstract, and system subjects into one column to mine them for AGROVOC terms:
+
+
+
+toLowercase(value) + toLowercase(cells["dcterms.abstract"].value) + toLowercase(cells["cg.subject.system"].value.replace("||", " "))
+
+- Then I extracted a list of AGROVOC terms the same way I did in August, 2022 and used this Jython code to extract matching terms:
+
+import re
+
+with open(r"/tmp/agrovoc-subjects.txt",'r') as f :
+ terms = [name.rstrip().lower() for name in f]
+
+return "||".join([term for term in terms if re.match(r".*\b" + term + r"\b.*", value.lower())])
+
+- Then I used this cool Jython to remove duplicate metadata values:
+
+deduped_list = list(set(value.split("||")))
+return '||'.join(map(str, deduped_list))
+
+- Then I did the same with countries, woooooo!
+- I checked for duplicates and found forty-one
+- I just stumbled upon UNTERM, which provides the official list of countries for the UN General Assembly, including a downloadable Excel with the short and formal names in all UN languages: https://unterm.un.org/unterm2/en/country
+- I created a pull request to add common names for Iran, Laos, and Syria on the Debian iso-codes package
+
+- These are remarked upon in the ISO.org online browsing platform for ISO 3166-1
+
+
+
+
diff --git a/docs/categories/index.html b/docs/categories/index.html
index 912d07caf..b721670ba 100644
--- a/docs/categories/index.html
+++ b/docs/categories/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html
index ce23dabd3..b2d620f47 100644
--- a/docs/categories/notes/index.html
+++ b/docs/categories/notes/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html
index d46a2d839..3584c4a0d 100644
--- a/docs/categories/notes/page/2/index.html
+++ b/docs/categories/notes/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html
index 15ac92603..954941368 100644
--- a/docs/categories/notes/page/3/index.html
+++ b/docs/categories/notes/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html
index 64f00b47e..862ce954c 100644
--- a/docs/categories/notes/page/4/index.html
+++ b/docs/categories/notes/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html
index 4cf1c6e57..4dcd814cd 100644
--- a/docs/categories/notes/page/5/index.html
+++ b/docs/categories/notes/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html
index 5270c1ac1..5d531ce79 100644
--- a/docs/categories/notes/page/6/index.html
+++ b/docs/categories/notes/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html
index fd22b8025..6070b487f 100644
--- a/docs/categories/notes/page/7/index.html
+++ b/docs/categories/notes/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/index.html b/docs/index.html
index 4616f1616..939beac72 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/10/index.html b/docs/page/10/index.html
index 7c5c9ff30..52219c6d7 100644
--- a/docs/page/10/index.html
+++ b/docs/page/10/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/2/index.html b/docs/page/2/index.html
index 0f3391a81..21d91f441 100644
--- a/docs/page/2/index.html
+++ b/docs/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/3/index.html b/docs/page/3/index.html
index f6f19e485..1f9c39229 100644
--- a/docs/page/3/index.html
+++ b/docs/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/4/index.html b/docs/page/4/index.html
index 4f1670e89..a9b3c50d5 100644
--- a/docs/page/4/index.html
+++ b/docs/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/5/index.html b/docs/page/5/index.html
index c3b13112f..67cfbea7c 100644
--- a/docs/page/5/index.html
+++ b/docs/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/6/index.html b/docs/page/6/index.html
index e20b29065..5335cbb5d 100644
--- a/docs/page/6/index.html
+++ b/docs/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/7/index.html b/docs/page/7/index.html
index 9f70064d4..358bdb18a 100644
--- a/docs/page/7/index.html
+++ b/docs/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/8/index.html b/docs/page/8/index.html
index 6a7eae01b..a94b9aa0f 100644
--- a/docs/page/8/index.html
+++ b/docs/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/page/9/index.html b/docs/page/9/index.html
index fba1bed8c..cef2eb960 100644
--- a/docs/page/9/index.html
+++ b/docs/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/index.html b/docs/posts/index.html
index c4c1130e4..843d4df1f 100644
--- a/docs/posts/index.html
+++ b/docs/posts/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/10/index.html b/docs/posts/page/10/index.html
index 3c05be58b..fcdc038be 100644
--- a/docs/posts/page/10/index.html
+++ b/docs/posts/page/10/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html
index d7d1ea6ea..51d8c131f 100644
--- a/docs/posts/page/2/index.html
+++ b/docs/posts/page/2/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html
index b2b78ac0b..80b0b2b6f 100644
--- a/docs/posts/page/3/index.html
+++ b/docs/posts/page/3/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html
index 07e56d283..75835b6cb 100644
--- a/docs/posts/page/4/index.html
+++ b/docs/posts/page/4/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html
index 14ada67a9..1e292a746 100644
--- a/docs/posts/page/5/index.html
+++ b/docs/posts/page/5/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html
index c1be48781..f0a1d100e 100644
--- a/docs/posts/page/6/index.html
+++ b/docs/posts/page/6/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html
index 9e98df96c..733c41afc 100644
--- a/docs/posts/page/7/index.html
+++ b/docs/posts/page/7/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html
index c1deaa208..269ea7c90 100644
--- a/docs/posts/page/8/index.html
+++ b/docs/posts/page/8/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html
index f5df103c8..08cb2cd52 100644
--- a/docs/posts/page/9/index.html
+++ b/docs/posts/page/9/index.html
@@ -10,7 +10,7 @@
-
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 779ffd1d6..d0805691f 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -3,19 +3,19 @@
xmlns:xhtml="http://www.w3.org/1999/xhtml">
https://alanorth.github.io/cgspace-notes/categories/
- 2023-02-15T19:47:13+03:00
+ 2023-02-22T11:59:48+03:00
https://alanorth.github.io/cgspace-notes/
- 2023-02-15T19:47:13+03:00
+ 2023-02-22T11:59:48+03:00
https://alanorth.github.io/cgspace-notes/2023-02/
- 2023-02-15T19:47:13+03:00
+ 2023-02-21T20:46:53+03:00
https://alanorth.github.io/cgspace-notes/categories/notes/
- 2023-02-15T19:47:13+03:00
+ 2023-02-22T11:59:48+03:00
https://alanorth.github.io/cgspace-notes/posts/
- 2023-02-15T19:47:13+03:00
+ 2023-02-22T11:59:48+03:00
https://alanorth.github.io/cgspace-notes/2023-01/
2023-01-31T22:20:38+03:00
@@ -33,7 +33,7 @@
2022-09-30T17:29:50+03:00
https://alanorth.github.io/cgspace-notes/2022-08/
- 2022-09-27T14:35:26+03:00
+ 2023-02-22T11:59:48+03:00
https://alanorth.github.io/cgspace-notes/2022-07/
2022-07-31T15:49:35+03:00