diff --git a/content/posts/2024-01.md b/content/posts/2024-01.md index 789adf0aa..b14d969b8 100644 --- a/content/posts/2024-01.md +++ b/content/posts/2024-01.md @@ -210,4 +210,166 @@ Time: 240.041 ms - I think we can move those to a new `cg.identifier.project` if we create one - The `cg.identifier.cpwfproject` field is similarly sparse, but the CCAFS ones are widely used +## 2024-01-12 + +- Export a list of affiliations to do some cleanup: + +```console +localhost/dspace7= ☘ \COPY (SELECT DISTINCT text_value AS "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id = 211 GROUP BY "cg.contributor.affiliation" ORDER BY count DESC) to /tmp/2024-01-affiliations.csv WITH CSV HEADER; +COPY 11719 +``` + +- I first did some clustering and editing in OpenRefine, then I'll import those back into CGSpace and then do another export +- Troubleshooting the statistics pages that aren't working on DSpace 7 + - On a hunch, I queried for for Solr statistics documents that **did not have an `id` matching the 36-character UUID pattern**: + +```console +$ curl 'http://localhost:8983/solr/statistics/select?q=-id%3A%2F.\{36\}%2F&rows=0' +{ + "responseHeader":{ + "status":0, + "QTime":0, + "params":{ + "q":"-id:/.{36}/", + "rows":"0"}}, + "response":{"numFound":800167,"start":0,"numFoundExact":true,"docs":[] + }} +``` + +- They seem to come mostly from 2020, 2023, and 2024: + +```console +$ curl 'http://localhost:8983/solr/statistics/select?q=-id%3A%2F.\{36\}%2F&facet.range=time&facet=true&facet.range.start=2010-01-01T00:00:00Z&facet.range.end=NOW&facet.range.gap=%2B1YEAR&rows=0' +{ + "responseHeader":{ + "status":0, + "QTime":13, + "params":{ + "facet.range":"time", + "q":"-id:/.{36}/", + "facet.range.gap":"+1YEAR", + "rows":"0", + "facet":"true", + "facet.range.start":"2010-01-01T00:00:00Z", + "facet.range.end":"NOW"}}, + "response":{"numFound":800168,"start":0,"numFoundExact":true,"docs":[] + }, + "facet_counts":{ + "facet_queries":{}, + "facet_fields":{}, + "facet_ranges":{ + "time":{ + "counts":[ + "2010-01-01T00:00:00Z",0, + "2011-01-01T00:00:00Z",0, + "2012-01-01T00:00:00Z",0, + "2013-01-01T00:00:00Z",0, + "2014-01-01T00:00:00Z",0, + "2015-01-01T00:00:00Z",89, + "2016-01-01T00:00:00Z",11, + "2017-01-01T00:00:00Z",0, + "2018-01-01T00:00:00Z",0, + "2019-01-01T00:00:00Z",0, + "2020-01-01T00:00:00Z",1339, + "2021-01-01T00:00:00Z",0, + "2022-01-01T00:00:00Z",0, + "2023-01-01T00:00:00Z",653736, + "2024-01-01T00:00:00Z",144993], + "gap":"+1YEAR", + "start":"2010-01-01T00:00:00Z", + "end":"2025-01-01T00:00:00Z"}}, + "facet_intervals":{}, + "facet_heatmaps":{}}} +``` + +- They seem to come from 2023-08 until now (so way before we migrated to DSpace 7): + +```console +$ curl 'http://localhost:8983/solr/statistics/select?q=-id%3A%2F.\{36\}%2F&facet.range=time&facet=true&facet.range.start=2023-01-01T00:00:00Z&facet.range.end=NOW&facet.range.gap=%2B1MONTH&rows=0' +{ + "responseHeader":{ + "status":0, + "QTime":196, + "params":{ + "facet.range":"time", + "q":"-id:/.{36}/", + "facet.range.gap":"+1MONTH", + "rows":"0", + "facet":"true", + "facet.range.start":"2023-01-01T00:00:00Z", + "facet.range.end":"NOW"}}, + "response":{"numFound":800168,"start":0,"numFoundExact":true,"docs":[] + }, + "facet_counts":{ + "facet_queries":{}, + "facet_fields":{}, + "facet_ranges":{ + "time":{ + "counts":[ + "2023-01-01T00:00:00Z",1, + "2023-02-01T00:00:00Z",0, + "2023-03-01T00:00:00Z",0, + "2023-04-01T00:00:00Z",0, + "2023-05-01T00:00:00Z",0, + "2023-06-01T00:00:00Z",0, + "2023-07-01T00:00:00Z",0, + "2023-08-01T00:00:00Z",27621, + "2023-09-01T00:00:00Z",59165, + "2023-10-01T00:00:00Z",115338, + "2023-11-01T00:00:00Z",96147, + "2023-12-01T00:00:00Z",355464, + "2024-01-01T00:00:00Z",125429], + "gap":"+1MONTH", + "start":"2023-01-01T00:00:00Z", + "end":"2024-02-01T00:00:00Z"}}, + "facet_intervals":{}, + "facet_heatmaps":{}}} +``` + +- I see that we had 31,744 statistic events yesterday, and 799 have no `id`! +- I asked about this on Slack and will file an issue on GitHub if someone else also finds such records + - Several people said they have them, so it's a bug of some sort in DSpace, not our configuration + +## 2024-01-13 + +- Yesterday alone we had 37,000 unique IPs making requests to nginx + - I looked up the ASNs and found 6,000 IPs from this network in Amazon Singapore: 47.128.0.0/14 + +## 2024-01-15 + +- Investigating the CSS selector warning that I've seen in PM2 logs: + +```console +0|dspace-ui | 1 rules skipped due to selector errors: +0|dspace-ui | .custom-file-input:lang(en)~.custom-file-label -> unmatched pseudo-class :lang +``` + +- It seems to be a bug in Angular, as this selector comes from Bootstrap 4.6.x and is not invalid + - But that led me to a more interesting issue with `inlineCritical` optimization for styles in Angular SSR that might be responsible for causing high load in the frontend + - See: https://github.com/angular/angular/issues/42098 + - See: https://github.com/angular/universal/issues/2106 + - See: https://github.com/GoogleChromeLabs/critters/issues/78 +- Since the production site was flapping a lot I decided to try disabling inlineCriticalCss +- There have been on and off load issues with the Angular frontend today + - I think I will just block all data center network blocks for now + - In the last week I see almost 200,000 unique IPs: + +```console +# zcat -f /var/log/nginx/*access.log /var/log/nginx/*access.log.1 /var/log/nginx/*access.log.2.gz /var/log/nginx/*access.log.3.gz /var/log/nginx/*access.log.4.gz /var/log/nginx/*access.log.5.gz /var/log/nginx/*access.log.6.gz | awk '{print $1}' | sort -u | +tee /tmp/ips.txt | wc -l +196493 +``` + +- Looking these IPs up I see there are 18,000 coming from Comcast, 10,000 from AT&T, 4110 from Charter, 3500 from Cox and dozens of other residential IPs + - I highly doubt these are home users browsing CGSpace... seems super fishy + - Also, over 1,000 IPs from SpaceX Starlink in the last week. RIGHT + - I will temporarily add a few new datacenter ISP network blocks to our rate limit: + - 16509 Amazon-02 + - 701 UUNET + - 8075 Microsoft + - 15169 Google + - 14618 Amazon-AES + - 396982 Google Cloud + - The load on the server *immediately* dropped + diff --git a/docs/2024-01/index.html b/docs/2024-01/index.html index 317f27037..fa640728b 100644 --- a/docs/2024-01/index.html +++ b/docs/2024-01/index.html @@ -22,7 +22,7 @@ Work on IFPRI ISNAR archive cleanup - + @@ -50,9 +50,9 @@ Work on IFPRI ISNAR archive cleanup "@type": "BlogPosting", "headline": "January, 2024", "url": "https://alanorth.github.io/cgspace-notes/2024-01/", - "wordCount": "1306", + "wordCount": "1847", "datePublished": "2024-01-02T10:08:00+03:00", - "dateModified": "2024-01-10T08:34:16+03:00", + "dateModified": "2024-01-10T17:21:12+03:00", "author": { "@type": "Person", "name": "Alan Orth" @@ -340,6 +340,177 @@ Work on IFPRI ISNAR archive cleanup
  • I think we can move those to a new cg.identifier.project if we create one
  • The cg.identifier.cpwfproject field is similarly sparse, but the CCAFS ones are widely used
  • +

    2024-01-12

    + +
    localhost/dspace7= ☘ \COPY (SELECT DISTINCT text_value AS "cg.contributor.affiliation", count(*) FROM metadatavalue WHERE dspace_object_id in (SELECT dspace_object_id FROM item) AND metadata_field_id = 211 GROUP BY "cg.contributor.affiliation" ORDER BY count DESC) to /tmp/2024-01-affiliations.csv WITH CSV HEADER;
    +COPY 11719
    +
    +
    $ curl 'http://localhost:8983/solr/statistics/select?q=-id%3A%2F.\{36\}%2F&rows=0'
    +{
    +  "responseHeader":{
    +    "status":0,
    +    "QTime":0,
    +    "params":{
    +      "q":"-id:/.{36}/",
    +      "rows":"0"}},
    +  "response":{"numFound":800167,"start":0,"numFoundExact":true,"docs":[]
    +  }}
    +
    +
    $ curl 'http://localhost:8983/solr/statistics/select?q=-id%3A%2F.\{36\}%2F&facet.range=time&facet=true&facet.range.start=2010-01-01T00:00:00Z&facet.range.end=NOW&facet.range.gap=%2B1YEAR&rows=0'
    +{
    +  "responseHeader":{
    +    "status":0,
    +    "QTime":13,
    +    "params":{
    +      "facet.range":"time",
    +      "q":"-id:/.{36}/",
    +      "facet.range.gap":"+1YEAR",
    +      "rows":"0",
    +      "facet":"true",
    +      "facet.range.start":"2010-01-01T00:00:00Z",
    +      "facet.range.end":"NOW"}},
    +  "response":{"numFound":800168,"start":0,"numFoundExact":true,"docs":[]
    +  },
    +  "facet_counts":{
    +    "facet_queries":{},
    +    "facet_fields":{},
    +    "facet_ranges":{
    +      "time":{
    +        "counts":[
    +          "2010-01-01T00:00:00Z",0,
    +          "2011-01-01T00:00:00Z",0,
    +          "2012-01-01T00:00:00Z",0,
    +          "2013-01-01T00:00:00Z",0,
    +          "2014-01-01T00:00:00Z",0,
    +          "2015-01-01T00:00:00Z",89,
    +          "2016-01-01T00:00:00Z",11,
    +          "2017-01-01T00:00:00Z",0,
    +          "2018-01-01T00:00:00Z",0,
    +          "2019-01-01T00:00:00Z",0,
    +          "2020-01-01T00:00:00Z",1339,
    +          "2021-01-01T00:00:00Z",0,
    +          "2022-01-01T00:00:00Z",0,
    +          "2023-01-01T00:00:00Z",653736,
    +          "2024-01-01T00:00:00Z",144993],
    +        "gap":"+1YEAR",
    +        "start":"2010-01-01T00:00:00Z",
    +        "end":"2025-01-01T00:00:00Z"}},
    +    "facet_intervals":{},
    +    "facet_heatmaps":{}}}
    +
    +
    $ curl 'http://localhost:8983/solr/statistics/select?q=-id%3A%2F.\{36\}%2F&facet.range=time&facet=true&facet.range.start=2023-01-01T00:00:00Z&facet.range.end=NOW&facet.range.gap=%2B1MONTH&rows=0'
    +{
    +  "responseHeader":{
    +    "status":0,
    +    "QTime":196,
    +    "params":{
    +      "facet.range":"time",
    +      "q":"-id:/.{36}/",
    +      "facet.range.gap":"+1MONTH",
    +      "rows":"0",
    +      "facet":"true",
    +      "facet.range.start":"2023-01-01T00:00:00Z",
    +      "facet.range.end":"NOW"}},
    +  "response":{"numFound":800168,"start":0,"numFoundExact":true,"docs":[]
    +  },
    +  "facet_counts":{
    +    "facet_queries":{},
    +    "facet_fields":{},
    +    "facet_ranges":{
    +      "time":{
    +        "counts":[
    +          "2023-01-01T00:00:00Z",1,
    +          "2023-02-01T00:00:00Z",0,
    +          "2023-03-01T00:00:00Z",0,
    +          "2023-04-01T00:00:00Z",0,
    +          "2023-05-01T00:00:00Z",0,
    +          "2023-06-01T00:00:00Z",0,
    +          "2023-07-01T00:00:00Z",0,
    +          "2023-08-01T00:00:00Z",27621,
    +          "2023-09-01T00:00:00Z",59165,
    +          "2023-10-01T00:00:00Z",115338,
    +          "2023-11-01T00:00:00Z",96147,
    +          "2023-12-01T00:00:00Z",355464,
    +          "2024-01-01T00:00:00Z",125429],
    +        "gap":"+1MONTH",
    +        "start":"2023-01-01T00:00:00Z",
    +        "end":"2024-02-01T00:00:00Z"}},
    +    "facet_intervals":{},
    +    "facet_heatmaps":{}}}
    +
    +

    2024-01-13

    + +

    2024-01-15

    + +
    0|dspace-ui  | 1 rules skipped due to selector errors:
    +0|dspace-ui  |   .custom-file-input:lang(en)~.custom-file-label -> unmatched pseudo-class :lang
    +
    +
    # zcat -f /var/log/nginx/*access.log  /var/log/nginx/*access.log.1 /var/log/nginx/*access.log.2.gz /var/log/nginx/*access.log.3.gz /var/log/nginx/*access.log.4.gz /var/log/nginx/*access.log.5.gz /var/log/nginx/*access.log.6.gz | awk '{print $1}' | sort -u |
    +tee /tmp/ips.txt | wc -l
    +196493
    +
    diff --git a/docs/categories/index.html b/docs/categories/index.html index 05f6bd2a6..c19241b7c 100644 --- a/docs/categories/index.html +++ b/docs/categories/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/index.html b/docs/categories/notes/index.html index 71e632573..d5ae56049 100644 --- a/docs/categories/notes/index.html +++ b/docs/categories/notes/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/2/index.html b/docs/categories/notes/page/2/index.html index 72f5d57be..18e01a1e3 100644 --- a/docs/categories/notes/page/2/index.html +++ b/docs/categories/notes/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/3/index.html b/docs/categories/notes/page/3/index.html index 1b5c8a059..9f8462c1f 100644 --- a/docs/categories/notes/page/3/index.html +++ b/docs/categories/notes/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/4/index.html b/docs/categories/notes/page/4/index.html index 0a528dfe7..f9c84ba75 100644 --- a/docs/categories/notes/page/4/index.html +++ b/docs/categories/notes/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/5/index.html b/docs/categories/notes/page/5/index.html index 0da6e492a..033add2b7 100644 --- a/docs/categories/notes/page/5/index.html +++ b/docs/categories/notes/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/6/index.html b/docs/categories/notes/page/6/index.html index 03bacdc83..8177fb6b4 100644 --- a/docs/categories/notes/page/6/index.html +++ b/docs/categories/notes/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/7/index.html b/docs/categories/notes/page/7/index.html index 4051b42dd..b18d48ebc 100644 --- a/docs/categories/notes/page/7/index.html +++ b/docs/categories/notes/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/categories/notes/page/8/index.html b/docs/categories/notes/page/8/index.html index d98d64d2a..76955a0e6 100644 --- a/docs/categories/notes/page/8/index.html +++ b/docs/categories/notes/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/index.html b/docs/index.html index a9c3a71b4..8fd286292 100644 --- a/docs/index.html +++ b/docs/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/10/index.html b/docs/page/10/index.html index 837965a69..203825c2d 100644 --- a/docs/page/10/index.html +++ b/docs/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/11/index.html b/docs/page/11/index.html index 7dbddcb21..c2c5065b8 100644 --- a/docs/page/11/index.html +++ b/docs/page/11/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/2/index.html b/docs/page/2/index.html index 445a3eb57..78afc6967 100644 --- a/docs/page/2/index.html +++ b/docs/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/3/index.html b/docs/page/3/index.html index 8278406e7..823821979 100644 --- a/docs/page/3/index.html +++ b/docs/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/4/index.html b/docs/page/4/index.html index 8fe3c9498..010a4dfc7 100644 --- a/docs/page/4/index.html +++ b/docs/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/5/index.html b/docs/page/5/index.html index 19caca806..a2440fbad 100644 --- a/docs/page/5/index.html +++ b/docs/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/6/index.html b/docs/page/6/index.html index 996138e49..d406d2f87 100644 --- a/docs/page/6/index.html +++ b/docs/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/7/index.html b/docs/page/7/index.html index 69f442168..fdb64d575 100644 --- a/docs/page/7/index.html +++ b/docs/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/8/index.html b/docs/page/8/index.html index d0973c5a4..63b8dc48f 100644 --- a/docs/page/8/index.html +++ b/docs/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/page/9/index.html b/docs/page/9/index.html index 56e049367..b23f337ee 100644 --- a/docs/page/9/index.html +++ b/docs/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/index.html b/docs/posts/index.html index d32ccf122..57591ba3f 100644 --- a/docs/posts/index.html +++ b/docs/posts/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/10/index.html b/docs/posts/page/10/index.html index 35858ad75..7132ecef7 100644 --- a/docs/posts/page/10/index.html +++ b/docs/posts/page/10/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/11/index.html b/docs/posts/page/11/index.html index 5e154d789..8842f60a9 100644 --- a/docs/posts/page/11/index.html +++ b/docs/posts/page/11/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/2/index.html b/docs/posts/page/2/index.html index 8099ba973..6dcdd1465 100644 --- a/docs/posts/page/2/index.html +++ b/docs/posts/page/2/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/3/index.html b/docs/posts/page/3/index.html index 0f82d6270..cc7952675 100644 --- a/docs/posts/page/3/index.html +++ b/docs/posts/page/3/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/4/index.html b/docs/posts/page/4/index.html index 66e957917..90eaafd18 100644 --- a/docs/posts/page/4/index.html +++ b/docs/posts/page/4/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/5/index.html b/docs/posts/page/5/index.html index 3fbba1548..c7a92ffc2 100644 --- a/docs/posts/page/5/index.html +++ b/docs/posts/page/5/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/6/index.html b/docs/posts/page/6/index.html index 843b2cc88..94e2ef98c 100644 --- a/docs/posts/page/6/index.html +++ b/docs/posts/page/6/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/7/index.html b/docs/posts/page/7/index.html index 09892a075..da86ac8d8 100644 --- a/docs/posts/page/7/index.html +++ b/docs/posts/page/7/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/8/index.html b/docs/posts/page/8/index.html index bf0b1ee41..9fed04a6f 100644 --- a/docs/posts/page/8/index.html +++ b/docs/posts/page/8/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/posts/page/9/index.html b/docs/posts/page/9/index.html index fc4083487..63d8e1be9 100644 --- a/docs/posts/page/9/index.html +++ b/docs/posts/page/9/index.html @@ -10,7 +10,7 @@ - + diff --git a/docs/sitemap.xml b/docs/sitemap.xml index abe7aeae0..200175e9f 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -3,19 +3,19 @@ xmlns:xhtml="http://www.w3.org/1999/xhtml"> https://alanorth.github.io/cgspace-notes/categories/ - 2024-01-10T08:34:16+03:00 + 2024-01-10T17:21:12+03:00 https://alanorth.github.io/cgspace-notes/ - 2024-01-10T08:34:16+03:00 + 2024-01-10T17:21:12+03:00 https://alanorth.github.io/cgspace-notes/2024-01/ - 2024-01-10T08:34:16+03:00 + 2024-01-10T17:21:12+03:00 https://alanorth.github.io/cgspace-notes/categories/notes/ - 2024-01-10T08:34:16+03:00 + 2024-01-10T17:21:12+03:00 https://alanorth.github.io/cgspace-notes/posts/ - 2024-01-10T08:34:16+03:00 + 2024-01-10T17:21:12+03:00 https://alanorth.github.io/cgspace-notes/2023-12/ 2023-12-29T12:08:57+03:00