Merge 56de42b090 into 2341c56c40

poetry.lock: run poetry update
Add fix for normalizing DOIs
2024-11-22 05:45:02 +01:00 · 2024-06-30 18:05:29 +00:00 · 2024-04-25 12:50:30 +03:00 · 2024-04-25 12:49:19 +03:00 · 2024-04-12 13:40:55 +03:00 · 2024-04-12 11:07:36 +03:00
8 changed files with 540 additions and 464 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

 ## Unreleased
+### Added
+- Ability to normalize DOIs to https://doi.org URI format
+
 ### Fixed
 - Fixed regex so we don't run the invalid multi-value separator fix on
 `dcterms.bibliographicCitation` fields
--- a/README.md
+++ b/README.md
@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
 - Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
 - Remove duplicate metadata values
 - Check for duplicate items, using the title, type, and date issued as an indicator
+- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format

 ## Installation
 The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
@ -125,7 +126,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 - Better logging, for example with INFO, WARN, and ERR levels
 - Verbose, debug, or quiet options
 - Warn if an author is shorter than 3 characters?
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
 - Warn if two items use the same file in `filename` column
 - Add tests for application invocation, ie `tests/test_app.py`?
 - Validate ISSNs or journal titles against CrossRef API?
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -141,6 +141,11 @@ def run(argv):
        # Fix: unnecessary Unicode
        df[column] = df[column].apply(fix.unnecessary_unicode)

+        # Fix: normalize DOIs
+        match = re.match(r"^.*?identifier\.doi.*$", column)
+        if match is not None:
+            df[column] = df[column].apply(fix.normalize_dois)
+
        # Fix: invalid and unnecessary multi-value separators. Skip the title
        # and abstract fields because "|" is used to indicate something like
        # a subtitle.
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -2,7 +2,7 @@

 import logging
 import re
-from datetime import datetime, timedelta
+from datetime import datetime

 import country_converter as coco
 import pandas as pd
@ -133,7 +133,7 @@ def suspicious_characters(field, field_name):
        return

    # List of suspicious characters, for example:  ́ˆ~`
-    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
+    suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]

    for character in suspicious_characters:
        # Find the position of the suspicious character in the string
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -395,3 +395,74 @@ def countries_match_regions(row, exclude):
                row[region_column_name] = "||".join(missing_regions)

    return row
+
+
+def normalize_dois(field):
+    """Normalize DOIs.
+
+    DOIs are meant to be globally unique identifiers. They are case insensitive,
+    but in order to compare them robustly they should be normalized to a common
+    format:
+
+        - strip leading and trailing whitespace
+        - lowercase all ASCII characters
+        - convert all variations to https://doi.org/10.xxxx/xxxx URI format
+
+    Return string with normalized DOI.
+
+    See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
+    """
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Try to split multi-value field on "||" separator
+    values = field.split("||")
+
+    # Initialize an empty list to hold the de-duplicated values
+    new_values = []
+
+    # Iterate over all values (most items will only have one DOI)
+    for value in values:
+        # Strip leading and trailing whitespace
+        new_value = value.strip()
+
+        new_value = new_value.lower()
+
+        # Convert to HTTPS
+        pattern = re.compile(r"^http://")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://", new_value)
+
+        # Convert dx.doi.org to doi.org
+        pattern = re.compile(r"dx\.doi\.org")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "doi.org", new_value)
+
+        # Replace values like doi: 10.11648/j.jps.20140201.14
+        pattern = re.compile(r"^doi: 10\.")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
+
+        # Replace values like 10.3390/foods12010115
+        pattern = re.compile(r"^10\.")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
+
+        if new_value != value:
+            print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
+
+        new_values.append(new_value)
+
+    new_field = "||".join(new_values)
+
+    return new_field
--- a/data/test.csv
+++ b/data/test.csv
@ -37,3 +37,6 @@ Mojibake,2021-03-18,,,,PublicaÃ§ao CIAT,,,,Report,,,,
 Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
 Country missing region,2021-12-08,,,,,Kenya,,,,,,,
 Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
+DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
+DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
+Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
--- a/poetry.lock
+++ b/poetry.lock
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
    series_correct = pd.Series(data=d_correct)

    pd.testing.assert_series_equal(result, series_correct)
+
+
+def test_fix_normalize_dois():
+    """Test normalizing a DOI."""
+
+    value = "doi: 10.11648/j.jps.20140201.14"
+
+    assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
Author	SHA1	Message	Date
renovate[bot]	3329ce2535	Merge `56de42b090` into `2341c56c40`	2024-06-30 18:05:29 +00:00
Alan Orth	2341c56c40	poetry.lock: run poetry update	2024-04-25 12:50:30 +03:00
Alan Orth	5be2195325	Add fix for normalizing DOIs	2024-04-25 12:49:19 +03:00
Alan Orth	736948ed2c	csv_metadata_quality/check.py: run rye fmt	2024-04-12 13:40:55 +03:00
Alan Orth	ee0b448355	csv_metadata_quality/check.py: remove unused import	2024-04-12 11:07:36 +03:00