From 5be21953257cb825b750158b377a4a9e3630c043 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 25 Apr 2024 12:49:19 +0300 Subject: [PATCH] Add fix for normalizing DOIs --- CHANGELOG.md | 3 ++ README.md | 2 +- csv_metadata_quality/app.py | 5 +++ csv_metadata_quality/fix.py | 71 +++++++++++++++++++++++++++++++++++++ data/test.csv | 3 ++ tests/test_fix.py | 8 +++++ 6 files changed, 91 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8aab5a4..75504ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased +### Added +- Ability to normalize DOIs to https://doi.org URI format + ### Fixed - Fixed regex so we don't run the invalid multi-value separator fix on `dcterms.bibliographicCitation` fields diff --git a/README.md b/README.md index a87ca5d..64fd061 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite: - Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`) - Remove duplicate metadata values - Check for duplicate items, using the title, type, and date issued as an indicator +- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format ## Installation The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org): @@ -125,7 +126,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib - Better logging, for example with INFO, WARN, and ERR levels - Verbose, debug, or quiet options - Warn if an author is shorter than 3 characters? -- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006 - Warn if two items use the same file in `filename` column - Add tests for application invocation, ie `tests/test_app.py`? - Validate ISSNs or journal titles against CrossRef API? diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 79f84be..95d2faa 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -141,6 +141,11 @@ def run(argv): # Fix: unnecessary Unicode df[column] = df[column].apply(fix.unnecessary_unicode) + # Fix: normalize DOIs + match = re.match(r"^.*?identifier\.doi.*$", column) + if match is not None: + df[column] = df[column].apply(fix.normalize_dois) + # Fix: invalid and unnecessary multi-value separators. Skip the title # and abstract fields because "|" is used to indicate something like # a subtitle. diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 48675d1..ae17f7e 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -395,3 +395,74 @@ def countries_match_regions(row, exclude): row[region_column_name] = "||".join(missing_regions) return row + + +def normalize_dois(field): + """Normalize DOIs. + + DOIs are meant to be globally unique identifiers. They are case insensitive, + but in order to compare them robustly they should be normalized to a common + format: + + - strip leading and trailing whitespace + - lowercase all ASCII characters + - convert all variations to https://doi.org/10.xxxx/xxxx URI format + + Return string with normalized DOI. + + See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/ + """ + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + values = field.split("||") + + # Initialize an empty list to hold the de-duplicated values + new_values = [] + + # Iterate over all values (most items will only have one DOI) + for value in values: + # Strip leading and trailing whitespace + new_value = value.strip() + + new_value = new_value.lower() + + # Convert to HTTPS + pattern = re.compile(r"^http://") + match = re.findall(pattern, new_value) + + if match: + new_value = re.sub(pattern, "https://", new_value) + + # Convert dx.doi.org to doi.org + pattern = re.compile(r"dx\.doi\.org") + match = re.findall(pattern, new_value) + + if match: + new_value = re.sub(pattern, "doi.org", new_value) + + # Replace values like doi: 10.11648/j.jps.20140201.14 + pattern = re.compile(r"^doi: 10\.") + match = re.findall(pattern, new_value) + + if match: + new_value = re.sub(pattern, "https://doi.org/10.", new_value) + + # Replace values like 10.3390/foods12010115 + pattern = re.compile(r"^10\.") + match = re.findall(pattern, new_value) + + if match: + new_value = re.sub(pattern, "https://doi.org/10.", new_value) + + if new_value != value: + print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}") + + new_values.append(new_value) + + new_field = "||".join(new_values) + + return new_field diff --git a/data/test.csv b/data/test.csv index 119fbbf..86c9def 100644 --- a/data/test.csv +++ b/data/test.csv @@ -37,3 +37,6 @@ Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,, Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,, Country missing region,2021-12-08,,,,,Kenya,,,,,,, Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo +DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,, +DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,, +Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,, diff --git a/tests/test_fix.py b/tests/test_fix.py index f8d95c1..00fcf6b 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -152,3 +152,11 @@ def test_fix_country_not_matching_region(): series_correct = pd.Series(data=d_correct) pd.testing.assert_series_equal(result, series_correct) + + +def test_fix_normalize_dois(): + """Test normalizing a DOI.""" + + value = "doi: 10.11648/j.jps.20140201.14" + + assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"