diff --git a/README.md b/README.md index 30ceeb9..e6364f4 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ DSpace curation tasks and other Java-based helpers used on the [CGSpace](https:/ - **CountryCodeTagger**: add ISO 3166-1 Alpha2 country codes to items based on their existing country metadata - **FixJpgJpgThumbnails**: fix low-quality ".jpg.jpg" thumbnails by replacing them with their originals - **FixLowQualityThumbnails**: remove low-quality thumbnails when PDF bitstreams are present +- **NormalizeDOIs**: normalize DOIs by stripping whitespace, lowercasing, and converting to https://doi.org/ format Tested on DSpace 7.6. Read more about the [DSpace curation system](https://wiki.lyrasis.org/display/DSDOC7x/Curation+System). diff --git a/src/main/java/io/github/ilri/cgspace/ctasks/NormalizeDOIs.java b/src/main/java/io/github/ilri/cgspace/ctasks/NormalizeDOIs.java new file mode 100644 index 0000000..99e6234 --- /dev/null +++ b/src/main/java/io/github/ilri/cgspace/ctasks/NormalizeDOIs.java @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2024 Alan Orth + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +package io.github.ilri.cgspace.ctasks; + +import org.dspace.content.DSpaceObject; +import org.dspace.content.Item; +import org.dspace.content.MetadataValue; +import org.dspace.core.Constants; +import org.dspace.curate.AbstractCurationTask; +import org.dspace.curate.Curator; +import org.dspace.curate.Suspendable; + +import java.io.IOException; +import java.util.List; + +/** + * Attempt to normalize DOIs by stripping whitespace, lower casing, and + * converting to https://doi.org format. The reason is that DOIs are case + * insensitive and must be unique, which we can only guarantee if they are + * normalized to the same format. + * + * See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/ + * + * TODO: set curation to failed if invalid DOI submitted (and configure to reject in workflow) + * TODO: allow operation on communities and collections (currently only works on items) + * + * @author Alan Orth for the International Livestock Research Institute + * @version 7.6.1.1 + * @since 7.6.1.1 + */ +@Suspendable +public class NormalizeDOIs extends AbstractCurationTask { + @Override + public int perform(DSpaceObject dso) throws IOException { + if (dso.getType() == Constants.ITEM) { + Item item = (Item) dso; + String result; + + // Keep track of whether we change metadata, and how many + boolean metadataChanged = false; + int count = 0; + + // Hard coding the metadata field for now since I can't figure out how to read the taskProperty + List itemDOIs = itemService.getMetadataByMetadataString(item, "cg.identifier.doi"); + + // skip items that don't have DOIs + if (itemDOIs.isEmpty()) { + setResult("No DOIs, skipping"); + return Curator.CURATE_SKIP; + } else { + for (MetadataValue itemDOI : itemDOIs) { + String newDOI = getNormalizedDOI(itemDOI); + + // Check if the normalized DOI is different than the original + if (!newDOI.equals(itemDOI.getValue())) { + itemDOI.setValue(newDOI); + metadataChanged = true; + count++; + } + } + } + if (metadataChanged) { + result = "Normalized " + count + " DOI(s)"; + } else { + result = "All DOIs already normalized"; + } + report(result); + setResult(result); + + return Curator.CURATE_SUCCESS; + } else { + setResult("Object skipped"); + return Curator.CURATE_SKIP; + } + } + + private static String getNormalizedDOI(MetadataValue itemDOI) { + // 1. Convert to lowercase + String newDOI = itemDOI.getValue().toLowerCase(); + // 2. Strip leading and trailing whitespace + newDOI = newDOI.strip(); + // 3. Convert to HTTPS + newDOI = newDOI.replace("http://", "https://"); + // 4. Prefer doi.org to dx.doi.org + newDOI = newDOI.replace("dx.doi.org", "doi.org"); + // 5. Replace values like doi: 10.11648/j.jps.20140201.14 + newDOI = newDOI.replaceAll("^doi: 10\\.", "https://doi.org/10."); + // 6. Replace values like 10.3390/foods12010115 + newDOI = newDOI.replaceAll("^10\\.", "https://doi.org/10."); + + return newDOI; + } +} diff --git a/src/main/java/io/github/ilri/cgspace/ctasks/README.md b/src/main/java/io/github/ilri/cgspace/ctasks/README.md index 996e6eb..8aa2d72 100644 --- a/src/main/java/io/github/ilri/cgspace/ctasks/README.md +++ b/src/main/java/io/github/ilri/cgspace/ctasks/README.md @@ -2,6 +2,7 @@ DSpace curation tasks used on the [CGSpace](https://cgspace.cgiar.org) institutional repository: - **CountryCodeTagger**: add ISO 3166-1 Alpha2 country codes to items based on their existing country metadata +- **NormalizeDOIs**: normalize DOIs by stripping whitespace, lowercasing, and converting to https://doi.org/ format Tested on DSpace 7.6. Read more about the [DSpace curation system](https://wiki.lyrasis.org/display/DSDOC5x/Curation+System). @@ -39,6 +40,7 @@ Add the curation task to DSpace's `config/modules/curate.cfg`: ``` plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.CountryCodeTagger = countrycodetagger.force +plugin.named.org.dspace.curate.CurationTask = io.github.ilri.cgspace.ctasks.NormalizeDOIs = normalizedois ``` And then add the following variables to your `local.cfg` or some other [configuration file that is included](https://wiki.lyrasis.org/display/DSDOC6x/Configuration+Reference#ConfigurationReference-IncludingotherPropertyFiles):