From 92ff0ee51b44d666c200f4283a1e2d56f96bb7a8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 25 Jun 2024 11:54:09 +0300 Subject: [PATCH] Normalize DOIs with %2f These seem to be incorrectly URL encoded. --- csv_metadata_quality/fix.py | 7 +++++++ data/test.csv | 1 + 2 files changed, 8 insertions(+) diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 59e0b14..93a673e 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -451,6 +451,13 @@ def normalize_dois(field): if match: new_value = re.sub(pattern, "doi.org", new_value) + # Convert erroneous %2f to / + pattern = re.compile("%2f") + match = re.findall(pattern, new_value) + + if match: + new_value = re.sub(pattern, "/", new_value) + # Replace values like doi: 10.11648/j.jps.20140201.14 pattern = re.compile(r"^doi: 10\.") match = re.findall(pattern, new_value) diff --git a/data/test.csv b/data/test.csv index 86c9def..a1a54f3 100644 --- a/data/test.csv +++ b/data/test.csv @@ -40,3 +40,4 @@ Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,East DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,, DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,, Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,, +DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,