1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-21 11:42:20 +01:00

Normalize DOIs with %2f

These seem to be incorrectly URL encoded.
This commit is contained in:
Alan Orth 2024-06-25 11:54:09 +03:00
parent ae38a826ec
commit 92ff0ee51b
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 8 additions and 0 deletions

View File

@ -451,6 +451,13 @@ def normalize_dois(field):
if match:
new_value = re.sub(pattern, "doi.org", new_value)
# Convert erroneous %2f to /
pattern = re.compile("%2f")
match = re.findall(pattern, new_value)
if match:
new_value = re.sub(pattern, "/", new_value)
# Replace values like doi: 10.11648/j.jps.20140201.14
pattern = re.compile(r"^doi: 10\.")
match = re.findall(pattern, new_value)

View File

@ -40,3 +40,4 @@ Subregion field shouldnt trigger region checks,2022-12-07,,,,,Kenya,,,,,,East
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi cg.coverage.region cg.coverage.subregion
40 DOI with colon 2024-04-23 doi: 10.11648/j.jps.20140201.14
41 Upper case bare DOI 2024-04-23 10.19103/AS.2018.0043.16
42 DOI with %2f 2024-06-25 https://doi.org/10.1016%2fj.envc.2023.100794
43