mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-21 11:42:20 +01:00
Normalize DOIs with %2f
These seem to be incorrectly URL encoded.
This commit is contained in:
parent
ae38a826ec
commit
92ff0ee51b
@ -451,6 +451,13 @@ def normalize_dois(field):
|
||||
if match:
|
||||
new_value = re.sub(pattern, "doi.org", new_value)
|
||||
|
||||
# Convert erroneous %2f to /
|
||||
pattern = re.compile("%2f")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "/", new_value)
|
||||
|
||||
# Replace values like doi: 10.11648/j.jps.20140201.14
|
||||
pattern = re.compile(r"^doi: 10\.")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
@ -40,3 +40,4 @@ Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,East
|
||||
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
|
||||
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
|
||||
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
|
||||
DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,
|
||||
|
|
Loading…
Reference in New Issue
Block a user