1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-12 17:17:03 +01:00

Compare commits

..

4 Commits

Author SHA1 Message Date
92ff0ee51b
Normalize DOIs with %2f
These seem to be incorrectly URL encoded.
2024-06-25 11:54:09 +03:00
ae38a826ec
csv_metadata_quality/fix.py: minor update to DOI fix
Normalize www.doi.org to doi.org in DOI field.
2024-06-25 11:48:45 +03:00
c1f630c298
Bump dependencies
All tests passing...
2024-06-18 22:17:38 +03:00
82b056f0ea
Use py3langid v0.3.0 2024-06-18 21:51:32 +03:00
5 changed files with 33 additions and 18 deletions

View File

@ -444,6 +444,20 @@ def normalize_dois(field):
if match:
new_value = re.sub(pattern, "doi.org", new_value)
# Convert www.doi.org to doi.org
pattern = re.compile(r"www\.doi\.org")
match = re.findall(pattern, new_value)
if match:
new_value = re.sub(pattern, "doi.org", new_value)
# Convert erroneous %2f to /
pattern = re.compile("%2f")
match = re.findall(pattern, new_value)
if match:
new_value = re.sub(pattern, "/", new_value)
# Replace values like doi: 10.11648/j.jps.20140201.14
pattern = re.compile(r"^doi: 10\.")
match = re.findall(pattern, new_value)

View File

@ -40,3 +40,4 @@ Subregion field shouldnt trigger region checks,2022-12-07,,,,,Kenya,,,,,,East
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi cg.coverage.region cg.coverage.subregion
40 DOI with colon 2024-04-23 doi: 10.11648/j.jps.20140201.14
41 Upper case bare DOI 2024-04-23 10.19103/AS.2018.0043.16
42 DOI with %2f 2024-06-25 https://doi.org/10.1016%2fj.envc.2023.100794
43

View File

@ -8,14 +8,14 @@ authors = [
license= { file = "LICENSE.txt" }
dependencies = [
"pandas[feather,performance]~=2.2",
"python-stdnum~=1.19",
"requests~=2.31",
"requests-cache~=1.2",
"python-stdnum~=1.20",
"requests~=2.32",
"requests-cache~=1.2.1",
"colorama~=0.4",
"ftfy~=6.1",
"ftfy~=6.2.0",
"country-converter~=1.2",
"pycountry~=23.12",
"py3langid~=0.2",
"pycountry~=24.6.1",
"py3langid~=0.3",
]
readme = "README.md"
requires-python = ">= 3.9"

View File

@ -70,7 +70,7 @@ leather==0.4.0
# via agate
libcst==1.4.0
# via fixit
llvmlite==0.42.0
llvmlite==0.43.0
# via numba
markdown-it-py==3.0.0
# via rich
@ -80,11 +80,11 @@ mdurl==0.1.2
# via markdown-it-py
moreorless==0.4.0
# via fixit
numba==0.59.1
numba==0.60.0
# via pandas
numexpr==2.10.0
# via pandas
numpy==1.26.4
numpy==2.0.0
# via bottleneck
# via numba
# via numexpr
@ -122,11 +122,11 @@ ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
py3langid==0.2.2
py3langid==0.3.0
# via csv-metadata-quality
pyarrow==16.1.0
# via pandas
pycountry==23.12.11
pycountry==24.6.1
# via csv-metadata-quality
pygments==2.18.0
# via ipython
@ -149,7 +149,7 @@ pyyaml==6.0.1
requests==2.32.2
# via csv-metadata-quality
# via requests-cache
requests-cache==1.2.0
requests-cache==1.2.1
# via csv-metadata-quality
rich==13.7.1
# via pytest-clarity

View File

@ -28,13 +28,13 @@ ftfy==6.2.0
# via csv-metadata-quality
idna==3.7
# via requests
llvmlite==0.42.0
llvmlite==0.43.0
# via numba
numba==0.59.1
numba==0.60.0
# via pandas
numexpr==2.10.0
# via pandas
numpy==1.26.4
numpy==2.0.0
# via bottleneck
# via numba
# via numexpr
@ -46,11 +46,11 @@ pandas==2.2.2
# via csv-metadata-quality
platformdirs==4.2.2
# via requests-cache
py3langid==0.2.2
py3langid==0.3.0
# via csv-metadata-quality
pyarrow==16.1.0
# via pandas
pycountry==23.12.11
pycountry==24.6.1
# via csv-metadata-quality
python-dateutil==2.9.0.post0
# via pandas
@ -61,7 +61,7 @@ pytz==2024.1
requests==2.32.2
# via csv-metadata-quality
# via requests-cache
requests-cache==1.2.0
requests-cache==1.2.1
# via csv-metadata-quality
six==1.16.0
# via python-dateutil