mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-09 15:55:44 +01:00
Compare commits
5 Commits
272623d818
...
f7a3e8fbf6
Author | SHA1 | Date | |
---|---|---|---|
|
f7a3e8fbf6 | ||
2341c56c40 | |||
5be2195325 | |||
736948ed2c | |||
ee0b448355 |
@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
### Added
|
||||||
|
- Ability to normalize DOIs to https://doi.org URI format
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Fixed regex so we don't run the invalid multi-value separator fix on
|
- Fixed regex so we don't run the invalid multi-value separator fix on
|
||||||
`dcterms.bibliographicCitation` fields
|
`dcterms.bibliographicCitation` fields
|
||||||
|
@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
|||||||
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
||||||
- Remove duplicate metadata values
|
- Remove duplicate metadata values
|
||||||
- Check for duplicate items, using the title, type, and date issued as an indicator
|
- Check for duplicate items, using the title, type, and date issued as an indicator
|
||||||
|
- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
||||||
@ -125,7 +126,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Better logging, for example with INFO, WARN, and ERR levels
|
- Better logging, for example with INFO, WARN, and ERR levels
|
||||||
- Verbose, debug, or quiet options
|
- Verbose, debug, or quiet options
|
||||||
- Warn if an author is shorter than 3 characters?
|
- Warn if an author is shorter than 3 characters?
|
||||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
|
||||||
- Warn if two items use the same file in `filename` column
|
- Warn if two items use the same file in `filename` column
|
||||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||||
- Validate ISSNs or journal titles against CrossRef API?
|
- Validate ISSNs or journal titles against CrossRef API?
|
||||||
|
@ -141,6 +141,11 @@ def run(argv):
|
|||||||
# Fix: unnecessary Unicode
|
# Fix: unnecessary Unicode
|
||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
|
# Fix: normalize DOIs
|
||||||
|
match = re.match(r"^.*?identifier\.doi.*$", column)
|
||||||
|
if match is not None:
|
||||||
|
df[column] = df[column].apply(fix.normalize_dois)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||||
# and abstract fields because "|" is used to indicate something like
|
# and abstract fields because "|" is used to indicate something like
|
||||||
# a subtitle.
|
# a subtitle.
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
|
|
||||||
import country_converter as coco
|
import country_converter as coco
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -133,7 +133,7 @@ def suspicious_characters(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# List of suspicious characters, for example: ́ˆ~`
|
# List of suspicious characters, for example: ́ˆ~`
|
||||||
suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
|
suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]
|
||||||
|
|
||||||
for character in suspicious_characters:
|
for character in suspicious_characters:
|
||||||
# Find the position of the suspicious character in the string
|
# Find the position of the suspicious character in the string
|
||||||
|
@ -395,3 +395,74 @@ def countries_match_regions(row, exclude):
|
|||||||
row[region_column_name] = "||".join(missing_regions)
|
row[region_column_name] = "||".join(missing_regions)
|
||||||
|
|
||||||
return row
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_dois(field):
|
||||||
|
"""Normalize DOIs.
|
||||||
|
|
||||||
|
DOIs are meant to be globally unique identifiers. They are case insensitive,
|
||||||
|
but in order to compare them robustly they should be normalized to a common
|
||||||
|
format:
|
||||||
|
|
||||||
|
- strip leading and trailing whitespace
|
||||||
|
- lowercase all ASCII characters
|
||||||
|
- convert all variations to https://doi.org/10.xxxx/xxxx URI format
|
||||||
|
|
||||||
|
Return string with normalized DOI.
|
||||||
|
|
||||||
|
See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
values = field.split("||")
|
||||||
|
|
||||||
|
# Initialize an empty list to hold the de-duplicated values
|
||||||
|
new_values = []
|
||||||
|
|
||||||
|
# Iterate over all values (most items will only have one DOI)
|
||||||
|
for value in values:
|
||||||
|
# Strip leading and trailing whitespace
|
||||||
|
new_value = value.strip()
|
||||||
|
|
||||||
|
new_value = new_value.lower()
|
||||||
|
|
||||||
|
# Convert to HTTPS
|
||||||
|
pattern = re.compile(r"^http://")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://", new_value)
|
||||||
|
|
||||||
|
# Convert dx.doi.org to doi.org
|
||||||
|
pattern = re.compile(r"dx\.doi\.org")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "doi.org", new_value)
|
||||||
|
|
||||||
|
# Replace values like doi: 10.11648/j.jps.20140201.14
|
||||||
|
pattern = re.compile(r"^doi: 10\.")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||||
|
|
||||||
|
# Replace values like 10.3390/foods12010115
|
||||||
|
pattern = re.compile(r"^10\.")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||||
|
|
||||||
|
if new_value != value:
|
||||||
|
print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
|
||||||
|
|
||||||
|
new_values.append(new_value)
|
||||||
|
|
||||||
|
new_field = "||".join(new_values)
|
||||||
|
|
||||||
|
return new_field
|
||||||
|
@ -37,3 +37,6 @@ Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
|
|||||||
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
||||||
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
||||||
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
||||||
|
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
|
||||||
|
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
|
||||||
|
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
|
||||||
|
|
908
poetry.lock
generated
908
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
|
|||||||
series_correct = pd.Series(data=d_correct)
|
series_correct = pd.Series(data=d_correct)
|
||||||
|
|
||||||
pd.testing.assert_series_equal(result, series_correct)
|
pd.testing.assert_series_equal(result, series_correct)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_normalize_dois():
|
||||||
|
"""Test normalizing a DOI."""
|
||||||
|
|
||||||
|
value = "doi: 10.11648/j.jps.20140201.14"
|
||||||
|
|
||||||
|
assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
|
||||||
|
Loading…
Reference in New Issue
Block a user