mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
Add check for missing DOIs
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
Sometimes an editor includes a DOI in the citation field, but does not add a standalone DOI field.
This commit is contained in:
parent
831ce979c3
commit
8a27fb2589
@ -186,11 +186,16 @@ def run(argv):
|
||||
# column. For now it will have to do.
|
||||
##
|
||||
|
||||
if args.experimental_checks:
|
||||
# Transpose the DataFrame so we can consider each row as a column
|
||||
df_transposed = df.T
|
||||
# Transpose the DataFrame so we can consider each row as a column
|
||||
df_transposed = df.T
|
||||
|
||||
for column in df_transposed.columns:
|
||||
# Remember, here a "column" is an item (previously row). Perhaps I
|
||||
# should rename column in this for loop...
|
||||
for column in df_transposed.columns:
|
||||
# Check: citation DOI
|
||||
check.citation_doi(df_transposed[column])
|
||||
|
||||
if args.experimental_checks:
|
||||
experimental.correct_language(df_transposed[column])
|
||||
|
||||
# Write
|
||||
|
@ -368,3 +368,45 @@ def mojibake(field, field_name):
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
|
||||
def citation_doi(row):
|
||||
"""Check for the scenario where an item has a DOI listed in its citation,
|
||||
but does not have a cg.identifier.doi field.
|
||||
|
||||
Function prints a warning if the DOI field is missing, but there is a DOI
|
||||
in the citation.
|
||||
"""
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
citation = ""
|
||||
|
||||
# Iterate over the labels of the current row's values to check if a DOI
|
||||
# exists. If not, then we extract the citation to see if there is a DOI
|
||||
# listed there.
|
||||
for label in row.axes[0]:
|
||||
# Skip fields with missing values
|
||||
if pd.isna(row[label]):
|
||||
continue
|
||||
|
||||
# If a DOI field exists we don't need to check the citation
|
||||
match = re.match(r"^.*?doi.*$", label)
|
||||
if match is not None:
|
||||
return
|
||||
|
||||
# Get the name of the citation field
|
||||
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||
if match is not None:
|
||||
citation = row[label]
|
||||
|
||||
if citation != "":
|
||||
# Check the citation for "doi: 10.1186/1743-422X-9-218"
|
||||
doi_match1 = re.match(r"^.*?doi:\s.*$", citation)
|
||||
# Check the citation for a DOI URL (doi.org, dx.doi.org, etc)
|
||||
doi_match2 = re.match(r"^.*?doi\.org.*$", citation)
|
||||
if doi_match1 is not None or doi_match2 is not None:
|
||||
print(
|
||||
f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}"
|
||||
)
|
||||
|
||||
return
|
||||
|
@ -1,35 +1,36 @@
|
||||
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type
|
||||
Leading space,2019-07-29,,,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,
|
||||
Invalid date,2019-07-260,,,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,
|
||||
Invalid language,2019-07-29,,,Span,,,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,
|
||||
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi
|
||||
Leading space,2019-07-29,,,,,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,
|
||||
Invalid date,2019-07-260,,,,,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,
|
||||
Invalid language,2019-07-29,,,Span,,,,,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,,
|
||||
Newline (LF),2019-07-30,,,,"TANZA
|
||||
NIA",,,,
|
||||
Missing date,,,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,,,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,
|
||||
Composéd Unicode,2020-01-14,,,,,,,,
|
||||
Decomposéd Unicode,2020-01-14,,,,,,,,
|
||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,
|
||||
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,
|
||||
Duplicate Title,2021-03-17,,,,,,,,Report
|
||||
Duplicate Title,2021-03-17,,,,,,,,Report
|
||||
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report
|
||||
NIA",,,,,,
|
||||
Missing date,,,,,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,,,,,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,
|
||||
Composéd Unicode,2020-01-14,,,,,,,,,,
|
||||
Decomposéd Unicode,2020-01-14,,,,,,,,,,
|
||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,
|
||||
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,
|
||||
Duplicate Title,2021-03-17,,,,,,,,Report,,
|
||||
Duplicate Title,2021-03-17,,,,,,,,Report,,
|
||||
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,
|
||||
"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218",
|
||||
|
|
@ -367,3 +367,44 @@ def test_check_mojibake(capsys):
|
||||
captured.out
|
||||
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
|
||||
)
|
||||
|
||||
|
||||
def test_check_doi_field():
|
||||
"""Test an item with a DOI field."""
|
||||
|
||||
doi = "https://doi.org/10.1186/1743-422X-9-218"
|
||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the citation and an empty DOI field.
|
||||
d = {
|
||||
"cg.identifier.doi": doi,
|
||||
"dcterms.bibliographicCitation": citation
|
||||
}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
result = check.citation_doi(series)
|
||||
|
||||
assert result == None
|
||||
|
||||
|
||||
def test_check_doi_only_in_citation(capsys):
|
||||
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||
|
||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# an empty DOI field and a citation containing a DOI.
|
||||
d = {
|
||||
"cg.identifier.doi": None,
|
||||
"dcterms.bibliographicCitation": citation
|
||||
}
|
||||
series = pd.Series(data=d)
|
||||
|
||||
check.citation_doi(series)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
captured.out
|
||||
== f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n"
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user