mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-25 15:18:19 +01:00
Add check for missing DOIs
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
Sometimes an editor includes a DOI in the citation field, but does not add a standalone DOI field.
This commit is contained in:
parent
831ce979c3
commit
8a27fb2589
@ -186,11 +186,16 @@ def run(argv):
|
|||||||
# column. For now it will have to do.
|
# column. For now it will have to do.
|
||||||
##
|
##
|
||||||
|
|
||||||
if args.experimental_checks:
|
# Transpose the DataFrame so we can consider each row as a column
|
||||||
# Transpose the DataFrame so we can consider each row as a column
|
df_transposed = df.T
|
||||||
df_transposed = df.T
|
|
||||||
|
|
||||||
for column in df_transposed.columns:
|
# Remember, here a "column" is an item (previously row). Perhaps I
|
||||||
|
# should rename column in this for loop...
|
||||||
|
for column in df_transposed.columns:
|
||||||
|
# Check: citation DOI
|
||||||
|
check.citation_doi(df_transposed[column])
|
||||||
|
|
||||||
|
if args.experimental_checks:
|
||||||
experimental.correct_language(df_transposed[column])
|
experimental.correct_language(df_transposed[column])
|
||||||
|
|
||||||
# Write
|
# Write
|
||||||
|
@ -368,3 +368,45 @@ def mojibake(field, field_name):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def citation_doi(row):
|
||||||
|
"""Check for the scenario where an item has a DOI listed in its citation,
|
||||||
|
but does not have a cg.identifier.doi field.
|
||||||
|
|
||||||
|
Function prints a warning if the DOI field is missing, but there is a DOI
|
||||||
|
in the citation.
|
||||||
|
"""
|
||||||
|
# Initialize some variables at global scope so that we can set them in the
|
||||||
|
# loop scope below and still be able to access them afterwards.
|
||||||
|
citation = ""
|
||||||
|
|
||||||
|
# Iterate over the labels of the current row's values to check if a DOI
|
||||||
|
# exists. If not, then we extract the citation to see if there is a DOI
|
||||||
|
# listed there.
|
||||||
|
for label in row.axes[0]:
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(row[label]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If a DOI field exists we don't need to check the citation
|
||||||
|
match = re.match(r"^.*?doi.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get the name of the citation field
|
||||||
|
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
citation = row[label]
|
||||||
|
|
||||||
|
if citation != "":
|
||||||
|
# Check the citation for "doi: 10.1186/1743-422X-9-218"
|
||||||
|
doi_match1 = re.match(r"^.*?doi:\s.*$", citation)
|
||||||
|
# Check the citation for a DOI URL (doi.org, dx.doi.org, etc)
|
||||||
|
doi_match2 = re.match(r"^.*?doi\.org.*$", citation)
|
||||||
|
if doi_match1 is not None or doi_match2 is not None:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
@ -1,35 +1,36 @@
|
|||||||
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type
|
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi
|
||||||
Leading space,2019-07-29,,,,,,,,
|
Leading space,2019-07-29,,,,,,,,,,
|
||||||
Trailing space ,2019-07-29,,,,,,,,
|
Trailing space ,2019-07-29,,,,,,,,,,
|
||||||
Excessive space,2019-07-29,,,,,,,,
|
Excessive space,2019-07-29,,,,,,,,,,
|
||||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,
|
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,
|
||||||
Duplicate||Duplicate,2019-07-29,,,,,,,,
|
Duplicate||Duplicate,2019-07-29,,,,,,,,,,
|
||||||
Invalid ISSN,2019-07-29,2321-2302,,,,,,,
|
Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,
|
||||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,
|
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,
|
||||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,
|
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,
|
||||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,
|
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,
|
||||||
Invalid date,2019-07-260,,,,,,,,
|
Invalid date,2019-07-260,,,,,,,,,,
|
||||||
Multiple dates,2019-07-26||2019-01-10,,,,,,,,
|
Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,
|
||||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,
|
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,
|
||||||
Unnecessary Unicode,2019-07-29,,,,,,,,
|
Unnecessary Unicode,2019-07-29,,,,,,,,,,
|
||||||
Suspicious character||foreˆt,2019-07-29,,,,,,,,
|
Suspicious character||foreˆt,2019-07-29,,,,,,,,,,
|
||||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,
|
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,
|
||||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,
|
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,
|
||||||
Invalid language,2019-07-29,,,Span,,,,,
|
Invalid language,2019-07-29,,,Span,,,,,,,
|
||||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,
|
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,,
|
||||||
Newline (LF),2019-07-30,,,,"TANZA
|
Newline (LF),2019-07-30,,,,"TANZA
|
||||||
NIA",,,,
|
NIA",,,,,,
|
||||||
Missing date,,,,,,,,,
|
Missing date,,,,,,,,,,,
|
||||||
Invalid country,2019-08-01,,,,,KENYAA,,,
|
Invalid country,2019-08-01,,,,,KENYAA,,,,,
|
||||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,
|
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,
|
||||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,,,
|
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,,,,,
|
||||||
"Missing space,after comma",2019-08-27,,,,,,,,
|
"Missing space,after comma",2019-08-27,,,,,,,,,,
|
||||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,
|
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,
|
||||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,
|
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,
|
||||||
Composéd Unicode,2020-01-14,,,,,,,,
|
Composéd Unicode,2020-01-14,,,,,,,,,,
|
||||||
Decomposéd Unicode,2020-01-14,,,,,,,,
|
Decomposéd Unicode,2020-01-14,,,,,,,,,,
|
||||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,
|
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,
|
||||||
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,
|
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,
|
||||||
Duplicate Title,2021-03-17,,,,,,,,Report
|
Duplicate Title,2021-03-17,,,,,,,,Report,,
|
||||||
Duplicate Title,2021-03-17,,,,,,,,Report
|
Duplicate Title,2021-03-17,,,,,,,,Report,,
|
||||||
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report
|
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,
|
||||||
|
"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218",
|
||||||
|
|
@ -367,3 +367,44 @@ def test_check_mojibake(capsys):
|
|||||||
captured.out
|
captured.out
|
||||||
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
|
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_doi_field():
|
||||||
|
"""Test an item with a DOI field."""
|
||||||
|
|
||||||
|
doi = "https://doi.org/10.1186/1743-422X-9-218"
|
||||||
|
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||||
|
|
||||||
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
|
# the citation and an empty DOI field.
|
||||||
|
d = {
|
||||||
|
"cg.identifier.doi": doi,
|
||||||
|
"dcterms.bibliographicCitation": citation
|
||||||
|
}
|
||||||
|
series = pd.Series(data=d)
|
||||||
|
|
||||||
|
result = check.citation_doi(series)
|
||||||
|
|
||||||
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_doi_only_in_citation(capsys):
|
||||||
|
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||||
|
|
||||||
|
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||||
|
|
||||||
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
|
# an empty DOI field and a citation containing a DOI.
|
||||||
|
d = {
|
||||||
|
"cg.identifier.doi": None,
|
||||||
|
"dcterms.bibliographicCitation": citation
|
||||||
|
}
|
||||||
|
series = pd.Series(data=d)
|
||||||
|
|
||||||
|
check.citation_doi(series)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert (
|
||||||
|
captured.out
|
||||||
|
== f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n"
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user