1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-07-15 09:11:19 +02:00

Add check for missing DOIs
All checks were successful
continuous-integration/drone/push Build is passing

Sometimes an editor includes a DOI in the citation field, but does
not add a standalone DOI field.
This commit is contained in:
Alan Orth 2021-10-06 21:25:39 +03:00
parent 831ce979c3
commit 8a27fb2589
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 127 additions and 38 deletions

View File

@ -186,11 +186,16 @@ def run(argv):
# column. For now it will have to do.
##
if args.experimental_checks:
# Transpose the DataFrame so we can consider each row as a column
df_transposed = df.T
# Transpose the DataFrame so we can consider each row as a column
df_transposed = df.T
for column in df_transposed.columns:
# Remember, here a "column" is an item (previously row). Perhaps I
# should rename column in this for loop...
for column in df_transposed.columns:
# Check: citation DOI
check.citation_doi(df_transposed[column])
if args.experimental_checks:
experimental.correct_language(df_transposed[column])
# Write

View File

@ -368,3 +368,45 @@ def mojibake(field, field_name):
)
return
def citation_doi(row):
"""Check for the scenario where an item has a DOI listed in its citation,
but does not have a cg.identifier.doi field.
Function prints a warning if the DOI field is missing, but there is a DOI
in the citation.
"""
# Initialize some variables at global scope so that we can set them in the
# loop scope below and still be able to access them afterwards.
citation = ""
# Iterate over the labels of the current row's values to check if a DOI
# exists. If not, then we extract the citation to see if there is a DOI
# listed there.
for label in row.axes[0]:
# Skip fields with missing values
if pd.isna(row[label]):
continue
# If a DOI field exists we don't need to check the citation
match = re.match(r"^.*?doi.*$", label)
if match is not None:
return
# Get the name of the citation field
match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None:
citation = row[label]
if citation != "":
# Check the citation for "doi: 10.1186/1743-422X-9-218"
doi_match1 = re.match(r"^.*?doi:\s.*$", citation)
# Check the citation for a DOI URL (doi.org, dx.doi.org, etc)
doi_match2 = re.match(r"^.*?doi\.org.*$", citation)
if doi_match1 is not None or doi_match2 is not None:
print(
f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}"
)
return

View File

@ -1,35 +1,36 @@
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type
Leading space,2019-07-29,,,,,,,,
Trailing space ,2019-07-29,,,,,,,,
Excessive space,2019-07-29,,,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,
Invalid date,2019-07-260,,,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,
Unnecessary Unicode,2019-07-29,,,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,
Invalid language,2019-07-29,,,Span,,,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi
Leading space,2019-07-29,,,,,,,,,,
Trailing space ,2019-07-29,,,,,,,,,,
Excessive space,2019-07-29,,,,,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,,
Invalid date,2019-07-260,,,,,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,,
Unnecessary Unicode,2019-07-29,,,,,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,
Invalid language,2019-07-29,,,Span,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,,
Newline (LF),2019-07-30,,,,"TANZA
NIA",,,,
Missing date,,,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,,,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,
"Missing space,after comma",2019-08-27,,,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,
Composéd Unicode,2020-01-14,,,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,
Duplicate Title,2021-03-17,,,,,,,,Report
Duplicate Title,2021-03-17,,,,,,,,Report
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report
NIA",,,,,,
Missing date,,,,,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,,,,,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,,
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,,
"Missing space,after comma",2019-08-27,,,,,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,,
Composéd Unicode,2020-01-14,,,,,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,,
Duplicate Title,2021-03-17,,,,,,,,Report,,
Duplicate Title,2021-03-17,,,,,,,,Report,,
Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,
"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218",

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi
2 Leading space 2019-07-29
3 Trailing space 2019-07-29
4 Excessive space 2019-07-29
5 Miscellaenous ||whitespace | issues 2019-07-29
6 Duplicate||Duplicate 2019-07-29
7 Invalid ISSN 2019-07-29 2321-2302
8 Invalid ISBN 2019-07-29 978-0-306-40615-6
9 Multiple valid ISSNs 2019-07-29 0378-5955||0024-9319
10 Multiple valid ISBNs 2019-07-29 99921-58-10-7||978-0-306-40615-7
11 Invalid date 2019-07-260
12 Multiple dates 2019-07-26||2019-01-10
13 Invalid multi-value separator 2019-07-29 0378-5955|0024-9319
14 Unnecessary Unicode​ 2019-07-29
15 Suspicious character||foreˆt 2019-07-29
16 Invalid ISO 639-1 (alpha 2) language 2019-07-29 jp
17 Invalid ISO 639-3 (alpha 3) language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA
23 Uncommon filename extension 2019-08-10 file.pdf.lck
24 Unneccesary unicode (U+002D + U+00AD) 2019-08-10 978-­92-­9043-­823-­6
25 Missing space,after comma 2019-08-27
26 Incorrect ISO 639-1 language 2019-09-26 es
27 Incorrect ISO 639-3 language 2019-09-26 spa
28 Composéd Unicode 2020-01-14
29 Decomposéd Unicode 2020-01-14
30 Unnecessary multi-value separator 2021-01-03 0378-5955||
31 Invalid SPDX license identifier 2021-03-11 CC-BY
32 Duplicate Title 2021-03-17 Report
33 Duplicate Title 2021-03-17 Report
34 Mojibake 2021-03-18 Publicaçao CIAT Report
35 DOI in citation, but missing cg.identifier.doi 2021-10-06 Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218
36

View File

@ -367,3 +367,44 @@ def test_check_mojibake(capsys):
captured.out
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
)
def test_check_doi_field():
"""Test an item with a DOI field."""
doi = "https://doi.org/10.1186/1743-422X-9-218"
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
# Emulate a column in a transposed dataframe (which is just a series), with
# the citation and an empty DOI field.
d = {
"cg.identifier.doi": doi,
"dcterms.bibliographicCitation": citation
}
series = pd.Series(data=d)
result = check.citation_doi(series)
assert result == None
def test_check_doi_only_in_citation(capsys):
"""Test an item with a DOI in its citation, but no DOI field."""
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
# Emulate a column in a transposed dataframe (which is just a series), with
# an empty DOI field and a citation containing a DOI.
d = {
"cg.identifier.doi": None,
"dcterms.bibliographicCitation": citation
}
series = pd.Series(data=d)
check.citation_doi(series)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n"
)