From 8a27fb2589b26544bf5884339cfd043a648a4bf2 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 6 Oct 2021 21:25:39 +0300 Subject: [PATCH] Add check for missing DOIs Sometimes an editor includes a DOI in the citation field, but does not add a standalone DOI field. --- csv_metadata_quality/app.py | 13 +++++-- csv_metadata_quality/check.py | 42 +++++++++++++++++++++ data/test.csv | 69 ++++++++++++++++++----------------- tests/test_check.py | 41 +++++++++++++++++++++ 4 files changed, 127 insertions(+), 38 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 82e6e1c..715921c 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -186,11 +186,16 @@ def run(argv): # column. For now it will have to do. ## - if args.experimental_checks: - # Transpose the DataFrame so we can consider each row as a column - df_transposed = df.T + # Transpose the DataFrame so we can consider each row as a column + df_transposed = df.T - for column in df_transposed.columns: + # Remember, here a "column" is an item (previously row). Perhaps I + # should rename column in this for loop... + for column in df_transposed.columns: + # Check: citation DOI + check.citation_doi(df_transposed[column]) + + if args.experimental_checks: experimental.correct_language(df_transposed[column]) # Write diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 91ea225..00dcad0 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -368,3 +368,45 @@ def mojibake(field, field_name): ) return + + +def citation_doi(row): + """Check for the scenario where an item has a DOI listed in its citation, + but does not have a cg.identifier.doi field. + + Function prints a warning if the DOI field is missing, but there is a DOI + in the citation. + """ + # Initialize some variables at global scope so that we can set them in the + # loop scope below and still be able to access them afterwards. + citation = "" + + # Iterate over the labels of the current row's values to check if a DOI + # exists. If not, then we extract the citation to see if there is a DOI + # listed there. + for label in row.axes[0]: + # Skip fields with missing values + if pd.isna(row[label]): + continue + + # If a DOI field exists we don't need to check the citation + match = re.match(r"^.*?doi.*$", label) + if match is not None: + return + + # Get the name of the citation field + match = re.match(r"^.*?[cC]itation.*$", label) + if match is not None: + citation = row[label] + + if citation != "": + # Check the citation for "doi: 10.1186/1743-422X-9-218" + doi_match1 = re.match(r"^.*?doi:\s.*$", citation) + # Check the citation for a DOI URL (doi.org, dx.doi.org, etc) + doi_match2 = re.match(r"^.*?doi\.org.*$", citation) + if doi_match1 is not None or doi_match2 is not None: + print( + f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}" + ) + + return diff --git a/data/test.csv b/data/test.csv index 926634b..69206f0 100644 --- a/data/test.csv +++ b/data/test.csv @@ -1,35 +1,36 @@ -dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type - Leading space,2019-07-29,,,,,,,, -Trailing space ,2019-07-29,,,,,,,, -Excessive space,2019-07-29,,,,,,,, -Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,, -Duplicate||Duplicate,2019-07-29,,,,,,,, -Invalid ISSN,2019-07-29,2321-2302,,,,,,, -Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,, -Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,, -Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,, -Invalid date,2019-07-260,,,,,,,, -Multiple dates,2019-07-26||2019-01-10,,,,,,,, -Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,, -Unnecessary Unicode​,2019-07-29,,,,,,,, -Suspicious character||foreˆt,2019-07-29,,,,,,,, -Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,, -Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,, -Invalid language,2019-07-29,,,Span,,,,, -Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,, +dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license,dcterms.type,dcterms.bibliographicCitation,cg.identifier.doi + Leading space,2019-07-29,,,,,,,,,, +Trailing space ,2019-07-29,,,,,,,,,, +Excessive space,2019-07-29,,,,,,,,,, +Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,,,, +Duplicate||Duplicate,2019-07-29,,,,,,,,,, +Invalid ISSN,2019-07-29,2321-2302,,,,,,,,, +Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,,,, +Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,,,, +Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,,,, +Invalid date,2019-07-260,,,,,,,,,, +Multiple dates,2019-07-26||2019-01-10,,,,,,,,,, +Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,,,, +Unnecessary Unicode​,2019-07-29,,,,,,,,,, +Suspicious character||foreˆt,2019-07-29,,,,,,,,,, +Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,, +Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,, +Invalid language,2019-07-29,,,Span,,,,,,, +Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,, Newline (LF),2019-07-30,,,,"TANZA -NIA",,,, -Missing date,,,,,,,,, -Invalid country,2019-08-01,,,,,KENYAA,,, -Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,, -Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,, -"Missing space,after comma",2019-08-27,,,,,,,, -Incorrect ISO 639-1 language,2019-09-26,,,es,,,,, -Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,, -Composéd Unicode,2020-01-14,,,,,,,, -Decomposéd Unicode,2020-01-14,,,,,,,, -Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,, -Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY, -Duplicate Title,2021-03-17,,,,,,,,Report -Duplicate Title,2021-03-17,,,,,,,,Report -Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report +NIA",,,,,, +Missing date,,,,,,,,,,, +Invalid country,2019-08-01,,,,,KENYAA,,,,, +Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,,,, +Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,,,, +"Missing space,after comma",2019-08-27,,,,,,,,,, +Incorrect ISO 639-1 language,2019-09-26,,,es,,,,,,, +Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,,,, +Composéd Unicode,2020-01-14,,,,,,,,,, +Decomposéd Unicode,2020-01-14,,,,,,,,,, +Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,,, +Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,,, +Duplicate Title,2021-03-17,,,,,,,,Report,, +Duplicate Title,2021-03-17,,,,,,,,Report,, +Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,, +"DOI in citation, but missing cg.identifier.doi",2021-10-06,,,,,,,,,"Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218", diff --git a/tests/test_check.py b/tests/test_check.py index 485abee..375f8ad 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -367,3 +367,44 @@ def test_check_mojibake(capsys): captured.out == f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n" ) + + +def test_check_doi_field(): + """Test an item with a DOI field.""" + + doi = "https://doi.org/10.1186/1743-422X-9-218" + citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218" + + # Emulate a column in a transposed dataframe (which is just a series), with + # the citation and an empty DOI field. + d = { + "cg.identifier.doi": doi, + "dcterms.bibliographicCitation": citation + } + series = pd.Series(data=d) + + result = check.citation_doi(series) + + assert result == None + + +def test_check_doi_only_in_citation(capsys): + """Test an item with a DOI in its citation, but no DOI field.""" + + citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218" + + # Emulate a column in a transposed dataframe (which is just a series), with + # an empty DOI field and a citation containing a DOI. + d = { + "cg.identifier.doi": None, + "dcterms.bibliographicCitation": citation + } + series = pd.Series(data=d) + + check.citation_doi(series) + + captured = capsys.readouterr() + assert ( + captured.out + == f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n" + )