2021-03-19 15:04:13 +01:00
|
|
|
|
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
|
|
2020-01-15 10:41:31 +01:00
|
|
|
|
import pandas as pd
|
2021-02-21 12:01:25 +01:00
|
|
|
|
from colorama import Fore
|
2020-01-15 10:41:31 +01:00
|
|
|
|
|
2019-07-27 01:10:13 +02:00
|
|
|
|
import csv_metadata_quality.check as check
|
2019-09-24 17:55:05 +02:00
|
|
|
|
import csv_metadata_quality.experimental as experimental
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-27 01:10:13 +02:00
|
|
|
|
def test_check_invalid_issn(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid ISSN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "2321-2302"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
check.issn(value)
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-27 01:10:13 +02:00
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert captured.out == f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}\n"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_issn():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid ISSN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "0024-9319"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
result = check.issn(value)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_isbn(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid ISBN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "99921-58-10-6"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
check.isbn(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert captured.out == f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}\n"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_isbn():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid ISBN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "99921-58-10-7"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
result = check.isbn(value)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
2019-07-28 15:11:36 +02:00
|
|
|
|
def test_check_missing_date(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking missing date."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
value = None
|
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert captured.out == f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}\n"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_multiple_dates(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking multiple dates."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "1990||1991"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{value}\n"
|
|
|
|
|
)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_date(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid ISO8601 date."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "1990-0"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out == f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{value}\n"
|
|
|
|
|
)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_date():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid ISO8601 date."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "1990"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
result = check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_suspicious_characters(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking for suspicious characters."""
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "foreˆt"
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
2019-08-09 00:26:13 +02:00
|
|
|
|
|
|
|
|
|
check.suspicious_characters(value, field_name)
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}ˆt\n"
|
|
|
|
|
)
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
2019-09-11 15:36:53 +02:00
|
|
|
|
def test_check_valid_iso639_1_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test valid ISO 639-1 (alpha 2) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "ja"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
result = check.language(value)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
2019-09-26 06:44:39 +02:00
|
|
|
|
def test_check_valid_iso639_3_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test valid ISO 639-3 (alpha 3) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "eng"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
result = check.language(value)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
2019-09-11 15:36:53 +02:00
|
|
|
|
def test_check_invalid_iso639_1_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid ISO 639-1 (alpha 2) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "jp"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
check.language(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out == f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}\n"
|
|
|
|
|
)
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
2019-09-26 06:44:39 +02:00
|
|
|
|
def test_check_invalid_iso639_3_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid ISO 639-3 (alpha 3) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "chi"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
check.language(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out == f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}\n"
|
|
|
|
|
)
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Span"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
check.language(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert captured.out == f"{Fore.RED}Invalid language: {Fore.RESET}{value}\n"
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_agrovoc(capsys):
|
2021-12-23 11:44:32 +01:00
|
|
|
|
"""Test invalid AGROVOC subject. Invalid values *will not* be dropped."""
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2021-12-23 11:44:32 +01:00
|
|
|
|
valid_agrovoc = "LIVESTOCK"
|
|
|
|
|
invalid_agrovoc = "FOREST"
|
|
|
|
|
value = f"{valid_agrovoc}||{invalid_agrovoc}"
|
2021-03-11 10:45:25 +01:00
|
|
|
|
field_name = "dcterms.subject"
|
2021-12-23 11:44:32 +01:00
|
|
|
|
drop = False
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2021-12-23 11:44:32 +01:00
|
|
|
|
new_value = check.agrovoc(value, field_name, drop)
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
2021-12-23 11:44:32 +01:00
|
|
|
|
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
|
2021-02-21 12:01:25 +01:00
|
|
|
|
)
|
2021-12-23 11:44:32 +01:00
|
|
|
|
assert new_value == value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_agrovoc_dropped(capsys):
|
|
|
|
|
"""Test invalid AGROVOC subjects. Invalid values *will* be dropped."""
|
|
|
|
|
|
|
|
|
|
valid_agrovoc = "LIVESTOCK"
|
|
|
|
|
invalid_agrovoc = "FOREST"
|
|
|
|
|
value = f"{valid_agrovoc}||{invalid_agrovoc}"
|
|
|
|
|
field_name = "dcterms.subject"
|
|
|
|
|
drop = True
|
|
|
|
|
|
|
|
|
|
new_value = check.agrovoc(value, field_name, drop)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
|
|
|
|
|
)
|
|
|
|
|
assert new_value == valid_agrovoc
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_agrovoc():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test valid AGROVOC subject."""
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "FORESTS"
|
2021-03-11 10:45:25 +01:00
|
|
|
|
field_name = "dcterms.subject"
|
2021-12-23 11:44:32 +01:00
|
|
|
|
drop = False
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2021-12-23 11:44:32 +01:00
|
|
|
|
result = check.agrovoc(value, field_name, drop)
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2021-12-23 11:44:32 +01:00
|
|
|
|
assert result == "FORESTS"
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_uncommon_filename_extension(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test uncommon filename extension."""
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "file.pdf.lck"
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
2019-08-10 22:45:41 +02:00
|
|
|
|
check.filename_extension(value)
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2021-02-21 12:01:25 +01:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}\n"
|
|
|
|
|
)
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_common_filename_extension():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test common filename extension."""
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "file.pdf"
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
result = check.filename_extension(value)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_incorrect_iso_639_1_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "es"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
2022-09-02 15:24:33 +02:00
|
|
|
|
experimental.correct_language(series, exclude)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
2021-02-21 12:01:25 +01:00
|
|
|
|
== f"{Fore.YELLOW}Possibly incorrect language {language} (detected en): {Fore.RESET}{title}\n"
|
2019-09-26 13:02:51 +02:00
|
|
|
|
)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_incorrect_iso_639_3_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "spa"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
2022-09-02 15:24:33 +02:00
|
|
|
|
experimental.correct_language(series, exclude)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
2021-02-21 12:01:25 +01:00
|
|
|
|
== f"{Fore.YELLOW}Possibly incorrect language {language} (detected eng): {Fore.RESET}{title}\n"
|
2019-09-26 13:02:51 +02:00
|
|
|
|
)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_correct_iso_639_1_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "en"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
2022-09-02 15:24:33 +02:00
|
|
|
|
result = experimental.correct_language(series, exclude)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_correct_iso_639_3_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "eng"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
2022-09-02 15:24:33 +02:00
|
|
|
|
result = experimental.correct_language(series, exclude)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2021-03-11 09:36:26 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_spdx_license_identifier():
|
|
|
|
|
"""Test valid SPDX license identifier."""
|
|
|
|
|
|
|
|
|
|
license = "CC-BY-SA-4.0"
|
|
|
|
|
|
|
|
|
|
result = check.spdx_license_identifier(license)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2021-03-11 09:36:26 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_spdx_license_identifier(capsys):
|
|
|
|
|
"""Test invalid SPDX license identifier."""
|
|
|
|
|
|
|
|
|
|
license = "CC-BY-SA"
|
|
|
|
|
|
2022-12-20 14:08:29 +01:00
|
|
|
|
check.spdx_license_identifier(license)
|
2021-03-11 09:36:26 +01:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{license}\n"
|
|
|
|
|
)
|
2021-03-17 08:54:02 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_duplicate_item(capsys):
|
|
|
|
|
"""Test item with duplicate title, type, and date."""
|
|
|
|
|
|
|
|
|
|
item_title = "Title"
|
|
|
|
|
item_type = "Report"
|
|
|
|
|
item_date = "2021-03-17"
|
|
|
|
|
|
|
|
|
|
d = {
|
|
|
|
|
"dc.title": [item_title, item_title],
|
|
|
|
|
"dcterms.type": [item_type, item_type],
|
|
|
|
|
"dcterms.issued": [item_date, item_date],
|
|
|
|
|
}
|
|
|
|
|
df = pd.DataFrame(data=d)
|
|
|
|
|
|
2022-12-20 14:08:29 +01:00
|
|
|
|
check.duplicate_items(df)
|
2021-03-17 08:54:02 +01:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}Possible duplicate (dc.title): {Fore.RESET}{item_title}\n"
|
|
|
|
|
)
|
2021-03-19 09:28:33 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_no_mojibake():
|
|
|
|
|
"""Test string with no mojibake."""
|
|
|
|
|
|
|
|
|
|
field = "CIAT Publicaçao"
|
|
|
|
|
field_name = "dcterms.isPartOf"
|
|
|
|
|
|
|
|
|
|
result = check.mojibake(field, field_name)
|
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2021-03-19 09:28:33 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_mojibake(capsys):
|
|
|
|
|
"""Test string with mojibake."""
|
|
|
|
|
|
|
|
|
|
field = "CIAT Publicaçao"
|
|
|
|
|
field_name = "dcterms.isPartOf"
|
|
|
|
|
|
2022-12-20 14:08:29 +01:00
|
|
|
|
check.mojibake(field, field_name)
|
2021-03-19 09:28:33 +01:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
|
|
|
|
|
)
|
2021-10-06 20:25:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_doi_field():
|
|
|
|
|
"""Test an item with a DOI field."""
|
|
|
|
|
|
|
|
|
|
doi = "https://doi.org/10.1186/1743-422X-9-218"
|
|
|
|
|
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
2021-12-05 14:58:25 +01:00
|
|
|
|
# the citation and a DOI field.
|
2021-10-06 21:10:26 +02:00
|
|
|
|
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
2021-10-06 20:25:39 +02:00
|
|
|
|
series = pd.Series(data=d)
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2021-10-06 20:25:39 +02:00
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
result = check.citation_doi(series, exclude)
|
2021-10-06 20:25:39 +02:00
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2021-10-06 20:25:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_doi_only_in_citation(capsys):
|
|
|
|
|
"""Test an item with a DOI in its citation, but no DOI field."""
|
|
|
|
|
|
|
|
|
|
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2021-10-06 20:25:39 +02:00
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
|
|
|
|
# an empty DOI field and a citation containing a DOI.
|
2021-10-06 21:10:26 +02:00
|
|
|
|
d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
|
2021-10-06 20:25:39 +02:00
|
|
|
|
series = pd.Series(data=d)
|
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
check.citation_doi(series, exclude)
|
2021-10-06 20:25:39 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n"
|
|
|
|
|
)
|
2021-12-05 15:01:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_title_in_citation():
|
|
|
|
|
"""Test an item with its title in the citation."""
|
|
|
|
|
|
|
|
|
|
title = "Testing all the things"
|
|
|
|
|
citation = "Orth, A. 2021. Testing all the things."
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2021-12-05 15:01:11 +01:00
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
|
|
|
|
# the title and citation.
|
|
|
|
|
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
|
|
|
|
|
series = pd.Series(data=d)
|
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
result = check.title_in_citation(series, exclude)
|
2021-12-05 15:01:11 +01:00
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2021-12-05 15:01:11 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_title_not_in_citation(capsys):
|
|
|
|
|
"""Test an item with its title missing from the citation."""
|
|
|
|
|
|
|
|
|
|
title = "Testing all the things"
|
|
|
|
|
citation = "Orth, A. 2021. Testing all teh things."
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2021-12-05 15:01:11 +01:00
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
|
|
|
|
# the title and citation.
|
|
|
|
|
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
|
|
|
|
|
series = pd.Series(data=d)
|
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
check.title_in_citation(series, exclude)
|
2021-12-05 15:01:11 +01:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}\n"
|
|
|
|
|
)
|
2021-12-08 14:18:50 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_country_matches_region():
|
|
|
|
|
"""Test an item with regions matching its country list."""
|
|
|
|
|
|
|
|
|
|
country = "Kenya"
|
|
|
|
|
region = "Eastern Africa"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2021-12-08 14:18:50 +01:00
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series)
|
|
|
|
|
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
|
|
|
|
series = pd.Series(data=d)
|
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
result = check.countries_match_regions(series, exclude)
|
2021-12-08 14:18:50 +01:00
|
|
|
|
|
2022-12-20 14:07:41 +01:00
|
|
|
|
assert result is None
|
2021-12-08 14:18:50 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_country_not_matching_region(capsys):
|
|
|
|
|
"""Test an item with regions not matching its country list."""
|
|
|
|
|
|
|
|
|
|
title = "Testing an item with no matching region."
|
|
|
|
|
country = "Kenya"
|
|
|
|
|
region = ""
|
|
|
|
|
missing_region = "Eastern Africa"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2021-12-08 14:18:50 +01:00
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series)
|
|
|
|
|
d = {
|
|
|
|
|
"dc.title": title,
|
|
|
|
|
"cg.coverage.country": country,
|
|
|
|
|
"cg.coverage.region": region,
|
|
|
|
|
}
|
|
|
|
|
series = pd.Series(data=d)
|
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
check.countries_match_regions(series, exclude)
|
2021-12-08 14:18:50 +01:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
2022-11-28 20:12:17 +01:00
|
|
|
|
== f"{Fore.YELLOW}Missing region ({country} → {missing_region}): {Fore.RESET}{title}\n"
|
2021-12-08 14:18:50 +01:00
|
|
|
|
)
|