2020-01-15 10:41:31 +01:00
|
|
|
|
import pandas as pd
|
|
|
|
|
|
2019-07-27 01:10:13 +02:00
|
|
|
|
import csv_metadata_quality.check as check
|
2019-09-24 17:55:05 +02:00
|
|
|
|
import csv_metadata_quality.experimental as experimental
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-27 01:10:13 +02:00
|
|
|
|
def test_check_invalid_issn(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid ISSN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "2321-2302"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
check.issn(value)
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-27 01:10:13 +02:00
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid ISSN: {value}\n"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_issn():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid ISSN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "0024-9319"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
result = check.issn(value)
|
|
|
|
|
|
|
|
|
|
assert result == value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_isbn(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid ISBN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "99921-58-10-6"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
check.isbn(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid ISBN: {value}\n"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_isbn():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid ISBN."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "99921-58-10-7"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
result = check.isbn(value)
|
|
|
|
|
|
|
|
|
|
assert result == value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_separators(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid multi-value separators."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Alan|Orth"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
check.separators(value, field_name)
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2020-01-16 11:35:11 +01:00
|
|
|
|
assert captured.out == f"Invalid multi-value separator ({field_name}): {value}\n"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_separators():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid multi-value separators."""
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Alan||Orth"
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
result = check.separators(value, field_name)
|
2019-07-27 01:10:13 +02:00
|
|
|
|
|
|
|
|
|
assert result == value
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_missing_date(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking missing date."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
value = None
|
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Missing date ({field_name}).\n"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_multiple_dates(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking multiple dates."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "1990||1991"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Multiple dates not allowed ({field_name}): {value}\n"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_date(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking invalid ISO8601 date."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "1990-0"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid date ({field_name}): {value}\n"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_date():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking valid ISO8601 date."""
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "1990"
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.date.issued"
|
2019-08-21 14:34:52 +02:00
|
|
|
|
|
|
|
|
|
result = check.date(value, field_name)
|
2019-07-28 15:11:36 +02:00
|
|
|
|
|
|
|
|
|
assert result == value
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_suspicious_characters(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test checking for suspicious characters."""
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "foreˆt"
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
2019-08-09 00:26:13 +02:00
|
|
|
|
|
|
|
|
|
check.suspicious_characters(value, field_name)
|
2019-07-29 16:08:49 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Suspicious character ({field_name}): ˆt\n"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
2019-09-11 15:36:53 +02:00
|
|
|
|
def test_check_valid_iso639_1_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test valid ISO 639-1 (alpha 2) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "ja"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
result = check.language(value)
|
|
|
|
|
|
|
|
|
|
assert result == value
|
|
|
|
|
|
|
|
|
|
|
2019-09-26 06:44:39 +02:00
|
|
|
|
def test_check_valid_iso639_3_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test valid ISO 639-3 (alpha 3) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "eng"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
result = check.language(value)
|
|
|
|
|
|
|
|
|
|
assert result == value
|
|
|
|
|
|
|
|
|
|
|
2019-09-11 15:36:53 +02:00
|
|
|
|
def test_check_invalid_iso639_1_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid ISO 639-1 (alpha 2) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "jp"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
check.language(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid ISO 639-1 language: {value}\n"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
2019-09-26 06:44:39 +02:00
|
|
|
|
def test_check_invalid_iso639_3_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid ISO 639-3 (alpha 3) language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "chi"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
check.language(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid ISO 639-3 language: {value}\n"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid language."""
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Span"
|
2019-07-29 17:59:42 +02:00
|
|
|
|
|
|
|
|
|
check.language(value)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid language: {value}\n"
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_invalid_agrovoc(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test invalid AGROVOC subject."""
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "FOREST"
|
|
|
|
|
field_name = "dc.subject"
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2019-08-01 22:59:11 +02:00
|
|
|
|
check.agrovoc(value, field_name)
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Invalid AGROVOC ({field_name}): {value}\n"
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_valid_agrovoc():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test valid AGROVOC subject."""
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "FORESTS"
|
|
|
|
|
field_name = "dc.subject"
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
2019-08-01 22:59:11 +02:00
|
|
|
|
result = check.agrovoc(value, field_name)
|
2019-07-29 23:30:31 +02:00
|
|
|
|
|
|
|
|
|
assert result == value
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_uncommon_filename_extension(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test uncommon filename extension."""
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "file.pdf.lck"
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
2019-08-10 22:45:41 +02:00
|
|
|
|
check.filename_extension(value)
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert captured.out == f"Filename with uncommon extension: {value}\n"
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_common_filename_extension():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test common filename extension."""
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "file.pdf"
|
2019-08-10 22:44:13 +02:00
|
|
|
|
|
|
|
|
|
result = check.filename_extension(value)
|
|
|
|
|
|
|
|
|
|
assert result == value
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_incorrect_iso_639_1_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "es"
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
|
|
|
|
experimental.correct_language(series)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"Possibly incorrect language {language} (detected en): {title}\n"
|
|
|
|
|
)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_incorrect_iso_639_3_language(capsys):
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "spa"
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
|
|
|
|
experimental.correct_language(series)
|
|
|
|
|
|
|
|
|
|
captured = capsys.readouterr()
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert (
|
|
|
|
|
captured.out
|
|
|
|
|
== f"Possibly incorrect language {language} (detected eng): {title}\n"
|
|
|
|
|
)
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_correct_iso_639_1_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "en"
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
|
|
|
|
result = experimental.correct_language(series)
|
|
|
|
|
|
|
|
|
|
assert result == language
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_check_correct_iso_639_3_language():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
|
|
|
|
language = "eng"
|
2019-09-24 17:55:05 +02:00
|
|
|
|
|
|
|
|
|
# Create a dictionary to mimic Pandas series
|
2019-09-26 13:02:51 +02:00
|
|
|
|
row = {"dc.title": title, "dc.language.iso": language}
|
2019-09-24 17:55:05 +02:00
|
|
|
|
series = pd.Series(row)
|
|
|
|
|
|
|
|
|
|
result = experimental.correct_language(series)
|
|
|
|
|
|
|
|
|
|
assert result == language
|