# SPDX-License-Identifier: GPL-3.0-only import pandas as pd from colorama import Fore import csv_metadata_quality.check as check import csv_metadata_quality.experimental as experimental def test_check_invalid_issn(capsys): """Test checking invalid ISSN.""" value = "2321-2302" check.issn(value) captured = capsys.readouterr() assert captured.out == f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}\n" def test_check_valid_issn(): """Test checking valid ISSN.""" value = "0024-9319" result = check.issn(value) assert result is None def test_check_invalid_isbn(capsys): """Test checking invalid ISBN.""" value = "99921-58-10-6" check.isbn(value) captured = capsys.readouterr() assert captured.out == f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}\n" def test_check_valid_isbn(): """Test checking valid ISBN.""" value = "99921-58-10-7" result = check.isbn(value) assert result is None def test_check_missing_date(capsys): """Test checking missing date.""" value = None field_name = "dc.date.issued" check.date(value, field_name) captured = capsys.readouterr() assert captured.out == f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}\n" def test_check_multiple_dates(capsys): """Test checking multiple dates.""" value = "1990||1991" field_name = "dc.date.issued" check.date(value, field_name) captured = capsys.readouterr() assert ( captured.out == f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{value}\n" ) def test_check_invalid_date(capsys): """Test checking invalid ISO8601 date.""" value = "1990-0" field_name = "dc.date.issued" check.date(value, field_name) captured = capsys.readouterr() assert ( captured.out == f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{value}\n" ) def test_check_valid_date(): """Test checking valid ISO8601 date.""" value = "1990" field_name = "dc.date.issued" result = check.date(value, field_name) assert result is None def test_check_suspicious_characters(capsys): """Test checking for suspicious characters.""" value = "foreˆt" field_name = "dc.contributor.author" check.suspicious_characters(value, field_name) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}ˆt\n" ) def test_check_valid_iso639_1_language(): """Test valid ISO 639-1 (alpha 2) language.""" value = "ja" result = check.language(value) assert result is None def test_check_valid_iso639_3_language(): """Test valid ISO 639-3 (alpha 3) language.""" value = "eng" result = check.language(value) assert result is None def test_check_invalid_iso639_1_language(capsys): """Test invalid ISO 639-1 (alpha 2) language.""" value = "jp" check.language(value) captured = capsys.readouterr() assert ( captured.out == f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}\n" ) def test_check_invalid_iso639_3_language(capsys): """Test invalid ISO 639-3 (alpha 3) language.""" value = "chi" check.language(value) captured = capsys.readouterr() assert ( captured.out == f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}\n" ) def test_check_invalid_language(capsys): """Test invalid language.""" value = "Span" check.language(value) captured = capsys.readouterr() assert captured.out == f"{Fore.RED}Invalid language: {Fore.RESET}{value}\n" def test_check_invalid_agrovoc(capsys): """Test invalid AGROVOC subject. Invalid values *will not* be dropped.""" valid_agrovoc = "LIVESTOCK" invalid_agrovoc = "FOREST" value = f"{valid_agrovoc}||{invalid_agrovoc}" field_name = "dcterms.subject" drop = False new_value = check.agrovoc(value, field_name, drop) captured = capsys.readouterr() assert ( captured.out == f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n" ) assert new_value == value def test_check_invalid_agrovoc_dropped(capsys): """Test invalid AGROVOC subjects. Invalid values *will* be dropped.""" valid_agrovoc = "LIVESTOCK" invalid_agrovoc = "FOREST" value = f"{valid_agrovoc}||{invalid_agrovoc}" field_name = "dcterms.subject" drop = True new_value = check.agrovoc(value, field_name, drop) captured = capsys.readouterr() assert ( captured.out == f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n" ) assert new_value == valid_agrovoc def test_check_valid_agrovoc(): """Test valid AGROVOC subject.""" value = "FORESTS" field_name = "dcterms.subject" drop = False result = check.agrovoc(value, field_name, drop) assert result == "FORESTS" def test_check_uncommon_filename_extension(capsys): """Test uncommon filename extension.""" value = "file.pdf.lck" check.filename_extension(value) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}\n" ) def test_check_common_filename_extension(): """Test common filename extension.""" value = "file.pdf" result = check.filename_extension(value) assert result is None def test_check_incorrect_iso_639_1_language(capsys): """Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.""" title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle" language = "es" exclude = [] # Create a dictionary to mimic Pandas series row = {"dc.title": title, "dc.language.iso": language} series = pd.Series(row) experimental.correct_language(series, exclude) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Possibly incorrect language {language} (detected en): {Fore.RESET}{title}\n" ) def test_check_incorrect_iso_639_3_language(capsys): """Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.""" title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle" language = "spa" exclude = [] # Create a dictionary to mimic Pandas series row = {"dc.title": title, "dc.language.iso": language} series = pd.Series(row) experimental.correct_language(series, exclude) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Possibly incorrect language {language} (detected eng): {Fore.RESET}{title}\n" ) def test_check_correct_iso_639_1_language(): """Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.""" title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle" language = "en" exclude = [] # Create a dictionary to mimic Pandas series row = {"dc.title": title, "dc.language.iso": language} series = pd.Series(row) result = experimental.correct_language(series, exclude) assert result is None def test_check_correct_iso_639_3_language(): """Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.""" title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle" language = "eng" exclude = [] # Create a dictionary to mimic Pandas series row = {"dc.title": title, "dc.language.iso": language} series = pd.Series(row) result = experimental.correct_language(series, exclude) assert result is None def test_check_valid_spdx_license_identifier(): """Test valid SPDX license identifier.""" license = "CC-BY-SA-4.0" result = check.spdx_license_identifier(license) assert result is None def test_check_invalid_spdx_license_identifier(capsys): """Test invalid SPDX license identifier.""" license = "CC-BY-SA" check.spdx_license_identifier(license) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{license}\n" ) def test_check_duplicate_item(capsys): """Test item with duplicate title, type, and date.""" item_title = "Title" item_type = "Report" item_date = "2021-03-17" d = { "dc.title": [item_title, item_title], "dcterms.type": [item_type, item_type], "dcterms.issued": [item_date, item_date], } df = pd.DataFrame(data=d) check.duplicate_items(df) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Possible duplicate (dc.title): {Fore.RESET}{item_title}\n" ) def test_check_no_mojibake(): """Test string with no mojibake.""" field = "CIAT Publicaçao" field_name = "dcterms.isPartOf" result = check.mojibake(field, field_name) assert result is None def test_check_mojibake(capsys): """Test string with mojibake.""" field = "CIAT Publicaçao" field_name = "dcterms.isPartOf" check.mojibake(field, field_name) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n" ) def test_check_doi_field(): """Test an item with a DOI field.""" doi = "https://doi.org/10.1186/1743-422X-9-218" citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218" # Emulate a column in a transposed dataframe (which is just a series), with # the citation and a DOI field. d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) exclude = [] result = check.citation_doi(series, exclude) assert result is None def test_check_doi_only_in_citation(capsys): """Test an item with a DOI in its citation, but no DOI field.""" citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218" exclude = [] # Emulate a column in a transposed dataframe (which is just a series), with # an empty DOI field and a citation containing a DOI. d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) check.citation_doi(series, exclude) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}\n" ) def test_title_in_citation(): """Test an item with its title in the citation.""" title = "Testing all the things" citation = "Orth, A. 2021. Testing all the things." exclude = [] # Emulate a column in a transposed dataframe (which is just a series), with # the title and citation. d = {"dc.title": title, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) result = check.title_in_citation(series, exclude) assert result is None def test_title_not_in_citation(capsys): """Test an item with its title missing from the citation.""" title = "Testing all the things" citation = "Orth, A. 2021. Testing all teh things." exclude = [] # Emulate a column in a transposed dataframe (which is just a series), with # the title and citation. d = {"dc.title": title, "dcterms.bibliographicCitation": citation} series = pd.Series(data=d) check.title_in_citation(series, exclude) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}\n" ) def test_country_matches_region(): """Test an item with regions matching its country list.""" country = "Kenya" region = "Eastern Africa" exclude = [] # Emulate a column in a transposed dataframe (which is just a series) d = {"cg.coverage.country": country, "cg.coverage.region": region} series = pd.Series(data=d) result = check.countries_match_regions(series, exclude) assert result is None def test_country_not_matching_region(capsys): """Test an item with regions not matching its country list.""" title = "Testing an item with no matching region." country = "Kenya" region = "" missing_region = "Eastern Africa" exclude = [] # Emulate a column in a transposed dataframe (which is just a series) d = { "dc.title": title, "cg.coverage.country": country, "cg.coverage.region": region, } series = pd.Series(data=d) check.countries_match_regions(series, exclude) captured = capsys.readouterr() assert ( captured.out == f"{Fore.YELLOW}Missing region ({country} → {missing_region}): {Fore.RESET}{title}\n" )