1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-25 23:28:18 +01:00

Add mojibake to data/test.csv and tests

This commit is contained in:
Alan Orth 2021-03-19 10:28:33 +02:00
parent 898bb412c3
commit 39a4b1a487
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
3 changed files with 36 additions and 0 deletions

View File

@ -32,3 +32,4 @@ Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,
Duplicate Title,2021-03-17,,,,,,,,Report
Duplicate Title,2021-03-17,,,,,,,,Report
Mojibake,2021-03-18,,,,CIAT Publicaçao,,,,Report

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type
32 Duplicate Title 2021-03-17 Report
33 Duplicate Title 2021-03-17 Report
34 Mojibake 2021-03-18 CIAT Publicaçao Report
35

View File

@ -339,3 +339,29 @@ def test_check_duplicate_item(capsys):
captured.out
== f"{Fore.YELLOW}Possible duplicate (dc.title): {Fore.RESET}{item_title}\n"
)
def test_check_no_mojibake():
"""Test string with no mojibake."""
field = "CIAT Publicaçao"
field_name = "dcterms.isPartOf"
result = check.mojibake(field, field_name)
assert result == None
def test_check_mojibake(capsys):
"""Test string with mojibake."""
field = "CIAT Publicaçao"
field_name = "dcterms.isPartOf"
result = check.mojibake(field, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
)

View File

@ -108,3 +108,12 @@ def test_fix_decomposed_unicode():
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
def test_fix_mojibake():
"""Test string with no mojibake."""
field = "CIAT Publicaçao"
field_name = "dcterms.isPartOf"
assert fix.mojibake(field, field_name) == "CIAT Publicaçao"