1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-10-31 19:43:00 +01:00
csv-metadata-quality/tests/test_fix.py

123 lines
2.7 KiB
Python
Raw Normal View History

# SPDX-License-Identifier: GPL-3.0-only
import csv_metadata_quality.fix as fix
2019-07-28 16:47:28 +02:00
def test_fix_leading_whitespace():
2019-09-26 13:02:51 +02:00
"""Test fixing leading whitespace."""
2019-09-26 13:02:51 +02:00
value = " Alan"
field_name = "dc.contributor.author"
assert fix.whitespace(value, field_name) == "Alan"
2019-07-28 16:47:28 +02:00
def test_fix_trailing_whitespace():
2019-09-26 13:02:51 +02:00
"""Test fixing trailing whitespace."""
2019-09-26 13:02:51 +02:00
value = "Alan "
field_name = "dc.contributor.author"
assert fix.whitespace(value, field_name) == "Alan"
2019-07-28 16:47:28 +02:00
def test_fix_excessive_whitespace():
2019-09-26 13:02:51 +02:00
"""Test fixing excessive whitespace."""
2019-09-26 13:02:51 +02:00
value = "Alan Orth"
field_name = "dc.contributor.author"
assert fix.whitespace(value, field_name) == "Alan Orth"
def test_fix_invalid_separators():
2019-09-26 13:02:51 +02:00
"""Test fixing invalid multi-value separators."""
2019-09-26 13:02:51 +02:00
value = "Alan|Orth"
field_name = "dc.contributor.author"
assert fix.separators(value, field_name) == "Alan||Orth"
def test_fix_unnecessary_separators():
"""Test fixing unnecessary multi-value separators."""
field = "Alan||Orth||"
field_name = "dc.contributor.author"
assert fix.separators(field, field_name) == "Alan||Orth"
def test_fix_unnecessary_unicode():
2019-09-26 13:02:51 +02:00
"""Test fixing unnecessary Unicode."""
2019-09-26 13:02:51 +02:00
value = "Alan Orth"
2019-09-26 13:02:51 +02:00
assert fix.unnecessary_unicode(value) == "Alan Orth"
2019-07-29 17:05:03 +02:00
def test_fix_duplicates():
2019-09-26 13:02:51 +02:00
"""Test fixing duplicate metadata values."""
2019-07-29 17:05:03 +02:00
2019-09-26 13:02:51 +02:00
value = "Kenya||Kenya"
2019-07-29 17:05:03 +02:00
field_name = "dc.contributor.author"
assert fix.duplicates(value, field_name) == "Kenya"
def test_fix_newlines():
2019-09-26 13:02:51 +02:00
"""Test fixing newlines."""
2019-09-26 13:02:51 +02:00
value = """Ken
ya"""
2021-10-08 13:36:23 +02:00
field_name = "dcterms.subject"
2021-10-08 13:36:23 +02:00
assert fix.newlines(value, field_name) == "Kenya"
def test_fix_comma_space():
2019-09-26 13:02:51 +02:00
"""Test adding space after comma."""
2019-09-26 13:02:51 +02:00
value = "Orth,Alan S."
2019-09-26 13:02:51 +02:00
field_name = "dc.contributor.author"
2019-09-26 13:02:51 +02:00
assert fix.comma_space(value, field_name) == "Orth, Alan S."
def test_fix_normalized_unicode():
"""Test fixing a string that is already in its normalized (NFC) Unicode form."""
# string using the normalized canonical form of é
value = "Ouédraogo, Mathieu"
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
def test_fix_decomposed_unicode():
"""Test fixing a string that contains Unicode string."""
# string using the decomposed form of é
value = "Ouédraogo, Mathieu"
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
def test_fix_mojibake():
"""Test string with no mojibake."""
field = "CIAT Publicaçao"
field_name = "dcterms.isPartOf"
assert fix.mojibake(field, field_name) == "CIAT Publicaçao"