2019-07-27 00:36:40 +03:00
|
|
|
|
import csv_metadata_quality.fix as fix
|
|
|
|
|
|
2019-07-28 17:47:28 +03:00
|
|
|
|
|
2019-07-27 00:36:40 +03:00
|
|
|
|
def test_fix_leading_whitespace():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing leading whitespace."""
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = " Alan"
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2020-01-16 12:35:11 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.whitespace(value, field_name) == "Alan"
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2019-07-28 17:47:28 +03:00
|
|
|
|
|
2019-07-27 00:36:40 +03:00
|
|
|
|
def test_fix_trailing_whitespace():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing trailing whitespace."""
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = "Alan "
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2020-01-16 12:35:11 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.whitespace(value, field_name) == "Alan"
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2019-07-28 17:47:28 +03:00
|
|
|
|
|
2019-07-27 00:36:40 +03:00
|
|
|
|
def test_fix_excessive_whitespace():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing excessive whitespace."""
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = "Alan Orth"
|
2019-07-27 00:36:40 +03:00
|
|
|
|
|
2020-01-16 12:35:11 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.whitespace(value, field_name) == "Alan Orth"
|
2019-07-28 22:53:39 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_invalid_separators():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing invalid multi-value separators."""
|
2019-07-28 22:53:39 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = "Alan|Orth"
|
2019-07-28 22:53:39 +03:00
|
|
|
|
|
2020-01-16 12:35:11 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.separators(value, field_name) == "Alan||Orth"
|
2019-07-29 16:38:10 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_unnecessary_unicode():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing unnecessary Unicode."""
|
2019-07-29 16:38:10 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = "Alan Orth"
|
2019-07-29 16:38:10 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
assert fix.unnecessary_unicode(value) == "Alan Orth"
|
2019-07-29 18:05:03 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_duplicates():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing duplicate metadata values."""
|
2019-07-29 18:05:03 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = "Kenya||Kenya"
|
2019-07-29 18:05:03 +03:00
|
|
|
|
|
2020-01-16 12:35:11 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.duplicates(value, field_name) == "Kenya"
|
2019-07-30 20:05:12 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_newlines():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test fixing newlines."""
|
2019-07-30 20:05:12 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = """Ken
|
|
|
|
|
ya"""
|
2019-07-30 20:05:12 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
assert fix.newlines(value) == "Kenya"
|
2019-08-28 00:08:56 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_comma_space():
|
2019-09-26 14:02:51 +03:00
|
|
|
|
"""Test adding space after comma."""
|
2019-08-28 00:08:56 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
value = "Orth,Alan S."
|
2019-08-28 00:08:56 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
field_name = "dc.contributor.author"
|
2019-08-28 00:08:56 +03:00
|
|
|
|
|
2019-09-26 14:02:51 +03:00
|
|
|
|
assert fix.comma_space(value, field_name) == "Orth, Alan S."
|
2020-01-15 11:37:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_normalized_unicode():
|
|
|
|
|
"""Test fixing a string that is already in its normalized (NFC) Unicode form."""
|
|
|
|
|
|
|
|
|
|
# string using the normalized canonical form of é
|
|
|
|
|
value = "Ouédraogo, Mathieu"
|
|
|
|
|
|
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_decomposed_unicode():
|
|
|
|
|
"""Test fixing a string that contains Unicode string."""
|
|
|
|
|
|
|
|
|
|
# string using the decomposed form of é
|
|
|
|
|
value = "Ouédraogo, Mathieu"
|
|
|
|
|
|
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
|