1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-22 22:05:03 +01:00
csv-metadata-quality/tests/test_fix.py
Alan Orth 49e3543878
Add Unicode normalization
This will check all strings for un-normalized Unicode characters.
Normalization is done using NFC. This includes tests and updated
sample data (data/test.csv).

See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html
2020-01-15 11:37:54 +02:00

91 lines
1.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv_metadata_quality.fix as fix
def test_fix_leading_whitespace():
"""Test fixing leading whitespace."""
value = " Alan"
assert fix.whitespace(value) == "Alan"
def test_fix_trailing_whitespace():
"""Test fixing trailing whitespace."""
value = "Alan "
assert fix.whitespace(value) == "Alan"
def test_fix_excessive_whitespace():
"""Test fixing excessive whitespace."""
value = "Alan Orth"
assert fix.whitespace(value) == "Alan Orth"
def test_fix_invalid_separators():
"""Test fixing invalid multi-value separators."""
value = "Alan|Orth"
assert fix.separators(value) == "Alan||Orth"
def test_fix_unnecessary_unicode():
"""Test fixing unnecessary Unicode."""
value = "Alan Orth"
assert fix.unnecessary_unicode(value) == "Alan Orth"
def test_fix_duplicates():
"""Test fixing duplicate metadata values."""
value = "Kenya||Kenya"
assert fix.duplicates(value) == "Kenya"
def test_fix_newlines():
"""Test fixing newlines."""
value = """Ken
ya"""
assert fix.newlines(value) == "Kenya"
def test_fix_comma_space():
"""Test adding space after comma."""
value = "Orth,Alan S."
field_name = "dc.contributor.author"
assert fix.comma_space(value, field_name) == "Orth, Alan S."
def test_fix_normalized_unicode():
"""Test fixing a string that is already in its normalized (NFC) Unicode form."""
# string using the normalized canonical form of é
value = "Ouédraogo, Mathieu"
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
def test_fix_decomposed_unicode():
"""Test fixing a string that contains Unicode string."""
# string using the decomposed form of é
value = "Ouédraogo, Mathieu"
field_name = "dc.contributor.author"
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"