2021-03-19 15:04:13 +01:00
|
|
|
|
# SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
|
|
2022-07-28 15:52:43 +02:00
|
|
|
|
import pandas as pd
|
|
|
|
|
|
2019-07-26 23:36:40 +02:00
|
|
|
|
import csv_metadata_quality.fix as fix
|
|
|
|
|
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-26 23:36:40 +02:00
|
|
|
|
def test_fix_leading_whitespace():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing leading whitespace."""
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = " Alan"
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.whitespace(value, field_name) == "Alan"
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-26 23:36:40 +02:00
|
|
|
|
def test_fix_trailing_whitespace():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing trailing whitespace."""
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Alan "
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.whitespace(value, field_name) == "Alan"
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2019-07-28 16:47:28 +02:00
|
|
|
|
|
2019-07-26 23:36:40 +02:00
|
|
|
|
def test_fix_excessive_whitespace():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing excessive whitespace."""
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Alan Orth"
|
2019-07-26 23:36:40 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.whitespace(value, field_name) == "Alan Orth"
|
2019-07-28 21:53:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_invalid_separators():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing invalid multi-value separators."""
|
2019-07-28 21:53:39 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Alan|Orth"
|
2019-07-28 21:53:39 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.separators(value, field_name) == "Alan||Orth"
|
2019-07-29 15:38:10 +02:00
|
|
|
|
|
|
|
|
|
|
2021-01-03 14:37:18 +01:00
|
|
|
|
def test_fix_unnecessary_separators():
|
|
|
|
|
"""Test fixing unnecessary multi-value separators."""
|
|
|
|
|
|
|
|
|
|
field = "Alan||Orth||"
|
|
|
|
|
|
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.separators(field, field_name) == "Alan||Orth"
|
|
|
|
|
|
|
|
|
|
|
2019-07-29 15:38:10 +02:00
|
|
|
|
def test_fix_unnecessary_unicode():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing unnecessary Unicode."""
|
2019-07-29 15:38:10 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Alan Orth"
|
2019-07-29 15:38:10 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert fix.unnecessary_unicode(value) == "Alan Orth"
|
2019-07-29 17:05:03 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_duplicates():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing duplicate metadata values."""
|
2019-07-29 17:05:03 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Kenya||Kenya"
|
2019-07-29 17:05:03 +02:00
|
|
|
|
|
2020-01-16 11:35:11 +01:00
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.duplicates(value, field_name) == "Kenya"
|
2019-07-30 19:05:12 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_newlines():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test fixing newlines."""
|
2019-07-30 19:05:12 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = """Ken
|
|
|
|
|
ya"""
|
2021-10-08 13:36:23 +02:00
|
|
|
|
field_name = "dcterms.subject"
|
2019-07-30 19:05:12 +02:00
|
|
|
|
|
2021-10-08 13:36:23 +02:00
|
|
|
|
assert fix.newlines(value, field_name) == "Kenya"
|
2019-08-27 23:08:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_comma_space():
|
2019-09-26 13:02:51 +02:00
|
|
|
|
"""Test adding space after comma."""
|
2019-08-27 23:08:56 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
value = "Orth,Alan S."
|
2019-08-27 23:08:56 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
field_name = "dc.contributor.author"
|
2019-08-27 23:08:56 +02:00
|
|
|
|
|
2019-09-26 13:02:51 +02:00
|
|
|
|
assert fix.comma_space(value, field_name) == "Orth, Alan S."
|
2020-01-15 10:37:54 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_normalized_unicode():
|
|
|
|
|
"""Test fixing a string that is already in its normalized (NFC) Unicode form."""
|
|
|
|
|
|
|
|
|
|
# string using the normalized canonical form of é
|
|
|
|
|
value = "Ouédraogo, Mathieu"
|
|
|
|
|
|
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_decomposed_unicode():
|
|
|
|
|
"""Test fixing a string that contains Unicode string."""
|
|
|
|
|
|
|
|
|
|
# string using the decomposed form of é
|
|
|
|
|
value = "Ouédraogo, Mathieu"
|
|
|
|
|
|
|
|
|
|
field_name = "dc.contributor.author"
|
|
|
|
|
|
|
|
|
|
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
|
2021-03-19 09:28:33 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_mojibake():
|
|
|
|
|
"""Test string with no mojibake."""
|
|
|
|
|
|
|
|
|
|
field = "CIAT Publicaçao"
|
|
|
|
|
field_name = "dcterms.isPartOf"
|
|
|
|
|
|
|
|
|
|
assert fix.mojibake(field, field_name) == "CIAT Publicaçao"
|
2022-07-28 15:52:43 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_country_not_matching_region():
|
|
|
|
|
"""Test an item with regions not matching its country list."""
|
|
|
|
|
|
|
|
|
|
title = "Testing an item with no matching region."
|
|
|
|
|
country = "Kenya"
|
|
|
|
|
region = ""
|
|
|
|
|
missing_region = "Eastern Africa"
|
2023-12-09 10:20:35 +01:00
|
|
|
|
exclude = []
|
2022-07-28 15:52:43 +02:00
|
|
|
|
|
|
|
|
|
# Emulate a column in a transposed dataframe (which is just a series)
|
|
|
|
|
d = {
|
|
|
|
|
"dc.title": title,
|
|
|
|
|
"cg.coverage.country": country,
|
|
|
|
|
"cg.coverage.region": region,
|
|
|
|
|
}
|
|
|
|
|
series = pd.Series(data=d)
|
|
|
|
|
|
2022-09-02 14:59:22 +02:00
|
|
|
|
result = fix.countries_match_regions(series, exclude)
|
2022-07-28 15:52:43 +02:00
|
|
|
|
|
|
|
|
|
# Emulate the correct series we are expecting
|
|
|
|
|
d_correct = {
|
|
|
|
|
"dc.title": title,
|
|
|
|
|
"cg.coverage.country": country,
|
|
|
|
|
"cg.coverage.region": missing_region,
|
|
|
|
|
}
|
|
|
|
|
series_correct = pd.Series(data=d_correct)
|
|
|
|
|
|
|
|
|
|
pd.testing.assert_series_equal(result, series_correct)
|
2024-04-25 11:49:19 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_fix_normalize_dois():
|
|
|
|
|
"""Test normalizing a DOI."""
|
|
|
|
|
|
|
|
|
|
value = "doi: 10.11648/j.jps.20140201.14"
|
|
|
|
|
|
|
|
|
|
assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
|