csv-metadata-quality/tests/test_fix.py

# SPDX-License-Identifier: GPL-3.0-only

import pandas as pd

import csv_metadata_quality.fix as fix


def test_fix_leading_whitespace():
    """Test fixing leading whitespace."""

    value = " Alan"

    field_name = "dc.contributor.author"

    assert fix.whitespace(value, field_name) == "Alan"


def test_fix_trailing_whitespace():
    """Test fixing trailing whitespace."""

    value = "Alan "

    field_name = "dc.contributor.author"

    assert fix.whitespace(value, field_name) == "Alan"


def test_fix_excessive_whitespace():
    """Test fixing excessive whitespace."""

    value = "Alan  Orth"

    field_name = "dc.contributor.author"

    assert fix.whitespace(value, field_name) == "Alan Orth"


def test_fix_invalid_separators():
    """Test fixing invalid multi-value separators."""

    value = "Alan|Orth"

    field_name = "dc.contributor.author"

    assert fix.separators(value, field_name) == "Alan||Orth"


def test_fix_unnecessary_separators():
    """Test fixing unnecessary multi-value separators."""

    field = "Alan||Orth||"

    field_name = "dc.contributor.author"

    assert fix.separators(field, field_name) == "Alan||Orth"


def test_fix_unnecessary_unicode():
    """Test fixing unnecessary Unicode."""

    value = "Alan Orth"

    assert fix.unnecessary_unicode(value) == "Alan Orth"


def test_fix_duplicates():
    """Test fixing duplicate metadata values."""

    value = "Kenya||Kenya"

    field_name = "dc.contributor.author"

    assert fix.duplicates(value, field_name) == "Kenya"


def test_fix_newlines():
    """Test fixing newlines."""

    value = """Ken
ya"""
    field_name = "dcterms.subject"

    assert fix.newlines(value, field_name) == "Kenya"


def test_fix_comma_space():
    """Test adding space after comma."""

    value = "Orth,Alan S."

    field_name = "dc.contributor.author"

    assert fix.comma_space(value, field_name) == "Orth, Alan S."


def test_fix_normalized_unicode():
    """Test fixing a string that is already in its normalized (NFC) Unicode form."""

    # string using the normalized canonical form of é
    value = "Ouédraogo, Mathieu"

    field_name = "dc.contributor.author"

    assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"


def test_fix_decomposed_unicode():
    """Test fixing a string that contains Unicode string."""

    # string using the decomposed form of é
    value = "Ouédraogo, Mathieu"

    field_name = "dc.contributor.author"

    assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"


def test_fix_mojibake():
    """Test string with no mojibake."""

    field = "CIAT PublicaÃ§ao"
    field_name = "dcterms.isPartOf"

    assert fix.mojibake(field, field_name) == "CIAT Publicaçao"


def test_fix_country_not_matching_region():
    """Test an item with regions not matching its country list."""

    title = "Testing an item with no matching region."
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
        "dc.title": title,
        "cg.coverage.country": country,
        "cg.coverage.region": region,
    }
    series = pd.Series(data=d)

    result = fix.countries_match_regions(series, exclude)

    # Emulate the correct series we are expecting
    d_correct = {
        "dc.title": title,
        "cg.coverage.country": country,
        "cg.coverage.region": missing_region,
    }
    series_correct = pd.Series(data=d_correct)

    pd.testing.assert_series_equal(result, series_correct)
-												Add SPDX short license identifier to all Python files

See: https://spdx.github.io/spdx-spec/appendix-V-using-SPDX-short-identifiers-in-source-files/

											
										
										
											2021-03-19 15:04:13 +01:00
+								# SPDX-License-Identifier: GPL-3.0-only
-												Add unsafe check to add missing regions

											
										
										
											2022-07-28 15:52:43 +02:00
+								import pandas as pd
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
+								import csv_metadata_quality.fix as fix
-												Fix whitespace errors found by flake8

											
										
										
											2019-07-28 16:47:28 +02:00
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
+								def test_fix_leading_whitespace():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing leading whitespace."""
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = " Alan"
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Output field name for more fixes and checks

This helps identify which field has the error.

											
										
										
											2020-01-16 11:35:11 +01:00
+								    field_name = "dc.contributor.author"
 								    assert fix.whitespace(value, field_name) == "Alan"
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Fix whitespace errors found by flake8

											
										
										
											2019-07-28 16:47:28 +02:00
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
+								def test_fix_trailing_whitespace():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing trailing whitespace."""
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = "Alan "
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Output field name for more fixes and checks

This helps identify which field has the error.

											
										
										
											2020-01-16 11:35:11 +01:00
+								    field_name = "dc.contributor.author"
 								    assert fix.whitespace(value, field_name) == "Alan"
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Fix whitespace errors found by flake8

											
										
										
											2019-07-28 16:47:28 +02:00
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
+								def test_fix_excessive_whitespace():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing excessive whitespace."""
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = "Alan  Orth"
-												Add initial tests

For now only test fixes because they return changed data. I'm not
sure how to test the checks, because they don't return data and I
can't modify them to return boolean values without breaking the app.

											
										
										
											2019-07-26 23:36:40 +02:00
-												Output field name for more fixes and checks

This helps identify which field has the error.

											
										
										
											2020-01-16 11:35:11 +01:00
+								    field_name = "dc.contributor.author"
 								    assert fix.whitespace(value, field_name) == "Alan Orth"
-												Add "unsafe fixes" runtime option

In this case it fixes occurences of invalid multi-value separators.
DSpace uses "||" to separate multiple values in one field, but our
editors sometimes give us files with mistakes like "|". We can fix
these to be correct multi-value separators if we are sure that the
metadata is not actually using "|" for some legitimate purpose.

											
										
										
											2019-07-28 21:53:39 +02:00
 								def test_fix_invalid_separators():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing invalid multi-value separators."""
-												Add "unsafe fixes" runtime option

In this case it fixes occurences of invalid multi-value separators.
DSpace uses "||" to separate multiple values in one field, but our
editors sometimes give us files with mistakes like "|". We can fix
these to be correct multi-value separators if we are sure that the
metadata is not actually using "|" for some legitimate purpose.

											
										
										
											2019-07-28 21:53:39 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = "Alan|Orth"
-												Add "unsafe fixes" runtime option

In this case it fixes occurences of invalid multi-value separators.
DSpace uses "||" to separate multiple values in one field, but our
editors sometimes give us files with mistakes like "|". We can fix
these to be correct multi-value separators if we are sure that the
metadata is not actually using "|" for some legitimate purpose.

											
										
										
											2019-07-28 21:53:39 +02:00
-												Output field name for more fixes and checks

This helps identify which field has the error.

											
										
										
											2020-01-16 11:35:11 +01:00
+								    field_name = "dc.contributor.author"
 								    assert fix.separators(value, field_name) == "Alan||Orth"
-												Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters,
etc that add nothing to the metadata and often cause errors during
parsing or displaying in a UI.

											
										
										
											2019-07-29 15:38:10 +02:00
-												Add tests for unnecessary multi-value separators

											
										
										
											2021-01-03 14:37:18 +01:00
+								def test_fix_unnecessary_separators():
 								    """Test fixing unnecessary multi-value separators."""
 								    field = "Alan||Orth||"
 								    field_name = "dc.contributor.author"
 								    assert fix.separators(field, field_name) == "Alan||Orth"
-												Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters,
etc that add nothing to the metadata and often cause errors during
parsing or displaying in a UI.

											
										
										
											2019-07-29 15:38:10 +02:00
+								def test_fix_unnecessary_unicode():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing unnecessary Unicode."""
-												Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters,
etc that add nothing to the metadata and often cause errors during
parsing or displaying in a UI.

											
										
										
											2019-07-29 15:38:10 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = "Alan Orth"
-												Add support for fixing "unnecessary" Unicode

These are things like non-breaking spaces, "replacement" characters,
etc that add nothing to the metadata and often cause errors during
parsing or displaying in a UI.

											
										
										
											2019-07-29 15:38:10 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    assert fix.unnecessary_unicode(value) == "Alan Orth"
-												Add fix for duplicate metadata values

											
										
										
											2019-07-29 17:05:03 +02:00
 								def test_fix_duplicates():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing duplicate metadata values."""
-												Add fix for duplicate metadata values

											
										
										
											2019-07-29 17:05:03 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = "Kenya||Kenya"
-												Add fix for duplicate metadata values

											
										
										
											2019-07-29 17:05:03 +02:00
-												Output field name for more fixes and checks

This helps identify which field has the error.

											
										
										
											2020-01-16 11:35:11 +01:00
+								    field_name = "dc.contributor.author"
 								    assert fix.duplicates(value, field_name) == "Kenya"
-												Add support for removing newlines

This was tricky because of the nature of newlines. In actuality we
are removing Unix line feeds here (U+000A) because Windows carriage
returns are actually already removed by the string stripping in the
whitespace fix.

Creating the test case in Vim was difficult because I couldn't fig-
ure out how to manually enter a line feed character. In the end I
used a search and replace on a known pattern like "ALAN", replacing
it with \r. Neither entering the Unicode code point (U+000A) direc-
tly or typing an "Enter" character after ^V worked. Grrr.

											
										
										
											2019-07-30 19:05:12 +02:00
 								def test_fix_newlines():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test fixing newlines."""
-												Add support for removing newlines

This was tricky because of the nature of newlines. In actuality we
are removing Unix line feeds here (U+000A) because Windows carriage
returns are actually already removed by the string stripping in the
whitespace fix.

Creating the test case in Vim was difficult because I couldn't fig-
ure out how to manually enter a line feed character. In the end I
used a search and replace on a known pattern like "ALAN", replacing
it with \r. Neither entering the Unicode code point (U+000A) direc-
tly or typing an "Enter" character after ^V worked. Grrr.

											
										
										
											2019-07-30 19:05:12 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = """Ken
 								ya"""
-												Add field name to fix.newlines output

											
										
										
											2021-10-08 13:36:23 +02:00
+								    field_name = "dcterms.subject"
-												Add support for removing newlines

This was tricky because of the nature of newlines. In actuality we
are removing Unix line feeds here (U+000A) because Windows carriage
returns are actually already removed by the string stripping in the
whitespace fix.

Creating the test case in Vim was difficult because I couldn't fig-
ure out how to manually enter a line feed character. In the end I
used a search and replace on a known pattern like "ALAN", replacing
it with \r. Neither entering the Unicode code point (U+000A) direc-
tly or typing an "Enter" character after ^V worked. Grrr.

											
										
										
											2019-07-30 19:05:12 +02:00
-												Add field name to fix.newlines output

											
										
										
											2021-10-08 13:36:23 +02:00
+								    assert fix.newlines(value, field_name) == "Kenya"
-												tests/test_fix.py: Add test for missing space after comma

											
										
										
											2019-08-27 23:08:56 +02:00
 								def test_fix_comma_space():
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    """Test adding space after comma."""
-												tests/test_fix.py: Add test for missing space after comma

											
										
										
											2019-08-27 23:08:56 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    value = "Orth,Alan S."
-												tests/test_fix.py: Add test for missing space after comma

											
										
										
											2019-08-27 23:08:56 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    field_name = "dc.contributor.author"
-												tests/test_fix.py: Add test for missing space after comma

											
										
										
											2019-08-27 23:08:56 +02:00
-												Reformat tests with black

											
										
										
											2019-09-26 13:02:51 +02:00
+								    assert fix.comma_space(value, field_name) == "Orth, Alan S."
-												Add Unicode normalization

This will check all strings for un-normalized Unicode characters.
Normalization is done using NFC. This includes tests and updated
sample data (data/test.csv).

See: https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html

											
										
										
											2020-01-15 10:37:54 +01:00
 								def test_fix_normalized_unicode():
 								    """Test fixing a string that is already in its normalized (NFC) Unicode form."""
 								    # string using the normalized canonical form of é
 								    value = "Ouédraogo, Mathieu"
 								    field_name = "dc.contributor.author"
 								    assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
 								def test_fix_decomposed_unicode():
 								    """Test fixing a string that contains Unicode string."""
 								    # string using the decomposed form of é
 								    value = "Ouédraogo, Mathieu"
 								    field_name = "dc.contributor.author"
 								    assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
-												Add mojibake to data/test.csv and tests

											
										
										
											2021-03-19 09:28:33 +01:00
 								def test_fix_mojibake():
 								    """Test string with no mojibake."""
 								    field = "CIAT PublicaÃ§ao"
 								    field_name = "dcterms.isPartOf"
 								    assert fix.mojibake(field, field_name) == "CIAT Publicaçao"
-												Add unsafe check to add missing regions

											
										
										
											2022-07-28 15:52:43 +02:00
 								def test_fix_country_not_matching_region():
 								    """Test an item with regions not matching its country list."""
 								    title = "Testing an item with no matching region."
 								    country = "Kenya"
 								    region = ""
 								    missing_region = "Eastern Africa"
-												tests: apply fixes from fixit

RewriteToLiteral: It's slower to call list() than using the empty literal

											
										
										
											2023-12-09 10:20:35 +01:00
+								    exclude = []
-												Add unsafe check to add missing regions

											
										
										
											2022-07-28 15:52:43 +02:00
 								    # Emulate a column in a transposed dataframe (which is just a series)
 								    d = {
 								        "dc.title": title,
 								        "cg.coverage.country": country,
 								        "cg.coverage.region": region,
 								    }
 								    series = pd.Series(data=d)
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								    result = fix.countries_match_regions(series, exclude)
-												Add unsafe check to add missing regions

											
										
										
											2022-07-28 15:52:43 +02:00
 								    # Emulate the correct series we are expecting
 								    d_correct = {
 								        "dc.title": title,
 								        "cg.coverage.country": country,
 								        "cg.coverage.region": missing_region,
 								    }
 								    series_correct = pd.Series(data=d_correct)
 								    pd.testing.assert_series_equal(result, series_correct)