csv-metadata-quality/csv_metadata_quality/fix.py

import re

import pandas as pd


def whitespace(field):
    """Fix whitespace issues.

    Return string with leading, trailing, and consecutive whitespace trimmed.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Initialize an empty list to hold the cleaned values
    values = list()

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        # Strip leading and trailing whitespace
        value = value.strip()

        # Replace excessive whitespace (>2) with one space
        pattern = re.compile(r"\s{2,}")
        match = re.findall(pattern, value)

        if match:
            print(f"Excessive whitespace: {value}")
            value = re.sub(pattern, " ", value)

        # Save cleaned value
        values.append(value)

    # Create a new field consisting of all values joined with "||"
    new_field = "||".join(values)

    return new_field


def separators(field):
    """Fix for invalid multi-value separators (ie "|")."""

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Initialize an empty list to hold the cleaned values
    values = list()

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        # After splitting, see if there are any remaining "|" characters
        pattern = re.compile(r"\|")
        match = re.findall(pattern, value)

        if match:
            print(f"Fixing invalid multi-value separator: {value}")

            value = re.sub(pattern, "||", value)

        # Save cleaned value
        values.append(value)

    # Create a new field consisting of all values joined with "||"
    new_field = "||".join(values)

    return new_field


def unnecessary_unicode(field):
    """Remove and replace unnecessary Unicode characters.

    Removes unnecessary Unicode characters like:
        - Zero-width space (U+200B)
        - Replacement character (U+FFFD)
        - No-break space (U+00A0)

    Replaces unnecessary Unicode characters like:
        - Soft hyphen (U+00AD) → hyphen

    Return string with characters removed or replaced.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Check for zero-width space characters (U+200B)
    pattern = re.compile(r"\u200B")
    match = re.findall(pattern, field)

    if match:
        print(f"Removing unnecessary Unicode (U+200B): {field}")
        field = re.sub(pattern, "", field)

    # Check for replacement characters (U+FFFD)
    pattern = re.compile(r"\uFFFD")
    match = re.findall(pattern, field)

    if match:
        print(f"Removing unnecessary Unicode (U+FFFD): {field}")
        field = re.sub(pattern, "", field)

    # Check for no-break spaces (U+00A0)
    pattern = re.compile(r"\u00A0")
    match = re.findall(pattern, field)

    if match:
        print(f"Removing unnecessary Unicode (U+00A0): {field}")
        field = re.sub(pattern, "", field)

    # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
    pattern = re.compile(r"\u002D*?\u00AD")
    match = re.findall(pattern, field)

    if match:
        print(f"Replacing unnecessary Unicode (U+00AD): {field}")
        field = re.sub(pattern, "-", field)

    return field


def duplicates(field):
    """Remove duplicate metadata values."""

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    values = field.split("||")

    # Initialize an empty list to hold the de-duplicated values
    new_values = list()

    # Iterate over all values
    for value in values:
        # Check if each value exists in our list of values already
        if value not in new_values:
            new_values.append(value)
        else:
            print(f"Dropping duplicate value: {value}")

    # Create a new field consisting of all values joined with "||"
    new_field = "||".join(new_values)

    return new_field


def newlines(field):
    """Fix newlines.

    Single metadata values should not span multiple lines because this is not
    rendered properly in DSpace's XMLUI and even causes issues during import.

    Implementation note: this currently only detects Unix line feeds (0x0a).
    This is essentially when a user presses "Enter" to move to the next line.
    Other newlines like the Windows carriage return are already handled with
    the string stipping performed in the whitespace fixes.

    Confusingly, in Vim '\n' matches a line feed when searching, but you must
    use '\r' to *insert* a line feed, ie in a search and replace expression.

    Return string with newlines removed.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Check for Unix line feed (LF)
    match = re.findall(r"\n", field)

    if match:
        print(f"Removing newline: {field}")
        field = field.replace("\n", "")

    return field


def comma_space(field, field_name):
    """Fix occurrences of commas missing a trailing space, for example:

    Orth,Alan S.

    This is a very common mistake in author and citation fields.

    Return string with a space added.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Check for comma followed by a word character
    match = re.findall(r",\w", field)

    if match:
        print(f"Adding space after comma ({field_name}): {field}")
        field = re.sub(r",(\w)", r", \1", field)

    return field
Refactor as package with subpackages This makes it cleaner for introducing checks, fixes, tests, docs, and tests in the future. Currently can be run like this: python -m csv_metadata_quality CSV input and output paths are still hard coded. See: https://dev.to/codemouse92/dead-simple-python-project-structure-and-imports-38c6 2019-07-26 21:11:10 +02:00			`import re`

Sort imports with isort See: https://sourcery.ai/blog/python-best-practices/ 2019-08-29 00:15:04 +02:00			`import pandas as pd`

Fix whitespace errors found by flake8 2019-07-28 16:47:28 +02:00
Refactor as package with subpackages This makes it cleaner for introducing checks, fixes, tests, docs, and tests in the future. Currently can be run like this: python -m csv_metadata_quality CSV input and output paths are still hard coded. See: https://dev.to/codemouse92/dead-simple-python-project-structure-and-imports-38c6 2019-07-26 21:11:10 +02:00			`def whitespace(field):`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00			`"""Fix whitespace issues.`

			`Return string with leading, trailing, and consecutive whitespace trimmed.`
			`"""`

fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Skip fields with missing values`
			`if pd.isna(field):`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00			`return`

fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Initialize an empty list to hold the cleaned values`
			`values = list()`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Try to split multi-value field on "\|\|" separator`
Format with black 2019-08-29 00:10:39 +02:00			`for value in field.split("\|\|"):`
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Strip leading and trailing whitespace`
			`value = value.strip()`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Replace excessive whitespace (>2) with one space`
Format with black 2019-08-29 00:10:39 +02:00			`pattern = re.compile(r"\s{2,}")`
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`match = re.findall(pattern, value)`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00
Test Python regular expression matches directly Match objects always have a boolean value of True. See: https://docs.python.org/3.7/library/re.html 2019-07-29 15:16:30 +02:00			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Excessive whitespace: {value}")`
			`value = re.sub(pattern, " ", value)`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Save cleaned value`
			`values.append(value)`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`# Create a new field consisting of all values joined with "\|\|"`
Format with black 2019-08-29 00:10:39 +02:00			`new_field = "\|\|".join(values)`
Add fix.py Initial working version of metadata cleaning script that fixes lea- ding and trailing whitespace (even in DSpace multi-value fields). 2019-07-26 18:08:28 +02:00
fix.py: Massive improvements Use Python's str.strip() instead of kludgy regular expressions and use split/join to handle multi-value fields more cleanly. 2019-07-26 18:31:55 +02:00			`return new_field`
Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "\|\|" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "\|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "\|" for some legitimate purpose. 2019-07-28 21:53:39 +02:00

			`def separators(field):`
			`"""Fix for invalid multi-value separators (ie "\|")."""`

			`# Skip fields with missing values`
			`if pd.isna(field):`
			`return`

			`# Initialize an empty list to hold the cleaned values`
			`values = list()`

			`# Try to split multi-value field on "\|\|" separator`
Format with black 2019-08-29 00:10:39 +02:00			`for value in field.split("\|\|"):`
Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "\|\|" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "\|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "\|" for some legitimate purpose. 2019-07-28 21:53:39 +02:00			`# After splitting, see if there are any remaining "\|" characters`
Format with black 2019-08-29 00:10:39 +02:00			`pattern = re.compile(r"\\|")`
Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "\|\|" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "\|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "\|" for some legitimate purpose. 2019-07-28 21:53:39 +02:00			`match = re.findall(pattern, value)`

Test Python regular expression matches directly Match objects always have a boolean value of True. See: https://docs.python.org/3.7/library/re.html 2019-07-29 15:16:30 +02:00			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Fixing invalid multi-value separator: {value}")`
Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "\|\|" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "\|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "\|" for some legitimate purpose. 2019-07-28 21:53:39 +02:00
Format with black 2019-08-29 00:10:39 +02:00			`value = re.sub(pattern, "\|\|", value)`
Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "\|\|" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "\|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "\|" for some legitimate purpose. 2019-07-28 21:53:39 +02:00
			`# Save cleaned value`
			`values.append(value)`

			`# Create a new field consisting of all values joined with "\|\|"`
Format with black 2019-08-29 00:10:39 +02:00			`new_field = "\|\|".join(values)`
Add "unsafe fixes" runtime option In this case it fixes occurences of invalid multi-value separators. DSpace uses "\|\|" to separate multiple values in one field, but our editors sometimes give us files with mistakes like "\|". We can fix these to be correct multi-value separators if we are sure that the metadata is not actually using "\|" for some legitimate purpose. 2019-07-28 21:53:39 +02:00
			`return new_field`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00

			`def unnecessary_unicode(field):`
csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail. 2019-08-10 23:07:21 +02:00			`"""Remove and replace unnecessary Unicode characters.`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00
			`Removes unnecessary Unicode characters like:`
			`- Zero-width space (U+200B)`
			`- Replacement character (U+FFFD)`
			`- No-break space (U+00A0)`

csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail. 2019-08-10 23:07:21 +02:00			`Replaces unnecessary Unicode characters like:`
			`- Soft hyphen (U+00AD) → hyphen`

			`Return string with characters removed or replaced.`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00			`"""`

			`# Skip fields with missing values`
			`if pd.isna(field):`
			`return`

			`# Check for zero-width space characters (U+200B)`
Format with black 2019-08-29 00:10:39 +02:00			`pattern = re.compile(r"\u200B")`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00			`match = re.findall(pattern, field)`

			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Removing unnecessary Unicode (U+200B): {field}")`
			`field = re.sub(pattern, "", field)`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00
			`# Check for replacement characters (U+FFFD)`
Format with black 2019-08-29 00:10:39 +02:00			`pattern = re.compile(r"\uFFFD")`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00			`match = re.findall(pattern, field)`

			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Removing unnecessary Unicode (U+FFFD): {field}")`
			`field = re.sub(pattern, "", field)`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00
			`# Check for no-break spaces (U+00A0)`
Format with black 2019-08-29 00:10:39 +02:00			`pattern = re.compile(r"\u00A0")`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00			`match = re.findall(pattern, field)`

			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Removing unnecessary Unicode (U+00A0): {field}")`
			`field = re.sub(pattern, "", field)`
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00
csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail. 2019-08-10 23:07:21 +02:00			`# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen`
Format with black 2019-08-29 00:10:39 +02:00			`pattern = re.compile(r"\u002D*?\u00AD")`
csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail. 2019-08-10 23:07:21 +02:00			`match = re.findall(pattern, field)`

			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Replacing unnecessary Unicode (U+00AD): {field}")`
			`field = re.sub(pattern, "-", field)`
csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail. 2019-08-10 23:07:21 +02:00
Add support for fixing "unnecessary" Unicode These are things like non-breaking spaces, "replacement" characters, etc that add nothing to the metadata and often cause errors during parsing or displaying in a UI. 2019-07-29 15:38:10 +02:00			`return field`
Add fix for duplicate metadata values 2019-07-29 17:05:03 +02:00

			`def duplicates(field):`
			`"""Remove duplicate metadata values."""`

			`# Skip fields with missing values`
			`if pd.isna(field):`
			`return`

			`# Try to split multi-value field on "\|\|" separator`
Format with black 2019-08-29 00:10:39 +02:00			`values = field.split("\|\|")`
Add fix for duplicate metadata values 2019-07-29 17:05:03 +02:00
			`# Initialize an empty list to hold the de-duplicated values`
			`new_values = list()`

			`# Iterate over all values`
			`for value in values:`
			`# Check if each value exists in our list of values already`
			`if value not in new_values:`
			`new_values.append(value)`
			`else:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Dropping duplicate value: {value}")`
Add fix for duplicate metadata values 2019-07-29 17:05:03 +02:00
			`# Create a new field consisting of all values joined with "\|\|"`
Format with black 2019-08-29 00:10:39 +02:00			`new_field = "\|\|".join(new_values)`
Add fix for duplicate metadata values 2019-07-29 17:05:03 +02:00
			`return new_field`
Add support for removing newlines This was tricky because of the nature of newlines. In actuality we are removing Unix line feeds here (U+000A) because Windows carriage returns are actually already removed by the string stripping in the whitespace fix. Creating the test case in Vim was difficult because I couldn't fig- ure out how to manually enter a line feed character. In the end I used a search and replace on a known pattern like "ALAN", replacing it with \r. Neither entering the Unicode code point (U+000A) direc- tly or typing an "Enter" character after ^V worked. Grrr. 2019-07-30 19:05:12 +02:00

			`def newlines(field):`
			`"""Fix newlines.`

			`Single metadata values should not span multiple lines because this is not`
			`rendered properly in DSpace's XMLUI and even causes issues during import.`

			`Implementation note: this currently only detects Unix line feeds (0x0a).`
			`This is essentially when a user presses "Enter" to move to the next line.`
			`Other newlines like the Windows carriage return are already handled with`
			`the string stipping performed in the whitespace fixes.`

			`Confusingly, in Vim '\n' matches a line feed when searching, but you must`
			`use '\r' to insert a line feed, ie in a search and replace expression.`

			`Return string with newlines removed.`
			`"""`

			`# Skip fields with missing values`
			`if pd.isna(field):`
			`return`

			`# Check for Unix line feed (LF)`
Format with black 2019-08-29 00:10:39 +02:00			`match = re.findall(r"\n", field)`
Add support for removing newlines This was tricky because of the nature of newlines. In actuality we are removing Unix line feeds here (U+000A) because Windows carriage returns are actually already removed by the string stripping in the whitespace fix. Creating the test case in Vim was difficult because I couldn't fig- ure out how to manually enter a line feed character. In the end I used a search and replace on a known pattern like "ALAN", replacing it with \r. Neither entering the Unicode code point (U+000A) direc- tly or typing an "Enter" character after ^V worked. Grrr. 2019-07-30 19:05:12 +02:00
			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Removing newline: {field}")`
			`field = field.replace("\n", "")`
Add support for removing newlines This was tricky because of the nature of newlines. In actuality we are removing Unix line feeds here (U+000A) because Windows carriage returns are actually already removed by the string stripping in the whitespace fix. Creating the test case in Vim was difficult because I couldn't fig- ure out how to manually enter a line feed character. In the end I used a search and replace on a known pattern like "ALAN", replacing it with \r. Neither entering the Unicode code point (U+000A) direc- tly or typing an "Enter" character after ^V worked. Grrr. 2019-07-30 19:05:12 +02:00
			`return field`
Add fix for missing space after commas This happens in names very often, for example in the contributor and citation fields. I will limit this to those fields for now and hide this fix behind the "unsafe fixes" option until I test it more. 2019-08-27 23:05:52 +02:00

			`def comma_space(field, field_name):`
			`"""Fix occurrences of commas missing a trailing space, for example:`

			`Orth,Alan S.`

			`This is a very common mistake in author and citation fields.`

			`Return string with a space added.`
			`"""`

			`# Skip fields with missing values`
			`if pd.isna(field):`
			`return`

			`# Check for comma followed by a word character`
Format with black 2019-08-29 00:10:39 +02:00			`match = re.findall(r",\w", field)`
Add fix for missing space after commas This happens in names very often, for example in the contributor and citation fields. I will limit this to those fields for now and hide this fix behind the "unsafe fixes" option until I test it more. 2019-08-27 23:05:52 +02:00
			`if match:`
Format with black 2019-08-29 00:10:39 +02:00			`print(f"Adding space after comma ({field_name}): {field}")`
			`field = re.sub(r",(\w)", r", \1", field)`
Add fix for missing space after commas This happens in names very often, for example in the contributor and citation fields. I will limit this to those fields for now and hide this fix behind the "unsafe fixes" option until I test it more. 2019-08-27 23:05:52 +02:00
			`return field`