csv-metadata-quality/csv_metadata_quality/check.py

# SPDX-License-Identifier: GPL-3.0-only

import os
import re
from datetime import datetime, timedelta

import pandas as pd
import requests
import requests_cache
import spdx_license_list
from colorama import Fore
from pycountry import languages
from stdnum import isbn as stdnum_isbn
from stdnum import issn as stdnum_issn

from csv_metadata_quality.util import is_mojibake


def issn(field):
    """Check if an ISSN is valid.

    Prints the ISSN if invalid.

    stdnum's is_valid() function never raises an exception.

    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):

        if not stdnum_issn.is_valid(value):
            print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")

    return


def isbn(field):
    """Check if an ISBN is valid.

    Prints the ISBN if invalid.

    stdnum's is_valid() function never raises an exception.

    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):

        if not stdnum_isbn.is_valid(value):
            print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")

    return


def date(field, field_name):
    """Check if a date is valid.

    In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
    could technically even include time as long as it is ISO8601.

    Also checks for other invalid cases like missing and multiple dates.

    Prints the date if invalid.
    """

    if pd.isna(field):
        print(f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}")

        return

    # Try to split multi-value field on "||" separator
    multiple_dates = field.split("||")

    # We don't allow multi-value date fields
    if len(multiple_dates) > 1:
        print(
            f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
        )

        return

    try:
        # Check if date is valid YYYY format
        datetime.strptime(field, "%Y")

        return
    except ValueError:
        pass

    try:
        # Check if date is valid YYYY-MM format
        datetime.strptime(field, "%Y-%m")

        return
    except ValueError:
        pass

    try:
        # Check if date is valid YYYY-MM-DD format
        datetime.strptime(field, "%Y-%m-%d")

        return
    except ValueError:
        pass

    try:
        # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
        datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")

        return
    except ValueError:
        print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")

        return


def suspicious_characters(field, field_name):
    """Warn about suspicious characters.

    Look for standalone characters that could indicate encoding or copy/paste
    errors for languages with accents. For example: foreˆt should be forêt.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # List of suspicious characters, for example:  ́ˆ~`
    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]

    for character in suspicious_characters:
        # Find the position of the suspicious character in the string
        suspicious_character_position = field.find(character)

        # Python returns -1 if there is no match
        if suspicious_character_position != -1:
            # Create a temporary new string starting from the position of the
            # suspicious character
            field_subset = field[suspicious_character_position:]

            # Print part of the metadata value starting from the suspicious
            # character and spanning enough of the rest to give a preview,
            # but not too much to cause the line to break in terminals with
            # a default of 80 characters width.
            suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
            print(f"{suspicious_character_msg:1.80}")

    return


def language(field):
    """Check if a language is valid ISO 639-1 (alpha 2) or ISO 639-3 (alpha 3).

    Prints the value if it is invalid.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # need to handle "Other" values here...

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):

        # After splitting, check if language value is 2 or 3 characters so we
        # can check it against ISO 639-1 or ISO 639-3 accordingly.
        if len(value) == 2:
            if not languages.get(alpha_2=value):
                print(f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}")

                pass
        elif len(value) == 3:
            if not languages.get(alpha_3=value):
                print(f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}")

                pass
        else:
            print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")

    return


def agrovoc(field, field_name):
    """Check subject terms against AGROVOC REST API.

    Function constructor expects the field as well as the field name because
    many fields can now be validated against AGROVOC and we want to be able
    to inform the user in which field the invalid term is.

    Logic copied from agrovoc-lookup.py.

    See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

    Prints a warning if the value is invalid.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # enable transparent request cache with thirty days expiry
    expire_after = timedelta(days=30)
    # Allow overriding the location of the requests cache, just in case we are
    # running in an environment where we can't write to the current working di-
    # rectory (for example from csv-metadata-quality-web).
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
    requests_cache.install_cache(
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
    )

    # prune old cache entries
    requests_cache.core.remove_expired_responses()

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
        request_params = {"query": value}

        request = requests.get(request_url, params=request_params)

        if request.status_code == requests.codes.ok:
            data = request.json()

            # check if there are any results
            if len(data["results"]) == 0:
                print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")

    return


def filename_extension(field):
    """Check filename extension.

    CSVs with a 'filename' column are likely meant as input for the SAFBuilder
    tool, which creates a Simple Archive Format bundle for importing metadata
    with accompanying PDFs or other files into DSpace.

    This check warns if a filename has an uncommon extension (that is, other
    than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    values = field.split("||")

    # List of common filename extentions
    common_filename_extensions = [
        ".pdf",
        ".doc",
        ".docx",
        ".ppt",
        ".pptx",
        ".xls",
        ".xlsx",
    ]

    # Iterate over all values
    for value in values:
        # Assume filename extension does not match
        filename_extension_match = False

        for filename_extension in common_filename_extensions:
            # Check for extension at the end of the filename
            pattern = re.escape(filename_extension) + r"$"
            match = re.search(pattern, value, re.IGNORECASE)

            if match is not None:
                # Register the match and stop checking for this filename
                filename_extension_match = True

                break

        if filename_extension_match is False:
            print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")

    return


def spdx_license_identifier(field):
    """Check if a license is a valid SPDX identifier.

    Prints the value if it is invalid.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        if value not in spdx_license_list.LICENSES:
            print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")

            pass

    return


def duplicate_items(df):
    """Attempt to identify duplicate items.

    First we check the total number of titles and compare it with the number of
    unique titles. If there are less unique titles than total titles we expand
    the search by creating a key (of sorts) for each item that includes their
    title, type, and date issued, and compare it with all the others. If there
    are multiple occurrences of the same title, type, date string then it's a
    very good indicator that the items are duplicates.
    """

    # Extract the names of the title, type, and date issued columns so we can
    # reference them later. First we filter columns by likely patterns, then
    # we extract the name from the first item of the resulting object, ie:
    #
    #   Index(['dcterms.title[en_US]'], dtype='object')
    #
    title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
    type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
    date_column_name = df.filter(
        regex=r"dcterms\.issued|dc\.date\.accessioned"
    ).columns[0]

    items_count_total = df[title_column_name].count()
    items_count_unique = df[title_column_name].nunique()

    if items_count_unique < items_count_total:
        # Create a list to hold our items while we check for duplicates
        items = list()

        for index, row in df.iterrows():
            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"

            if item_title_type_date in items:
                print(
                    f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
                )
            else:
                items.append(item_title_type_date)


def mojibake(field, field_name):
    """Check for mojibake (text that was encoded in one encoding and decoded in
    in another, perhaps multiple times). See util.py.

    Prints the string if it contains suspected mojibake.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    if is_mojibake(field):
        print(
            f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}"
        )

    return
-												Add SPDX short license identifier to all Python files

See: https://spdx.github.io/spdx-spec/appendix-V-using-SPDX-short-identifiers-in-source-files/

											
										
										
											2021-03-19 15:04:13 +01:00
+								# SPDX-License-Identifier: GPL-3.0-only
-												csv_metadata_quality/check.py: requests cache

Allow overriding the directory for the requests cache. In the case
of csv-metadata-quality-web, which currently runs on Google's App
Engine, we can only write to /tmp.

											
										
										
											2021-03-14 08:07:35 +01:00
+								import os
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								import re
-												csv_metadata_quality: Move scoped imports to global

According to PEP8 we should avoid scoped imports unless you have a
good reason. Here there are two cases where we do (issn and isbn),
but I will move the others to the global scope.

											
										
										
											2020-10-06 16:11:39 +02:00
+								from datetime import datetime, timedelta
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
+								import pandas as pd
-												csv_metadata_quality: Move scoped imports to global

According to PEP8 we should avoid scoped imports unless you have a
good reason. Here there are two cases where we do (issn and isbn),
but I will move the others to the global scope.

											
										
										
											2020-10-06 16:11:39 +02:00
+								import requests
 								import requests_cache
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
+								import spdx_license_list
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								from colorama import Fore
-												csv_metadata_quality: Move scoped imports to global

According to PEP8 we should avoid scoped imports unless you have a
good reason. Here there are two cases where we do (issn and isbn),
but I will move the others to the global scope.

											
										
										
											2020-10-06 16:11:39 +02:00
+								from pycountry import languages
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								from stdnum import isbn as stdnum_isbn
 								from stdnum import issn as stdnum_issn
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
-												Add checks and unsafe fixes for mojibake

This detects whether text has likely been encoded in one encoding
and decoded in another, perhaps multiple times. This often results
in display of "mojibake" characters.

For example, a file encoded in UTF-8 is opened as CP-1252 (Windows
Latin codepage) in Microsoft Excel, and saved again as UTF-8. You
will see strings like this in the resulting file:

    - CIAT PublicaÃ§ao
    - CIAT PublicaciÃ³n

The correct version of these in UTF-8 would be:

    - CIAT Publicaçao
    - CIAT Publicación

I use a code snippet from Martijn Pieters on StackOverflow to de-
tect whether a string is "weird" as determined by the excellent
"fixes text for you" (ftfy) Python library, then check if a weird
string encodes as CP-1252 or not. If so, I can try to fix it.

See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python

											
										
										
											2021-03-19 09:22:21 +01:00
+								from csv_metadata_quality.util import is_mojibake
-												Fix whitespace errors found by flake8

											
										
										
											2019-07-28 16:47:28 +02:00
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
+								def issn(field):
 								    """Check if an ISSN is valid.
 								    Prints the ISSN if invalid.
 								    stdnum's is_valid() function never raises an exception.
 								    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								        if not stdnum_issn.is_valid(value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												csv_metadata_quality/check.py: Always return field

We always need to return the field back so apply doesn't set it to
null when creating the new data frame.

											
										
										
											2019-07-27 00:28:08 +02:00
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
 								def isbn(field):
 								    """Check if an ISBN is valid.
 								    Prints the ISBN if invalid.
 								    stdnum's is_valid() function never raises an exception.
 								    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
 								    """
-												csv_metadata_quality/check.py: Add check for missing isbn values

											
										
										
											2019-07-26 22:44:58 +02:00
+								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
+								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								        if not stdnum_isbn.is_valid(value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
-												Add check for invalid multi-value separators

											
										
										
											2019-07-26 22:48:24 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												csv_metadata_quality/check.py: Always return field

We always need to return the field back so apply doesn't set it to
null when creating the new data frame.

											
										
										
											2019-07-27 00:28:08 +02:00
-												Add check for invalid multi-value separators

											
										
										
											2019-07-26 22:48:24 +02:00
-												Add column name to output in date checks

This makes it easier to understand where the error is in case a CSV
has multiple date fields, for example:

    Missing date (dc.date.issued).
    Missing date (dc.date.issued[]).

If you have 126 items and you get 126 "Missing date" messages then
it's likely that 100 of the items have dates in one field, and the
others have dates in other field.

											
										
										
											2019-08-21 14:31:12 +02:00
+								def date(field, field_name):
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    """Check if a date is valid.
 								    In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
 								    could technically even include time as long as it is ISO8601.
 								    Also checks for other invalid cases like missing and multiple dates.
 								    Prints the date if invalid.
 								    """
 								    if pd.isna(field):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								        print(f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
 								        return
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    multiple_dates = field.split("||")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
 								    # We don't allow multi-value date fields
 								    if len(multiple_dates) > 1:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								        print(
 								            f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
 								        )
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
 								    try:
 								        # Check if date is valid YYYY format
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								        datetime.strptime(field, "%Y")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    except ValueError:
 								        pass
 								    try:
 								        # Check if date is valid YYYY-MM format
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								        datetime.strptime(field, "%Y-%m")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    except ValueError:
 								        pass
 								    try:
 								        # Check if date is valid YYYY-MM-DD format
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								        datetime.strptime(field, "%Y-%m-%d")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												csv_metadata_quality/check.py: More date formats

We should also allow ISO 8601 extended in combined date and time
format. DSpace does not have a problem with dates in this format
and I have found some metadata that uses this date format.

For example: 2020-08-31T11:04:56Z

See: https://en.wikipedia.org/wiki/ISO_8601

											
										
										
											2021-02-04 20:39:14 +01:00
+								    except ValueError:
 								        pass
 								    try:
 								        # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
 								        datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    except ValueError:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								        print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												csv_metadata_quality/check.py: Return date even if it is invalid

Otherwise it is missing from the final CSV and then we can't even
fix it. :)

											
										
										
											2019-07-29 16:40:14 +02:00
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
-												Improve suspicious character detection

Now it will print just the part of the metadata value that contains
the suspicious character (up to 80 characters, so we don't make the
line break on terminals that use 80 character width by default).

Also, print the name of the field in which the metadata value is so
that it is easier for the user to locate.

											
										
										
											2019-08-09 00:22:59 +02:00
+								def suspicious_characters(field, field_name):
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
+								    """Warn about suspicious characters.
 								    Look for standalone characters that could indicate encoding or copy/paste
 								    errors for languages with accents. For example: foreˆt should be forêt.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # List of suspicious characters, for example:  ́ˆ~`
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
 								    for character in suspicious_characters:
-												Improve suspicious character detection

Now it will print just the part of the metadata value that contains
the suspicious character (up to 80 characters, so we don't make the
line break on terminals that use 80 character width by default).

Also, print the name of the field in which the metadata value is so
that it is easier for the user to locate.

											
										
										
											2019-08-09 00:22:59 +02:00
+								        # Find the position of the suspicious character in the string
 								        suspicious_character_position = field.find(character)
 								        # Python returns -1 if there is no match
 								        if suspicious_character_position != -1:
 								            # Create a temporary new string starting from the position of the
 								            # suspicious character
 								            field_subset = field[suspicious_character_position:]
 								            # Print part of the metadata value starting from the suspicious
 								            # character and spanning enough of the rest to give a preview,
 								            # but not too much to cause the line to break in terminals with
 								            # a default of 80 characters width.
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								            print(f"{suspicious_character_msg:1.80}")
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								def language(field):
-												More ISO 639-1 and ISO 639-3 fixes

ISO 639-1 uses two-letter codes and ISO 639-3 uses three-letter codes.
Technically there ISO 639-2/T and ISO 639-2/B, which also uses three
letter codes, but those are not supported by the pycountry library
so I won't even worry about them.

See: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

											
										
										
											2019-09-26 06:44:39 +02:00
+								    """Check if a language is valid ISO 639-1 (alpha 2) or ISO 639-3 (alpha 3).
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								    Prints the value if it is invalid.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # need to handle "Other" values here...
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								        # After splitting, check if language value is 2 or 3 characters so we
-												More ISO 639-1 and ISO 639-3 fixes

ISO 639-1 uses two-letter codes and ISO 639-3 uses three-letter codes.
Technically there ISO 639-2/T and ISO 639-2/B, which also uses three
letter codes, but those are not supported by the pycountry library
so I won't even worry about them.

See: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

											
										
										
											2019-09-26 06:44:39 +02:00
+								        # can check it against ISO 639-1 or ISO 639-3 accordingly.
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
+								        if len(value) == 2:
-												Use pycountry instead of iso-639 for languages

The latter is a fork that hasn't been updated since 2016 and the
original still seems to be well maintained, with recent database
updates as well as tests for Python 3.7.

Also, pycountry supports ISO 3166-2 (administrative zones), which
we could eventually use for sub regions.

											
										
										
											2019-07-30 15:39:26 +02:00
+								            if not languages.get(alpha_2=value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								                print(f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}")
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								                pass
 								        elif len(value) == 3:
-												Use pycountry instead of iso-639 for languages

The latter is a fork that hasn't been updated since 2016 and the
original still seems to be well maintained, with recent database
updates as well as tests for Python 3.7.

Also, pycountry supports ISO 3166-2 (administrative zones), which
we could eventually use for sub regions.

											
										
										
											2019-07-30 15:39:26 +02:00
+								            if not languages.get(alpha_3=value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								                print(f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}")
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								                pass
 								        else:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
-												Rework AGROVOC validation

AGROVOC validation is now disabled by default, but can be enabled
on a field-by-field basis. For example, countries and regions are
also present in AGROVOC. Fields with these values can be enabled
using the new `--agrovoc-fields` option.

I reworked the script output to show the field name when printing
an invalid term so that the user knows in which field the term is.

											
										
										
											2019-08-01 22:51:58 +02:00
+								def agrovoc(field, field_name):
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
+								    """Check subject terms against AGROVOC REST API.
-												Rework AGROVOC validation

AGROVOC validation is now disabled by default, but can be enabled
on a field-by-field basis. For example, countries and regions are
also present in AGROVOC. Fields with these values can be enabled
using the new `--agrovoc-fields` option.

I reworked the script output to show the field name when printing
an invalid term so that the user knows in which field the term is.

											
										
										
											2019-08-01 22:51:58 +02:00
+								    Function constructor expects the field as well as the field name because
 								    many fields can now be validated against AGROVOC and we want to be able
 								    to inform the user in which field the invalid term is.
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
+								    Logic copied from agrovoc-lookup.py.
 								    See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
 								    Prints a warning if the value is invalid.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
-												csv_metadata_quality/check.py: Prune requests cache once

We only need to prune the requests cache once before using it, not
for every value we check.

											
										
										
											2020-07-06 12:41:51 +02:00
+								    # enable transparent request cache with thirty days expiry
 								    expire_after = timedelta(days=30)
-												csv_metadata_quality/check.py: requests cache

Allow overriding the directory for the requests cache. In the case
of csv-metadata-quality-web, which currently runs on Google's App
Engine, we can only write to /tmp.

											
										
										
											2021-03-14 08:07:35 +01:00
+								    # Allow overriding the location of the requests cache, just in case we are
 								    # running in an environment where we can't write to the current working di-
 								    # rectory (for example from csv-metadata-quality-web).
 								    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
-												csv_metadata_quality/check.py: Reformat with black

											
										
										
											2021-03-16 15:12:33 +01:00
+								    requests_cache.install_cache(
 								        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
 								    )
-												csv_metadata_quality/check.py: Prune requests cache once

We only need to prune the requests cache once before using it, not
for every value we check.

											
										
										
											2020-07-06 12:41:51 +02:00
 								    # prune old cache entries
 								    requests_cache.core.remove_expired_responses()
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
+								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												csv_metadata_quality/check.py: Parameterize AGROVOC request

											
										
										
											2020-07-06 12:44:46 +02:00
+								        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
 								        request_params = {"query": value}
-												csv_metadata_quality/check.py: Simplify AGROVOC check

I recycled this code from a separate agrovoc-lookup.py script that
checks lines in a text file to see if they are valid AGROVOC terms
or not. There I was concerned about skipping comments or something
I think, but we don't need to check that here. We simply check the
term that is in the field and inform the user if it's valid or not.

											
										
										
											2019-08-21 15:35:29 +02:00
-												csv_metadata_quality/check.py: Parameterize AGROVOC request

											
										
										
											2020-07-06 12:44:46 +02:00
+								        request = requests.get(request_url, params=request_params)
-												csv_metadata_quality/check.py: Simplify AGROVOC check

I recycled this code from a separate agrovoc-lookup.py script that
checks lines in a text file to see if they are valid AGROVOC terms
or not. There I was concerned about skipping comments or something
I think, but we don't need to check that here. We simply check the
term that is in the field and inform the user if it's valid or not.

											
										
										
											2019-08-21 15:35:29 +02:00
 								        if request.status_code == requests.codes.ok:
 								            data = request.json()
 								            # check if there are any results
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								            if len(data["results"]) == 0:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								                print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
 								def filename_extension(field):
 								    """Check filename extension.
 								    CSVs with a 'filename' column are likely meant as input for the SAFBuilder
 								    tool, which creates a Simple Archive Format bundle for importing metadata
 								    with accompanying PDFs or other files into DSpace.
 								    This check warns if a filename has an uncommon extension (that is, other
 								    than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    values = field.split("||")
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
 								    # List of common filename extentions
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    common_filename_extensions = [
 								        ".pdf",
 								        ".doc",
 								        ".docx",
 								        ".ppt",
 								        ".pptx",
 								        ".xls",
 								        ".xlsx",
 								    ]
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
 								    # Iterate over all values
 								    for value in values:
 								        # Assume filename extension does not match
 								        filename_extension_match = False
 								        for filename_extension in common_filename_extensions:
 								            # Check for extension at the end of the filename
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								            pattern = re.escape(filename_extension) + r"$"
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
+								            match = re.search(pattern, value, re.IGNORECASE)
 								            if match is not None:
 								                # Register the match and stop checking for this filename
 								                filename_extension_match = True
 								                break
-												csv_metadata_quality/check.py: Fix test for False

											
										
										
											2019-08-10 22:52:53 +02:00
+								        if filename_extension_match is False:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
 								def spdx_license_identifier(field):
 								    """Check if a license is a valid SPDX identifier.
 								    Prints the value if it is invalid.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # Try to split multi-value field on "||" separator
 								    for value in field.split("||"):
 								        if value not in spdx_license_list.LICENSES:
 								            print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
 								            pass
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.

											
										
										
											2021-03-17 08:53:07 +01:00
 								def duplicate_items(df):
 								    """Attempt to identify duplicate items.
 								    First we check the total number of titles and compare it with the number of
 								    unique titles. If there are less unique titles than total titles we expand
 								    the search by creating a key (of sorts) for each item that includes their
 								    title, type, and date issued, and compare it with all the others. If there
 								    are multiple occurrences of the same title, type, date string then it's a
 								    very good indicator that the items are duplicates.
 								    """
 								    # Extract the names of the title, type, and date issued columns so we can
 								    # reference them later. First we filter columns by likely patterns, then
 								    # we extract the name from the first item of the resulting object, ie:
 								    #
 								    #   Index(['dcterms.title[en_US]'], dtype='object')
 								    #
 								    title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
 								    type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
 								    date_column_name = df.filter(
 								        regex=r"dcterms\.issued|dc\.date\.accessioned"
 								    ).columns[0]
 								    items_count_total = df[title_column_name].count()
 								    items_count_unique = df[title_column_name].nunique()
 								    if items_count_unique < items_count_total:
 								        # Create a list to hold our items while we check for duplicates
 								        items = list()
 								        for index, row in df.iterrows():
 								            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
 								            if item_title_type_date in items:
 								                print(
 								                    f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
 								                )
 								            else:
 								                items.append(item_title_type_date)
-												Add checks and unsafe fixes for mojibake

This detects whether text has likely been encoded in one encoding
and decoded in another, perhaps multiple times. This often results
in display of "mojibake" characters.

For example, a file encoded in UTF-8 is opened as CP-1252 (Windows
Latin codepage) in Microsoft Excel, and saved again as UTF-8. You
will see strings like this in the resulting file:

    - CIAT PublicaÃ§ao
    - CIAT PublicaciÃ³n

The correct version of these in UTF-8 would be:

    - CIAT Publicaçao
    - CIAT Publicación

I use a code snippet from Martijn Pieters on StackOverflow to de-
tect whether a string is "weird" as determined by the excellent
"fixes text for you" (ftfy) Python library, then check if a weird
string encodes as CP-1252 or not. If so, I can try to fix it.

See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python

											
										
										
											2021-03-19 09:22:21 +01:00
 								def mojibake(field, field_name):
 								    """Check for mojibake (text that was encoded in one encoding and decoded in
 								    in another, perhaps multiple times). See util.py.
 								    Prints the string if it contains suspected mojibake.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    if is_mojibake(field):
 								        print(
 								            f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}"
 								        )
 								    return