csv-metadata-quality/csv_metadata_quality/check.py

# SPDX-License-Identifier: GPL-3.0-only

import logging
import os
import re
from datetime import datetime, timedelta

import country_converter as coco
import pandas as pd
import requests
import requests_cache
from colorama import Fore
from pycountry import languages
from stdnum import isbn as stdnum_isbn
from stdnum import issn as stdnum_issn

from csv_metadata_quality.util import is_mojibake, load_spdx_licenses


def issn(field):
    """Check if an ISSN is valid.

    Prints the ISSN if invalid.

    stdnum's is_valid() function never raises an exception.

    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        if not stdnum_issn.is_valid(value):
            print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")

    return


def isbn(field):
    """Check if an ISBN is valid.

    Prints the ISBN if invalid.

    stdnum's is_valid() function never raises an exception.

    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        if not stdnum_isbn.is_valid(value):
            print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")

    return


def date(field, field_name):
    """Check if a date is valid.

    In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
    could technically even include time as long as it is ISO8601.

    Also checks for other invalid cases like missing and multiple dates.

    Prints the date if invalid.
    """

    if pd.isna(field):
        print(f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}")

        return

    # Try to split multi-value field on "||" separator
    multiple_dates = field.split("||")

    # We don't allow multi-value date fields
    if len(multiple_dates) > 1:
        print(
            f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
        )

        return

    try:
        # Check if date is valid YYYY format
        datetime.strptime(field, "%Y")

        return
    except ValueError:
        pass

    try:
        # Check if date is valid YYYY-MM format
        datetime.strptime(field, "%Y-%m")

        return
    except ValueError:
        pass

    try:
        # Check if date is valid YYYY-MM-DD format
        datetime.strptime(field, "%Y-%m-%d")

        return
    except ValueError:
        pass

    try:
        # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
        datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")

        return
    except ValueError:
        print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")

        return


def suspicious_characters(field, field_name):
    """Warn about suspicious characters.

    Look for standalone characters that could indicate encoding or copy/paste
    errors for languages with accents. For example: foreˆt should be forêt.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # List of suspicious characters, for example:  ́ˆ~`
    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]

    for character in suspicious_characters:
        # Find the position of the suspicious character in the string
        suspicious_character_position = field.find(character)

        # Python returns -1 if there is no match
        if suspicious_character_position != -1:
            # Create a temporary new string starting from the position of the
            # suspicious character
            field_subset = field[suspicious_character_position:]

            # Print part of the metadata value starting from the suspicious
            # character and spanning enough of the rest to give a preview,
            # but not too much to cause the line to break in terminals with
            # a default of 80 characters width.
            suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
            print(f"{suspicious_character_msg:1.80}")

    return


def language(field):
    """Check if a language is valid ISO 639-1 (alpha 2) or ISO 639-3 (alpha 3).

    Prints the value if it is invalid.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # need to handle "Other" values here...

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        # After splitting, check if language value is 2 or 3 characters so we
        # can check it against ISO 639-1 or ISO 639-3 accordingly.
        if len(value) == 2:
            if not languages.get(alpha_2=value):
                print(f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}")
        elif len(value) == 3:
            if not languages.get(alpha_3=value):
                print(f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}")
        else:
            print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")

    return


def agrovoc(field, field_name, drop):
    """Check subject terms against AGROVOC REST API.

    Function constructor expects the field as well as the field name because
    many fields can now be validated against AGROVOC and we want to be able
    to inform the user in which field the invalid term is.

    Logic copied from agrovoc-lookup.py.

    See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

    Prints a warning if the value is invalid.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # enable transparent request cache with thirty days expiry
    expire_after = timedelta(days=30)
    # Allow overriding the location of the requests cache, just in case we are
    # running in an environment where we can't write to the current working di-
    # rectory (for example from csv-metadata-quality-web).
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
    requests_cache.install_cache(
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
    )

    # prune old cache entries
    # requests_cache.remove_expired_responses()

    # Initialize an empty list to hold the validated AGROVOC values
    values = list()

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
        request_params = {"query": value}

        request = requests.get(request_url, params=request_params)

        if request.status_code == requests.codes.ok:
            data = request.json()

            # check if there are any results
            if len(data["results"]) == 0:
                if drop:
                    print(
                        f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
                    )
                else:
                    print(
                        f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
                    )

                    # value is invalid AGROVOC, but we are not dropping
                    values.append(value)
            else:
                # value is valid AGROVOC so save it
                values.append(value)

    # Create a new field consisting of all values joined with "||"
    new_field = "||".join(values)

    return new_field


def filename_extension(field):
    """Check filename extension.

    CSVs with a 'filename' column are likely meant as input for the SAFBuilder
    tool, which creates a Simple Archive Format bundle for importing metadata
    with accompanying PDFs or other files into DSpace.

    This check warns if a filename has an uncommon extension (that is, other
    than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    # Try to split multi-value field on "||" separator
    values = field.split("||")

    # List of common filename extentions
    common_filename_extensions = [
        ".pdf",
        ".doc",
        ".docx",
        ".ppt",
        ".pptx",
        ".xls",
        ".xlsx",
    ]

    # Iterate over all values
    for value in values:
        # Strip filename descriptions that are meant for SAF Bundler, for
        # example: Annual_Report_2020.pdf__description:Report
        if "__description" in value:
            value = value.split("__")[0]

        # Assume filename extension does not match
        filename_extension_match = False

        for filename_extension in common_filename_extensions:
            # Check for extension at the end of the filename
            pattern = re.escape(filename_extension) + r"$"
            match = re.search(pattern, value, re.IGNORECASE)

            if match is not None:
                # Register the match and stop checking for this filename
                filename_extension_match = True

                break

        if filename_extension_match is False:
            print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")

    return


def spdx_license_identifier(field):
    """Check if a license is a valid SPDX identifier.

    Prints the value if it is invalid.
    """

    # List of common non-SPDX licenses to ignore
    # See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
    ignore_licenses = {
        "All rights reserved; no re-use allowed",
        "All rights reserved; self-archive copy only",
        "Copyrighted; Non-commercial educational use only",
        "Copyrighted; Non-commercial use only",
        "Copyrighted; all rights reserved",
        "Other",
    }

    # Skip fields with missing values
    if pd.isna(field) or field in ignore_licenses:
        return

    spdx_licenses = load_spdx_licenses()

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
        if value not in spdx_licenses:
            print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")

    return


def duplicate_items(df):
    """Attempt to identify duplicate items.

    First we check the total number of titles and compare it with the number of
    unique titles. If there are less unique titles than total titles we expand
    the search by creating a key (of sorts) for each item that includes their
    title, type, and date issued, and compare it with all the others. If there
    are multiple occurrences of the same title, type, date string then it's a
    very good indicator that the items are duplicates.
    """

    # Extract the names of the title, type, and date issued columns so we can
    # reference them later. First we filter columns by likely patterns, then
    # we extract the name from the first item of the resulting object, ie:
    #
    #   Index(['dcterms.title[en_US]'], dtype='object')
    #
    # But, we need to consider that dc.title.alternative might come before the
    # main title in the CSV, so use a negative lookahead to eliminate that.
    #
    # See: https://regex101.com/r/elyXkW/1
    title_column_name = df.filter(
        regex=r"^(dc|dcterms)\.title(?!\.alternative).*$"
    ).columns[0]
    type_column_name = df.filter(regex=r"^(dcterms\.type|dc\.type).*$").columns[0]
    date_column_name = df.filter(
        regex=r"^(dcterms\.issued|dc\.date\.accessioned).*$"
    ).columns[0]

    items_count_total = df[title_column_name].count()
    items_count_unique = df[title_column_name].nunique()

    if items_count_unique < items_count_total:
        # Create a list to hold our items while we check for duplicates
        items = list()

        for index, row in df.iterrows():
            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"

            if item_title_type_date in items:
                print(
                    f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
                )
            else:
                items.append(item_title_type_date)


def mojibake(field, field_name):
    """Check for mojibake (text that was encoded in one encoding and decoded in
    in another, perhaps multiple times). See util.py.

    Prints the string if it contains suspected mojibake.
    """

    # Skip fields with missing values
    if pd.isna(field):
        return

    if is_mojibake(field):
        print(
            f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}"
        )

    return


def citation_doi(row, exclude):
    """Check for the scenario where an item has a DOI listed in its citation,
    but does not have a cg.identifier.doi field.

    Function prints a warning if the DOI field is missing, but there is a DOI
    in the citation.
    """
    # Check if the user requested us to skip any DOI fields so we can
    # just return before going any further.
    for field in exclude:
        match = re.match(r"^.*?doi.*$", field)
        if match is not None:
            return

    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    citation = ""

    # Iterate over the labels of the current row's values to check if a DOI
    # exists. If not, then we extract the citation to see if there is a DOI
    # listed there.
    for label in row.axes[0]:
        # Skip fields with missing values
        if pd.isna(row[label]):
            continue

        # If a DOI field exists we don't need to check the citation
        match = re.match(r"^.*?doi.*$", label)
        if match is not None:
            return

        # Check if the current label is a citation field and make sure the user
        # hasn't asked to skip it. If not, then set the citation.
        match = re.match(r"^.*?[cC]itation.*$", label)
        if match is not None and label not in exclude:
            citation = row[label]

    if citation != "":
        # Check the citation for "doi: 10.1186/1743-422X-9-218"
        doi_match1 = re.match(r"^.*?doi:\s.*$", citation)
        # Check the citation for a DOI URL (doi.org, dx.doi.org, etc)
        doi_match2 = re.match(r"^.*?doi\.org.*$", citation)
        if doi_match1 is not None or doi_match2 is not None:
            print(
                f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}"
            )

    return


def title_in_citation(row, exclude):
    """Check for the scenario where an item's title is missing from its cita-
    tion. This could mean that it is missing entirely, or perhaps just exists
    in a different format (whitespace, accents, etc).

    Function prints a warning if the title does not appear in the citation.
    """
    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    title = ""
    citation = ""

    # Iterate over the labels of the current row's values to get the names of
    # the title and citation columns. Then we check if the title is present in
    # the citation.
    for label in row.axes[0]:
        # Skip fields with missing values
        if pd.isna(row[label]):
            continue

        # Find the name of the title column
        match = re.match(r"^(dc|dcterms)\.title.*$", label)
        if match is not None and label not in exclude:
            title = row[label]

        # Find the name of the citation column
        match = re.match(r"^.*?[cC]itation.*$", label)
        if match is not None and label not in exclude:
            citation = row[label]

    if citation != "":
        if title not in citation:
            print(f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}")

    return


def countries_match_regions(row, exclude):
    """Check for the scenario where an item has country coverage metadata, but
    does not have the corresponding region metadata. For example, an item that
    has country coverage "Kenya" should also have region "Eastern Africa" acc-
    ording to the UN M.49 classification scheme.

    See: https://unstats.un.org/unsd/methodology/m49/

    Function prints a warning if the appropriate region is not present.
    """
    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    country_column_name = ""
    region_column_name = ""
    title_column_name = ""

    # Instantiate a CountryConverter() object here. According to the docs it is
    # more performant to do that as opposed to calling coco.convert() directly
    # because we don't need to re-load the country data with each iteration.
    cc = coco.CountryConverter()

    # Set logging to ERROR so country_converter's convert() doesn't print the
    # "not found in regex" warning message to the screen.
    logging.basicConfig(level=logging.ERROR)

    # Iterate over the labels of the current row's values to get the names of
    # the title and citation columns. Then we check if the title is present in
    # the citation.
    for label in row.axes[0]:
        # Find the name of the country column
        match = re.match(r"^.*?country.*$", label)
        if match is not None:
            country_column_name = label

        # Find the name of the region column, but make sure it's not subregion!
        match = re.match(r"^.*?region.*$", label)
        if match is not None and "sub" not in label:
            region_column_name = label

        # Find the name of the title column
        match = re.match(r"^(dc|dcterms)\.title.*$", label)
        if match is not None:
            title_column_name = label

    # Make sure the user has not asked to exclude any metadata fields. If so, we
    # should return immediately.
    column_names = [country_column_name, region_column_name, title_column_name]
    if any(field in column_names for field in exclude):
        return

    # Make sure we found the country and region columns
    if country_column_name != "" and region_column_name != "":
        # If we don't have any countries then we should return early before
        # suggesting regions.
        if row[country_column_name] is not None:
            countries = row[country_column_name].split("||")
        else:
            return

        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
            regions = list()

        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
            # only list the direct region, ie Western Africa, rather than all
            # the parent regions ("Sub-Saharan Africa", "Africa", "World")
            un_region = cc.convert(names=country, to="UNRegion")

            if un_region != "not found" and un_region not in regions:
                print(
                    f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
                )

    return
-												Add SPDX short license identifier to all Python files

See: https://spdx.github.io/spdx-spec/appendix-V-using-SPDX-short-identifiers-in-source-files/

											
										
										
											2021-03-19 15:04:13 +01:00
+								# SPDX-License-Identifier: GPL-3.0-only
-												csv_metadata_quality/check.py: missing region fixes

Port over the recent fixes and logic improvements to regions from
fix.py.

											
										
										
											2022-09-01 15:38:35 +02:00
+								import logging
-												csv_metadata_quality/check.py: requests cache

Allow overriding the directory for the requests cache. In the case
of csv-metadata-quality-web, which currently runs on Google's App
Engine, we can only write to /tmp.

											
										
										
											2021-03-14 08:07:35 +01:00
+								import os
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								import re
-												csv_metadata_quality: Move scoped imports to global

According to PEP8 we should avoid scoped imports unless you have a
good reason. Here there are two cases where we do (issn and isbn),
but I will move the others to the global scope.

											
										
										
											2020-10-06 16:11:39 +02:00
+								from datetime import datetime, timedelta
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								import country_converter as coco
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
+								import pandas as pd
-												csv_metadata_quality: Move scoped imports to global

According to PEP8 we should avoid scoped imports unless you have a
good reason. Here there are two cases where we do (issn and isbn),
but I will move the others to the global scope.

											
										
										
											2020-10-06 16:11:39 +02:00
+								import requests
 								import requests_cache
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								from colorama import Fore
-												csv_metadata_quality: Move scoped imports to global

According to PEP8 we should avoid scoped imports unless you have a
good reason. Here there are two cases where we do (issn and isbn),
but I will move the others to the global scope.

											
										
										
											2020-10-06 16:11:39 +02:00
+								from pycountry import languages
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								from stdnum import isbn as stdnum_isbn
 								from stdnum import issn as stdnum_issn
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
-												Use licenses.json from SPDX instead of spdx-license-list

spdx-license-list has been deprecated[1] and already has outdated
information compared to recent SPDX data releases. Now I use the
JSON license data directly from SPDX[2] (currently version 3.19).

The JSON file is loaded from the package's data directory using
Python 3's stdlib functions from importlib[3], though we now need
Python 3.9 as a minimum for importlib.resources.files[4].

Also note that the data directory is not properly packaged via
setuptools, so this only works for local installs, and not via
versions published to pypi, for example (I'm currently not doing
this anyways). If I want to publish this in the future I will
need to modify setup.py/pyproject.toml to include the data files.

[1] https://gitlab.com/uniqx/spdx-license-list
[2] https://github.com/spdx/license-list-data/blob/main/json/licenses.json
[3] https://copdips.com/2022/09/adding-data-files-to-python-package-with-setup-py.html
[4] https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files

											
										
										
											2022-12-13 08:31:21 +01:00
+								from csv_metadata_quality.util import is_mojibake, load_spdx_licenses
-												Add checks and unsafe fixes for mojibake

This detects whether text has likely been encoded in one encoding
and decoded in another, perhaps multiple times. This often results
in display of "mojibake" characters.

For example, a file encoded in UTF-8 is opened as CP-1252 (Windows
Latin codepage) in Microsoft Excel, and saved again as UTF-8. You
will see strings like this in the resulting file:

    - CIAT PublicaÃ§ao
    - CIAT PublicaciÃ³n

The correct version of these in UTF-8 would be:

    - CIAT Publicaçao
    - CIAT Publicación

I use a code snippet from Martijn Pieters on StackOverflow to de-
tect whether a string is "weird" as determined by the excellent
"fixes text for you" (ftfy) Python library, then check if a weird
string encodes as CP-1252 or not. If so, I can try to fix it.

See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python

											
										
										
											2021-03-19 09:22:21 +01:00
-												Fix whitespace errors found by flake8

											
										
										
											2019-07-28 16:47:28 +02:00
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
+								def issn(field):
 								    """Check if an ISSN is valid.
 								    Prints the ISSN if invalid.
 								    stdnum's is_valid() function never raises an exception.
 								    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								        if not stdnum_issn.is_valid(value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												csv_metadata_quality/check.py: Always return field

We always need to return the field back so apply doesn't set it to
null when creating the new data frame.

											
										
										
											2019-07-27 00:28:08 +02:00
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
 								def isbn(field):
 								    """Check if an ISBN is valid.
 								    Prints the ISBN if invalid.
 								    stdnum's is_valid() function never raises an exception.
 								    See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
 								    """
-												csv_metadata_quality/check.py: Add check for missing isbn values

											
										
										
											2019-07-26 22:44:58 +02:00
+								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
-												Add ISSN and ISBN checks using python-stdnum

											
										
										
											2019-07-26 22:14:10 +02:00
+								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												csv_metadata_quality/check.py: Move all imports to top of file

PEP8 recommends keeping imports at the top of the file. Also, I had
to re-work the issn/isbn so they didn't conflict with the functions
in check.py (flake8 warned about them being redefined).

Imports sorted with isort.

See: https://www.python.org/dev/peps/pep-0008/#imports

											
										
										
											2021-03-11 09:52:20 +01:00
+								        if not stdnum_isbn.is_valid(value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
-												Add check for invalid multi-value separators

											
										
										
											2019-07-26 22:48:24 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												csv_metadata_quality/check.py: Always return field

We always need to return the field back so apply doesn't set it to
null when creating the new data frame.

											
										
										
											2019-07-27 00:28:08 +02:00
-												Add check for invalid multi-value separators

											
										
										
											2019-07-26 22:48:24 +02:00
-												Add column name to output in date checks

This makes it easier to understand where the error is in case a CSV
has multiple date fields, for example:

    Missing date (dc.date.issued).
    Missing date (dc.date.issued[]).

If you have 126 items and you get 126 "Missing date" messages then
it's likely that 100 of the items have dates in one field, and the
others have dates in other field.

											
										
										
											2019-08-21 14:31:12 +02:00
+								def date(field, field_name):
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    """Check if a date is valid.
 								    In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
 								    could technically even include time as long as it is ISO8601.
 								    Also checks for other invalid cases like missing and multiple dates.
 								    Prints the date if invalid.
 								    """
 								    if pd.isna(field):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								        print(f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
 								        return
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    multiple_dates = field.split("||")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
 								    # We don't allow multi-value date fields
 								    if len(multiple_dates) > 1:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								        print(
 								            f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
 								        )
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
 								    try:
 								        # Check if date is valid YYYY format
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								        datetime.strptime(field, "%Y")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    except ValueError:
 								        pass
 								    try:
 								        # Check if date is valid YYYY-MM format
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								        datetime.strptime(field, "%Y-%m")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    except ValueError:
 								        pass
 								    try:
 								        # Check if date is valid YYYY-MM-DD format
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								        datetime.strptime(field, "%Y-%m-%d")
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												csv_metadata_quality/check.py: More date formats

We should also allow ISO 8601 extended in combined date and time
format. DSpace does not have a problem with dates in this format
and I have found some metadata that uses this date format.

For example: 2020-08-31T11:04:56Z

See: https://en.wikipedia.org/wiki/ISO_8601

											
										
										
											2021-02-04 20:39:14 +01:00
+								    except ValueError:
 								        pass
 								    try:
 								        # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
 								        datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.

											
										
										
											2019-07-28 15:11:36 +02:00
+								    except ValueError:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								        print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								        return
-												csv_metadata_quality/check.py: Return date even if it is invalid

Otherwise it is missing from the final CSV and then we can't even
fix it. :)

											
										
										
											2019-07-29 16:40:14 +02:00
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
-												Improve suspicious character detection

Now it will print just the part of the metadata value that contains
the suspicious character (up to 80 characters, so we don't make the
line break on terminals that use 80 character width by default).

Also, print the name of the field in which the metadata value is so
that it is easier for the user to locate.

											
										
										
											2019-08-09 00:22:59 +02:00
+								def suspicious_characters(field, field_name):
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
+								    """Warn about suspicious characters.
 								    Look for standalone characters that could indicate encoding or copy/paste
 								    errors for languages with accents. For example: foreˆt should be forêt.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # List of suspicious characters, for example:  ́ˆ~`
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
 								    for character in suspicious_characters:
-												Improve suspicious character detection

Now it will print just the part of the metadata value that contains
the suspicious character (up to 80 characters, so we don't make the
line break on terminals that use 80 character width by default).

Also, print the name of the field in which the metadata value is so
that it is easier for the user to locate.

											
										
										
											2019-08-09 00:22:59 +02:00
+								        # Find the position of the suspicious character in the string
 								        suspicious_character_position = field.find(character)
 								        # Python returns -1 if there is no match
 								        if suspicious_character_position != -1:
 								            # Create a temporary new string starting from the position of the
 								            # suspicious character
 								            field_subset = field[suspicious_character_position:]
 								            # Print part of the metadata value starting from the suspicious
 								            # character and spanning enough of the rest to give a preview,
 								            # but not too much to cause the line to break in terminals with
 								            # a default of 80 characters width.
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								            print(f"{suspicious_character_msg:1.80}")
-												Add check for "suspicious" characters

These standalone characters often indicate issues with encoding or
copy/paste in languages with accents like French and Spanish. For
example: foreˆt should be forêt.

It is not possible to fix these issues automatically, but this will
print a warning so you can notify the owner of the data.

											
										
										
											2019-07-29 16:08:49 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								def language(field):
-												More ISO 639-1 and ISO 639-3 fixes

ISO 639-1 uses two-letter codes and ISO 639-3 uses three-letter codes.
Technically there ISO 639-2/T and ISO 639-2/B, which also uses three
letter codes, but those are not supported by the pycountry library
so I won't even worry about them.

See: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

											
										
										
											2019-09-26 06:44:39 +02:00
+								    """Check if a language is valid ISO 639-1 (alpha 2) or ISO 639-3 (alpha 3).
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
 								    Prints the value if it is invalid.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # need to handle "Other" values here...
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
+								        # After splitting, check if language value is 2 or 3 characters so we
-												More ISO 639-1 and ISO 639-3 fixes

ISO 639-1 uses two-letter codes and ISO 639-3 uses three-letter codes.
Technically there ISO 639-2/T and ISO 639-2/B, which also uses three
letter codes, but those are not supported by the pycountry library
so I won't even worry about them.

See: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

											
										
										
											2019-09-26 06:44:39 +02:00
+								        # can check it against ISO 639-1 or ISO 639-3 accordingly.
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
+								        if len(value) == 2:
-												Use pycountry instead of iso-639 for languages

The latter is a fork that hasn't been updated since 2016 and the
original still seems to be well maintained, with recent database
updates as well as tests for Python 3.7.

Also, pycountry supports ISO 3166-2 (administrative zones), which
we could eventually use for sub regions.

											
										
										
											2019-07-30 15:39:26 +02:00
+								            if not languages.get(alpha_2=value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								                print(f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}")
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
+								        elif len(value) == 3:
-												Use pycountry instead of iso-639 for languages

The latter is a fork that hasn't been updated since 2016 and the
original still seems to be well maintained, with recent database
updates as well as tests for Python 3.7.

Also, pycountry supports ISO 3166-2 (administrative zones), which
we could eventually use for sub regions.

											
										
										
											2019-07-30 15:39:26 +02:00
+								            if not languages.get(alpha_3=value):
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								                print(f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}")
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
+								        else:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
-												Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".

											
										
										
											2019-07-29 17:59:42 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
-												Add support for dropping invalid AGROVOC terms

Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.

											
										
										
											2021-12-23 11:43:10 +01:00
+								def agrovoc(field, field_name, drop):
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
+								    """Check subject terms against AGROVOC REST API.
-												Rework AGROVOC validation

AGROVOC validation is now disabled by default, but can be enabled
on a field-by-field basis. For example, countries and regions are
also present in AGROVOC. Fields with these values can be enabled
using the new `--agrovoc-fields` option.

I reworked the script output to show the field name when printing
an invalid term so that the user knows in which field the term is.

											
										
										
											2019-08-01 22:51:58 +02:00
+								    Function constructor expects the field as well as the field name because
 								    many fields can now be validated against AGROVOC and we want to be able
 								    to inform the user in which field the invalid term is.
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
+								    Logic copied from agrovoc-lookup.py.
 								    See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
 								    Prints a warning if the value is invalid.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
-												csv_metadata_quality/check.py: Prune requests cache once

We only need to prune the requests cache once before using it, not
for every value we check.

											
										
										
											2020-07-06 12:41:51 +02:00
+								    # enable transparent request cache with thirty days expiry
 								    expire_after = timedelta(days=30)
-												csv_metadata_quality/check.py: requests cache

Allow overriding the directory for the requests cache. In the case
of csv-metadata-quality-web, which currently runs on Google's App
Engine, we can only write to /tmp.

											
										
										
											2021-03-14 08:07:35 +01:00
+								    # Allow overriding the location of the requests cache, just in case we are
 								    # running in an environment where we can't write to the current working di-
 								    # rectory (for example from csv-metadata-quality-web).
 								    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
-												csv_metadata_quality/check.py: Reformat with black

											
										
										
											2021-03-16 15:12:33 +01:00
+								    requests_cache.install_cache(
 								        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
 								    )
-												csv_metadata_quality/check.py: Prune requests cache once

We only need to prune the requests cache once before using it, not
for every value we check.

											
										
										
											2020-07-06 12:41:51 +02:00
 								    # prune old cache entries
-												csv_metadata_quality/check.py: missing region fixes

Port over the recent fixes and logic improvements to regions from
fix.py.

											
										
										
											2022-09-01 15:38:35 +02:00
+								    # requests_cache.remove_expired_responses()
-												csv_metadata_quality/check.py: Prune requests cache once

We only need to prune the requests cache once before using it, not
for every value we check.

											
										
										
											2020-07-06 12:41:51 +02:00
-												Add support for dropping invalid AGROVOC terms

Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.

											
										
										
											2021-12-23 11:43:10 +01:00
+								    # Initialize an empty list to hold the validated AGROVOC values
 								    values = list()
-												Add support for validating subjects against AGROVOC

Checks values in the dc.subject or dcterms.subject field against the
AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py.

See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py

											
										
										
											2019-07-29 23:30:31 +02:00
+								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    for value in field.split("||"):
-												csv_metadata_quality/check.py: Parameterize AGROVOC request

											
										
										
											2020-07-06 12:44:46 +02:00
+								        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
 								        request_params = {"query": value}
-												csv_metadata_quality/check.py: Simplify AGROVOC check

I recycled this code from a separate agrovoc-lookup.py script that
checks lines in a text file to see if they are valid AGROVOC terms
or not. There I was concerned about skipping comments or something
I think, but we don't need to check that here. We simply check the
term that is in the field and inform the user if it's valid or not.

											
										
										
											2019-08-21 15:35:29 +02:00
-												csv_metadata_quality/check.py: Parameterize AGROVOC request

											
										
										
											2020-07-06 12:44:46 +02:00
+								        request = requests.get(request_url, params=request_params)
-												csv_metadata_quality/check.py: Simplify AGROVOC check

I recycled this code from a separate agrovoc-lookup.py script that
checks lines in a text file to see if they are valid AGROVOC terms
or not. There I was concerned about skipping comments or something
I think, but we don't need to check that here. We simply check the
term that is in the field and inform the user if it's valid or not.

											
										
										
											2019-08-21 15:35:29 +02:00
 								        if request.status_code == requests.codes.ok:
 								            data = request.json()
 								            # check if there are any results
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								            if len(data["results"]) == 0:
-												Add support for dropping invalid AGROVOC terms

Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.

											
										
										
											2021-12-23 11:43:10 +01:00
+								                if drop:
 								                    print(
 								                        f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
 								                    )
 								                else:
 								                    print(
 								                        f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
 								                    )
 								                    # value is invalid AGROVOC, but we are not dropping
 								                    values.append(value)
 								            else:
 								                # value is valid AGROVOC so save it
 								                values.append(value)
 								    # Create a new field consisting of all values joined with "||"
 								    new_field = "||".join(values)
 								    return new_field
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
 								def filename_extension(field):
 								    """Check filename extension.
 								    CSVs with a 'filename' column are likely meant as input for the SAFBuilder
 								    tool, which creates a Simple Archive Format bundle for importing metadata
 								    with accompanying PDFs or other files into DSpace.
 								    This check warns if a filename has an uncommon extension (that is, other
 								    than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    # Try to split multi-value field on "||" separator
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    values = field.split("||")
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
 								    # List of common filename extentions
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								    common_filename_extensions = [
 								        ".pdf",
 								        ".doc",
 								        ".docx",
 								        ".ppt",
 								        ".pptx",
 								        ".xls",
 								        ".xlsx",
 								    ]
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
 								    # Iterate over all values
 								    for value in values:
-												Strip filename descriptions before checking

When checking for uncommon file extensions in the filename field
we should strip descriptions that are meant for SAF Bundler, for
example: Annual_Report_2020.pdf__description:Report. This ends up
as a false positive that spams the output with warnings.

											
										
										
											2023-02-13 08:59:14 +01:00
+								        # Strip filename descriptions that are meant for SAF Bundler, for
 								        # example: Annual_Report_2020.pdf__description:Report
 								        if "__description" in value:
 								            value = value.split("__")[0]
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
+								        # Assume filename extension does not match
 								        filename_extension_match = False
 								        for filename_extension in common_filename_extensions:
 								            # Check for extension at the end of the filename
-												Format with black

											
										
										
											2019-08-29 00:10:39 +02:00
+								            pattern = re.escape(filename_extension) + r"$"
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
+								            match = re.search(pattern, value, re.IGNORECASE)
 								            if match is not None:
 								                # Register the match and stop checking for this filename
 								                filename_extension_match = True
 								                break
-												csv_metadata_quality/check.py: Fix test for False

											
										
										
											2019-08-10 22:52:53 +02:00
+								        if filename_extension_match is False:
-												Colorize output

Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes

											
										
										
											2021-02-21 12:01:25 +01:00
+								            print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
-												Add check for uncommon filenames

Generally we want people to upload documents in accessible formats
like PDF, Word, Excel, and PowerPoint. This check warns if a file
is using an uncommon extension.

											
										
										
											2019-08-10 22:41:16 +02:00
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
 								def spdx_license_identifier(field):
 								    """Check if a license is a valid SPDX identifier.
 								    Prints the value if it is invalid.
 								    """
-												Ignore common non-SPDX licenses

This is meant to catch licenses that are supposed to be SPDX but
aren't, not licenses that *aren't* supposed to be SPDX. We have so
many free-text license descriptions like "Copyrighted" and "Other"
that I'm sick of seeing warnings for them!

											
										
										
											2023-02-07 15:01:56 +01:00
+								    # List of common non-SPDX licenses to ignore
 								    # See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt
 								    ignore_licenses = {
 								        "All rights reserved; no re-use allowed",
 								        "All rights reserved; self-archive copy only",
 								        "Copyrighted; Non-commercial educational use only",
 								        "Copyrighted; Non-commercial use only",
 								        "Copyrighted; all rights reserved",
 								        "Other",
 								    }
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
+								    # Skip fields with missing values
-												Ignore common non-SPDX licenses

This is meant to catch licenses that are supposed to be SPDX but
aren't, not licenses that *aren't* supposed to be SPDX. We have so
many free-text license descriptions like "Copyrighted" and "Other"
that I'm sick of seeing warnings for them!

											
										
										
											2023-02-07 15:01:56 +01:00
+								    if pd.isna(field) or field in ignore_licenses:
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
+								        return
-												Use licenses.json from SPDX instead of spdx-license-list

spdx-license-list has been deprecated[1] and already has outdated
information compared to recent SPDX data releases. Now I use the
JSON license data directly from SPDX[2] (currently version 3.19).

The JSON file is loaded from the package's data directory using
Python 3's stdlib functions from importlib[3], though we now need
Python 3.9 as a minimum for importlib.resources.files[4].

Also note that the data directory is not properly packaged via
setuptools, so this only works for local installs, and not via
versions published to pypi, for example (I'm currently not doing
this anyways). If I want to publish this in the future I will
need to modify setup.py/pyproject.toml to include the data files.

[1] https://gitlab.com/uniqx/spdx-license-list
[2] https://github.com/spdx/license-list-data/blob/main/json/licenses.json
[3] https://copdips.com/2022/09/adding-data-files-to-python-package-with-setup-py.html
[4] https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files

											
										
										
											2022-12-13 08:31:21 +01:00
+								    spdx_licenses = load_spdx_licenses()
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
+								    # Try to split multi-value field on "||" separator
 								    for value in field.split("||"):
-												Use licenses.json from SPDX instead of spdx-license-list

spdx-license-list has been deprecated[1] and already has outdated
information compared to recent SPDX data releases. Now I use the
JSON license data directly from SPDX[2] (currently version 3.19).

The JSON file is loaded from the package's data directory using
Python 3's stdlib functions from importlib[3], though we now need
Python 3.9 as a minimum for importlib.resources.files[4].

Also note that the data directory is not properly packaged via
setuptools, so this only works for local installs, and not via
versions published to pypi, for example (I'm currently not doing
this anyways). If I want to publish this in the future I will
need to modify setup.py/pyproject.toml to include the data files.

[1] https://gitlab.com/uniqx/spdx-license-list
[2] https://github.com/spdx/license-list-data/blob/main/json/licenses.json
[3] https://copdips.com/2022/09/adding-data-files-to-python-package-with-setup-py.html
[4] https://docs.python.org/3/library/importlib.resources.html#importlib.resources.files

											
										
										
											2022-12-13 08:31:21 +01:00
+								        if value not in spdx_licenses:
-												Add validation of SPDX license identifiers

Currently this only checks the dcterms.license field and the result
will only be a warning.

											
										
										
											2021-03-11 09:33:16 +01:00
+								            print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
-												Don't unnecessarily rewrite DataFrames for checks

By using df[column] = df[column].apply(check...) we were re-writing
the DataFrame every time we returned from a check. We don't actuall
y need to return a value at all, as the point of checks is to print
a warning to the screen. In Python a "return" statement without a v
ariable returns None.

I haven't measured the impact of this, but I assume it will mean we
are faster and use less memory.

											
										
										
											2021-03-16 15:04:19 +01:00
+								    return
-												Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.

											
										
										
											2021-03-17 08:53:07 +01:00
 								def duplicate_items(df):
 								    """Attempt to identify duplicate items.
 								    First we check the total number of titles and compare it with the number of
 								    unique titles. If there are less unique titles than total titles we expand
 								    the search by creating a key (of sorts) for each item that includes their
 								    title, type, and date issued, and compare it with all the others. If there
 								    are multiple occurrences of the same title, type, date string then it's a
 								    very good indicator that the items are duplicates.
 								    """
 								    # Extract the names of the title, type, and date issued columns so we can
 								    # reference them later. First we filter columns by likely patterns, then
 								    # we extract the name from the first item of the resulting object, ie:
 								    #
 								    #   Index(['dcterms.title[en_US]'], dtype='object')
 								    #
-												csv_metadata_quality/check.py: Fix duplicate checker

Fix the incorrect type field regex, and improve the title regex to
consider dcterms.title and dc.title (along with the DSpace language
variants like dc.title[en_US]), but ignore dc.title.alternative.

See: https://regex101.com/r/I4m06F/1

											
										
										
											2021-10-06 18:32:40 +02:00
+								    # But, we need to consider that dc.title.alternative might come before the
 								    # main title in the CSV, so use a negative lookahead to eliminate that.
 								    #
 								    # See: https://regex101.com/r/elyXkW/1
 								    title_column_name = df.filter(
 								        regex=r"^(dc|dcterms)\.title(?!\.alternative).*$"
 								    ).columns[0]
 								    type_column_name = df.filter(regex=r"^(dcterms\.type|dc\.type).*$").columns[0]
-												Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.

											
										
										
											2021-03-17 08:53:07 +01:00
+								    date_column_name = df.filter(
-												csv_metadata_quality/check.py: Fix duplicate checker

Fix the incorrect type field regex, and improve the title regex to
consider dcterms.title and dc.title (along with the DSpace language
variants like dc.title[en_US]), but ignore dc.title.alternative.

See: https://regex101.com/r/I4m06F/1

											
										
										
											2021-10-06 18:32:40 +02:00
+								        regex=r"^(dcterms\.issued|dc\.date\.accessioned).*$"
-												Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.

											
										
										
											2021-03-17 08:53:07 +01:00
+								    ).columns[0]
 								    items_count_total = df[title_column_name].count()
 								    items_count_unique = df[title_column_name].nunique()
 								    if items_count_unique < items_count_total:
 								        # Create a list to hold our items while we check for duplicates
 								        items = list()
 								        for index, row in df.iterrows():
 								            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
 								            if item_title_type_date in items:
 								                print(
 								                    f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
 								                )
 								            else:
 								                items.append(item_title_type_date)
-												Add checks and unsafe fixes for mojibake

This detects whether text has likely been encoded in one encoding
and decoded in another, perhaps multiple times. This often results
in display of "mojibake" characters.

For example, a file encoded in UTF-8 is opened as CP-1252 (Windows
Latin codepage) in Microsoft Excel, and saved again as UTF-8. You
will see strings like this in the resulting file:

    - CIAT PublicaÃ§ao
    - CIAT PublicaciÃ³n

The correct version of these in UTF-8 would be:

    - CIAT Publicaçao
    - CIAT Publicación

I use a code snippet from Martijn Pieters on StackOverflow to de-
tect whether a string is "weird" as determined by the excellent
"fixes text for you" (ftfy) Python library, then check if a weird
string encodes as CP-1252 or not. If so, I can try to fix it.

See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python

											
										
										
											2021-03-19 09:22:21 +01:00
 								def mojibake(field, field_name):
 								    """Check for mojibake (text that was encoded in one encoding and decoded in
 								    in another, perhaps multiple times). See util.py.
 								    Prints the string if it contains suspected mojibake.
 								    """
 								    # Skip fields with missing values
 								    if pd.isna(field):
 								        return
 								    if is_mojibake(field):
 								        print(
 								            f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}"
 								        )
 								    return
-												Add check for missing DOIs

Sometimes an editor includes a DOI in the citation field, but does
not add a standalone DOI field.

											
										
										
											2021-10-06 20:25:39 +02:00
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								def citation_doi(row, exclude):
-												Add check for missing DOIs

Sometimes an editor includes a DOI in the citation field, but does
not add a standalone DOI field.

											
										
										
											2021-10-06 20:25:39 +02:00
+								    """Check for the scenario where an item has a DOI listed in its citation,
 								    but does not have a cg.identifier.doi field.
 								    Function prints a warning if the DOI field is missing, but there is a DOI
 								    in the citation.
 								    """
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								    # Check if the user requested us to skip any DOI fields so we can
 								    # just return before going any further.
 								    for field in exclude:
 								        match = re.match(r"^.*?doi.*$", field)
 								        if match is not None:
 								            return
-												Add check for missing DOIs

Sometimes an editor includes a DOI in the citation field, but does
not add a standalone DOI field.

											
										
										
											2021-10-06 20:25:39 +02:00
+								    # Initialize some variables at global scope so that we can set them in the
 								    # loop scope below and still be able to access them afterwards.
 								    citation = ""
 								    # Iterate over the labels of the current row's values to check if a DOI
 								    # exists. If not, then we extract the citation to see if there is a DOI
 								    # listed there.
 								    for label in row.axes[0]:
 								        # Skip fields with missing values
 								        if pd.isna(row[label]):
 								            continue
 								        # If a DOI field exists we don't need to check the citation
 								        match = re.match(r"^.*?doi.*$", label)
 								        if match is not None:
 								            return
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								        # Check if the current label is a citation field and make sure the user
 								        # hasn't asked to skip it. If not, then set the citation.
-												Add check for missing DOIs

Sometimes an editor includes a DOI in the citation field, but does
not add a standalone DOI field.

											
										
										
											2021-10-06 20:25:39 +02:00
+								        match = re.match(r"^.*?[cC]itation.*$", label)
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								        if match is not None and label not in exclude:
-												Add check for missing DOIs

Sometimes an editor includes a DOI in the citation field, but does
not add a standalone DOI field.

											
										
										
											2021-10-06 20:25:39 +02:00
+								            citation = row[label]
 								    if citation != "":
 								        # Check the citation for "doi: 10.1186/1743-422X-9-218"
 								        doi_match1 = re.match(r"^.*?doi:\s.*$", citation)
 								        # Check the citation for a DOI URL (doi.org, dx.doi.org, etc)
 								        doi_match2 = re.match(r"^.*?doi\.org.*$", citation)
 								        if doi_match1 is not None or doi_match2 is not None:
 								            print(
 								                f"{Fore.YELLOW}DOI in citation, but missing a DOI field: {Fore.RESET}{citation}"
 								            )
 								    return
-												Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.

											
										
										
											2021-12-05 14:52:42 +01:00
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								def title_in_citation(row, exclude):
-												Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.

											
										
										
											2021-12-05 14:52:42 +01:00
+								    """Check for the scenario where an item's title is missing from its cita-
 								    tion. This could mean that it is missing entirely, or perhaps just exists
 								    in a different format (whitespace, accents, etc).
 								    Function prints a warning if the title does not appear in the citation.
 								    """
-												csv_metadata_quality/check.py: update title in citation check

Initialize the titles and citations before the for loop so we can
access them later. This makes it easier to check if the item actua-
lly has a citation.

											
										
										
											2021-12-05 15:21:44 +01:00
+								    # Initialize some variables at global scope so that we can set them in the
 								    # loop scope below and still be able to access them afterwards.
 								    title = ""
 								    citation = ""
-												Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.

											
										
										
											2021-12-05 14:52:42 +01:00
+								    # Iterate over the labels of the current row's values to get the names of
 								    # the title and citation columns. Then we check if the title is present in
 								    # the citation.
 								    for label in row.axes[0]:
 								        # Skip fields with missing values
 								        if pd.isna(row[label]):
 								            continue
 								        # Find the name of the title column
 								        match = re.match(r"^(dc|dcterms)\.title.*$", label)
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								        if match is not None and label not in exclude:
-												csv_metadata_quality/check.py: update title in citation check

Initialize the titles and citations before the for loop so we can
access them later. This makes it easier to check if the item actua-
lly has a citation.

											
										
										
											2021-12-05 15:21:44 +01:00
+								            title = row[label]
-												Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.

											
										
										
											2021-12-05 14:52:42 +01:00
 								        # Find the name of the citation column
 								        match = re.match(r"^.*?[cC]itation.*$", label)
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								        if match is not None and label not in exclude:
-												csv_metadata_quality/check.py: update title in citation check

Initialize the titles and citations before the for loop so we can
access them later. This makes it easier to check if the item actua-
lly has a citation.

											
										
										
											2021-12-05 15:21:44 +01:00
+								            citation = row[label]
-												Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.

											
										
										
											2021-12-05 14:52:42 +01:00
-												csv_metadata_quality/check.py: update title in citation check

Initialize the titles and citations before the for loop so we can
access them later. This makes it easier to check if the item actua-
lly has a citation.

											
										
										
											2021-12-05 15:21:44 +01:00
+								    if citation != "":
 								        if title not in citation:
 								            print(f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{title}")
-												Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.

											
										
										
											2021-12-05 14:52:42 +01:00
 								    return
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								def countries_match_regions(row, exclude):
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								    """Check for the scenario where an item has country coverage metadata, but
 								    does not have the corresponding region metadata. For example, an item that
 								    has country coverage "Kenya" should also have region "Eastern Africa" acc-
 								    ording to the UN M.49 classification scheme.
 								    See: https://unstats.un.org/unsd/methodology/m49/
 								    Function prints a warning if the appropriate region is not present.
 								    """
 								    # Initialize some variables at global scope so that we can set them in the
 								    # loop scope below and still be able to access them afterwards.
 								    country_column_name = ""
 								    region_column_name = ""
 								    title_column_name = ""
-												csv_metadata_quality/check.py: missing region fixes

Port over the recent fixes and logic improvements to regions from
fix.py.

											
										
										
											2022-09-01 15:38:35 +02:00
+								    # Instantiate a CountryConverter() object here. According to the docs it is
 								    # more performant to do that as opposed to calling coco.convert() directly
 								    # because we don't need to re-load the country data with each iteration.
 								    cc = coco.CountryConverter()
 								    # Set logging to ERROR so country_converter's convert() doesn't print the
 								    # "not found in regex" warning message to the screen.
 								    logging.basicConfig(level=logging.ERROR)
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								    # Iterate over the labels of the current row's values to get the names of
 								    # the title and citation columns. Then we check if the title is present in
 								    # the citation.
 								    for label in row.axes[0]:
 								        # Find the name of the country column
 								        match = re.match(r"^.*?country.*$", label)
 								        if match is not None:
 								            country_column_name = label
-												Ignore subregion field for missing region checks

Due to a sloppy regex I was sometimes matching the subregion field
when checking for missing UN M.49 regions in the region field.

											
										
										
											2022-12-07 23:18:47 +01:00
+								        # Find the name of the region column, but make sure it's not subregion!
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								        match = re.match(r"^.*?region.*$", label)
-												Ignore subregion field for missing region checks

Due to a sloppy regex I was sometimes matching the subregion field
when checking for missing UN M.49 regions in the region field.

											
										
										
											2022-12-07 23:18:47 +01:00
+								        if match is not None and "sub" not in label:
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								            region_column_name = label
 								        # Find the name of the title column
 								        match = re.match(r"^(dc|dcterms)\.title.*$", label)
 								        if match is not None:
 								            title_column_name = label
-												Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.

											
										
										
											2022-09-02 14:59:22 +02:00
+								    # Make sure the user has not asked to exclude any metadata fields. If so, we
 								    # should return immediately.
 								    column_names = [country_column_name, region_column_name, title_column_name]
 								    if any(field in column_names for field in exclude):
 								        return
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								    # Make sure we found the country and region columns
 								    if country_column_name != "" and region_column_name != "":
 								        # If we don't have any countries then we should return early before
 								        # suggesting regions.
 								        if row[country_column_name] is not None:
 								            countries = row[country_column_name].split("||")
 								        else:
 								            return
 								        if row[region_column_name] is not None:
 								            regions = row[region_column_name].split("||")
 								        else:
 								            regions = list()
 								        for country in countries:
 								            # Look up the UN M.49 regions for this country code. CoCo seems to
 								            # only list the direct region, ie Western Africa, rather than all
 								            # the parent regions ("Sub-Saharan Africa", "Africa", "World")
-												csv_metadata_quality/check.py: missing region fixes

Port over the recent fixes and logic improvements to regions from
fix.py.

											
										
										
											2022-09-01 15:38:35 +02:00
+								            un_region = cc.convert(names=country, to="UNRegion")
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
-												csv_metadata_quality/check.py: missing region fixes

Port over the recent fixes and logic improvements to regions from
fix.py.

											
										
										
											2022-09-01 15:38:35 +02:00
+								            if un_region != "not found" and un_region not in regions:
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								                print(
-												csv_metadata_quality/check.py: update region output

Add the country to the message about missing regions. This makes it
easier to see which country is triggering the missing region error,
and helps in case of debugging possible mistakes in the data coming
from the country_converter library.

											
										
										
											2022-11-28 15:40:27 +01:00
+								                    f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
-												Add check for countries without matching regions

If we have country "Kenya" we should have region "Eastern Africa"
according to the UN M.49 geolocation scheme.

											
										
										
											2021-12-08 14:02:20 +01:00
+								                )
 								    return