poetry.lock: run poetry update

Add fix for normalizing DOIs
csv_metadata_quality/check.py: run rye fmt
2024-04-25 12:50:30 +03:00 · 2024-04-25 12:49:19 +03:00 · 2024-04-12 13:40:55 +03:00 · 2024-04-12 11:07:36 +03:00 · 2024-03-02 10:39:00 +03:00 · 2024-03-02 10:38:27 +03:00
20 changed files with 3568 additions and 1394 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -1,3 +1,33 @@
+---
+kind: pipeline
+type: docker
+name: python311
+
+steps:
+- name: test
+  image: python:3.11-slim
+  commands:
+  - id
+  - python -V
+  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
+  - python -m pip install poetry
+  - poetry install
+  - poetry run pytest
+  # Basic test
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  # Basic test with unsafe fixes
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  # Geography test
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  # Geography test with unsafe fixes
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  # Test with experimental checks
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  # Test with AGROVOC validation
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  # Test with AGROVOC validation (and dropping invalid)
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+
 ---
 kind: pipeline
 type: docker
@ -10,23 +40,23 @@ steps:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
-  - pip install -r requirements-dev.txt
-  - pytest
-  - python setup.py install
+  - python -m pip install poetry
+  - poetry install
+  - poetry run pytest
  # Basic test
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d

 ---
 kind: pipeline
@ -40,22 +70,22 @@ steps:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
-  - pip install -r requirements-dev.txt
-  - pytest
-  - python setup.py install
+  - python -m pip install poetry
+  - poetry install
+  - poetry run pytest
  # Basic test
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d

 # vim: ts=2 sw=2 et
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@ -15,37 +15,31 @@ jobs:
    runs-on: ubuntu-22.04

    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v4
+    - uses: actions/checkout@v4
+    - name: Install poetry
+      run: pipx install poetry
+    - uses: actions/setup-python@v5
      with:
-        python-version: '3.10'
-        cache: 'pip'
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-        if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
+        python-version: '3.11'
+        cache: 'poetry'
+    - run: poetry install
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
    - name: Test with pytest
-      run: |
-        pytest
+      run: poetry run pytest
    - name: Test CLI
      run: |
-        python setup.py install
        # Basic test
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
        # Test with unsafe fixes
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
        # Test with experimental checks
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
        # Test with AGROVOC validation
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
        # Test with AGROVOC validation (and dropping invalid)
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## Unreleased
+### Added
+- Ability to normalize DOIs to https://doi.org URI format
+
+### Fixed
+- Fixed regex so we don't run the invalid multi-value separator fix on
+`dcterms.bibliographicCitation` fields
+- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
+fields
+- Don't crash the country/region checker/fixer when a title field is missing
+
+### Changed
+- Don't run newline fix on description fields
+- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
+- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
+
+### Updated
+- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
+- SPDX license list
+
 ## [0.6.1] - 2023-02-23
 ### Fixed
 - Missing region check should ignore subregion field, if it exists
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1 @@
+include csv_metadata_quality/data/licenses.json
--- a/README.md
+++ b/README.md
@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
 - Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
 - Remove duplicate metadata values
 - Check for duplicate items, using the title, type, and date issued as an indicator
+- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format

 ## Installation
 The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
@ -125,9 +126,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 - Better logging, for example with INFO, WARN, and ERR levels
 - Verbose, debug, or quiet options
 - Warn if an author is shorter than 3 characters?
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
 - Warn if two items use the same file in `filename` column
- Add an option to drop invalid AGROVOC subjects?
 - Add tests for application invocation, ie `tests/test_app.py`?
 - Validate ISSNs or journal titles against CrossRef API?
 - Add configurable field validation, like specify a field name and a validation file?
@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
  - Warn if item is Open Access, but missing a license
  - Warn if item has an ISSN but no journal title
  - Update journal titles from ISSN
- Migrate to https://github.com/spdx/license-list-data
+- Migrate from Pandas to Polars

 ## License
 This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-3.0-only

 import argparse
+import os
 import re
 import signal
 import sys
+from datetime import timedelta

 import pandas as pd
+import requests_cache
 from colorama import Fore

 import csv_metadata_quality.check as check
@ -74,7 +77,7 @@ def run(argv):
    signal.signal(signal.SIGINT, signal_handler)

    # Read all fields as strings so dates don't get converted from 1998 to 1998.0
-    df = pd.read_csv(args.input_file, dtype=str)
+    df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")

    # Check if the user requested to skip any fields
    if args.exclude_fields:
@ -82,7 +85,20 @@ def run(argv):
        # user should be careful to no include spaces here.
        exclude = args.exclude_fields.split(",")
    else:
-        exclude = list()
+        exclude = []
+
+    # enable transparent request cache with thirty days expiry
+    expire_after = timedelta(days=30)
+    # Allow overriding the location of the requests cache, just in case we are
+    # running in an environment where we can't write to the current working di-
+    # rectory (for example from csv-metadata-quality-web).
+    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
+    requests_cache.install_cache(
+        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
+    )
+
+    # prune old cache entries
+    requests_cache.delete()

    for column in df.columns:
        if column in exclude:
@ -91,7 +107,9 @@ def run(argv):
            continue

        if args.unsafe_fixes:
-            match = re.match(r"^.*?abstract.*$", column)
+            # Skip whitespace and newline fixes on abstracts and descriptions
+            # because there are too many with legitimate multi-line metadata.
+            match = re.match(r"^.*?(abstract|description).*$", column)
            if match is None:
                # Fix: whitespace
                df[column] = df[column].apply(fix.whitespace, field_name=column)
@ -102,7 +120,7 @@ def run(argv):
        # Fix: missing space after comma. Only run on author and citation
        # fields for now, as this problem is mostly an issue in names.
        if args.unsafe_fixes:
-            match = re.match(r"^.*?(author|citation).*$", column)
+            match = re.match(r"^.*?(author|[Cc]itation).*$", column)
            if match is not None:
                df[column] = df[column].apply(fix.comma_space, field_name=column)

@ -123,10 +141,15 @@ def run(argv):
        # Fix: unnecessary Unicode
        df[column] = df[column].apply(fix.unnecessary_unicode)

+        # Fix: normalize DOIs
+        match = re.match(r"^.*?identifier\.doi.*$", column)
+        if match is not None:
+            df[column] = df[column].apply(fix.normalize_dois)
+
        # Fix: invalid and unnecessary multi-value separators. Skip the title
        # and abstract fields because "|" is used to indicate something like
        # a subtitle.
-        match = re.match(r"^.*?(abstract|title).*$", column)
+        match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
        if match is None:
            df[column] = df[column].apply(fix.separators, field_name=column)
            # Run whitespace fix again after fixing invalid separators
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -1,14 +1,12 @@
 # SPDX-License-Identifier: GPL-3.0-only

 import logging
-import os
 import re
-from datetime import datetime, timedelta
+from datetime import datetime

 import country_converter as coco
 import pandas as pd
 import requests
-import requests_cache
 from colorama import Fore
 from pycountry import languages
 from stdnum import isbn as stdnum_isbn
@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
        return

    # List of suspicious characters, for example:  ́ˆ~`
-    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
+    suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]

    for character in suspicious_characters:
        # Find the position of the suspicious character in the string
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
    if pd.isna(field):
        return

-    # enable transparent request cache with thirty days expiry
-    expire_after = timedelta(days=30)
-    # Allow overriding the location of the requests cache, just in case we are
-    # running in an environment where we can't write to the current working di-
-    # rectory (for example from csv-metadata-quality-web).
-    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
-    requests_cache.install_cache(
-        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
-    )
-
-    # prune old cache entries
-    # requests_cache.remove_expired_responses()
-
    # Initialize an empty list to hold the validated AGROVOC values
-    values = list()
+    values = []

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
-        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
+        request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
        request_params = {"query": value}

        request = requests.get(request_url, params=request_params)
@ -373,7 +358,7 @@ def duplicate_items(df):

    if items_count_unique < items_count_total:
        # Create a list to hold our items while we check for duplicates
-        items = list()
+        items = []

        for index, row in df.iterrows():
            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
-            regions = list()
+            regions = []

        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
            un_region = cc.convert(names=country, to="UNRegion")

            if un_region != "not found" and un_region not in regions:
-                print(
-                    f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
-                )
+                try:
+                    print(
+                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
+                    )
+                except KeyError:
+                    print(
+                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
+                    )

    return
--- a/csv_metadata_quality/data/licenses.json
+++ b/csv_metadata_quality/data/licenses.json
--- a/csv_metadata_quality/experimental.py
+++ b/csv_metadata_quality/experimental.py
@ -2,8 +2,8 @@

 import re

-import langid
 import pandas as pd
+import py3langid as langid
 from colorama import Fore
 from pycountry import languages

@ -20,7 +20,7 @@ def correct_language(row, exclude):
    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    language = ""
-    sample_strings = list()
+    sample_strings = []
    title = None

    # Iterate over the labels of the current row's values. Before we transposed
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -23,7 +23,7 @@ def whitespace(field, field_name):
        return

    # Initialize an empty list to hold the cleaned values
-    values = list()
+    values = []

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
@ -64,7 +64,7 @@ def separators(field, field_name):
        return

    # Initialize an empty list to hold the cleaned values
-    values = list()
+    values = []

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
@ -175,7 +175,7 @@ def duplicates(field, field_name):
    values = field.split("||")

    # Initialize an empty list to hold the de-duplicated values
-    new_values = list()
+    new_values = []

    # Iterate over all values
    for value in values:
@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
-            regions = list()
+            regions = []

        # An empty list for our regions so we can keep track for all countries
-        missing_regions = list()
+        missing_regions = []

        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
            # it doesn't already exist in regions.
            if un_region != "not found" and un_region not in regions:
                if un_region not in missing_regions:
-                    print(
-                        f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
-                    )
+                    try:
+                        print(
+                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
+                        )
+                    except KeyError:
+                        # If there is no title column in the CSV we will print
+                        # the fix without the title instead of crashing.
+                        print(
+                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
+                        )
+
                    missing_regions.append(un_region)

        if len(missing_regions) > 0:
@ -387,3 +395,74 @@ def countries_match_regions(row, exclude):
                row[region_column_name] = "||".join(missing_regions)

    return row
+
+
+def normalize_dois(field):
+    """Normalize DOIs.
+
+    DOIs are meant to be globally unique identifiers. They are case insensitive,
+    but in order to compare them robustly they should be normalized to a common
+    format:
+
+        - strip leading and trailing whitespace
+        - lowercase all ASCII characters
+        - convert all variations to https://doi.org/10.xxxx/xxxx URI format
+
+    Return string with normalized DOI.
+
+    See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
+    """
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Try to split multi-value field on "||" separator
+    values = field.split("||")
+
+    # Initialize an empty list to hold the de-duplicated values
+    new_values = []
+
+    # Iterate over all values (most items will only have one DOI)
+    for value in values:
+        # Strip leading and trailing whitespace
+        new_value = value.strip()
+
+        new_value = new_value.lower()
+
+        # Convert to HTTPS
+        pattern = re.compile(r"^http://")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://", new_value)
+
+        # Convert dx.doi.org to doi.org
+        pattern = re.compile(r"dx\.doi\.org")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "doi.org", new_value)
+
+        # Replace values like doi: 10.11648/j.jps.20140201.14
+        pattern = re.compile(r"^doi: 10\.")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
+
+        # Replace values like 10.3390/foods12010115
+        pattern = re.compile(r"^10\.")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
+
+        if new_value != value:
+            print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
+
+        new_values.append(new_value)
+
+    new_field = "||".join(new_values)
+
+    return new_field
--- a/csv_metadata_quality/util.py
+++ b/csv_metadata_quality/util.py
@ -2,7 +2,7 @@


 import json
-from importlib.resources import files
+import os

 from ftfy.badness import is_bad

@ -58,7 +58,7 @@ def is_mojibake(field):
 def load_spdx_licenses():
    """Returns a Python list of SPDX short license identifiers."""

-    with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
+    with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
        licenses = json.load(f)

    # List comprehension to extract the license ID for each license
--- a/data/test.csv
+++ b/data/test.csv
@ -37,3 +37,6 @@ Mojibake,2021-03-18,,,,PublicaÃ§ao CIAT,,,,Report,,,,
 Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
 Country missing region,2021-12-08,,,,,Kenya,,,,,,,
 Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
+DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
+DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
+Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,26 +12,25 @@ csv-metadata-quality = 'csv_metadata_quality.__main__:main'

 [tool.poetry.dependencies]
 python = "^3.9"
-pandas = "^1.5.2"
+pandas = {version = "^2.0.2", extras = ["feather", "performance"]}
 python-stdnum = "^1.18"
 requests = "^2.28.2"
-requests-cache = "^0.9.8"
-langid = "^1.1.6"
+requests-cache = "^1.0.0"
 colorama = "^0.4.6"
 ftfy = "^6.1.1"
-country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
-pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
+country-converter = "~1.1.0"
+pycountry = "^23.12.7"
+py3langid = "^0.2.2"

-[tool.poetry.dev-dependencies]
+[tool.poetry.group.dev.dependencies]
 pytest = "^7.2.1"
-flake8 = "^6.0.0"
+flake8 = "^7.0.0"
 pytest-clarity = "^1.0.1"
 black = "^23.1.0"
 isort = "^5.12.0"
 csvkit = "^1.1.0"
-
-[tool.poetry.group.dev.dependencies]
 ipython = "^8.10.0"
+fixit = "^2.1.0"

 [build-system]
 requires = ["poetry>=0.12"]
--- a/renovate.json
+++ b/renovate.json
@ -0,0 +1,9 @@
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:base"
+  ],
+  "pip_requirements": {
+      "enabled": false
+  }
+}
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -5,28 +5,28 @@ agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
 appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
 appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
 asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
-attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
+attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
+babel==2.12.1 ; python_version >= "3.9" and python_version < "4.0"
 backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
-black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
+black==23.3.0 ; python_version >= "3.9" and python_version < "4.0"
 cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
-charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
+certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
+charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
 click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
-country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
+country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
 dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
 decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
 et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
-exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
+exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
 executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
 flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
 ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
-greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
-idna==3.4 ; python_version >= "3.9" and python_version < "4"
+greenlet==2.0.2 ; python_version >= "3.9" and platform_machine == "aarch64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "ppc64le" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "x86_64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "amd64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "AMD64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "win32" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "WIN32" and python_version < "4.0"
+idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
 iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
-ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
+ipython==8.13.1 ; python_version >= "3.9" and python_version < "4.0"
 isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
 isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
 jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
@ -37,44 +37,46 @@ matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
 mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
 mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
 mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
-numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
+numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
 olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
-openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
-packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
-pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
+openpyxl==3.1.2 ; python_version >= "3.9" and python_version < "4.0"
+packaging==23.1 ; python_version >= "3.9" and python_version < "4.0"
+pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
 parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
 parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
-pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
+pathspec==0.11.1 ; python_version >= "3.9" and python_version < "4.0"
 pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
 pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
-platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
+platformdirs==3.5.0 ; python_version >= "3.9" and python_version < "4.0"
 pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
-prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
+prompt-toolkit==3.0.38 ; python_version >= "3.9" and python_version < "4.0"
 ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
 pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
+pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
 pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
-pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
+pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
 pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
-pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
+pygments==2.15.1 ; python_version >= "3.9" and python_version < "4.0"
 pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
-pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
+pytest==7.3.1 ; python_version >= "3.9" and python_version < "4.0"
 python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
-python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
+python-slugify==8.0.1 ; python_version >= "3.9" and python_version < "4.0"
 python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
 pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
-pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
+pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
-requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
-rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
+requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
+rich==13.3.5 ; python_version >= "3.9" and python_version < "4.0"
 six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
-sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
+sqlalchemy==1.4.48 ; python_version >= "3.9" and python_version < "4.0"
 stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
 text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
 tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
 traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
 typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
+tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
-urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
+urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
 wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
 xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,23 +1,25 @@
 appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
-attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
+attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
 cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
-charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
+certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
+charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
-country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
-exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
+country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
+exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
 ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
-idna==3.4 ; python_version >= "3.9" and python_version < "4"
+idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
 langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
-numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
-pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
-pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
+numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
+pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
+pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
+pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
 python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
 python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
-pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
+pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
-requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
+requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
 six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
+tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
-urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
+urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
 wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
--- a/setup.py
+++ b/setup.py
@ -1,36 +0,0 @@
-import setuptools
-
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
-install_requires = [
-    "pandas",
-    "python-stdnum",
-    "requests",
-    "requests-cache",
-    "pycountry",
-    "langid",
-]
-
-setuptools.setup(
-    name="csv-metadata-quality",
-    version="0.6.1",
-    author="Alan Orth",
-    author_email="aorth@mjanja.ch",
-    description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
-    license="GPLv3",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/alanorth/csv-metadata-quality",
-    classifiers=[
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-        "Operating System :: OS Independent",
-    ],
-    packages=["csv_metadata_quality"],
-    entry_points={
-        "console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
-    },
-    install_requires=install_requires,
-)
--- a/tests/test_check.py
+++ b/tests/test_check.py
@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "es"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "spa"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "en"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "eng"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@ -407,7 +407,7 @@ def test_check_doi_field():
    # the citation and a DOI field.
    d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)
-    exclude = list()
+    exclude = []

    result = check.citation_doi(series, exclude)

@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
    """Test an item with a DOI in its citation, but no DOI field."""

    citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series), with
    # an empty DOI field and a citation containing a DOI.
@ -439,7 +439,7 @@ def test_title_in_citation():

    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all the things."
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):

    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all teh things."
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
@ -477,7 +477,7 @@ def test_country_matches_region():

    country = "Kenya"
    region = "Eastern Africa"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {"cg.coverage.country": country, "cg.coverage.region": region}
@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
    series_correct = pd.Series(data=d_correct)

    pd.testing.assert_series_equal(result, series_correct)
+
+
+def test_fix_normalize_dois():
+    """Test normalizing a DOI."""
+
+    value = "doi: 10.11648/j.jps.20140201.14"
+
+    assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
Author	SHA1	Message	Date
Alan Orth	2341c56c40	poetry.lock: run poetry update	2024-04-25 12:50:30 +03:00
Alan Orth	5be2195325	Add fix for normalizing DOIs	2024-04-25 12:49:19 +03:00
Alan Orth	736948ed2c	csv_metadata_quality/check.py: run rye fmt	2024-04-12 13:40:55 +03:00
Alan Orth	ee0b448355	csv_metadata_quality/check.py: remove unused import	2024-04-12 11:07:36 +03:00
Alan Orth	4f3174a543	CHANGELOG.md: add note about SPDX license list continuous-integration/drone/push Build is passing Details	2024-03-02 10:39:00 +03:00
Alan Orth	d5c25f82fa	Update SPDX license list From: https://github.com/spdx/license-list-data/blob/main/json/licenses.json	2024-03-02 10:38:27 +03:00
Alan Orth	7b3e2b4e68	Merge pull request #43 from ilri/renovate/pytest-7.x-lockfile continuous-integration/drone/push Build is passing Details chore(deps): update dependency pytest to v7.4.4	2024-01-05 16:40:13 +03:00
Alan Orth	f92b2fe206	Merge pull request #44 from ilri/renovate/flake8-7.x chore(deps): update dependency flake8 to v7	2024-01-05 16:25:22 +03:00
renovate[bot]	df040b70c7	chore(deps): update dependency flake8 to v7 continuous-integration/drone/push Build is passing Details	2024-01-05 00:58:28 +00:00
renovate[bot]	10bc8f3e14	chore(deps): update dependency pytest to v7.4.4 continuous-integration/drone/push Build is passing Details	2023-12-31 13:47:46 +00:00
Alan Orth	7e6e92ecaa	poetry.lock: run poetry lock continuous-integration/drone/push Build is passing Details	2023-12-28 14:12:03 +03:00
Alan Orth	a21ffb0fa8	Use py3langid instead of langid Faster and more modern code for Python 3 as a drop-in replacement. See: https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html	2023-12-28 14:11:21 +03:00
Alan Orth	fb341dd9fa	Merge pull request #37 from ilri/renovate/actions-setup-python-5.x chore(deps): update actions/setup-python action to v5	2023-12-28 09:02:41 +03:00
Alan Orth	2e943ee4db	Merge pull request #39 from ilri/renovate/isort-5.x-lockfile chore(deps): update dependency isort to v5.13.2	2023-12-28 09:01:48 +03:00
Alan Orth	6d3a9870d6	Merge pull request #41 from ilri/renovate/pycountry-23.x-lockfile fix(deps): update dependency pycountry to v23.12.11	2023-12-28 09:01:21 +03:00
Alan Orth	82ecf7119a	Merge pull request #42 from ilri/renovate/black-23.x-lockfile chore(deps): update dependency black to v23.12.1	2023-12-28 09:00:39 +03:00
renovate[bot]	1db21cf275	chore(deps): update dependency black to v23.12.1 continuous-integration/drone/push Build is passing Details	2023-12-23 00:35:13 +00:00
renovate[bot]	bcd1408798	chore(deps): update dependency isort to v5.13.2 continuous-integration/drone/push Build is passing Details	2023-12-13 22:21:38 +00:00
renovate[bot]	ee8d255811	fix(deps): update dependency pycountry to v23.12.11 continuous-integration/drone/push Build is passing Details	2023-12-11 21:50:09 +00:00
Alan Orth	2cc2dbe952	tests: apply fixes from fixit continuous-integration/drone/push Build is passing Details RewriteToLiteral: It's slower to call list() than using the empty literal	2023-12-09 12:20:35 +03:00
Alan Orth	940a325d61	poetry.lock: run poetry lock	2023-12-09 12:05:26 +03:00
Alan Orth	59b3b307c9	pyproject.toml: use official pycountry The project is moving again and has all the latest data from the iso-codes project.	2023-12-09 12:04:14 +03:00
Alan Orth	b305da3f0b	poetry.lock: run poetry update continuous-integration/drone/push Build is failing Details	2023-12-07 17:10:01 +03:00
renovate[bot]	96a486471c	Update actions/setup-python action to v5 continuous-integration/drone/push Build is passing Details	2023-12-06 13:13:11 +00:00
Alan Orth	530cd5863b	poetry.lock: run poetry update continuous-integration/drone/push Build is passing Details	2023-11-22 22:07:30 +03:00
Alan Orth	f6018c51b6	Apply fixes from fixit Apply recommended fix from fixit: RewriteToLiteral: It's slower to call list() than using the empty literal, because the name list must be looked up in the global scope in case it has been rebound.	2023-11-22 21:54:50 +03:00
Alan Orth	80c3f5b45a	Add fixit to dev dependencies	2023-11-22 21:54:09 +03:00
Alan Orth	ba4637ea34	Merge pull request #31 from ilri/renovate/black-23.x-lockfile continuous-integration/drone/push Build is passing Details Update dependency black to v23.11.0	2023-11-20 21:41:43 +03:00
Alan Orth	355428a691	Merge pull request #32 from ilri/renovate/country-converter-1.x Update dependency country-converter to ~1.1.0	2023-11-20 21:39:36 +03:00
renovate[bot]	58d4de973e	Update dependency country-converter to ~1.1.0 continuous-integration/drone/push Build is failing Details	2023-11-20 18:37:44 +00:00
Alan Orth	e1216dae3c	Merge pull request #33 from ilri/renovate/pandas-2.x-lockfile Update dependency pandas to v2.1.3	2023-11-20 21:36:20 +03:00
renovate[bot]	6b650ff1b3	Update dependency pandas to v2.1.3 continuous-integration/drone/push Build is failing Details	2023-11-20 18:33:42 +00:00
Alan Orth	fa7bde6fc0	Merge pull request #34 from ilri/renovate/requests-cache-1.x-lockfile Update dependency requests-cache to v1.1.1	2023-11-20 21:32:50 +03:00
renovate[bot]	f89159fe32	Update dependency requests-cache to v1.1.1 continuous-integration/drone/push Build is passing Details	2023-11-19 09:26:49 +00:00
renovate[bot]	02058c5a65	Update dependency black to v23.11.0 continuous-integration/drone/push Build is passing Details	2023-11-08 07:49:15 +00:00
Alan Orth	8fed6b71ff	Merge pull request #30 from ilri/renovate/ipython-8.x-lockfile continuous-integration/drone/push Build is passing Details Update dependency ipython to v8.17.2	2023-10-31 22:15:50 +03:00
Alan Orth	b005b28cbe	Merge pull request #29 from ilri/renovate/pandas-2.x-lockfile Update dependency pandas to v2.1.2	2023-10-31 22:15:27 +03:00
renovate[bot]	c626290599	Update dependency ipython to v8.17.2 continuous-integration/drone/push Build is passing Details	2023-10-31 13:47:08 +00:00
renovate[bot]	1a06470b64	Update dependency pandas to v2.1.2 continuous-integration/drone/push Build is passing Details	2023-10-26 23:01:25 +00:00
Alan Orth	d46a81672e	Merge pull request #28 from ilri/renovate/pytest-7.x-lockfile continuous-integration/drone/push Build is passing Details Update dependency pytest to v7.4.3	2023-10-25 12:08:23 +03:00
Alan Orth	2a50e75082	Merge pull request #27 from ilri/renovate/csvkit-1.x-lockfile Update dependency csvkit to v1.3.0	2023-10-25 12:08:05 +03:00
Alan Orth	0d45e73983	Merge pull request #25 from ilri/renovate/black-23.x-lockfile Update dependency black to v23.10.1	2023-10-25 12:07:15 +03:00
renovate[bot]	3611aab425	Update dependency pytest to v7.4.3 continuous-integration/drone/push Build is passing Details	2023-10-24 22:36:05 +00:00
renovate[bot]	5c4ad0eb41	Update dependency black to v23.10.1 continuous-integration/drone/push Build is passing Details	2023-10-23 20:03:53 +00:00
renovate[bot]	f1f39722f6	Update dependency csvkit to v1.3.0 continuous-integration/drone/push Build is passing Details	2023-10-18 07:56:03 +00:00
Alan Orth	1c03999582	Merge pull request #24 from ilri/renovate/actions-checkout-4.x continuous-integration/drone/push Build is passing Details Update actions/checkout action to v4	2023-10-15 23:39:45 +03:00
Alan Orth	1f637f32cd	Rework requests-cache We should only be running this once per invocation, not for every row we check. This should be more efficient, but it means that we don't cache responses when running via pytest, which is actually probably a good thing.	2023-10-15 23:37:38 +03:00
Alan Orth	b8241e919d	poetry.lock: run poetry update	2023-10-15 23:22:48 +03:00
Alan Orth	b8dc19cc3f	csv_metadata_quality/check.py: enable requests-cache This was disabled at some point. We also need to use the new delete method instead.	2023-10-15 23:21:58 +03:00
Alan Orth	93c9b739ac	csv_metadata_quality/check.py: use HTTPS Use HTTPS for AGROVOC REST API.	2023-10-15 22:38:45 +03:00
Alan Orth	4ed2786703	pyproject.toml: update pycountry Use the latest branch in my fork that has iso-codes 4.15.0.	2023-10-15 21:53:09 +03:00
renovate[bot]	8728789183	Update actions/checkout action to v4 continuous-integration/drone/push Build is passing Details	2023-09-04 14:26:25 +00:00
Alan Orth	bf90464809	poetry.lock: run poetry update continuous-integration/drone/push Build is failing Details continuous-integration/drone Build is passing Details	2023-08-08 09:55:41 +02:00
Alan Orth	1878002391	poetry.lock: run poetry update continuous-integration/drone/push Build is passing Details	2023-06-12 10:42:50 +03:00
Alan Orth	d21d2621e3	csv_metadata_quality/app.py: read fields as strings I suspect this undermines the PyArrow backend performance gains in recent Pandas 2.0.0, but we are dealing with messy data sometimes and we must rely on data being strings.	2023-06-12 10:42:50 +03:00
Alan Orth	f3fb1ff7fb	Don't crash when title is missing We shouldn't crash the country/region checker/fixer when the title field is missing, since we only use it to show status to the user.	2023-06-12 10:42:50 +03:00
Alan Orth	1fa81f7558	Merge pull request #13 from ilri/renovate/ipython-8.x-lockfile continuous-integration/drone/push Build is passing Details Update dependency ipython to v8.14.0	2023-06-03 17:09:21 +03:00
renovate[bot]	7409193b6b	Update dependency ipython to v8.14.0 continuous-integration/drone/push Build is passing Details	2023-06-02 15:58:34 +00:00
Alan Orth	a84fcf0b7b	.drone.yml: try to use poetry instead of pip continuous-integration/drone/push Build is passing Details	2023-05-30 11:39:08 +03:00
Alan Orth	25ac290df4	.github: update Python actions continuous-integration/drone/push Build is failing Details We don't need to use `python setup.py install` anymore. We can use poetry directly in CI. See: https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md	2023-05-29 22:58:01 +03:00
Alan Orth	3f52bad1e3	Remove setup.py As far as I understand this is deprecated.	2023-05-29 22:41:37 +03:00
Alan Orth	0208ad0ade	Merge pull request #12 from ilri/renovate/requests-cache-1.x Update dependency requests-cache to v1	2023-05-29 22:37:23 +03:00
renovate[bot]	3632ae0fc9	Update dependency requests-cache to v1 continuous-integration/drone/push Build is passing Details	2023-05-29 19:25:58 +00:00
Alan Orth	17d089cc6e	poetry.lock: run poetry update continuous-integration/drone/push Build is passing Details	2023-05-29 22:24:22 +03:00
Alan Orth	bc470a4343	pyproject.toml: rework pandas and pyarrow We don't explicitly depend on PyArrow. It should come as a pandas extra. I installed it like this: $ poetry add pandas=="^2.0.2[feather,performance]" See: https://pandas.pydata.org/docs/getting_started/install.html#other-data-sources	2023-05-29 22:24:04 +03:00
Alan Orth	be609a809d	setup.py: add Python 3.11 classifier	2023-05-29 21:32:59 +03:00
Alan Orth	de3387ded7	Use Python 3.11 in Drone CI and GitHub Actions	2023-05-29 21:31:03 +03:00
Alan Orth	f343e87f0c	renovate.json: fix json	2023-05-29 21:26:03 +03:00
Alan Orth	7d3524fbd5	renovate.json: disable requirements.txt support Poetry is used to manage dependencies. The requirements.txt files are generated manually by exporting from Poetry.	2023-05-29 21:11:48 +03:00
Alan Orth	c614b71a52	Merge pull request #5 from ilri/renovate/configure Configure Renovate	2023-05-29 21:02:16 +03:00
renovate[bot]	d159a839f3	Add renovate.json	2023-05-29 17:40:33 +00:00
Alan Orth	36e2ebe5f4	poetry.lock: run poetry update continuous-integration/drone/push Build is passing Details	2023-05-10 15:06:41 +03:00
Alan Orth	33f67b7a7c	Update requirements continuous-integration/drone/push Build is passing Details Generated with poetry export: $ poetry export --without-hashes -f requirements.txt > requirements.txt $ poetry export --without-hashes --with dev -f requirements.txt > requirements-dev.txt I am trying `--without-hashes` to work around an error on pip install when running in CI: ERROR: In --require-hashes mode, all requirements must have their versions pinned with ==	2023-05-03 14:29:12 +03:00
Alan Orth	c0e1448439	poetry.lock: run poetry update	2023-05-03 14:28:47 +03:00
Alan Orth	5d0804a08f	Update requirements continuous-integration/drone/push Build is failing Details Generated with poetry export: $ poetry export --without-hashes -f requirements.txt > requirements.txt $ poetry export --without-hashes --with dev -f requirements.txt > requirements-dev.txt I am trying `--without-hashes` to work around an error on pip install when running in CI: ERROR: In --require-hashes mode, all requirements must have their versions pinned with ==	2023-04-22 12:44:54 -07:00
Alan Orth	f01c9edf17	poetry.lock: run poetry update	2023-04-22 12:44:16 -07:00
Alan Orth	8d4295b2b3	CHANGELOG.md: add note about description field	2023-04-22 12:17:44 -07:00
Alan Orth	e2d46e9495	csv_metadata_quality/app.py: skip newline fix on description The description field often has free-form text like the abstract and there are too many legitimate newlines here to be correcting them automatically.	2023-04-22 12:16:13 -07:00
Alan Orth	1491e1edb0	Fix path to data/licenses.json continuous-integration/drone/push Build is passing Details When we install and run this from CI, this file needs to exist in the package's folder inside site-packages. Then we can use __file__ to get the path relative to the package. See: https://python-packaging.readthedocs.io/en/latest/non-code-files.html	2023-04-05 15:28:21 +03:00
Alan Orth	34142c3e6b	Update requirements continuous-integration/drone/push Build is failing Details Generated with poetry export: $ poetry export --without-hashes -f requirements.txt > requirements.txt $ poetry export --without-hashes --with dev -f requirements.txt > requirements-dev.txt I am trying `--without-hashes` to work around an error on pip install when running in CI: ERROR: In --require-hashes mode, all requirements must have their versions pinned with ==	2023-04-05 12:51:56 +03:00
Alan Orth	0c88b96e8d	poetry.lock: run poetry update	2023-04-05 12:51:19 +03:00
Alan Orth	2e55b4d6e3	pyproject.toml: add pyarrow explicitly CI was failing because pyarrow is not an extra provided by pandas. Indeed, according to the docs the named extras installing pyarrow are actually feather and parquet, so we need to install pyarrow explicitly. See: https://pandas.pydata.org/pandas-docs/version/2.0/getting_started/install.html#install-dependencies	2023-04-05 12:49:40 +03:00
Alan Orth	c90aad29f0	Use poetry dev group This is the new syntax since Poetry 1.2.0. See: https://python-poetry.org/docs/managing-dependencies/#installing-group-dependencies	2023-04-05 12:37:03 +03:00
Alan Orth	6fd1e1377f	Add pyarrow extra to Python Pandas deps	2023-04-05 11:40:22 +03:00
Alan Orth	c64b7eb1f1	CHANGELOG.md: add note about Pandas 2.0.0	2023-04-05 11:17:48 +03:00
Alan Orth	29cbc4f3a3	Update requirements Generated with poetry export: $ poetry export --without-hashes -f requirements.txt > requirements.txt $ poetry export --without-hashes --with dev -f requirements.txt > requirements-dev.txt I am trying `--without-hashes` to work around an error on pip install when running in CI: ERROR: In --require-hashes mode, all requirements must have their versions pinned with ==	2023-04-05 11:17:06 +03:00
Alan Orth	307af1acfc	poetry.lock: run poetry update	2023-04-05 11:15:55 +03:00
Alan Orth	b5106de9df	pyproject.toml: Pandas 2.0.0	2023-04-05 11:15:40 +03:00
Alan Orth	9eeadfc44e	poetry.lock: after adding pandas 2.0.0rc1 continuous-integration/drone/push Build is failing Details This is going to be an issue on the master branch if I update any dependencies in the mean time...	2023-03-22 12:17:26 +03:00
Alan Orth	d4aed378cf	Switch to pandas 2.0.0rc1 Seems to work fine with the new PyArrow datatypes.	2023-03-22 12:16:56 +03:00
Alan Orth	20a2cce34b	CHANGELOG.md: add fixes continuous-integration/drone/push Build is failing Details	2023-03-10 16:17:20 +03:00
Alan Orth	d661ffe439	Check comma space on bibliographicCitation too The regex was only matching `dc.identifier.citation`, but we need to match `dcterms.bibliographicCitation` too.	2023-03-10 16:13:16 +03:00
Alan Orth	45a310387a	Don't fix multi-value separators on citations	2023-03-10 16:12:30 +03:00
Alan Orth	47b03c49ba	README.md: Update TODOs continuous-integration/drone/push Build is failing Details	2023-03-07 10:45:04 +03:00
Alan Orth	986b81cbf4	Update requirements continuous-integration/drone/push Build is failing Details Generated with poetry export: $ poetry export --without-hashes -f requirements.txt > requirements.txt $ poetry export --without-hashes --with dev -f requirements.txt > requirements-dev.txt I am trying `--without-hashes` to work around an error on pip install when running in CI: ERROR: In --require-hashes mode, all requirements must have their versions pinned with ==	2023-03-04 07:35:36 +03:00
Alan Orth	d43a47ae32	poetry.lock: run poetry update	2023-03-04 07:34:50 +03:00
Alan Orth	ede37569f1	pyproject.toml: use pycountry with iso-codes 4.13.0	2023-03-04 07:33:48 +03:00
Alan Orth	0c53efe60a	Update requirements Generated with poetry export: $ poetry export --without-hashes -f requirements.txt > requirements.txt $ poetry export --without-hashes --with dev -f requirements.txt > requirements-dev.txt I am trying `--without-hashes` to work around an error on pip install when running in CI: ERROR: In --require-hashes mode, all requirements must have their versions pinned with ==	2023-03-04 06:54:34 +03:00
Alan Orth	5f0e25b818	poetry.lock: run poetry update	2023-03-04 06:53:55 +03:00
Alan Orth	4776154d6c	pyproject.toml: switch back to upstream country_converter Version 1.0.0 incorporates my change to Myanmar. See: https://github.com/IndEcol/country_converter/releases/tag/v1.0.0	2023-03-04 06:52:56 +03:00
				`@ -0,0 +1 @@`
				`include csv_metadata_quality/data/licenses.json`