.github/workflows: use rye in CI

Use rye instead of poetry in CI.
requirements-dev.lock: run rye sync
2025-09-11 06:07:04 +02:00 · 2024-08-21 18:56:09 +03:00 · 2024-08-21 17:41:49 +03:00 · 2024-08-21 17:41:36 +03:00 · 2024-07-29 19:58:42 -07:00 · 2024-06-25 11:54:09 +03:00
23 changed files with 3849 additions and 1481 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -1,3 +1,33 @@
 ---
 kind: pipeline
 type: docker
 name: python311
 steps:
 - name: test
  image: python:3.11-slim
  commands:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
  - python -m pip install poetry
  - poetry install
  - poetry run pytest
  # Basic test
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 ---
 kind: pipeline
 type: docker
@@ -10,23 +40,23 @@ steps:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
-  - pip install -r requirements-dev.txt
+  - python -m pip install poetry
-  - pytest
+  - poetry install
-  - python setup.py install
+  - poetry run pytest
  # Basic test
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 ---
 kind: pipeline
@@ -40,22 +70,22 @@ steps:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
-  - pip install -r requirements-dev.txt
+  - python -m pip install poetry
-  - pytest
+  - poetry install
-  - python setup.py install
+  - poetry run pytest
  # Basic test
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 # vim: ts=2 sw=2 et
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -15,37 +15,27 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
-    - name: Set up Python 3.10
+    - name: Install rye
-      uses: actions/setup-python@v4
+      uses: eifinger/setup-rye@v4
      with:
-        python-version: '3.10'
+        version: 'latest'
-        cache: 'pip'
+    - run: rye sync
-    - name: Install dependencies
+    - name: Lint
      run: |
        python -m pip install --upgrade pip
        pip install flake8 pytest
        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
        if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
    - name: Lint with flake8
      run: |
        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        rye lint
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
    - name: Test with pytest
-      run: |
+      run: rye test
        pytest
    - name: Test CLI
      run: |
        python setup.py install
        # Basic test
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
        # Test with unsafe fixes
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
        # Test with experimental checks
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
        # Test with AGROVOC validation
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
        # Test with AGROVOC validation (and dropping invalid)
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
 3.12
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## Unreleased
 ### Added
 - Ability to normalize DOIs to https://doi.org URI format
 ### Fixed
 - Fixed regex so we don't run the invalid multi-value separator fix on
 `dcterms.bibliographicCitation` fields
 - Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
 fields
 - Don't crash the country/region checker/fixer when a title field is missing
 ### Changed
 - Don't run newline fix on description fields
 - Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
 - Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
 ### Updated
 - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
 - SPDX license list
 ## [0.6.1] - 2023-02-23
 ### Fixed
 - Missing region check should ignore subregion field, if it exists
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1 @@
 include csv_metadata_quality/data/licenses.json
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
 - Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
 - Remove duplicate metadata values
 - Check for duplicate items, using the title, type, and date issued as an indicator
 - [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format
 ## Installation
 The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
@@ -125,9 +126,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 - Better logging, for example with INFO, WARN, and ERR levels
 - Verbose, debug, or quiet options
 - Warn if an author is shorter than 3 characters?
 - Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
 - Warn if two items use the same file in `filename` column
 - Add an option to drop invalid AGROVOC subjects?
 - Add tests for application invocation, ie `tests/test_app.py`?
 - Validate ISSNs or journal titles against CrossRef API?
 - Add configurable field validation, like specify a field name and a validation file?
@@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
  - Warn if item is Open Access, but missing a license
  - Warn if item has an ISSN but no journal title
  - Update journal titles from ISSN
- Migrate to https://github.com/spdx/license-list-data
+- Migrate from Pandas to Polars
 ## License
 This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-3.0-only
 import argparse
 import os
 import re
 import signal
 import sys
 from datetime import timedelta
 import pandas as pd
 import requests_cache
 from colorama import Fore
 import csv_metadata_quality.check as check
@@ -74,7 +77,7 @@ def run(argv):
    signal.signal(signal.SIGINT, signal_handler)
    # Read all fields as strings so dates don't get converted from 1998 to 1998.0
-    df = pd.read_csv(args.input_file, dtype=str)
+    df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
    # Check if the user requested to skip any fields
    if args.exclude_fields:
@@ -82,7 +85,20 @@ def run(argv):
        # user should be careful to no include spaces here.
        exclude = args.exclude_fields.split(",")
    else:
-        exclude = list()
+        exclude = []
    # enable transparent request cache with thirty days expiry
    expire_after = timedelta(days=30)
    # Allow overriding the location of the requests cache, just in case we are
    # running in an environment where we can't write to the current working di-
    # rectory (for example from csv-metadata-quality-web).
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
    requests_cache.install_cache(
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
    )
    # prune old cache entries
    requests_cache.delete()
    for column in df.columns:
        if column in exclude:
@@ -91,7 +107,9 @@ def run(argv):
            continue
        if args.unsafe_fixes:
-            match = re.match(r"^.*?abstract.*$", column)
+            # Skip whitespace and newline fixes on abstracts and descriptions
            # because there are too many with legitimate multi-line metadata.
            match = re.match(r"^.*?(abstract|description).*$", column)
            if match is None:
                # Fix: whitespace
                df[column] = df[column].apply(fix.whitespace, field_name=column)
@@ -102,7 +120,7 @@ def run(argv):
        # Fix: missing space after comma. Only run on author and citation
        # fields for now, as this problem is mostly an issue in names.
        if args.unsafe_fixes:
-            match = re.match(r"^.*?(author|citation).*$", column)
+            match = re.match(r"^.*?(author|[Cc]itation).*$", column)
            if match is not None:
                df[column] = df[column].apply(fix.comma_space, field_name=column)
@@ -123,10 +141,15 @@ def run(argv):
        # Fix: unnecessary Unicode
        df[column] = df[column].apply(fix.unnecessary_unicode)
        # Fix: normalize DOIs
        match = re.match(r"^.*?identifier\.doi.*$", column)
        if match is not None:
            df[column] = df[column].apply(fix.normalize_dois)
        # Fix: invalid and unnecessary multi-value separators. Skip the title
        # and abstract fields because "|" is used to indicate something like
        # a subtitle.
-        match = re.match(r"^.*?(abstract|title).*$", column)
+        match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
        if match is None:
            df[column] = df[column].apply(fix.separators, field_name=column)
            # Run whitespace fix again after fixing invalid separators
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: GPL-3.0-only
 import logging
 import os
 import re
-from datetime import datetime, timedelta
+from datetime import datetime
 import country_converter as coco
 import pandas as pd
 import requests
 import requests_cache
 from colorama import Fore
 from pycountry import languages
 from stdnum import isbn as stdnum_isbn
@@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
        return
    # List of suspicious characters, for example:  ́ˆ~`
-    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
+    suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]
    for character in suspicious_characters:
        # Find the position of the suspicious character in the string
@@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
    if pd.isna(field):
        return
    # enable transparent request cache with thirty days expiry
    expire_after = timedelta(days=30)
    # Allow overriding the location of the requests cache, just in case we are
    # running in an environment where we can't write to the current working di-
    # rectory (for example from csv-metadata-quality-web).
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
    requests_cache.install_cache(
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
    )
    # prune old cache entries
    # requests_cache.remove_expired_responses()
    # Initialize an empty list to hold the validated AGROVOC values
-    values = list()
+    values = []
    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
-        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
+        request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
        request_params = {"query": value}
        request = requests.get(request_url, params=request_params)
@@ -373,7 +358,7 @@ def duplicate_items(df):
    if items_count_unique < items_count_total:
        # Create a list to hold our items while we check for duplicates
-        items = list()
+        items = []
        for index, row in df.iterrows():
            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
@@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
-            regions = list()
+            regions = []
        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
@@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
            un_region = cc.convert(names=country, to="UNRegion")
            if un_region != "not found" and un_region not in regions:
-                print(
+                try:
-                    f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
+                    print(
-                )
+                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
                    )
                except KeyError:
                    print(
                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
                    )
    return
--- a/csv_metadata_quality/data/licenses.json
+++ b/csv_metadata_quality/data/licenses.json
--- a/csv_metadata_quality/experimental.py
+++ b/csv_metadata_quality/experimental.py
@@ -2,8 +2,8 @@
 import re
 import langid
 import pandas as pd
 import py3langid as langid
 from colorama import Fore
 from pycountry import languages
@@ -20,7 +20,7 @@ def correct_language(row, exclude):
    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    language = ""
-    sample_strings = list()
+    sample_strings = []
    title = None
    # Iterate over the labels of the current row's values. Before we transposed
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -23,7 +23,7 @@ def whitespace(field, field_name):
        return
    # Initialize an empty list to hold the cleaned values
-    values = list()
+    values = []
    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
@@ -64,7 +64,7 @@ def separators(field, field_name):
        return
    # Initialize an empty list to hold the cleaned values
-    values = list()
+    values = []
    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
@@ -175,7 +175,7 @@ def duplicates(field, field_name):
    values = field.split("||")
    # Initialize an empty list to hold the de-duplicated values
-    new_values = list()
+    new_values = []
    # Iterate over all values
    for value in values:
@@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
-            regions = list()
+            regions = []
        # An empty list for our regions so we can keep track for all countries
-        missing_regions = list()
+        missing_regions = []
        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
@@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
            # it doesn't already exist in regions.
            if un_region != "not found" and un_region not in regions:
                if un_region not in missing_regions:
-                    print(
+                    try:
-                        f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
+                        print(
-                    )
+                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
                        )
                    except KeyError:
                        # If there is no title column in the CSV we will print
                        # the fix without the title instead of crashing.
                        print(
                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
                        )
                    missing_regions.append(un_region)
        if len(missing_regions) > 0:
@@ -387,3 +395,88 @@ def countries_match_regions(row, exclude):
                row[region_column_name] = "||".join(missing_regions)
    return row
 def normalize_dois(field):
    """Normalize DOIs.
    DOIs are meant to be globally unique identifiers. They are case insensitive,
    but in order to compare them robustly they should be normalized to a common
    format:
        - strip leading and trailing whitespace
        - lowercase all ASCII characters
        - convert all variations to https://doi.org/10.xxxx/xxxx URI format
    Return string with normalized DOI.
    See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
    """
    # Skip fields with missing values
    if pd.isna(field):
        return
    # Try to split multi-value field on "||" separator
    values = field.split("||")
    # Initialize an empty list to hold the de-duplicated values
    new_values = []
    # Iterate over all values (most items will only have one DOI)
    for value in values:
        # Strip leading and trailing whitespace
        new_value = value.strip()
        new_value = new_value.lower()
        # Convert to HTTPS
        pattern = re.compile(r"^http://")
        match = re.findall(pattern, new_value)
        if match:
            new_value = re.sub(pattern, "https://", new_value)
        # Convert dx.doi.org to doi.org
        pattern = re.compile(r"dx\.doi\.org")
        match = re.findall(pattern, new_value)
        if match:
            new_value = re.sub(pattern, "doi.org", new_value)
        # Convert www.doi.org to doi.org
        pattern = re.compile(r"www\.doi\.org")
        match = re.findall(pattern, new_value)
        if match:
            new_value = re.sub(pattern, "doi.org", new_value)
        # Convert erroneous %2f to /
        pattern = re.compile("%2f")
        match = re.findall(pattern, new_value)
        if match:
            new_value = re.sub(pattern, "/", new_value)
        # Replace values like doi: 10.11648/j.jps.20140201.14
        pattern = re.compile(r"^doi: 10\.")
        match = re.findall(pattern, new_value)
        if match:
            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
        # Replace values like 10.3390/foods12010115
        pattern = re.compile(r"^10\.")
        match = re.findall(pattern, new_value)
        if match:
            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
        if new_value != value:
            print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
        new_values.append(new_value)
    new_field = "||".join(new_values)
    return new_field
--- a/csv_metadata_quality/util.py
+++ b/csv_metadata_quality/util.py
@@ -2,7 +2,7 @@
 import json
-from importlib.resources import files
+import os
 from ftfy.badness import is_bad
@@ -58,7 +58,7 @@ def is_mojibake(field):
 def load_spdx_licenses():
    """Returns a Python list of SPDX short license identifiers."""
-    with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
+    with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
        licenses = json.load(f)
    # List comprehension to extract the license ID for each license
--- a/data/test.csv
+++ b/data/test.csv
@@ -37,3 +37,7 @@ Mojibake,2021-03-18,,,,PublicaÃ§ao CIAT,,,,Report,,,,
 Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
 Country missing region,2021-12-08,,,,,Kenya,,,,,,,
 Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
 DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
 DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
 Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
 DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,41 +1,63 @@
-[tool.poetry]
+[project]
 name = "csv-metadata-quality"
 version = "0.6.1"
 description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
-authors = ["Alan Orth <alan.orth@gmail.com>"]
+authors = [
-license="GPL-3.0-only"
+    { name = "Alan Orth", email = "alan.orth@gmail.com" }
 ]
 license= { file = "LICENSE.txt" }
 dependencies = [
    "pandas[feather,performance]~=2.2",
    "python-stdnum~=1.20",
    "requests~=2.32",
    "requests-cache~=1.2.1",
    "colorama~=0.4",
    "ftfy~=6.2.0",
    "country-converter~=1.2",
    "pycountry~=24.6.1",
    "py3langid~=0.3",
 ]
 readme = "README.md"
 requires-python = ">= 3.9"
 classifiers = [
  "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
  "Natural Language :: English",
  "Operating System :: OS Independent",
  "Programming Language :: Python :: 3.9",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
  "Programming Language :: Python :: Implementation :: CPython",
 ]
 [project.urls]
 repository = "https://github.com/ilri/csv-metadata-quality"
 homepage = "https://github.com/ilri/csv-metadata-quality"
-[tool.poetry.scripts]
+[project.scripts]
 csv-metadata-quality = 'csv_metadata_quality.__main__:main'
-[tool.poetry.dependencies]
+# So rye doesn't fall back to setuptools
-python = "^3.9"
+# See: https://packaging.python.org/en/latest/tutorials/packaging-projects/#choosing-build-backend
 pandas = "^1.5.2"
 python-stdnum = "^1.18"
 requests = "^2.28.2"
 requests-cache = "^0.9.8"
 langid = "^1.1.6"
 colorama = "^0.4.6"
 ftfy = "^6.1.1"
 country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
 pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
 [tool.poetry.dev-dependencies]
 pytest = "^7.2.1"
 flake8 = "^6.0.0"
 pytest-clarity = "^1.0.1"
 black = "^23.1.0"
 isort = "^5.12.0"
 csvkit = "^1.1.0"
 [tool.poetry.group.dev.dependencies]
 ipython = "^8.10.0"
 [build-system]
-requires = ["poetry>=0.12"]
+requires = ["hatchling"]
-build-backend = "poetry.masonry.api"
+build-backend = "hatchling.build"
 [tool.rye]
 managed = true
 dev-dependencies = [
    "pytest~=8.3",
    "pytest-clarity~=1.0",
    "isort~=5.13",
    "csvkit~=2.0",
    "ipython~=8.26",
    "fixit~=2.1",
 ]
 # So hatch doesn't try to build other top-level directories like "data"
 [tool.hatch.build.targets.wheel]
 packages = ["csv_metadata_quality"]
 [tool.isort]
 profile = "black"
--- a/renovate.json
+++ b/renovate.json
@@ -0,0 +1,9 @@
 {
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
  "extends": [
    "config:base"
  ],
  "pip_requirements": {
      "enabled": false
  }
 }
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -0,0 +1,188 @@
 # generated by rye
 # use `rye lock` or `rye sync` to update this lockfile
 #
 # last locked with the following flags:
 #   pre: false
 #   features: []
 #   all-features: false
 #   with-sources: false
 #   generate-hashes: false
 #   universal: false
 -e file:.
 agate==1.10.2
    # via agate-dbf
    # via agate-excel
    # via agate-sql
    # via csvkit
 agate-dbf==0.2.3
    # via csvkit
 agate-excel==0.4.1
    # via csvkit
 agate-sql==0.7.2
    # via csvkit
 asttokens==2.4.1
    # via stack-data
 attrs==23.2.0
    # via cattrs
    # via requests-cache
 babel==2.15.0
    # via agate
 bottleneck==1.3.8
    # via pandas
 cattrs==23.2.3
    # via requests-cache
 certifi==2024.2.2
    # via requests
 charset-normalizer==3.3.2
    # via requests
 click==8.1.7
    # via fixit
    # via moreorless
 colorama==0.4.6
    # via csv-metadata-quality
 country-converter==1.2
    # via csv-metadata-quality
 csvkit==2.0.1
 dbfread==2.0.7
    # via agate-dbf
 decorator==5.1.1
    # via ipython
 et-xmlfile==1.1.0
    # via openpyxl
 executing==2.0.1
    # via stack-data
 fixit==2.1.0
 ftfy==6.2.0
    # via csv-metadata-quality
 greenlet==3.0.3
    # via sqlalchemy
 idna==3.7
    # via requests
 iniconfig==2.0.0
    # via pytest
 ipython==8.26.0
 isodate==0.6.1
    # via agate
 isort==5.13.2
 jedi==0.19.1
    # via ipython
 leather==0.4.0
    # via agate
 libcst==1.4.0
    # via fixit
 llvmlite==0.43.0
    # via numba
 markdown-it-py==3.0.0
    # via rich
 matplotlib-inline==0.1.7
    # via ipython
 mdurl==0.1.2
    # via markdown-it-py
 moreorless==0.4.0
    # via fixit
 numba==0.60.0
    # via pandas
 numexpr==2.10.0
    # via pandas
 numpy==2.0.0
    # via bottleneck
    # via numba
    # via numexpr
    # via pandas
    # via py3langid
    # via pyarrow
 olefile==0.47
    # via agate-excel
 openpyxl==3.1.2
    # via agate-excel
    # via csvkit
 packaging==24.0
    # via fixit
    # via pytest
 pandas==2.2.2
    # via country-converter
    # via csv-metadata-quality
 parsedatetime==2.6
    # via agate
 parso==0.8.4
    # via jedi
 pathspec==0.12.1
    # via trailrunner
 pexpect==4.9.0
    # via ipython
 platformdirs==4.2.2
    # via requests-cache
 pluggy==1.5.0
    # via pytest
 pprintpp==0.4.0
    # via pytest-clarity
 prompt-toolkit==3.0.43
    # via ipython
 ptyprocess==0.7.0
    # via pexpect
 pure-eval==0.2.2
    # via stack-data
 py3langid==0.3.0
    # via csv-metadata-quality
 pyarrow==16.1.0
    # via pandas
 pycountry==24.6.1
    # via csv-metadata-quality
 pygments==2.18.0
    # via ipython
    # via rich
 pytest==8.3.2
    # via pytest-clarity
 pytest-clarity==1.0.1
 python-dateutil==2.9.0.post0
    # via pandas
 python-slugify==8.0.4
    # via agate
 python-stdnum==1.20
    # via csv-metadata-quality
 pytimeparse==1.1.8
    # via agate
 pytz==2024.1
    # via pandas
 pyyaml==6.0.1
    # via libcst
 requests==2.32.2
    # via csv-metadata-quality
    # via requests-cache
 requests-cache==1.2.1
    # via csv-metadata-quality
 rich==13.7.1
    # via pytest-clarity
 six==1.16.0
    # via asttokens
    # via isodate
    # via python-dateutil
    # via url-normalize
 sqlalchemy==2.0.30
    # via agate-sql
    # via csvkit
 stack-data==0.6.3
    # via ipython
 text-unidecode==1.3
    # via python-slugify
 trailrunner==1.4.0
    # via fixit
 traitlets==5.14.3
    # via ipython
    # via matplotlib-inline
 typing-extensions==4.11.0
    # via sqlalchemy
 tzdata==2024.1
    # via pandas
 url-normalize==1.4.3
    # via requests-cache
 urllib3==2.2.1
    # via requests
    # via requests-cache
 wcwidth==0.2.13
    # via ftfy
    # via prompt-toolkit
 xlrd==2.0.1
    # via agate-excel
    # via csvkit
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,80 +0,0 @@
 agate-dbf==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
 agate-excel==0.2.5 ; python_version >= "3.9" and python_version < "4.0"
 agate-sql==0.5.9 ; python_version >= "3.9" and python_version < "4.0"
 agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
 appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
 appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
 asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
 attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
 backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
 black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
 cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
 charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
 click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
 country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
 csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
 dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
 decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
 et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
 exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
 executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
 flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
 ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
 greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
 idna==3.4 ; python_version >= "3.9" and python_version < "4"
 iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
 ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
 isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
 isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
 jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
 langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
 leather==0.3.4 ; python_version >= "3.9" and python_version < "4.0"
 markdown-it-py==2.2.0 ; python_version >= "3.9" and python_version < "4.0"
 matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
 mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
 mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
 mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
 olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
 openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
 packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
 pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
 parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
 parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
 pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
 pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
 pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
 platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
 pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
 prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
 ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
 pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
 pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
 pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
 pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
 pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
 pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
 pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
 python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
 python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
 python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
 pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
 pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
 requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
 requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
 rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
 six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
 sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
 stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
 text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
 tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
 traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
 typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
 url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
 urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
 wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
 xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
--- a/requirements.lock
+++ b/requirements.lock
@@ -0,0 +1,78 @@
 # generated by rye
 # use `rye lock` or `rye sync` to update this lockfile
 #
 # last locked with the following flags:
 #   pre: false
 #   features: []
 #   all-features: false
 #   with-sources: false
 #   generate-hashes: false
 #   universal: false
 -e file:.
 attrs==23.2.0
    # via cattrs
    # via requests-cache
 bottleneck==1.3.8
    # via pandas
 cattrs==23.2.3
    # via requests-cache
 certifi==2024.2.2
    # via requests
 charset-normalizer==3.3.2
    # via requests
 colorama==0.4.6
    # via csv-metadata-quality
 country-converter==1.2
    # via csv-metadata-quality
 ftfy==6.2.0
    # via csv-metadata-quality
 idna==3.7
    # via requests
 llvmlite==0.43.0
    # via numba
 numba==0.60.0
    # via pandas
 numexpr==2.10.0
    # via pandas
 numpy==2.0.0
    # via bottleneck
    # via numba
    # via numexpr
    # via pandas
    # via py3langid
    # via pyarrow
 pandas==2.2.2
    # via country-converter
    # via csv-metadata-quality
 platformdirs==4.2.2
    # via requests-cache
 py3langid==0.3.0
    # via csv-metadata-quality
 pyarrow==16.1.0
    # via pandas
 pycountry==24.6.1
    # via csv-metadata-quality
 python-dateutil==2.9.0.post0
    # via pandas
 python-stdnum==1.20
    # via csv-metadata-quality
 pytz==2024.1
    # via pandas
 requests==2.32.2
    # via csv-metadata-quality
    # via requests-cache
 requests-cache==1.2.1
    # via csv-metadata-quality
 six==1.16.0
    # via python-dateutil
    # via url-normalize
 tzdata==2024.1
    # via pandas
 url-normalize==1.4.3
    # via requests-cache
 urllib3==2.2.1
    # via requests
    # via requests-cache
 wcwidth==0.2.13
    # via ftfy
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +0,0 @@
 appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
 attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
 charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
 country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
 exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
 ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
 idna==3.4 ; python_version >= "3.9" and python_version < "4"
 langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
 numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
 pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
 pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
 python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
 python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
 pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
 requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
 requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
 six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
 url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
 urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
 wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
--- a/setup.py
+++ b/setup.py
@@ -1,36 +0,0 @@
 import setuptools
 with open("README.md", "r") as fh:
    long_description = fh.read()
 install_requires = [
    "pandas",
    "python-stdnum",
    "requests",
    "requests-cache",
    "pycountry",
    "langid",
 ]
 setuptools.setup(
    name="csv-metadata-quality",
    version="0.6.1",
    author="Alan Orth",
    author_email="aorth@mjanja.ch",
    description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
    license="GPLv3",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/alanorth/csv-metadata-quality",
    classifiers=[
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
        "Operating System :: OS Independent",
    ],
    packages=["csv_metadata_quality"],
    entry_points={
        "console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
    },
    install_requires=install_requires,
 )
--- a/tests/test_check.py
+++ b/tests/test_check.py
@@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):
    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "es"
-    exclude = list()
+    exclude = []
    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):
    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "spa"
-    exclude = list()
+    exclude = []
    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():
    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "en"
-    exclude = list()
+    exclude = []
    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():
    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "eng"
-    exclude = list()
+    exclude = []
    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -407,7 +407,7 @@ def test_check_doi_field():
    # the citation and a DOI field.
    d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)
-    exclude = list()
+    exclude = []
    result = check.citation_doi(series, exclude)
@@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
    """Test an item with a DOI in its citation, but no DOI field."""
    citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
-    exclude = list()
+    exclude = []
    # Emulate a column in a transposed dataframe (which is just a series), with
    # an empty DOI field and a citation containing a DOI.
@@ -439,7 +439,7 @@ def test_title_in_citation():
    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all the things."
-    exclude = list()
+    exclude = []
    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
@@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):
    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all teh things."
-    exclude = list()
+    exclude = []
    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
@@ -477,7 +477,7 @@ def test_country_matches_region():
    country = "Kenya"
    region = "Eastern Africa"
-    exclude = list()
+    exclude = []
    # Emulate a column in a transposed dataframe (which is just a series)
    d = {"cg.coverage.country": country, "cg.coverage.region": region}
@@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
-    exclude = list()
+    exclude = []
    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
-    exclude = list()
+    exclude = []
    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
@@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
    series_correct = pd.Series(data=d_correct)
    pd.testing.assert_series_equal(result, series_correct)
 def test_fix_normalize_dois():
    """Test normalizing a DOI."""
    value = "doi: 10.11648/j.jps.20140201.14"
    assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
		`@@ -0,0 +1 @@`
							`include csv_metadata_quality/data/licenses.json`