.github/workflows: use rye in CI

Use rye instead of poetry in CI.
requirements-dev.lock: run rye sync
2025-11-10 09:19:41 +01:00 · 2024-08-21 18:56:09 +03:00 · 2024-08-21 17:41:49 +03:00 · 2024-08-21 17:41:36 +03:00 · 2024-07-29 19:58:42 -07:00 · 2024-06-25 11:54:09 +03:00
23 changed files with 3849 additions and 1481 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -1,3 +1,33 @@
+---
+kind: pipeline
+type: docker
+name: python311
+
+steps:
+- name: test
+  image: python:3.11-slim
+  commands:
+  - id
+  - python -V
+  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
+  - python -m pip install poetry
+  - poetry install
+  - poetry run pytest
+  # Basic test
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  # Basic test with unsafe fixes
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  # Geography test
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  # Geography test with unsafe fixes
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  # Test with experimental checks
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  # Test with AGROVOC validation
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  # Test with AGROVOC validation (and dropping invalid)
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+
 ---
 kind: pipeline
 type: docker
@@ -10,23 +40,23 @@ steps:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
-  - pip install -r requirements-dev.txt
-  - pytest
-  - python setup.py install
+  - python -m pip install poetry
+  - poetry install
+  - poetry run pytest
  # Basic test
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d

 ---
 kind: pipeline
@@ -40,22 +70,22 @@ steps:
  - id
  - python -V
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
-  - pip install -r requirements-dev.txt
-  - pytest
-  - python setup.py install
+  - python -m pip install poetry
+  - poetry install
+  - poetry run pytest
  # Basic test
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
  # Basic test with unsafe fixes
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
  # Geography test
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
  # Geography test with unsafe fixes
-  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
+  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
  # Test with experimental checks
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
  # Test with AGROVOC validation
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
  # Test with AGROVOC validation (and dropping invalid)
-  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d

 # vim: ts=2 sw=2 et
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -15,37 +15,27 @@ jobs:
    runs-on: ubuntu-22.04

    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v4
+    - uses: actions/checkout@v4
+    - name: Install rye
+      uses: eifinger/setup-rye@v4
      with:
-        python-version: '3.10'
-        cache: 'pip'
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-        if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
-    - name: Lint with flake8
+        version: 'latest'
+    - run: rye sync
+    - name: Lint
      run: |
        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        rye lint
    - name: Test with pytest
-      run: |
-        pytest
+      run: rye test
    - name: Test CLI
      run: |
-        python setup.py install
        # Basic test
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
        # Test with unsafe fixes
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
        # Test with experimental checks
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
        # Test with AGROVOC validation
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
        # Test with AGROVOC validation (and dropping invalid)
-        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
+        rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## Unreleased
+### Added
+- Ability to normalize DOIs to https://doi.org URI format
+
+### Fixed
+- Fixed regex so we don't run the invalid multi-value separator fix on
+`dcterms.bibliographicCitation` fields
+- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
+fields
+- Don't crash the country/region checker/fixer when a title field is missing
+
+### Changed
+- Don't run newline fix on description fields
+- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
+- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
+
+### Updated
+- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
+- SPDX license list
+
 ## [0.6.1] - 2023-02-23
 ### Fixed
 - Missing region check should ignore subregion field, if it exists
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include csv_metadata_quality/data/licenses.json
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
 - Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
 - Remove duplicate metadata values
 - Check for duplicate items, using the title, type, and date issued as an indicator
+- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format

 ## Installation
 The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
@@ -125,9 +126,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 - Better logging, for example with INFO, WARN, and ERR levels
 - Verbose, debug, or quiet options
 - Warn if an author is shorter than 3 characters?
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
 - Warn if two items use the same file in `filename` column
- Add an option to drop invalid AGROVOC subjects?
 - Add tests for application invocation, ie `tests/test_app.py`?
 - Validate ISSNs or journal titles against CrossRef API?
 - Add configurable field validation, like specify a field name and a validation file?
@@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
  - Warn if item is Open Access, but missing a license
  - Warn if item has an ISSN but no journal title
  - Update journal titles from ISSN
- Migrate to https://github.com/spdx/license-list-data
+- Migrate from Pandas to Polars

 ## License
 This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-3.0-only

 import argparse
+import os
 import re
 import signal
 import sys
+from datetime import timedelta

 import pandas as pd
+import requests_cache
 from colorama import Fore

 import csv_metadata_quality.check as check
@@ -74,7 +77,7 @@ def run(argv):
    signal.signal(signal.SIGINT, signal_handler)

    # Read all fields as strings so dates don't get converted from 1998 to 1998.0
-    df = pd.read_csv(args.input_file, dtype=str)
+    df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")

    # Check if the user requested to skip any fields
    if args.exclude_fields:
@@ -82,7 +85,20 @@ def run(argv):
        # user should be careful to no include spaces here.
        exclude = args.exclude_fields.split(",")
    else:
-        exclude = list()
+        exclude = []
+
+    # enable transparent request cache with thirty days expiry
+    expire_after = timedelta(days=30)
+    # Allow overriding the location of the requests cache, just in case we are
+    # running in an environment where we can't write to the current working di-
+    # rectory (for example from csv-metadata-quality-web).
+    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
+    requests_cache.install_cache(
+        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
+    )
+
+    # prune old cache entries
+    requests_cache.delete()

    for column in df.columns:
        if column in exclude:
@@ -91,7 +107,9 @@ def run(argv):
            continue

        if args.unsafe_fixes:
-            match = re.match(r"^.*?abstract.*$", column)
+            # Skip whitespace and newline fixes on abstracts and descriptions
+            # because there are too many with legitimate multi-line metadata.
+            match = re.match(r"^.*?(abstract|description).*$", column)
            if match is None:
                # Fix: whitespace
                df[column] = df[column].apply(fix.whitespace, field_name=column)
@@ -102,7 +120,7 @@ def run(argv):
        # Fix: missing space after comma. Only run on author and citation
        # fields for now, as this problem is mostly an issue in names.
        if args.unsafe_fixes:
-            match = re.match(r"^.*?(author|citation).*$", column)
+            match = re.match(r"^.*?(author|[Cc]itation).*$", column)
            if match is not None:
                df[column] = df[column].apply(fix.comma_space, field_name=column)

@@ -123,10 +141,15 @@ def run(argv):
        # Fix: unnecessary Unicode
        df[column] = df[column].apply(fix.unnecessary_unicode)

+        # Fix: normalize DOIs
+        match = re.match(r"^.*?identifier\.doi.*$", column)
+        if match is not None:
+            df[column] = df[column].apply(fix.normalize_dois)
+
        # Fix: invalid and unnecessary multi-value separators. Skip the title
        # and abstract fields because "|" is used to indicate something like
        # a subtitle.
-        match = re.match(r"^.*?(abstract|title).*$", column)
+        match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
        if match is None:
            df[column] = df[column].apply(fix.separators, field_name=column)
            # Run whitespace fix again after fixing invalid separators
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: GPL-3.0-only

 import logging
-import os
 import re
-from datetime import datetime, timedelta
+from datetime import datetime

 import country_converter as coco
 import pandas as pd
 import requests
-import requests_cache
 from colorama import Fore
 from pycountry import languages
 from stdnum import isbn as stdnum_isbn
@@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
        return

    # List of suspicious characters, for example:  ́ˆ~`
-    suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
+    suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]

    for character in suspicious_characters:
        # Find the position of the suspicious character in the string
@@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
    if pd.isna(field):
        return

-    # enable transparent request cache with thirty days expiry
-    expire_after = timedelta(days=30)
-    # Allow overriding the location of the requests cache, just in case we are
-    # running in an environment where we can't write to the current working di-
-    # rectory (for example from csv-metadata-quality-web).
-    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
-    requests_cache.install_cache(
-        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
-    )
-
-    # prune old cache entries
-    # requests_cache.remove_expired_responses()
-
    # Initialize an empty list to hold the validated AGROVOC values
-    values = list()
+    values = []

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
-        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
+        request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
        request_params = {"query": value}

        request = requests.get(request_url, params=request_params)
@@ -373,7 +358,7 @@ def duplicate_items(df):

    if items_count_unique < items_count_total:
        # Create a list to hold our items while we check for duplicates
-        items = list()
+        items = []

        for index, row in df.iterrows():
            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
@@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
-            regions = list()
+            regions = []

        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
@@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
            un_region = cc.convert(names=country, to="UNRegion")

            if un_region != "not found" and un_region not in regions:
-                print(
-                    f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
-                )
+                try:
+                    print(
+                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
+                    )
+                except KeyError:
+                    print(
+                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
+                    )

    return
--- a/csv_metadata_quality/data/licenses.json
+++ b/csv_metadata_quality/data/licenses.json
--- a/csv_metadata_quality/experimental.py
+++ b/csv_metadata_quality/experimental.py
@@ -2,8 +2,8 @@

 import re

-import langid
 import pandas as pd
+import py3langid as langid
 from colorama import Fore
 from pycountry import languages

@@ -20,7 +20,7 @@ def correct_language(row, exclude):
    # Initialize some variables at global scope so that we can set them in the
    # loop scope below and still be able to access them afterwards.
    language = ""
-    sample_strings = list()
+    sample_strings = []
    title = None

    # Iterate over the labels of the current row's values. Before we transposed
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@@ -23,7 +23,7 @@ def whitespace(field, field_name):
        return

    # Initialize an empty list to hold the cleaned values
-    values = list()
+    values = []

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
@@ -64,7 +64,7 @@ def separators(field, field_name):
        return

    # Initialize an empty list to hold the cleaned values
-    values = list()
+    values = []

    # Try to split multi-value field on "||" separator
    for value in field.split("||"):
@@ -175,7 +175,7 @@ def duplicates(field, field_name):
    values = field.split("||")

    # Initialize an empty list to hold the de-duplicated values
-    new_values = list()
+    new_values = []

    # Iterate over all values
    for value in values:
@@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
        if row[region_column_name] is not None:
            regions = row[region_column_name].split("||")
        else:
-            regions = list()
+            regions = []

        # An empty list for our regions so we can keep track for all countries
-        missing_regions = list()
+        missing_regions = []

        for country in countries:
            # Look up the UN M.49 regions for this country code. CoCo seems to
@@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
            # it doesn't already exist in regions.
            if un_region != "not found" and un_region not in regions:
                if un_region not in missing_regions:
-                    print(
-                        f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
-                    )
+                    try:
+                        print(
+                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
+                        )
+                    except KeyError:
+                        # If there is no title column in the CSV we will print
+                        # the fix without the title instead of crashing.
+                        print(
+                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
+                        )
+
                    missing_regions.append(un_region)

        if len(missing_regions) > 0:
@@ -387,3 +395,88 @@ def countries_match_regions(row, exclude):
                row[region_column_name] = "||".join(missing_regions)

    return row
+
+
+def normalize_dois(field):
+    """Normalize DOIs.
+
+    DOIs are meant to be globally unique identifiers. They are case insensitive,
+    but in order to compare them robustly they should be normalized to a common
+    format:
+
+        - strip leading and trailing whitespace
+        - lowercase all ASCII characters
+        - convert all variations to https://doi.org/10.xxxx/xxxx URI format
+
+    Return string with normalized DOI.
+
+    See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
+    """
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Try to split multi-value field on "||" separator
+    values = field.split("||")
+
+    # Initialize an empty list to hold the de-duplicated values
+    new_values = []
+
+    # Iterate over all values (most items will only have one DOI)
+    for value in values:
+        # Strip leading and trailing whitespace
+        new_value = value.strip()
+
+        new_value = new_value.lower()
+
+        # Convert to HTTPS
+        pattern = re.compile(r"^http://")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://", new_value)
+
+        # Convert dx.doi.org to doi.org
+        pattern = re.compile(r"dx\.doi\.org")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "doi.org", new_value)
+
+        # Convert www.doi.org to doi.org
+        pattern = re.compile(r"www\.doi\.org")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "doi.org", new_value)
+
+        # Convert erroneous %2f to /
+        pattern = re.compile("%2f")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "/", new_value)
+
+        # Replace values like doi: 10.11648/j.jps.20140201.14
+        pattern = re.compile(r"^doi: 10\.")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
+
+        # Replace values like 10.3390/foods12010115
+        pattern = re.compile(r"^10\.")
+        match = re.findall(pattern, new_value)
+
+        if match:
+            new_value = re.sub(pattern, "https://doi.org/10.", new_value)
+
+        if new_value != value:
+            print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
+
+        new_values.append(new_value)
+
+    new_field = "||".join(new_values)
+
+    return new_field
--- a/csv_metadata_quality/util.py
+++ b/csv_metadata_quality/util.py
@@ -2,7 +2,7 @@


 import json
-from importlib.resources import files
+import os

 from ftfy.badness import is_bad

@@ -58,7 +58,7 @@ def is_mojibake(field):
 def load_spdx_licenses():
    """Returns a Python list of SPDX short license identifiers."""

-    with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
+    with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
        licenses = json.load(f)

    # List comprehension to extract the license ID for each license
--- a/data/test.csv
+++ b/data/test.csv
@@ -37,3 +37,7 @@ Mojibake,2021-03-18,,,,PublicaÃ§ao CIAT,,,,Report,,,,
 Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
 Country missing region,2021-12-08,,,,,Kenya,,,,,,,
 Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
+DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
+DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
+Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
+DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,41 +1,63 @@
-[tool.poetry]
+[project]
 name = "csv-metadata-quality"
 version = "0.6.1"
 description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
-authors = ["Alan Orth <alan.orth@gmail.com>"]
-license="GPL-3.0-only"
+authors = [
+    { name = "Alan Orth", email = "alan.orth@gmail.com" }
+]
+license= { file = "LICENSE.txt" }
+dependencies = [
+    "pandas[feather,performance]~=2.2",
+    "python-stdnum~=1.20",
+    "requests~=2.32",
+    "requests-cache~=1.2.1",
+    "colorama~=0.4",
+    "ftfy~=6.2.0",
+    "country-converter~=1.2",
+    "pycountry~=24.6.1",
+    "py3langid~=0.3",
+]
+readme = "README.md"
+requires-python = ">= 3.9"
+
+classifiers = [
+  "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+  "Natural Language :: English",
+  "Operating System :: OS Independent",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+]
+
+[project.urls]
 repository = "https://github.com/ilri/csv-metadata-quality"
 homepage = "https://github.com/ilri/csv-metadata-quality"

-[tool.poetry.scripts]
+[project.scripts]
 csv-metadata-quality = 'csv_metadata_quality.__main__:main'

-[tool.poetry.dependencies]
-python = "^3.9"
-pandas = "^1.5.2"
-python-stdnum = "^1.18"
-requests = "^2.28.2"
-requests-cache = "^0.9.8"
-langid = "^1.1.6"
-colorama = "^0.4.6"
-ftfy = "^6.1.1"
-country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
-pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
-
-[tool.poetry.dev-dependencies]
-pytest = "^7.2.1"
-flake8 = "^6.0.0"
-pytest-clarity = "^1.0.1"
-black = "^23.1.0"
-isort = "^5.12.0"
-csvkit = "^1.1.0"
-
-[tool.poetry.group.dev.dependencies]
-ipython = "^8.10.0"
-
+# So rye doesn't fall back to setuptools
+# See: https://packaging.python.org/en/latest/tutorials/packaging-projects/#choosing-build-backend
 [build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.rye]
+managed = true
+dev-dependencies = [
+    "pytest~=8.3",
+    "pytest-clarity~=1.0",
+    "isort~=5.13",
+    "csvkit~=2.0",
+    "ipython~=8.26",
+    "fixit~=2.1",
+]
+
+# So hatch doesn't try to build other top-level directories like "data"
+[tool.hatch.build.targets.wheel]
+packages = ["csv_metadata_quality"]

 [tool.isort]
 profile = "black"
--- a/renovate.json
+++ b/renovate.json
@@ -0,0 +1,9 @@
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:base"
+  ],
+  "pip_requirements": {
+      "enabled": false
+  }
+}
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -0,0 +1,188 @@
+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+#   universal: false
+
+-e file:.
+agate==1.10.2
+    # via agate-dbf
+    # via agate-excel
+    # via agate-sql
+    # via csvkit
+agate-dbf==0.2.3
+    # via csvkit
+agate-excel==0.4.1
+    # via csvkit
+agate-sql==0.7.2
+    # via csvkit
+asttokens==2.4.1
+    # via stack-data
+attrs==23.2.0
+    # via cattrs
+    # via requests-cache
+babel==2.15.0
+    # via agate
+bottleneck==1.3.8
+    # via pandas
+cattrs==23.2.3
+    # via requests-cache
+certifi==2024.2.2
+    # via requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via fixit
+    # via moreorless
+colorama==0.4.6
+    # via csv-metadata-quality
+country-converter==1.2
+    # via csv-metadata-quality
+csvkit==2.0.1
+dbfread==2.0.7
+    # via agate-dbf
+decorator==5.1.1
+    # via ipython
+et-xmlfile==1.1.0
+    # via openpyxl
+executing==2.0.1
+    # via stack-data
+fixit==2.1.0
+ftfy==6.2.0
+    # via csv-metadata-quality
+greenlet==3.0.3
+    # via sqlalchemy
+idna==3.7
+    # via requests
+iniconfig==2.0.0
+    # via pytest
+ipython==8.26.0
+isodate==0.6.1
+    # via agate
+isort==5.13.2
+jedi==0.19.1
+    # via ipython
+leather==0.4.0
+    # via agate
+libcst==1.4.0
+    # via fixit
+llvmlite==0.43.0
+    # via numba
+markdown-it-py==3.0.0
+    # via rich
+matplotlib-inline==0.1.7
+    # via ipython
+mdurl==0.1.2
+    # via markdown-it-py
+moreorless==0.4.0
+    # via fixit
+numba==0.60.0
+    # via pandas
+numexpr==2.10.0
+    # via pandas
+numpy==2.0.0
+    # via bottleneck
+    # via numba
+    # via numexpr
+    # via pandas
+    # via py3langid
+    # via pyarrow
+olefile==0.47
+    # via agate-excel
+openpyxl==3.1.2
+    # via agate-excel
+    # via csvkit
+packaging==24.0
+    # via fixit
+    # via pytest
+pandas==2.2.2
+    # via country-converter
+    # via csv-metadata-quality
+parsedatetime==2.6
+    # via agate
+parso==0.8.4
+    # via jedi
+pathspec==0.12.1
+    # via trailrunner
+pexpect==4.9.0
+    # via ipython
+platformdirs==4.2.2
+    # via requests-cache
+pluggy==1.5.0
+    # via pytest
+pprintpp==0.4.0
+    # via pytest-clarity
+prompt-toolkit==3.0.43
+    # via ipython
+ptyprocess==0.7.0
+    # via pexpect
+pure-eval==0.2.2
+    # via stack-data
+py3langid==0.3.0
+    # via csv-metadata-quality
+pyarrow==16.1.0
+    # via pandas
+pycountry==24.6.1
+    # via csv-metadata-quality
+pygments==2.18.0
+    # via ipython
+    # via rich
+pytest==8.3.2
+    # via pytest-clarity
+pytest-clarity==1.0.1
+python-dateutil==2.9.0.post0
+    # via pandas
+python-slugify==8.0.4
+    # via agate
+python-stdnum==1.20
+    # via csv-metadata-quality
+pytimeparse==1.1.8
+    # via agate
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.1
+    # via libcst
+requests==2.32.2
+    # via csv-metadata-quality
+    # via requests-cache
+requests-cache==1.2.1
+    # via csv-metadata-quality
+rich==13.7.1
+    # via pytest-clarity
+six==1.16.0
+    # via asttokens
+    # via isodate
+    # via python-dateutil
+    # via url-normalize
+sqlalchemy==2.0.30
+    # via agate-sql
+    # via csvkit
+stack-data==0.6.3
+    # via ipython
+text-unidecode==1.3
+    # via python-slugify
+trailrunner==1.4.0
+    # via fixit
+traitlets==5.14.3
+    # via ipython
+    # via matplotlib-inline
+typing-extensions==4.11.0
+    # via sqlalchemy
+tzdata==2024.1
+    # via pandas
+url-normalize==1.4.3
+    # via requests-cache
+urllib3==2.2.1
+    # via requests
+    # via requests-cache
+wcwidth==0.2.13
+    # via ftfy
+    # via prompt-toolkit
+xlrd==2.0.1
+    # via agate-excel
+    # via csvkit
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,80 +0,0 @@
-agate-dbf==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
-agate-excel==0.2.5 ; python_version >= "3.9" and python_version < "4.0"
-agate-sql==0.5.9 ; python_version >= "3.9" and python_version < "4.0"
-agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
-appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
-appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
-asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
-attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
-backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
-black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
-cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
-charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
-click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
-country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
-csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
-dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
-decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
-et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
-exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
-executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
-flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
-ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
-greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
-idna==3.4 ; python_version >= "3.9" and python_version < "4"
-iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
-ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
-isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
-isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
-jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
-langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
-leather==0.3.4 ; python_version >= "3.9" and python_version < "4.0"
-markdown-it-py==2.2.0 ; python_version >= "3.9" and python_version < "4.0"
-matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
-mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
-mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
-numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
-olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
-openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
-packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
-pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
-parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
-parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
-pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
-pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
-pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
-platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
-pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
-pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
-prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
-ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
-pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
-pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
-pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
-pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
-pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
-pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
-pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
-python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
-python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
-python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
-pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
-pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
-requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
-requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
-rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
-six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
-sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
-stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
-text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
-tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
-traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
-typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
-url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
-urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
-wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
-xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
--- a/requirements.lock
+++ b/requirements.lock
@@ -0,0 +1,78 @@
+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: []
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+#   universal: false
+
+-e file:.
+attrs==23.2.0
+    # via cattrs
+    # via requests-cache
+bottleneck==1.3.8
+    # via pandas
+cattrs==23.2.3
+    # via requests-cache
+certifi==2024.2.2
+    # via requests
+charset-normalizer==3.3.2
+    # via requests
+colorama==0.4.6
+    # via csv-metadata-quality
+country-converter==1.2
+    # via csv-metadata-quality
+ftfy==6.2.0
+    # via csv-metadata-quality
+idna==3.7
+    # via requests
+llvmlite==0.43.0
+    # via numba
+numba==0.60.0
+    # via pandas
+numexpr==2.10.0
+    # via pandas
+numpy==2.0.0
+    # via bottleneck
+    # via numba
+    # via numexpr
+    # via pandas
+    # via py3langid
+    # via pyarrow
+pandas==2.2.2
+    # via country-converter
+    # via csv-metadata-quality
+platformdirs==4.2.2
+    # via requests-cache
+py3langid==0.3.0
+    # via csv-metadata-quality
+pyarrow==16.1.0
+    # via pandas
+pycountry==24.6.1
+    # via csv-metadata-quality
+python-dateutil==2.9.0.post0
+    # via pandas
+python-stdnum==1.20
+    # via csv-metadata-quality
+pytz==2024.1
+    # via pandas
+requests==2.32.2
+    # via csv-metadata-quality
+    # via requests-cache
+requests-cache==1.2.1
+    # via csv-metadata-quality
+six==1.16.0
+    # via python-dateutil
+    # via url-normalize
+tzdata==2024.1
+    # via pandas
+url-normalize==1.4.3
+    # via requests-cache
+urllib3==2.2.1
+    # via requests
+    # via requests-cache
+wcwidth==0.2.13
+    # via ftfy
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +0,0 @@
-appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
-attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
-certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
-charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
-country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
-exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
-ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
-idna==3.4 ; python_version >= "3.9" and python_version < "4"
-langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
-numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
-pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
-pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
-python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
-python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
-pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
-requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
-requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
-six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
-url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
-urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
-wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
--- a/setup.py
+++ b/setup.py
@@ -1,36 +0,0 @@
-import setuptools
-
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
-install_requires = [
-    "pandas",
-    "python-stdnum",
-    "requests",
-    "requests-cache",
-    "pycountry",
-    "langid",
-]
-
-setuptools.setup(
-    name="csv-metadata-quality",
-    version="0.6.1",
-    author="Alan Orth",
-    author_email="aorth@mjanja.ch",
-    description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
-    license="GPLv3",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/alanorth/csv-metadata-quality",
-    classifiers=[
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-        "Operating System :: OS Independent",
-    ],
-    packages=["csv_metadata_quality"],
-    entry_points={
-        "console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
-    },
-    install_requires=install_requires,
-)
--- a/tests/test_check.py
+++ b/tests/test_check.py
@@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "es"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "spa"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "en"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():

    title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
    language = "eng"
-    exclude = list()
+    exclude = []

    # Create a dictionary to mimic Pandas series
    row = {"dc.title": title, "dc.language.iso": language}
@@ -407,7 +407,7 @@ def test_check_doi_field():
    # the citation and a DOI field.
    d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
    series = pd.Series(data=d)
-    exclude = list()
+    exclude = []

    result = check.citation_doi(series, exclude)

@@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
    """Test an item with a DOI in its citation, but no DOI field."""

    citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series), with
    # an empty DOI field and a citation containing a DOI.
@@ -439,7 +439,7 @@ def test_title_in_citation():

    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all the things."
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
@@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):

    title = "Testing all the things"
    citation = "Orth, A. 2021. Testing all teh things."
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series), with
    # the title and citation.
@@ -477,7 +477,7 @@ def test_country_matches_region():

    country = "Kenya"
    region = "Eastern Africa"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {"cg.coverage.country": country, "cg.coverage.region": region}
@@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
--- a/tests/test_fix.py
+++ b/tests/test_fix.py
@@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
    country = "Kenya"
    region = ""
    missing_region = "Eastern Africa"
-    exclude = list()
+    exclude = []

    # Emulate a column in a transposed dataframe (which is just a series)
    d = {
@@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
    series_correct = pd.Series(data=d_correct)

    pd.testing.assert_series_equal(result, series_correct)
+
+
+def test_fix_normalize_dois():
+    """Test normalizing a DOI."""
+
+    value = "doi: 10.11648/j.jps.20140201.14"
+
+    assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
				`@@ -0,0 +1 @@`
				`include csv_metadata_quality/data/licenses.json`