mirror of
				https://github.com/ilri/csv-metadata-quality.git
				synced 2025-11-03 22:19:08 +01:00 
			
		
		
		
	Compare commits
	
		
			76 Commits
		
	
	
		
			v0.6.1
			...
			530cd5863b
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						
						
							
						
						530cd5863b
	
				 | 
					
					
						|||
| 
						
						
							
						
						f6018c51b6
	
				 | 
					
					
						|||
| 
						
						
							
						
						80c3f5b45a
	
				 | 
					
					
						|||
| ba4637ea34 | |||
| 355428a691 | |||
| 
						 | 
					58d4de973e | ||
| e1216dae3c | |||
| 
						 | 
					6b650ff1b3 | ||
| fa7bde6fc0 | |||
| 
						 | 
					f89159fe32 | ||
| 
						 | 
					02058c5a65 | ||
| 8fed6b71ff | |||
| b005b28cbe | |||
| 
						 | 
					c626290599 | ||
| 
						 | 
					1a06470b64 | ||
| d46a81672e | |||
| 2a50e75082 | |||
| 0d45e73983 | |||
| 
						 | 
					3611aab425 | ||
| 
						 | 
					5c4ad0eb41 | ||
| 
						 | 
					f1f39722f6 | ||
| 1c03999582 | |||
| 
						
						
							
						
						1f637f32cd
	
				 | 
					
					
						|||
| 
						
						
							
						
						b8241e919d
	
				 | 
					
					
						|||
| 
						
						
							
						
						b8dc19cc3f
	
				 | 
					
					
						|||
| 
						
						
							
						
						93c9b739ac
	
				 | 
					
					
						|||
| 
						
						
							
						
						4ed2786703
	
				 | 
					
					
						|||
| 
						 | 
					8728789183 | ||
| 
						
						
							
						
						bf90464809
	
				 | 
					
					
						|||
| 1878002391 | |||
| d21d2621e3 | |||
| f3fb1ff7fb | |||
| 1fa81f7558 | |||
| 
						 | 
					7409193b6b | ||
| 
						
						
							
						
						a84fcf0b7b
	
				 | 
					
					
						|||
| 
						
						
							
						
						25ac290df4
	
				 | 
					
					
						|||
| 
						
						
							
						
						3f52bad1e3
	
				 | 
					
					
						|||
| 0208ad0ade | |||
| 
						 | 
					3632ae0fc9 | ||
| 
						
						
							
						
						17d089cc6e
	
				 | 
					
					
						|||
| 
						
						
							
						
						bc470a4343
	
				 | 
					
					
						|||
| 
						
						
							
						
						be609a809d
	
				 | 
					
					
						|||
| 
						
						
							
						
						de3387ded7
	
				 | 
					
					
						|||
| 
						
						
							
						
						f343e87f0c
	
				 | 
					
					
						|||
| 
						
						
							
						
						7d3524fbd5
	
				 | 
					
					
						|||
| c614b71a52 | |||
| 
						 | 
					d159a839f3 | ||
| 
						
						
							
						
						36e2ebe5f4
	
				 | 
					
					
						|||
| 
						
						
							
						
						33f67b7a7c
	
				 | 
					
					
						|||
| 
						
						
							
						
						c0e1448439
	
				 | 
					
					
						|||
| 
						
						
							
						
						5d0804a08f
	
				 | 
					
					
						|||
| 
						
						
							
						
						f01c9edf17
	
				 | 
					
					
						|||
| 
						
						
							
						
						8d4295b2b3
	
				 | 
					
					
						|||
| 
						
						
							
						
						e2d46e9495
	
				 | 
					
					
						|||
| 
						
						
							
						
						1491e1edb0
	
				 | 
					
					
						|||
| 
						
						
							
						
						34142c3e6b
	
				 | 
					
					
						|||
| 
						
						
							
						
						0c88b96e8d
	
				 | 
					
					
						|||
| 
						
						
							
						
						2e55b4d6e3
	
				 | 
					
					
						|||
| 
						
						
							
						
						c90aad29f0
	
				 | 
					
					
						|||
| 
						
						
							
						
						6fd1e1377f
	
				 | 
					
					
						|||
| 
						
						
							
						
						c64b7eb1f1
	
				 | 
					
					
						|||
| 
						
						
							
						
						29cbc4f3a3
	
				 | 
					
					
						|||
| 
						
						
							
						
						307af1acfc
	
				 | 
					
					
						|||
| 
						
						
							
						
						b5106de9df
	
				 | 
					
					
						|||
| 
						
						
							
						
						9eeadfc44e
	
				 | 
					
					
						|||
| 
						
						
							
						
						d4aed378cf
	
				 | 
					
					
						|||
| 
						
						
							
						
						20a2cce34b
	
				 | 
					
					
						|||
| 
						
						
							
						
						d661ffe439
	
				 | 
					
					
						|||
| 
						
						
							
						
						45a310387a
	
				 | 
					
					
						|||
| 
						
						
							
						
						47b03c49ba
	
				 | 
					
					
						|||
| 
						
						
							
						
						986b81cbf4
	
				 | 
					
					
						|||
| 
						
						
							
						
						d43a47ae32
	
				 | 
					
					
						|||
| 
						
						
							
						
						ede37569f1
	
				 | 
					
					
						|||
| 
						
						
							
						
						0c53efe60a
	
				 | 
					
					
						|||
| 
						
						
							
						
						5f0e25b818
	
				 | 
					
					
						|||
| 
						
						
							
						
						4776154d6c
	
				 | 
					
					
						
							
								
								
									
										70
									
								
								.drone.yml
									
									
									
									
									
								
							
							
						
						
									
										70
									
								
								.drone.yml
									
									
									
									
									
								
							@@ -1,3 +1,33 @@
 | 
			
		||||
---
 | 
			
		||||
kind: pipeline
 | 
			
		||||
type: docker
 | 
			
		||||
name: python311
 | 
			
		||||
 | 
			
		||||
steps:
 | 
			
		||||
- name: test
 | 
			
		||||
  image: python:3.11-slim
 | 
			
		||||
  commands:
 | 
			
		||||
  - id
 | 
			
		||||
  - python -V
 | 
			
		||||
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
 | 
			
		||||
  - python -m pip install poetry
 | 
			
		||||
  - poetry install
 | 
			
		||||
  - poetry run pytest
 | 
			
		||||
  # Basic test
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
  # Basic test with unsafe fixes
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
  # Geography test
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
 | 
			
		||||
  # Geography test with unsafe fixes
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
 | 
			
		||||
  # Test with experimental checks
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
  # Test with AGROVOC validation
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
  # Test with AGROVOC validation (and dropping invalid)
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
kind: pipeline
 | 
			
		||||
type: docker
 | 
			
		||||
@@ -10,23 +40,23 @@ steps:
 | 
			
		||||
  - id
 | 
			
		||||
  - python -V
 | 
			
		||||
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
 | 
			
		||||
  - pip install -r requirements-dev.txt
 | 
			
		||||
  - pytest
 | 
			
		||||
  - python setup.py install
 | 
			
		||||
  - python -m pip install poetry
 | 
			
		||||
  - poetry install
 | 
			
		||||
  - poetry run pytest
 | 
			
		||||
  # Basic test
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
  # Basic test with unsafe fixes
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
  # Geography test
 | 
			
		||||
  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
 | 
			
		||||
  # Geography test with unsafe fixes
 | 
			
		||||
  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
 | 
			
		||||
  # Test with experimental checks
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
  # Test with AGROVOC validation
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
  # Test with AGROVOC validation (and dropping invalid)
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
kind: pipeline
 | 
			
		||||
@@ -40,22 +70,22 @@ steps:
 | 
			
		||||
  - id
 | 
			
		||||
  - python -V
 | 
			
		||||
  - apt update && apt install -y gcc g++ libicu-dev pkg-config git
 | 
			
		||||
  - pip install -r requirements-dev.txt
 | 
			
		||||
  - pytest
 | 
			
		||||
  - python setup.py install
 | 
			
		||||
  - python -m pip install poetry
 | 
			
		||||
  - poetry install
 | 
			
		||||
  - poetry run pytest
 | 
			
		||||
  # Basic test
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
  # Basic test with unsafe fixes
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
  # Geography test
 | 
			
		||||
  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
 | 
			
		||||
  # Geography test with unsafe fixes
 | 
			
		||||
  - csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
 | 
			
		||||
  # Test with experimental checks
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
  # Test with AGROVOC validation
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
  # Test with AGROVOC validation (and dropping invalid)
 | 
			
		||||
  - csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
  - poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
 | 
			
		||||
# vim: ts=2 sw=2 et
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										36
									
								
								.github/workflows/python-app.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										36
									
								
								.github/workflows/python-app.yml
									
									
									
									
										vendored
									
									
								
							@@ -15,37 +15,31 @@ jobs:
 | 
			
		||||
    runs-on: ubuntu-22.04
 | 
			
		||||
 | 
			
		||||
    steps:
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
    - name: Set up Python 3.10
 | 
			
		||||
      uses: actions/setup-python@v4
 | 
			
		||||
    - uses: actions/checkout@v4
 | 
			
		||||
    - name: Install poetry
 | 
			
		||||
      run: pipx install poetry
 | 
			
		||||
    - uses: actions/setup-python@v4
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: '3.10'
 | 
			
		||||
        cache: 'pip'
 | 
			
		||||
    - name: Install dependencies
 | 
			
		||||
      run: |
 | 
			
		||||
        python -m pip install --upgrade pip
 | 
			
		||||
        pip install flake8 pytest
 | 
			
		||||
        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 | 
			
		||||
        if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
 | 
			
		||||
        python-version: '3.11'
 | 
			
		||||
        cache: 'poetry'
 | 
			
		||||
    - run: poetry install
 | 
			
		||||
    - name: Lint with flake8
 | 
			
		||||
      run: |
 | 
			
		||||
        # stop the build if there are Python syntax errors or undefined names
 | 
			
		||||
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
 | 
			
		||||
        poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
 | 
			
		||||
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
 | 
			
		||||
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 | 
			
		||||
        poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 | 
			
		||||
    - name: Test with pytest
 | 
			
		||||
      run: |
 | 
			
		||||
        pytest
 | 
			
		||||
      run: poetry run pytest
 | 
			
		||||
    - name: Test CLI
 | 
			
		||||
      run: |
 | 
			
		||||
        python setup.py install
 | 
			
		||||
        # Basic test
 | 
			
		||||
        csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
 | 
			
		||||
        # Test with unsafe fixes
 | 
			
		||||
        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
 | 
			
		||||
        # Test with experimental checks
 | 
			
		||||
        csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
 | 
			
		||||
        # Test with AGROVOC validation
 | 
			
		||||
        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
 | 
			
		||||
        # Test with AGROVOC validation (and dropping invalid)
 | 
			
		||||
        csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
        poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										15
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								CHANGELOG.md
									
									
									
									
									
								
							@@ -4,6 +4,21 @@ All notable changes to this project will be documented in this file.
 | 
			
		||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 | 
			
		||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 | 
			
		||||
 | 
			
		||||
## Unreleased
 | 
			
		||||
### Fixed
 | 
			
		||||
- Fixed regex so we don't run the invalid multi-value separator fix on
 | 
			
		||||
`dcterms.bibliographicCitation` fields
 | 
			
		||||
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
 | 
			
		||||
fields
 | 
			
		||||
- Don't crash the country/region checker/fixer when a title field is missing
 | 
			
		||||
 | 
			
		||||
### Changed
 | 
			
		||||
- Don't run newline fix on description fields
 | 
			
		||||
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
 | 
			
		||||
 | 
			
		||||
### Updated
 | 
			
		||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
 | 
			
		||||
 | 
			
		||||
## [0.6.1] - 2023-02-23
 | 
			
		||||
### Fixed
 | 
			
		||||
- Missing region check should ignore subregion field, if it exists
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
include csv_metadata_quality/data/licenses.json
 | 
			
		||||
@@ -127,7 +127,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 | 
			
		||||
- Warn if an author is shorter than 3 characters?
 | 
			
		||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
 | 
			
		||||
- Warn if two items use the same file in `filename` column
 | 
			
		||||
- Add an option to drop invalid AGROVOC subjects?
 | 
			
		||||
- Add tests for application invocation, ie `tests/test_app.py`?
 | 
			
		||||
- Validate ISSNs or journal titles against CrossRef API?
 | 
			
		||||
- Add configurable field validation, like specify a field name and a validation file?
 | 
			
		||||
@@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
 | 
			
		||||
  - Warn if item is Open Access, but missing a license
 | 
			
		||||
  - Warn if item has an ISSN but no journal title
 | 
			
		||||
  - Update journal titles from ISSN
 | 
			
		||||
- Migrate to https://github.com/spdx/license-list-data
 | 
			
		||||
- Migrate from Pandas to Polars
 | 
			
		||||
 | 
			
		||||
## License
 | 
			
		||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,14 @@
 | 
			
		||||
# SPDX-License-Identifier: GPL-3.0-only
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import signal
 | 
			
		||||
import sys
 | 
			
		||||
from datetime import timedelta
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import requests_cache
 | 
			
		||||
from colorama import Fore
 | 
			
		||||
 | 
			
		||||
import csv_metadata_quality.check as check
 | 
			
		||||
@@ -74,7 +77,7 @@ def run(argv):
 | 
			
		||||
    signal.signal(signal.SIGINT, signal_handler)
 | 
			
		||||
 | 
			
		||||
    # Read all fields as strings so dates don't get converted from 1998 to 1998.0
 | 
			
		||||
    df = pd.read_csv(args.input_file, dtype=str)
 | 
			
		||||
    df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
 | 
			
		||||
 | 
			
		||||
    # Check if the user requested to skip any fields
 | 
			
		||||
    if args.exclude_fields:
 | 
			
		||||
@@ -82,7 +85,20 @@ def run(argv):
 | 
			
		||||
        # user should be careful to no include spaces here.
 | 
			
		||||
        exclude = args.exclude_fields.split(",")
 | 
			
		||||
    else:
 | 
			
		||||
        exclude = list()
 | 
			
		||||
        exclude = []
 | 
			
		||||
 | 
			
		||||
    # enable transparent request cache with thirty days expiry
 | 
			
		||||
    expire_after = timedelta(days=30)
 | 
			
		||||
    # Allow overriding the location of the requests cache, just in case we are
 | 
			
		||||
    # running in an environment where we can't write to the current working di-
 | 
			
		||||
    # rectory (for example from csv-metadata-quality-web).
 | 
			
		||||
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
 | 
			
		||||
    requests_cache.install_cache(
 | 
			
		||||
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # prune old cache entries
 | 
			
		||||
    requests_cache.delete()
 | 
			
		||||
 | 
			
		||||
    for column in df.columns:
 | 
			
		||||
        if column in exclude:
 | 
			
		||||
@@ -91,7 +107,9 @@ def run(argv):
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        if args.unsafe_fixes:
 | 
			
		||||
            match = re.match(r"^.*?abstract.*$", column)
 | 
			
		||||
            # Skip whitespace and newline fixes on abstracts and descriptions
 | 
			
		||||
            # because there are too many with legitimate multi-line metadata.
 | 
			
		||||
            match = re.match(r"^.*?(abstract|description).*$", column)
 | 
			
		||||
            if match is None:
 | 
			
		||||
                # Fix: whitespace
 | 
			
		||||
                df[column] = df[column].apply(fix.whitespace, field_name=column)
 | 
			
		||||
@@ -102,7 +120,7 @@ def run(argv):
 | 
			
		||||
        # Fix: missing space after comma. Only run on author and citation
 | 
			
		||||
        # fields for now, as this problem is mostly an issue in names.
 | 
			
		||||
        if args.unsafe_fixes:
 | 
			
		||||
            match = re.match(r"^.*?(author|citation).*$", column)
 | 
			
		||||
            match = re.match(r"^.*?(author|[Cc]itation).*$", column)
 | 
			
		||||
            if match is not None:
 | 
			
		||||
                df[column] = df[column].apply(fix.comma_space, field_name=column)
 | 
			
		||||
 | 
			
		||||
@@ -126,7 +144,7 @@ def run(argv):
 | 
			
		||||
        # Fix: invalid and unnecessary multi-value separators. Skip the title
 | 
			
		||||
        # and abstract fields because "|" is used to indicate something like
 | 
			
		||||
        # a subtitle.
 | 
			
		||||
        match = re.match(r"^.*?(abstract|title).*$", column)
 | 
			
		||||
        match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
 | 
			
		||||
        if match is None:
 | 
			
		||||
            df[column] = df[column].apply(fix.separators, field_name=column)
 | 
			
		||||
            # Run whitespace fix again after fixing invalid separators
 | 
			
		||||
 
 | 
			
		||||
@@ -1,14 +1,12 @@
 | 
			
		||||
# SPDX-License-Identifier: GPL-3.0-only
 | 
			
		||||
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
 | 
			
		||||
import country_converter as coco
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import requests
 | 
			
		||||
import requests_cache
 | 
			
		||||
from colorama import Fore
 | 
			
		||||
from pycountry import languages
 | 
			
		||||
from stdnum import isbn as stdnum_isbn
 | 
			
		||||
@@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
 | 
			
		||||
    if pd.isna(field):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # enable transparent request cache with thirty days expiry
 | 
			
		||||
    expire_after = timedelta(days=30)
 | 
			
		||||
    # Allow overriding the location of the requests cache, just in case we are
 | 
			
		||||
    # running in an environment where we can't write to the current working di-
 | 
			
		||||
    # rectory (for example from csv-metadata-quality-web).
 | 
			
		||||
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
 | 
			
		||||
    requests_cache.install_cache(
 | 
			
		||||
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # prune old cache entries
 | 
			
		||||
    # requests_cache.remove_expired_responses()
 | 
			
		||||
 | 
			
		||||
    # Initialize an empty list to hold the validated AGROVOC values
 | 
			
		||||
    values = list()
 | 
			
		||||
    values = []
 | 
			
		||||
 | 
			
		||||
    # Try to split multi-value field on "||" separator
 | 
			
		||||
    for value in field.split("||"):
 | 
			
		||||
        request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
 | 
			
		||||
        request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
 | 
			
		||||
        request_params = {"query": value}
 | 
			
		||||
 | 
			
		||||
        request = requests.get(request_url, params=request_params)
 | 
			
		||||
@@ -373,7 +358,7 @@ def duplicate_items(df):
 | 
			
		||||
 | 
			
		||||
    if items_count_unique < items_count_total:
 | 
			
		||||
        # Create a list to hold our items while we check for duplicates
 | 
			
		||||
        items = list()
 | 
			
		||||
        items = []
 | 
			
		||||
 | 
			
		||||
        for index, row in df.iterrows():
 | 
			
		||||
            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
 | 
			
		||||
@@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
 | 
			
		||||
        if row[region_column_name] is not None:
 | 
			
		||||
            regions = row[region_column_name].split("||")
 | 
			
		||||
        else:
 | 
			
		||||
            regions = list()
 | 
			
		||||
            regions = []
 | 
			
		||||
 | 
			
		||||
        for country in countries:
 | 
			
		||||
            # Look up the UN M.49 regions for this country code. CoCo seems to
 | 
			
		||||
@@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
 | 
			
		||||
            un_region = cc.convert(names=country, to="UNRegion")
 | 
			
		||||
 | 
			
		||||
            if un_region != "not found" and un_region not in regions:
 | 
			
		||||
                print(
 | 
			
		||||
                    f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
 | 
			
		||||
                )
 | 
			
		||||
                try:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
 | 
			
		||||
                    )
 | 
			
		||||
                except KeyError:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
    return
 | 
			
		||||
 
 | 
			
		||||
@@ -20,7 +20,7 @@ def correct_language(row, exclude):
 | 
			
		||||
    # Initialize some variables at global scope so that we can set them in the
 | 
			
		||||
    # loop scope below and still be able to access them afterwards.
 | 
			
		||||
    language = ""
 | 
			
		||||
    sample_strings = list()
 | 
			
		||||
    sample_strings = []
 | 
			
		||||
    title = None
 | 
			
		||||
 | 
			
		||||
    # Iterate over the labels of the current row's values. Before we transposed
 | 
			
		||||
 
 | 
			
		||||
@@ -23,7 +23,7 @@ def whitespace(field, field_name):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # Initialize an empty list to hold the cleaned values
 | 
			
		||||
    values = list()
 | 
			
		||||
    values = []
 | 
			
		||||
 | 
			
		||||
    # Try to split multi-value field on "||" separator
 | 
			
		||||
    for value in field.split("||"):
 | 
			
		||||
@@ -64,7 +64,7 @@ def separators(field, field_name):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    # Initialize an empty list to hold the cleaned values
 | 
			
		||||
    values = list()
 | 
			
		||||
    values = []
 | 
			
		||||
 | 
			
		||||
    # Try to split multi-value field on "||" separator
 | 
			
		||||
    for value in field.split("||"):
 | 
			
		||||
@@ -175,7 +175,7 @@ def duplicates(field, field_name):
 | 
			
		||||
    values = field.split("||")
 | 
			
		||||
 | 
			
		||||
    # Initialize an empty list to hold the de-duplicated values
 | 
			
		||||
    new_values = list()
 | 
			
		||||
    new_values = []
 | 
			
		||||
 | 
			
		||||
    # Iterate over all values
 | 
			
		||||
    for value in values:
 | 
			
		||||
@@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
 | 
			
		||||
        if row[region_column_name] is not None:
 | 
			
		||||
            regions = row[region_column_name].split("||")
 | 
			
		||||
        else:
 | 
			
		||||
            regions = list()
 | 
			
		||||
            regions = []
 | 
			
		||||
 | 
			
		||||
        # An empty list for our regions so we can keep track for all countries
 | 
			
		||||
        missing_regions = list()
 | 
			
		||||
        missing_regions = []
 | 
			
		||||
 | 
			
		||||
        for country in countries:
 | 
			
		||||
            # Look up the UN M.49 regions for this country code. CoCo seems to
 | 
			
		||||
@@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
 | 
			
		||||
            # it doesn't already exist in regions.
 | 
			
		||||
            if un_region != "not found" and un_region not in regions:
 | 
			
		||||
                if un_region not in missing_regions:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
 | 
			
		||||
                    )
 | 
			
		||||
                    try:
 | 
			
		||||
                        print(
 | 
			
		||||
                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
 | 
			
		||||
                        )
 | 
			
		||||
                    except KeyError:
 | 
			
		||||
                        # If there is no title column in the CSV we will print
 | 
			
		||||
                        # the fix without the title instead of crashing.
 | 
			
		||||
                        print(
 | 
			
		||||
                            f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
 | 
			
		||||
                        )
 | 
			
		||||
 | 
			
		||||
                    missing_regions.append(un_region)
 | 
			
		||||
 | 
			
		||||
        if len(missing_regions) > 0:
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
from importlib.resources import files
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from ftfy.badness import is_bad
 | 
			
		||||
 | 
			
		||||
@@ -58,7 +58,7 @@ def is_mojibake(field):
 | 
			
		||||
def load_spdx_licenses():
 | 
			
		||||
    """Returns a Python list of SPDX short license identifiers."""
 | 
			
		||||
 | 
			
		||||
    with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
 | 
			
		||||
    with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
 | 
			
		||||
        licenses = json.load(f)
 | 
			
		||||
 | 
			
		||||
    # List comprehension to extract the license ID for each license
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1648
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1648
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -12,26 +12,25 @@ csv-metadata-quality = 'csv_metadata_quality.__main__:main'
 | 
			
		||||
 | 
			
		||||
[tool.poetry.dependencies]
 | 
			
		||||
python = "^3.9"
 | 
			
		||||
pandas = "^1.5.2"
 | 
			
		||||
pandas = {version = "^2.0.2", extras = ["feather", "performance"]}
 | 
			
		||||
python-stdnum = "^1.18"
 | 
			
		||||
requests = "^2.28.2"
 | 
			
		||||
requests-cache = "^0.9.8"
 | 
			
		||||
requests-cache = "^1.0.0"
 | 
			
		||||
langid = "^1.1.6"
 | 
			
		||||
colorama = "^0.4.6"
 | 
			
		||||
ftfy = "^6.1.1"
 | 
			
		||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
 | 
			
		||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
 | 
			
		||||
country-converter = "~1.1.0"
 | 
			
		||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"}
 | 
			
		||||
 | 
			
		||||
[tool.poetry.dev-dependencies]
 | 
			
		||||
[tool.poetry.group.dev.dependencies]
 | 
			
		||||
pytest = "^7.2.1"
 | 
			
		||||
flake8 = "^6.0.0"
 | 
			
		||||
pytest-clarity = "^1.0.1"
 | 
			
		||||
black = "^23.1.0"
 | 
			
		||||
isort = "^5.12.0"
 | 
			
		||||
csvkit = "^1.1.0"
 | 
			
		||||
 | 
			
		||||
[tool.poetry.group.dev.dependencies]
 | 
			
		||||
ipython = "^8.10.0"
 | 
			
		||||
fixit = "^2.1.0"
 | 
			
		||||
 | 
			
		||||
[build-system]
 | 
			
		||||
requires = ["poetry>=0.12"]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										9
									
								
								renovate.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								renovate.json
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
{
 | 
			
		||||
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
 | 
			
		||||
  "extends": [
 | 
			
		||||
    "config:base"
 | 
			
		||||
  ],
 | 
			
		||||
  "pip_requirements": {
 | 
			
		||||
      "enabled": false
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -5,28 +5,28 @@ agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
 | 
			
		||||
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
babel==2.12.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
black==23.3.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
 | 
			
		||||
exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
 | 
			
		||||
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
 | 
			
		||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
greenlet==2.0.2 ; python_version >= "3.9" and platform_machine == "aarch64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "ppc64le" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "x86_64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "amd64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "AMD64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "win32" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "WIN32" and python_version < "4.0"
 | 
			
		||||
idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
ipython==8.13.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
@@ -37,44 +37,46 @@ matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
 | 
			
		||||
numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
openpyxl==3.1.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
packaging==23.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pathspec==0.11.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
 | 
			
		||||
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
platformdirs==3.5.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
prompt-toolkit==3.0.38 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
 | 
			
		||||
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pygments==2.15.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytest==7.3.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
python-slugify==8.0.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
rich==13.3.5 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
sqlalchemy==1.4.48 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
 | 
			
		||||
traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
 | 
			
		||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
 
 | 
			
		||||
@@ -1,23 +1,25 @@
 | 
			
		||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
 | 
			
		||||
country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
 | 
			
		||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
 | 
			
		||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
 | 
			
		||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										36
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										36
									
								
								setup.py
									
									
									
									
									
								
							@@ -1,36 +0,0 @@
 | 
			
		||||
import setuptools
 | 
			
		||||
 | 
			
		||||
with open("README.md", "r") as fh:
 | 
			
		||||
    long_description = fh.read()
 | 
			
		||||
 | 
			
		||||
install_requires = [
 | 
			
		||||
    "pandas",
 | 
			
		||||
    "python-stdnum",
 | 
			
		||||
    "requests",
 | 
			
		||||
    "requests-cache",
 | 
			
		||||
    "pycountry",
 | 
			
		||||
    "langid",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
setuptools.setup(
 | 
			
		||||
    name="csv-metadata-quality",
 | 
			
		||||
    version="0.6.1",
 | 
			
		||||
    author="Alan Orth",
 | 
			
		||||
    author_email="aorth@mjanja.ch",
 | 
			
		||||
    description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
 | 
			
		||||
    license="GPLv3",
 | 
			
		||||
    long_description=long_description,
 | 
			
		||||
    long_description_content_type="text/markdown",
 | 
			
		||||
    url="https://github.com/alanorth/csv-metadata-quality",
 | 
			
		||||
    classifiers=[
 | 
			
		||||
        "Programming Language :: Python :: 3.9",
 | 
			
		||||
        "Programming Language :: Python :: 3.10",
 | 
			
		||||
        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
 | 
			
		||||
        "Operating System :: OS Independent",
 | 
			
		||||
    ],
 | 
			
		||||
    packages=["csv_metadata_quality"],
 | 
			
		||||
    entry_points={
 | 
			
		||||
        "console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
 | 
			
		||||
    },
 | 
			
		||||
    install_requires=install_requires,
 | 
			
		||||
)
 | 
			
		||||
		Reference in New Issue
	
	Block a user