mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-13 08:27:47 +02:00
Compare commits
77 Commits
v0.6.1
...
b305da3f0b
Author | SHA1 | Date | |
---|---|---|---|
b305da3f0b
|
|||
530cd5863b
|
|||
f6018c51b6
|
|||
80c3f5b45a
|
|||
ba4637ea34 | |||
355428a691 | |||
58d4de973e | |||
e1216dae3c | |||
6b650ff1b3 | |||
fa7bde6fc0 | |||
f89159fe32 | |||
02058c5a65 | |||
8fed6b71ff | |||
b005b28cbe | |||
c626290599 | |||
1a06470b64 | |||
d46a81672e | |||
2a50e75082 | |||
0d45e73983 | |||
3611aab425 | |||
5c4ad0eb41 | |||
f1f39722f6 | |||
1c03999582 | |||
1f637f32cd
|
|||
b8241e919d
|
|||
b8dc19cc3f
|
|||
93c9b739ac
|
|||
4ed2786703
|
|||
8728789183 | |||
bf90464809
|
|||
1878002391 | |||
d21d2621e3 | |||
f3fb1ff7fb | |||
1fa81f7558 | |||
7409193b6b | |||
a84fcf0b7b
|
|||
25ac290df4
|
|||
3f52bad1e3
|
|||
0208ad0ade | |||
3632ae0fc9 | |||
17d089cc6e
|
|||
bc470a4343
|
|||
be609a809d
|
|||
de3387ded7
|
|||
f343e87f0c
|
|||
7d3524fbd5
|
|||
c614b71a52 | |||
d159a839f3 | |||
36e2ebe5f4
|
|||
33f67b7a7c
|
|||
c0e1448439
|
|||
5d0804a08f
|
|||
f01c9edf17
|
|||
8d4295b2b3
|
|||
e2d46e9495
|
|||
1491e1edb0
|
|||
34142c3e6b
|
|||
0c88b96e8d
|
|||
2e55b4d6e3
|
|||
c90aad29f0
|
|||
6fd1e1377f
|
|||
c64b7eb1f1
|
|||
29cbc4f3a3
|
|||
307af1acfc
|
|||
b5106de9df
|
|||
9eeadfc44e
|
|||
d4aed378cf
|
|||
20a2cce34b
|
|||
d661ffe439
|
|||
45a310387a
|
|||
47b03c49ba
|
|||
986b81cbf4
|
|||
d43a47ae32
|
|||
ede37569f1
|
|||
0c53efe60a
|
|||
5f0e25b818
|
|||
4776154d6c
|
70
.drone.yml
70
.drone.yml
@ -1,3 +1,33 @@
|
||||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
name: python311
|
||||
|
||||
steps:
|
||||
- name: test
|
||||
image: python:3.11-slim
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
@ -10,23 +40,23 @@ steps:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
@ -40,22 +70,22 @@ steps:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
# vim: ts=2 sw=2 et
|
||||
|
36
.github/workflows/python-app.yml
vendored
36
.github/workflows/python-app.yml
vendored
@ -15,37 +15,31 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v4
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
cache: 'pip'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install flake8 pytest
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
|
||||
python-version: '3.11'
|
||||
cache: 'poetry'
|
||||
- run: poetry install
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
||||
run: poetry run pytest
|
||||
- name: Test CLI
|
||||
run: |
|
||||
python setup.py install
|
||||
# Basic test
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Test with unsafe fixes
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
15
CHANGELOG.md
15
CHANGELOG.md
@ -4,6 +4,21 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
### Fixed
|
||||
- Fixed regex so we don't run the invalid multi-value separator fix on
|
||||
`dcterms.bibliographicCitation` fields
|
||||
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
|
||||
fields
|
||||
- Don't crash the country/region checker/fixer when a title field is missing
|
||||
|
||||
### Changed
|
||||
- Don't run newline fix on description fields
|
||||
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||
|
||||
### Updated
|
||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||
|
||||
## [0.6.1] - 2023-02-23
|
||||
### Fixed
|
||||
- Missing region check should ignore subregion field, if it exists
|
||||
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
||||
include csv_metadata_quality/data/licenses.json
|
@ -127,7 +127,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Warn if an author is shorter than 3 characters?
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
- Validate ISSNs or journal titles against CrossRef API?
|
||||
- Add configurable field validation, like specify a field name and a validation file?
|
||||
@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Warn if item is Open Access, but missing a license
|
||||
- Warn if item has an ISSN but no journal title
|
||||
- Update journal titles from ISSN
|
||||
- Migrate to https://github.com/spdx/license-list-data
|
||||
- Migrate from Pandas to Polars
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
@ -1,11 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
|
||||
import csv_metadata_quality.check as check
|
||||
@ -74,7 +77,7 @@ def run(argv):
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||
df = pd.read_csv(args.input_file, dtype=str)
|
||||
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
|
||||
|
||||
# Check if the user requested to skip any fields
|
||||
if args.exclude_fields:
|
||||
@ -82,7 +85,20 @@ def run(argv):
|
||||
# user should be careful to no include spaces here.
|
||||
exclude = args.exclude_fields.split(",")
|
||||
else:
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.delete()
|
||||
|
||||
for column in df.columns:
|
||||
if column in exclude:
|
||||
@ -91,7 +107,9 @@ def run(argv):
|
||||
continue
|
||||
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?abstract.*$", column)
|
||||
# Skip whitespace and newline fixes on abstracts and descriptions
|
||||
# because there are too many with legitimate multi-line metadata.
|
||||
match = re.match(r"^.*?(abstract|description).*$", column)
|
||||
if match is None:
|
||||
# Fix: whitespace
|
||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||
@ -102,7 +120,7 @@ def run(argv):
|
||||
# Fix: missing space after comma. Only run on author and citation
|
||||
# fields for now, as this problem is mostly an issue in names.
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?(author|citation).*$", column)
|
||||
match = re.match(r"^.*?(author|[Cc]itation).*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||
|
||||
@ -126,7 +144,7 @@ def run(argv):
|
||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||
# and abstract fields because "|" is used to indicate something like
|
||||
# a subtitle.
|
||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
||||
match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
|
||||
if match is None:
|
||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
|
@ -1,14 +1,12 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import country_converter as coco
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
# requests_cache.remove_expired_responses()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_params = {"query": value}
|
||||
|
||||
request = requests.get(request_url, params=request_params)
|
||||
@ -373,7 +358,7 @@ def duplicate_items(df):
|
||||
|
||||
if items_count_unique < items_count_total:
|
||||
# Create a list to hold our items while we check for duplicates
|
||||
items = list()
|
||||
items = []
|
||||
|
||||
for index, row in df.iterrows():
|
||||
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
||||
@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
|
||||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
regions = []
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
|
||||
un_region = cc.convert(names=country, to="UNRegion")
|
||||
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
try:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
except KeyError:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
|
||||
)
|
||||
|
||||
return
|
||||
|
@ -20,7 +20,7 @@ def correct_language(row, exclude):
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
language = ""
|
||||
sample_strings = list()
|
||||
sample_strings = []
|
||||
title = None
|
||||
|
||||
# Iterate over the labels of the current row's values. Before we transposed
|
||||
|
@ -23,7 +23,7 @@ def whitespace(field, field_name):
|
||||
return
|
||||
|
||||
# Initialize an empty list to hold the cleaned values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
@ -64,7 +64,7 @@ def separators(field, field_name):
|
||||
return
|
||||
|
||||
# Initialize an empty list to hold the cleaned values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
@ -175,7 +175,7 @@ def duplicates(field, field_name):
|
||||
values = field.split("||")
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = list()
|
||||
new_values = []
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
|
||||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
regions = []
|
||||
|
||||
# An empty list for our regions so we can keep track for all countries
|
||||
missing_regions = list()
|
||||
missing_regions = []
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
|
||||
# it doesn't already exist in regions.
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
if un_region not in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
try:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
except KeyError:
|
||||
# If there is no title column in the CSV we will print
|
||||
# the fix without the title instead of crashing.
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
|
||||
)
|
||||
|
||||
missing_regions.append(un_region)
|
||||
|
||||
if len(missing_regions) > 0:
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
|
||||
import json
|
||||
from importlib.resources import files
|
||||
import os
|
||||
|
||||
from ftfy.badness import is_bad
|
||||
|
||||
@ -58,7 +58,7 @@ def is_mojibake(field):
|
||||
def load_spdx_licenses():
|
||||
"""Returns a Python list of SPDX short license identifiers."""
|
||||
|
||||
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
|
||||
licenses = json.load(f)
|
||||
|
||||
# List comprehension to extract the license ID for each license
|
||||
|
1684
poetry.lock
generated
1684
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -12,26 +12,25 @@ csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
pandas = "^1.5.2"
|
||||
pandas = {version = "^2.0.2", extras = ["feather", "performance"]}
|
||||
python-stdnum = "^1.18"
|
||||
requests = "^2.28.2"
|
||||
requests-cache = "^0.9.8"
|
||||
requests-cache = "^1.0.0"
|
||||
langid = "^1.1.6"
|
||||
colorama = "^0.4.6"
|
||||
ftfy = "^6.1.1"
|
||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
|
||||
country-converter = "~1.1.0"
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"}
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.2.1"
|
||||
flake8 = "^6.0.0"
|
||||
pytest-clarity = "^1.0.1"
|
||||
black = "^23.1.0"
|
||||
isort = "^5.12.0"
|
||||
csvkit = "^1.1.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipython = "^8.10.0"
|
||||
fixit = "^2.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry>=0.12"]
|
||||
|
9
renovate.json
Normal file
9
renovate.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||
"extends": [
|
||||
"config:base"
|
||||
],
|
||||
"pip_requirements": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
@ -5,28 +5,28 @@ agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
|
||||
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
babel==2.12.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
black==23.3.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
||||
exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
||||
greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
||||
greenlet==2.0.2 ; python_version >= "3.9" and platform_machine == "aarch64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "ppc64le" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "x86_64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "amd64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "AMD64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "win32" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "WIN32" and python_version < "4.0"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ipython==8.13.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
@ -37,44 +37,46 @@ matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
||||
numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
|
||||
openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
openpyxl==3.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
packaging==23.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pathspec==0.11.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
||||
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
platformdirs==3.5.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
|
||||
prompt-toolkit==3.0.38 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
||||
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pygments==2.15.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest==7.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-slugify==8.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
||||
rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
rich==13.3.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sqlalchemy==1.4.48 ; python_version >= "3.9" and python_version < "4.0"
|
||||
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
||||
urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
|
||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
||||
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
|
@ -1,23 +1,25 @@
|
||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
||||
country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
||||
requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
||||
urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
|
||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
||||
|
36
setup.py
36
setup.py
@ -1,36 +0,0 @@
|
||||
import setuptools
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
install_requires = [
|
||||
"pandas",
|
||||
"python-stdnum",
|
||||
"requests",
|
||||
"requests-cache",
|
||||
"pycountry",
|
||||
"langid",
|
||||
]
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.6.1",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
license="GPLv3",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/alanorth/csv-metadata-quality",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
packages=["csv_metadata_quality"],
|
||||
entry_points={
|
||||
"console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
|
||||
},
|
||||
install_requires=install_requires,
|
||||
)
|
Reference in New Issue
Block a user