mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-07-04 13:33:26 +02:00
Compare commits
147 Commits
53fdb50906
...
renovate/p
Author | SHA1 | Date | |
---|---|---|---|
b1b568347c | |||
22b2e3b8ae
|
|||
be550e21f1
|
|||
753f3340a3
|
|||
188097abe4
|
|||
b7a81b8ec7
|
|||
8a2c567d1f
|
|||
42eb9437e3
|
|||
5400bcb19b
|
|||
febea54f1b
|
|||
b5565124de
|
|||
2869919507
|
|||
f7d66947f7
|
|||
1d701f4056
|
|||
1e339609a6
|
|||
2b0568de30
|
|||
9903ada97a
|
|||
d4b20e282c
|
|||
9785c18301 | |||
de5e292f1a | |||
2675cd288e
|
|||
78dc1336d0
|
|||
28bbb919ce
|
|||
b1de9552a4
|
|||
81e3ca3d9c
|
|||
c470f8b375
|
|||
0f45448517
|
|||
7dd52ca491
|
|||
92ff0ee51b
|
|||
ae38a826ec
|
|||
c1f630c298
|
|||
82b056f0ea
|
|||
7fca981b95
|
|||
1a9424197b
|
|||
f6c6c94a1e
|
|||
f500fac64b
|
|||
8143a7d978
|
|||
94cec080d6
|
|||
9402af1e30
|
|||
d71ff9082b
|
|||
f309b694c4
|
|||
4d879f6d13
|
|||
a30fefcd52
|
|||
2341c56c40
|
|||
5be2195325
|
|||
736948ed2c
|
|||
ee0b448355
|
|||
4f3174a543
|
|||
d5c25f82fa
|
|||
7b3e2b4e68 | |||
f92b2fe206 | |||
df040b70c7 | |||
10bc8f3e14 | |||
7e6e92ecaa
|
|||
a21ffb0fa8
|
|||
fb341dd9fa | |||
2e943ee4db | |||
6d3a9870d6 | |||
82ecf7119a | |||
1db21cf275 | |||
bcd1408798 | |||
ee8d255811 | |||
2cc2dbe952
|
|||
940a325d61
|
|||
59b3b307c9
|
|||
b305da3f0b
|
|||
96a486471c | |||
530cd5863b
|
|||
f6018c51b6
|
|||
80c3f5b45a
|
|||
ba4637ea34 | |||
355428a691 | |||
58d4de973e | |||
e1216dae3c | |||
6b650ff1b3 | |||
fa7bde6fc0 | |||
f89159fe32 | |||
02058c5a65 | |||
8fed6b71ff | |||
b005b28cbe | |||
c626290599 | |||
1a06470b64 | |||
d46a81672e | |||
2a50e75082 | |||
0d45e73983 | |||
3611aab425 | |||
5c4ad0eb41 | |||
f1f39722f6 | |||
1c03999582 | |||
1f637f32cd
|
|||
b8241e919d
|
|||
b8dc19cc3f
|
|||
93c9b739ac
|
|||
4ed2786703
|
|||
8728789183 | |||
bf90464809
|
|||
1878002391 | |||
d21d2621e3 | |||
f3fb1ff7fb | |||
1fa81f7558 | |||
7409193b6b | |||
a84fcf0b7b
|
|||
25ac290df4
|
|||
3f52bad1e3
|
|||
0208ad0ade | |||
3632ae0fc9 | |||
17d089cc6e
|
|||
bc470a4343
|
|||
be609a809d
|
|||
de3387ded7
|
|||
f343e87f0c
|
|||
7d3524fbd5
|
|||
c614b71a52 | |||
d159a839f3 | |||
36e2ebe5f4
|
|||
33f67b7a7c
|
|||
c0e1448439
|
|||
5d0804a08f
|
|||
f01c9edf17
|
|||
8d4295b2b3
|
|||
e2d46e9495
|
|||
1491e1edb0
|
|||
34142c3e6b
|
|||
0c88b96e8d
|
|||
2e55b4d6e3
|
|||
c90aad29f0
|
|||
6fd1e1377f
|
|||
c64b7eb1f1
|
|||
29cbc4f3a3
|
|||
307af1acfc
|
|||
b5106de9df
|
|||
9eeadfc44e
|
|||
d4aed378cf
|
|||
20a2cce34b
|
|||
d661ffe439
|
|||
45a310387a
|
|||
47b03c49ba
|
|||
986b81cbf4
|
|||
d43a47ae32
|
|||
ede37569f1
|
|||
0c53efe60a
|
|||
5f0e25b818
|
|||
4776154d6c
|
|||
fdccdf7318
|
|||
ff2c986eec
|
|||
547574866e
|
|||
8aa7b93d87
|
61
.drone.yml
61
.drone.yml
@ -1,61 +0,0 @@
|
|||||||
---
|
|
||||||
kind: pipeline
|
|
||||||
type: docker
|
|
||||||
name: python310
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: test
|
|
||||||
image: python:3.10-slim
|
|
||||||
commands:
|
|
||||||
- id
|
|
||||||
- python -V
|
|
||||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
|
||||||
- pip install -r requirements-dev.txt
|
|
||||||
- pytest
|
|
||||||
- python setup.py install
|
|
||||||
# Basic test
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
|
||||||
# Basic test with unsafe fixes
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
|
||||||
# Geography test
|
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
|
||||||
# Geography test with unsafe fixes
|
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
|
||||||
# Test with experimental checks
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
|
||||||
# Test with AGROVOC validation
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
|
||||||
# Test with AGROVOC validation (and dropping invalid)
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
|
||||||
|
|
||||||
---
|
|
||||||
kind: pipeline
|
|
||||||
type: docker
|
|
||||||
name: python39
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: test
|
|
||||||
image: python:3.9-slim
|
|
||||||
commands:
|
|
||||||
- id
|
|
||||||
- python -V
|
|
||||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
|
||||||
- pip install -r requirements-dev.txt
|
|
||||||
- pytest
|
|
||||||
- python setup.py install
|
|
||||||
# Basic test
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
|
||||||
# Basic test with unsafe fixes
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
|
||||||
# Geography test
|
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
|
||||||
# Geography test with unsafe fixes
|
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
|
||||||
# Test with experimental checks
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
|
||||||
# Test with AGROVOC validation
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
|
||||||
# Test with AGROVOC validation (and dropping invalid)
|
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
|
||||||
|
|
||||||
# vim: ts=2 sw=2 et
|
|
38
.github/workflows/python-app.yml
vendored
38
.github/workflows/python-app.yml
vendored
@ -12,40 +12,26 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python 3.10
|
- name: Install uv
|
||||||
uses: actions/setup-python@v4
|
uses: astral-sh/setup-uv@v5
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
version: 'latest'
|
||||||
cache: 'pip'
|
- run: uv sync
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install flake8 pytest
|
|
||||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
|
||||||
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
|
|
||||||
- name: Lint with flake8
|
|
||||||
run: |
|
|
||||||
# stop the build if there are Python syntax errors or undefined names
|
|
||||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
|
||||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
|
||||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: uv run pytest
|
||||||
pytest
|
|
||||||
- name: Test CLI
|
- name: Test CLI
|
||||||
run: |
|
run: |
|
||||||
python setup.py install
|
|
||||||
# Basic test
|
# Basic test
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
uv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Test with unsafe fixes
|
# Test with unsafe fixes
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
uv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
uv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
uv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||||
# Test with AGROVOC validation (and dropping invalid)
|
# Test with AGROVOC validation (and dropping invalid)
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
uv run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||||
|
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
|||||||
|
3.13
|
24
CHANGELOG.md
24
CHANGELOG.md
@ -4,7 +4,29 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
## Unreleased
|
## [0.7.0] - 2025-01-31
|
||||||
|
### Added
|
||||||
|
- Ability to normalize DOIs to https://doi.org URI format
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Fixed regex so we don't run the invalid multi-value separator fix on
|
||||||
|
`dcterms.bibliographicCitation` fields
|
||||||
|
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
|
||||||
|
fields
|
||||||
|
- Don't crash the country/region checker/fixer when a title field is missing
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Don't run newline fix on description fields
|
||||||
|
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||||
|
- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
|
||||||
|
- Use uv instead of rye
|
||||||
|
- Remove pytest-clarity — I think pytest itself has gotten much better in the past few years
|
||||||
|
|
||||||
|
### Updated
|
||||||
|
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||||
|
- SPDX license list
|
||||||
|
|
||||||
|
## [0.6.1] - 2023-02-23
|
||||||
### Fixed
|
### Fixed
|
||||||
- Missing region check should ignore subregion field, if it exists
|
- Missing region check should ignore subregion field, if it exists
|
||||||
|
|
||||||
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
|||||||
|
include csv_metadata_quality/data/licenses.json
|
18
README.md
18
README.md
@ -1,7 +1,6 @@
|
|||||||
<h1 align="center">DSpace CSV Metadata Quality Checker</h1>
|
<h1 align="center">DSpace CSV Metadata Quality Checker</h1>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://ci.mjanja.ch/alanorth/csv-metadata-quality"><img alt="Build Status" src="https://ci.mjanja.ch/api/badges/alanorth/csv-metadata-quality/status.svg"></a>
|
|
||||||
<a href="https://github.com/ilri/csv-metadata-quality/actions"><img alt="Build and Test" src="https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg"></a>
|
<a href="https://github.com/ilri/csv-metadata-quality/actions"><img alt="Build and Test" src="https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg"></a>
|
||||||
<a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
|
<a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
|
||||||
</p>
|
</p>
|
||||||
@ -31,24 +30,25 @@ If you use the DSpace CSV metadata quality checker please cite:
|
|||||||
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
||||||
- Remove duplicate metadata values
|
- Remove duplicate metadata values
|
||||||
- Check for duplicate items, using the title, type, and date issued as an indicator
|
- Check for duplicate items, using the title, type, and date issued as an indicator
|
||||||
|
- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
The easiest way to install CSV Metadata Quality is with [uv](https://docs.astral.sh/uv/):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ git clone https://github.com/ilri/csv-metadata-quality.git
|
$ git clone https://github.com/ilri/csv-metadata-quality.git
|
||||||
$ cd csv-metadata-quality
|
$ cd csv-metadata-quality
|
||||||
$ poetry install
|
$ uv sync
|
||||||
$ poetry shell
|
$ source .venv/bin/activate
|
||||||
```
|
```
|
||||||
|
|
||||||
Otherwise, if you don't have poetry, you can use a vanilla Python virtual environment:
|
Otherwise, if you don't have uv, you can use a vanilla Python virtual environment:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ git clone https://github.com/ilri/csv-metadata-quality.git
|
$ git clone https://github.com/ilri/csv-metadata-quality.git
|
||||||
$ cd csv-metadata-quality
|
$ cd csv-metadata-quality
|
||||||
$ python3 -m venv venv
|
$ python3 -m venv .venv
|
||||||
$ source venv/bin/activate
|
$ source .venv/bin/activate
|
||||||
$ pip install -r requirements.txt
|
$ pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -125,9 +125,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Better logging, for example with INFO, WARN, and ERR levels
|
- Better logging, for example with INFO, WARN, and ERR levels
|
||||||
- Verbose, debug, or quiet options
|
- Verbose, debug, or quiet options
|
||||||
- Warn if an author is shorter than 3 characters?
|
- Warn if an author is shorter than 3 characters?
|
||||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
|
||||||
- Warn if two items use the same file in `filename` column
|
- Warn if two items use the same file in `filename` column
|
||||||
- Add an option to drop invalid AGROVOC subjects?
|
|
||||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||||
- Validate ISSNs or journal titles against CrossRef API?
|
- Validate ISSNs or journal titles against CrossRef API?
|
||||||
- Add configurable field validation, like specify a field name and a validation file?
|
- Add configurable field validation, like specify a field name and a validation file?
|
||||||
@ -137,7 +135,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Warn if item is Open Access, but missing a license
|
- Warn if item is Open Access, but missing a license
|
||||||
- Warn if item has an ISSN but no journal title
|
- Warn if item has an ISSN but no journal title
|
||||||
- Update journal titles from ISSN
|
- Update journal titles from ISSN
|
||||||
- Migrate to https://github.com/spdx/license-list-data
|
- Migrate from Pandas to Polars
|
||||||
|
|
||||||
## License
|
## License
|
||||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||||
|
@ -37,3 +37,7 @@ Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
|
|||||||
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
||||||
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
||||||
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
||||||
|
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
|
||||||
|
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
|
||||||
|
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
|
||||||
|
DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,
|
||||||
|
|
1553
poetry.lock
generated
1553
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,41 +1,55 @@
|
|||||||
[tool.poetry]
|
[project]
|
||||||
name = "csv-metadata-quality"
|
name = "csv-metadata-quality"
|
||||||
version = "0.6.0"
|
version = "0.7.0"
|
||||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
authors = [
|
||||||
license="GPL-3.0-only"
|
{ name = "Alan Orth", email = "alan.orth@gmail.com" }
|
||||||
|
]
|
||||||
|
license= { file = "LICENSE.txt" }
|
||||||
|
dependencies = [
|
||||||
|
"pandas[feather,performance]~=2.2.3",
|
||||||
|
"python-stdnum~=2.1",
|
||||||
|
"requests~=2.32.3",
|
||||||
|
"requests-cache~=1.2.1",
|
||||||
|
"colorama~=0.4",
|
||||||
|
"ftfy~=6.3.0",
|
||||||
|
"country-converter~=1.3",
|
||||||
|
"pycountry~=24.6.1",
|
||||||
|
"py3langid~=0.3",
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">= 3.10"
|
||||||
|
|
||||||
|
classifiers = [
|
||||||
|
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||||
|
"Natural Language :: English",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Programming Language :: Python :: Implementation :: CPython",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
repository = "https://github.com/ilri/csv-metadata-quality"
|
repository = "https://github.com/ilri/csv-metadata-quality"
|
||||||
homepage = "https://github.com/ilri/csv-metadata-quality"
|
homepage = "https://github.com/ilri/csv-metadata-quality"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[project.scripts]
|
||||||
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
# See: https://docs.astral.sh/uv/concepts/build-backend/
|
||||||
python = "^3.9"
|
|
||||||
pandas = "^1.5.2"
|
|
||||||
python-stdnum = "^1.18"
|
|
||||||
requests = "^2.28.2"
|
|
||||||
requests-cache = "^0.9.8"
|
|
||||||
langid = "^1.1.6"
|
|
||||||
colorama = "^0.4.6"
|
|
||||||
ftfy = "^6.1.1"
|
|
||||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
|
|
||||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
|
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
|
||||||
pytest = "^7.2.1"
|
|
||||||
flake8 = "^6.0.0"
|
|
||||||
pytest-clarity = "^1.0.1"
|
|
||||||
black = "^23.1.0"
|
|
||||||
isort = "^5.12.0"
|
|
||||||
csvkit = "^1.1.0"
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
|
||||||
ipython = "^8.10.0"
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry>=0.12"]
|
requires = ["uv_build>=0.7.19,<0.8.0"]
|
||||||
build-backend = "poetry.masonry.api"
|
build-backend = "uv_build"
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"pytest~=8.3",
|
||||||
|
"isort~=6.0",
|
||||||
|
"csvkit~=2.0",
|
||||||
|
"ipython~=8.31",
|
||||||
|
]
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
profile = "black"
|
profile = "black"
|
||||||
|
9
renovate.json
Normal file
9
renovate.json
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||||
|
"extends": [
|
||||||
|
"config:recommended"
|
||||||
|
],
|
||||||
|
"pip_requirements": {
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
}
|
@ -1,80 +0,0 @@
|
|||||||
agate-dbf==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
agate-excel==0.2.5 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
agate-sql==0.5.8 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
|
|
||||||
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
black==22.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
csvkit==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
|
||||||
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
flake8==5.0.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
greenlet==2.0.1 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
|
|
||||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
ipython==8.8.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
isort==5.11.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
leather==0.3.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
markdown-it-py==2.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
mypy-extensions==0.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
numpy==1.24.1 ; python_version < "4.0" and python_version >= "3.9"
|
|
||||||
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
openpyxl==3.0.10 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pathspec==0.10.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
|
||||||
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
platformdirs==2.6.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
prompt-toolkit==3.0.36 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
|
||||||
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pycodestyle==2.9.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pyflakes==2.5.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-slugify==7.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
rich==13.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
tomli==2.0.1 ; python_version >= "3.9" and python_full_version < "3.11.0a7"
|
|
||||||
traitlets==5.8.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
typing-extensions==4.4.0 ; python_version >= "3.9" and python_version < "3.10"
|
|
||||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
446
requirements.txt
446
requirements.txt
@ -1,23 +1,423 @@
|
|||||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
# This file was autogenerated by uv via the following command:
|
||||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
# uv export --no-dev
|
||||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
-e .
|
||||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
attrs==25.3.0 \
|
||||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
--hash=sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3 \
|
||||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b
|
||||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
# via
|
||||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
# cattrs
|
||||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
# requests-cache
|
||||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
bottleneck==1.5.0 \
|
||||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:049162927cf802208cc8691fb99b108afe74656cdc96b9e2067cf56cb9d84056 \
|
||||||
numpy==1.24.1 ; python_version < "4.0" and python_version >= "3.9"
|
--hash=sha256:07c2c1aa39917b5c9be77e85791aa598e8b2c00f8597a198b93628bbfde72a3f \
|
||||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:0dca825048a3076f34c4a35409e3277b31ceeb3cbb117bbe2a13ff5c214bcabc \
|
||||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:1043d95674566063f638582cc8700c24c4427f532f86b9e7cfc9f9ec84abc1ff \
|
||||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:1214a2bf3b36c66e3898aab821ad8366a3062db6f83a8f083e2f799d202e86ea \
|
||||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:1648f2a0d52b78f6e530385862e279ffa66baae2ce038bfdf5d8b29a638bac46 \
|
||||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:16fead35c0b5d307815997eef67d03c2151f255ca889e0fc3d68703f41aa5302 \
|
||||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:27e38e829497ca0a5eebdb79d3293aaa424f3c31c13806e5c607fd414536b7c3 \
|
||||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
--hash=sha256:2f5e863a4fdaf9c85416789aeb333d1cdd3603037fd854ad58b0e2ac73be16cf \
|
||||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:3886799cceb271eb67d057f6ecb13fb4582bda17a3b13b4fa0334638c59637c6 \
|
||||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
--hash=sha256:3f3e308416886e29441a0b71bce8f3eb4c7a4943be541fd918244aaf25534d36 \
|
||||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
--hash=sha256:436a402f0d60a9d6541d7adb0929501225a151ad03b96b756e0b607db6a106f1 \
|
||||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
--hash=sha256:48c2657102f3288e178cc341f000475a32f49a3cd8b7067e091d5446fa899383 \
|
||||||
|
--hash=sha256:5c4c94cfcba46adfe71894c63c4b186c847965e73727dbaf5fd9ade41ef38e6e \
|
||||||
|
--hash=sha256:613165ce39bf6bd80f5307da0f05842ba534b213a89526f1eba82ea0099592fc \
|
||||||
|
--hash=sha256:7962177b04b865b17e883ace01c68cf50353ef6a9437ec01bad1f5a1a2708490 \
|
||||||
|
--hash=sha256:7967e0189defe9f49025bd6469ff0fe22af5463926af55c7ba1e4592051d8ef8 \
|
||||||
|
--hash=sha256:80ef9eea2a92fc5a1c04734aa1bcf317253241062c962eaa6e7f123b583d0109 \
|
||||||
|
--hash=sha256:816c910c5d1fb53adb32581c52a513b206f503ae253ace70cb32d1fe4e45af1d \
|
||||||
|
--hash=sha256:8892f2d90d63a3dd5884e8f3fe7bbe8c569851a984023340ef926d2205332d96 \
|
||||||
|
--hash=sha256:8d123762f78717fc35ecf10cad45d08273fcb12ab40b3c847190b83fec236f03 \
|
||||||
|
--hash=sha256:97285cfedf3545d9a010b2db2123f9750bf920081e29364cc465052973bd0b5a \
|
||||||
|
--hash=sha256:9be5dfdf1a662d1d4423d7b7e8dd9a1b7046dcc2ce67b6e94a31d1cc57a8558f \
|
||||||
|
--hash=sha256:9ca39aca62f0e827fc8c9b352352224ecb38a98d8f9cbc30f071672c31904aa2 \
|
||||||
|
--hash=sha256:a107ed8b5f998918c24a1e476dbd2dfc3514ab0082df7132c460b01e6ffd8cf4 \
|
||||||
|
--hash=sha256:abc6a24a41f55765215005cec97dd69f41ac747ed0f4d446caa508531957eeda \
|
||||||
|
--hash=sha256:bda7c475d4a7e271dbd0b1d4bbce29065edc8891361857105b7212fe383c9a36 \
|
||||||
|
--hash=sha256:c15a5f009ea72f95d0a35e784c6944af2b6d7dab102341fb3c3412e41ce5adf6 \
|
||||||
|
--hash=sha256:c860242cf20e69d5aab2ec3c5d6c8c2a15f19e4b25b28b8fca2c2a12cefae9d8 \
|
||||||
|
--hash=sha256:dbb0f0d38feda63050aa253cf9435e81a0ecfac954b0df84896636be9eabd9b6 \
|
||||||
|
--hash=sha256:dc8d553d4bf033d3e025cd32d4c034d2daf10709e31ced3909811d1c843e451c \
|
||||||
|
--hash=sha256:f13b644207118564b95eb7b2130555fb4a4b2266a739b2a8f98a5276baa723ea \
|
||||||
|
--hash=sha256:f218e4dae6511180dcc4f06d8300e0c81e7f3df382091f464c5a919d289fab8e \
|
||||||
|
--hash=sha256:f26005740e6ef6013eba8a48241606a963e862a601671eab064b7835cd12ef3d \
|
||||||
|
--hash=sha256:f9545206daaffaecf88d176f657b7c939f6d909275991121dc8dee936dcd8985 \
|
||||||
|
--hash=sha256:fc0c0b661005b059fcb09988f8b5e2cd5e9c702e1bed24819ed38f85145140b5
|
||||||
|
# via pandas
|
||||||
|
cattrs==25.1.1 \
|
||||||
|
--hash=sha256:1b40b2d3402af7be79a7e7e097a9b4cd16d4c06e6d526644b0b26a063a1cc064 \
|
||||||
|
--hash=sha256:c914b734e0f2d59e5b720d145ee010f1fd9a13ee93900922a2f3f9d593b8382c
|
||||||
|
# via requests-cache
|
||||||
|
certifi==2025.6.15 \
|
||||||
|
--hash=sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057 \
|
||||||
|
--hash=sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b
|
||||||
|
# via requests
|
||||||
|
charset-normalizer==3.4.2 \
|
||||||
|
--hash=sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7 \
|
||||||
|
--hash=sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0 \
|
||||||
|
--hash=sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7 \
|
||||||
|
--hash=sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d \
|
||||||
|
--hash=sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0 \
|
||||||
|
--hash=sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db \
|
||||||
|
--hash=sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b \
|
||||||
|
--hash=sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8 \
|
||||||
|
--hash=sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff \
|
||||||
|
--hash=sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e \
|
||||||
|
--hash=sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148 \
|
||||||
|
--hash=sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a \
|
||||||
|
--hash=sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e \
|
||||||
|
--hash=sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63 \
|
||||||
|
--hash=sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c \
|
||||||
|
--hash=sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366 \
|
||||||
|
--hash=sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5 \
|
||||||
|
--hash=sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c \
|
||||||
|
--hash=sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b \
|
||||||
|
--hash=sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0 \
|
||||||
|
--hash=sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941 \
|
||||||
|
--hash=sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0 \
|
||||||
|
--hash=sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86 \
|
||||||
|
--hash=sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6 \
|
||||||
|
--hash=sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0 \
|
||||||
|
--hash=sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1 \
|
||||||
|
--hash=sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6 \
|
||||||
|
--hash=sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981 \
|
||||||
|
--hash=sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c \
|
||||||
|
--hash=sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980 \
|
||||||
|
--hash=sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645 \
|
||||||
|
--hash=sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7 \
|
||||||
|
--hash=sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd \
|
||||||
|
--hash=sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef \
|
||||||
|
--hash=sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2 \
|
||||||
|
--hash=sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d \
|
||||||
|
--hash=sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3 \
|
||||||
|
--hash=sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd \
|
||||||
|
--hash=sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214 \
|
||||||
|
--hash=sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd \
|
||||||
|
--hash=sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a \
|
||||||
|
--hash=sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c \
|
||||||
|
--hash=sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f \
|
||||||
|
--hash=sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28 \
|
||||||
|
--hash=sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691 \
|
||||||
|
--hash=sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82 \
|
||||||
|
--hash=sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a \
|
||||||
|
--hash=sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf \
|
||||||
|
--hash=sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b \
|
||||||
|
--hash=sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9 \
|
||||||
|
--hash=sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544 \
|
||||||
|
--hash=sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509 \
|
||||||
|
--hash=sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a \
|
||||||
|
--hash=sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f
|
||||||
|
# via requests
|
||||||
|
colorama==0.4.6 \
|
||||||
|
--hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
|
||||||
|
--hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
|
||||||
|
# via csv-metadata-quality
|
||||||
|
country-converter==1.3 \
|
||||||
|
--hash=sha256:006958c83adeada455d2f178921fdd051def736259ff250fada912eaf3ca8cf1 \
|
||||||
|
--hash=sha256:f6a1a14d1f98112ca90a5198f645f4e60bb73840e98f3f733893ff5b617c2f38
|
||||||
|
# via csv-metadata-quality
|
||||||
|
exceptiongroup==1.3.0 ; python_full_version < '3.11' \
|
||||||
|
--hash=sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10 \
|
||||||
|
--hash=sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88
|
||||||
|
# via cattrs
|
||||||
|
ftfy==6.3.1 \
|
||||||
|
--hash=sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083 \
|
||||||
|
--hash=sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec
|
||||||
|
# via csv-metadata-quality
|
||||||
|
idna==3.10 \
|
||||||
|
--hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
|
||||||
|
--hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
|
||||||
|
# via
|
||||||
|
# requests
|
||||||
|
# url-normalize
|
||||||
|
llvmlite==0.44.0 \
|
||||||
|
--hash=sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4 \
|
||||||
|
--hash=sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad \
|
||||||
|
--hash=sha256:2fb7c4f2fb86cbae6dca3db9ab203eeea0e22d73b99bc2341cdf9de93612e930 \
|
||||||
|
--hash=sha256:319bddd44e5f71ae2689859b7203080716448a3cd1128fb144fe5c055219d516 \
|
||||||
|
--hash=sha256:40526fb5e313d7b96bda4cbb2c85cd5374e04d80732dd36a282d72a560bb6408 \
|
||||||
|
--hash=sha256:41e3839150db4330e1b2716c0be3b5c4672525b4c9005e17c7597f835f351ce2 \
|
||||||
|
--hash=sha256:46224058b13c96af1365290bdfebe9a6264ae62fb79b2b55693deed11657a8bf \
|
||||||
|
--hash=sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db \
|
||||||
|
--hash=sha256:7202b678cdf904823c764ee0fe2dfe38a76981f4c1e51715b4cb5abb6cf1d9e8 \
|
||||||
|
--hash=sha256:9c58867118bad04a0bb22a2e0068c693719658105e40009ffe95c7000fcde88e \
|
||||||
|
--hash=sha256:9fbadbfba8422123bab5535b293da1cf72f9f478a65645ecd73e781f962ca614 \
|
||||||
|
--hash=sha256:aa0097052c32bf721a4efc03bd109d335dfa57d9bffb3d4c24cc680711b8b4fc \
|
||||||
|
--hash=sha256:ace564d9fa44bb91eb6e6d8e7754977783c68e90a471ea7ce913bff30bd62427 \
|
||||||
|
--hash=sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9 \
|
||||||
|
--hash=sha256:c5d22c3bfc842668168a786af4205ec8e3ad29fb1bc03fd11fd48460d0df64c1 \
|
||||||
|
--hash=sha256:cccf8eb28f24840f2689fb1a45f9c0f7e582dd24e088dcf96e424834af11f791 \
|
||||||
|
--hash=sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d \
|
||||||
|
--hash=sha256:d8489634d43c20cd0ad71330dde1d5bc7b9966937a263ff1ec1cebb90dc50955 \
|
||||||
|
--hash=sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1 \
|
||||||
|
--hash=sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3 \
|
||||||
|
--hash=sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610
|
||||||
|
# via numba
|
||||||
|
numba==0.61.2 \
|
||||||
|
--hash=sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2 \
|
||||||
|
--hash=sha256:3945615cd73c2c7eba2a85ccc9c1730c21cd3958bfcf5a44302abae0fb07bb60 \
|
||||||
|
--hash=sha256:3a10a8fc9afac40b1eac55717cece1b8b1ac0b946f5065c89e00bde646b5b154 \
|
||||||
|
--hash=sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd \
|
||||||
|
--hash=sha256:49c980e4171948ffebf6b9a2520ea81feed113c1f4890747ba7f59e74be84b1b \
|
||||||
|
--hash=sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8 \
|
||||||
|
--hash=sha256:59321215e2e0ac5fa928a8020ab00b8e57cda8a97384963ac0dfa4d4e6aa54e7 \
|
||||||
|
--hash=sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546 \
|
||||||
|
--hash=sha256:5f154aaea625fb32cfbe3b80c5456d514d416fcdf79733dd69c0df3a11348e9e \
|
||||||
|
--hash=sha256:76bcec9f46259cedf888041b9886e257ae101c6268261b19fda8cfbc52bec9d1 \
|
||||||
|
--hash=sha256:7d3bcada3c9afba3bed413fba45845f2fb9cd0d2b27dd58a1be90257e293d140 \
|
||||||
|
--hash=sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d \
|
||||||
|
--hash=sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18 \
|
||||||
|
--hash=sha256:ae45830b129c6137294093b269ef0a22998ccc27bf7cf096ab8dcf7bca8946f9 \
|
||||||
|
--hash=sha256:ae8c7a522c26215d5f62ebec436e3d341f7f590079245a2f1008dfd498cc1642 \
|
||||||
|
--hash=sha256:bbfdf4eca202cebade0b7d43896978e146f39398909a42941c9303f82f403a18 \
|
||||||
|
--hash=sha256:bd1e74609855aa43661edffca37346e4e8462f6903889917e9f41db40907daa2 \
|
||||||
|
--hash=sha256:bdbca73ad81fa196bd53dc12e3aaf1564ae036e0c125f237c7644fe64a4928ab \
|
||||||
|
--hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \
|
||||||
|
--hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \
|
||||||
|
--hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
|
||||||
|
# via pandas
|
||||||
|
numexpr==2.11.0 \
|
||||||
|
--hash=sha256:096ec768bee2ef14ac757b4178e3c5f05e5f1cb6cae83b2eea9b4ba3ec1a86dd \
|
||||||
|
--hash=sha256:097aa8835d32d6ac52f2be543384019b4b134d1fb67998cbfc4271155edfe54a \
|
||||||
|
--hash=sha256:0a184e5930c77ab91dd9beee4df403b825cd9dfc4e9ba4670d31c9fcb4e2c08e \
|
||||||
|
--hash=sha256:0db4c2dcad09f9594b45fce794f4b903345195a8c216e252de2aa92884fd81a8 \
|
||||||
|
--hash=sha256:2036be213a6a1b5ce49acf60de99b911a0f9d174aab7679dde1fae315134f826 \
|
||||||
|
--hash=sha256:238d19465a272ada3967600fada55e4c6900485aefb42122a78dfcaf2efca65f \
|
||||||
|
--hash=sha256:321736cb98f090ce864b58cc5c37661cb5548e394e0fe24d5f2c7892a89070c3 \
|
||||||
|
--hash=sha256:4229060be866813122385c608bbd3ea48fe0b33e91f2756810d28c1cdbfc98f1 \
|
||||||
|
--hash=sha256:450eba3c93c3e3e8070566ad8d70590949d6e574b1c960bf68edd789811e7da8 \
|
||||||
|
--hash=sha256:4aba2f640d9d45b986a613ce94fcf008c42cc72eeba2990fefdb575228b1d3d1 \
|
||||||
|
--hash=sha256:5ff337b36db141a1a0b49f01282783744f49f0d401cc83a512fc5596eb7db5c6 \
|
||||||
|
--hash=sha256:6b5fdfc86cbf5373ea67d554cc6f08863825ea8e928416bed8d5285e387420c6 \
|
||||||
|
--hash=sha256:6e68a9800a3fa37c438b73a669f507c4973801a456a864ac56b62c3bd63d08af \
|
||||||
|
--hash=sha256:7163b488bfdcd13c300a8407c309e4cee195ef95d07facf5ac2678d66c988805 \
|
||||||
|
--hash=sha256:75b2c01a4eda2e7c357bc67a3f5c3dd76506c15b5fd4dc42845ef2e182181bad \
|
||||||
|
--hash=sha256:7d9e76a77c9644fbd60da3984e516ead5b84817748c2da92515cd36f1941a04d \
|
||||||
|
--hash=sha256:7f082321c244ff5d0e252071fb2c4fe02063a45934144a1456a5370ca139bec2 \
|
||||||
|
--hash=sha256:7f471fd055a9e13cf5f4337ee12379b30b4dcda1ae0d85018d4649e841578c02 \
|
||||||
|
--hash=sha256:7f75797bc75a2e7edf52a1c9e68a1295fa84250161c8f4e41df9e72723332c65 \
|
||||||
|
--hash=sha256:8c9e6b07c136d06495c792f603099039bb1e7c6c29854cc5eb3d7640268df016 \
|
||||||
|
--hash=sha256:a1719788a787808c15c9bb98b6ff0c97d64a0e59c1a6ebe36d4ae4d7c5c09b95 \
|
||||||
|
--hash=sha256:a194e3684b3553ea199c3f4837f422a521c7e2f0cce13527adc3a6b4049f9e7c \
|
||||||
|
--hash=sha256:a69b5c02014448a412012752dc46091902d28932c3be0c6e02e73cecceffb700 \
|
||||||
|
--hash=sha256:ad5cf0ebc3cdb12edb5aa50472108807ffd0a0ce95f87c0366a479fa83a7c346 \
|
||||||
|
--hash=sha256:b5cc434eb4a4df2fe442bcc50df114e82ff7aa234657baf873b2c9cf3f851e8e \
|
||||||
|
--hash=sha256:b9854fa70edbe93242b8bb4840e58d1128c45766d9a70710f05b4f67eb0feb6e \
|
||||||
|
--hash=sha256:d7a19435ca3d7dd502b8d8dce643555eb1b6013989e3f7577857289f6db6be16 \
|
||||||
|
--hash=sha256:eb766218abad05c7c3ddad5367d0ec702d6152cb4a48d9fd56a6cef6abade70c \
|
||||||
|
--hash=sha256:f0eb88dbac8a7e61ee433006d0ddfd6eb921f5c6c224d1b50855bc98fb304c44 \
|
||||||
|
--hash=sha256:f326218262c8d8537887cc4bbd613c8409d62f2cac799835c0360e0d9cefaa5c \
|
||||||
|
--hash=sha256:f677668ab2bb2452fee955af3702fbb3b71919e61e4520762b1e5f54af59c0d8
|
||||||
|
# via pandas
|
||||||
|
numpy==2.2.6 \
|
||||||
|
--hash=sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff \
|
||||||
|
--hash=sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47 \
|
||||||
|
--hash=sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84 \
|
||||||
|
--hash=sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d \
|
||||||
|
--hash=sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6 \
|
||||||
|
--hash=sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f \
|
||||||
|
--hash=sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b \
|
||||||
|
--hash=sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49 \
|
||||||
|
--hash=sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163 \
|
||||||
|
--hash=sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571 \
|
||||||
|
--hash=sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42 \
|
||||||
|
--hash=sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff \
|
||||||
|
--hash=sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491 \
|
||||||
|
--hash=sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4 \
|
||||||
|
--hash=sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566 \
|
||||||
|
--hash=sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf \
|
||||||
|
--hash=sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40 \
|
||||||
|
--hash=sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd \
|
||||||
|
--hash=sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06 \
|
||||||
|
--hash=sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282 \
|
||||||
|
--hash=sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680 \
|
||||||
|
--hash=sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db \
|
||||||
|
--hash=sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3 \
|
||||||
|
--hash=sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90 \
|
||||||
|
--hash=sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1 \
|
||||||
|
--hash=sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289 \
|
||||||
|
--hash=sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab \
|
||||||
|
--hash=sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c \
|
||||||
|
--hash=sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d \
|
||||||
|
--hash=sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb \
|
||||||
|
--hash=sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d \
|
||||||
|
--hash=sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a \
|
||||||
|
--hash=sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf \
|
||||||
|
--hash=sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1 \
|
||||||
|
--hash=sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2 \
|
||||||
|
--hash=sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a \
|
||||||
|
--hash=sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543 \
|
||||||
|
--hash=sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00 \
|
||||||
|
--hash=sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c \
|
||||||
|
--hash=sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f \
|
||||||
|
--hash=sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd \
|
||||||
|
--hash=sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868 \
|
||||||
|
--hash=sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303 \
|
||||||
|
--hash=sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83 \
|
||||||
|
--hash=sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3 \
|
||||||
|
--hash=sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d \
|
||||||
|
--hash=sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87 \
|
||||||
|
--hash=sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa \
|
||||||
|
--hash=sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f \
|
||||||
|
--hash=sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae \
|
||||||
|
--hash=sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda \
|
||||||
|
--hash=sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915 \
|
||||||
|
--hash=sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249 \
|
||||||
|
--hash=sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de \
|
||||||
|
--hash=sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8
|
||||||
|
# via
|
||||||
|
# bottleneck
|
||||||
|
# numba
|
||||||
|
# numexpr
|
||||||
|
# pandas
|
||||||
|
# py3langid
|
||||||
|
pandas==2.2.3 \
|
||||||
|
--hash=sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a \
|
||||||
|
--hash=sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d \
|
||||||
|
--hash=sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5 \
|
||||||
|
--hash=sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4 \
|
||||||
|
--hash=sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0 \
|
||||||
|
--hash=sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32 \
|
||||||
|
--hash=sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28 \
|
||||||
|
--hash=sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f \
|
||||||
|
--hash=sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348 \
|
||||||
|
--hash=sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18 \
|
||||||
|
--hash=sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468 \
|
||||||
|
--hash=sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5 \
|
||||||
|
--hash=sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667 \
|
||||||
|
--hash=sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645 \
|
||||||
|
--hash=sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13 \
|
||||||
|
--hash=sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3 \
|
||||||
|
--hash=sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d \
|
||||||
|
--hash=sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb \
|
||||||
|
--hash=sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3 \
|
||||||
|
--hash=sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039 \
|
||||||
|
--hash=sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8 \
|
||||||
|
--hash=sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd \
|
||||||
|
--hash=sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659 \
|
||||||
|
--hash=sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57 \
|
||||||
|
--hash=sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4 \
|
||||||
|
--hash=sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a \
|
||||||
|
--hash=sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9 \
|
||||||
|
--hash=sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42 \
|
||||||
|
--hash=sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2 \
|
||||||
|
--hash=sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc \
|
||||||
|
--hash=sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698 \
|
||||||
|
--hash=sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed \
|
||||||
|
--hash=sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015 \
|
||||||
|
--hash=sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24 \
|
||||||
|
--hash=sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319
|
||||||
|
# via
|
||||||
|
# country-converter
|
||||||
|
# csv-metadata-quality
|
||||||
|
platformdirs==4.3.8 \
|
||||||
|
--hash=sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc \
|
||||||
|
--hash=sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4
|
||||||
|
# via requests-cache
|
||||||
|
py3langid==0.3.0 \
|
||||||
|
--hash=sha256:0a875a031a58aaf9dbda7bb8285fd75e801a7bd276216ffabe037901d4b449ec \
|
||||||
|
--hash=sha256:38f022eec31cf9a2bf6f142acb2a9b350fd7d0d5ae7762b1392c6d3567401fd3
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pyarrow==20.0.0 \
|
||||||
|
--hash=sha256:00138f79ee1b5aca81e2bdedb91e3739b987245e11fa3c826f9e57c5d102fb75 \
|
||||||
|
--hash=sha256:15aa1b3b2587e74328a730457068dc6c89e6dcbf438d4369f572af9d320a25ee \
|
||||||
|
--hash=sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6 \
|
||||||
|
--hash=sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781 \
|
||||||
|
--hash=sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0 \
|
||||||
|
--hash=sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd \
|
||||||
|
--hash=sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031 \
|
||||||
|
--hash=sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc \
|
||||||
|
--hash=sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b \
|
||||||
|
--hash=sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8 \
|
||||||
|
--hash=sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c \
|
||||||
|
--hash=sha256:4a8b029a07956b8d7bd742ffca25374dd3f634b35e46cc7a7c3fa4c75b297191 \
|
||||||
|
--hash=sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199 \
|
||||||
|
--hash=sha256:5605919fbe67a7948c1f03b9f3727d82846c053cd2ce9303ace791855923fd20 \
|
||||||
|
--hash=sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232 \
|
||||||
|
--hash=sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a \
|
||||||
|
--hash=sha256:6415a0d0174487456ddc9beaead703d0ded5966129fa4fd3114d76b5d1c5ceae \
|
||||||
|
--hash=sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c \
|
||||||
|
--hash=sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba \
|
||||||
|
--hash=sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab \
|
||||||
|
--hash=sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70 \
|
||||||
|
--hash=sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9 \
|
||||||
|
--hash=sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e \
|
||||||
|
--hash=sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb \
|
||||||
|
--hash=sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b \
|
||||||
|
--hash=sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3 \
|
||||||
|
--hash=sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b \
|
||||||
|
--hash=sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5 \
|
||||||
|
--hash=sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3 \
|
||||||
|
--hash=sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893 \
|
||||||
|
--hash=sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122 \
|
||||||
|
--hash=sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28 \
|
||||||
|
--hash=sha256:a5704f29a74b81673d266e5ec1fe376f060627c2e42c5c7651288ed4b0db29e9 \
|
||||||
|
--hash=sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62 \
|
||||||
|
--hash=sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae \
|
||||||
|
--hash=sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4 \
|
||||||
|
--hash=sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f \
|
||||||
|
--hash=sha256:c7dd06fd7d7b410ca5dc839cc9d485d2bc4ae5240851bcd45d85105cc90a47d7 \
|
||||||
|
--hash=sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63 \
|
||||||
|
--hash=sha256:d5382de8dc34c943249b01c19110783d0d64b207167c728461add1ecc2db88e4 \
|
||||||
|
--hash=sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061 \
|
||||||
|
--hash=sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a \
|
||||||
|
--hash=sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368 \
|
||||||
|
--hash=sha256:f2d67ac28f57a362f1a2c1e6fa98bfe2f03230f7e15927aecd067433b1e70ce8 \
|
||||||
|
--hash=sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c \
|
||||||
|
--hash=sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1
|
||||||
|
# via pandas
|
||||||
|
pycountry==24.6.1 \
|
||||||
|
--hash=sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221 \
|
||||||
|
--hash=sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f
|
||||||
|
# via csv-metadata-quality
|
||||||
|
python-dateutil==2.9.0.post0 \
|
||||||
|
--hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
|
||||||
|
--hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
|
||||||
|
# via pandas
|
||||||
|
python-stdnum==1.20 \
|
||||||
|
--hash=sha256:111008e10391d54fb2afad2a10df70d5cb0c6c0a7ec82fec6f022cb8712961d3 \
|
||||||
|
--hash=sha256:ad2a2cf2eb025de408210235f36b4ae31252de3186240ccaa8126e117cb82690
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pytz==2025.2 \
|
||||||
|
--hash=sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3 \
|
||||||
|
--hash=sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00
|
||||||
|
# via pandas
|
||||||
|
requests==2.32.4 \
|
||||||
|
--hash=sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c \
|
||||||
|
--hash=sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422
|
||||||
|
# via
|
||||||
|
# csv-metadata-quality
|
||||||
|
# requests-cache
|
||||||
|
requests-cache==1.2.1 \
|
||||||
|
--hash=sha256:1285151cddf5331067baa82598afe2d47c7495a1334bfe7a7d329b43e9fd3603 \
|
||||||
|
--hash=sha256:68abc986fdc5b8d0911318fbb5f7c80eebcd4d01bfacc6685ecf8876052511d1
|
||||||
|
# via csv-metadata-quality
|
||||||
|
six==1.17.0 \
|
||||||
|
--hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
|
||||||
|
--hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
|
||||||
|
# via python-dateutil
|
||||||
|
typing-extensions==4.14.0 \
|
||||||
|
--hash=sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4 \
|
||||||
|
--hash=sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af
|
||||||
|
# via
|
||||||
|
# cattrs
|
||||||
|
# exceptiongroup
|
||||||
|
tzdata==2025.2 \
|
||||||
|
--hash=sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 \
|
||||||
|
--hash=sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9
|
||||||
|
# via pandas
|
||||||
|
url-normalize==2.2.1 \
|
||||||
|
--hash=sha256:3deb687587dc91f7b25c9ae5162ffc0f057ae85d22b1e15cf5698311247f567b \
|
||||||
|
--hash=sha256:74a540a3b6eba1d95bdc610c24f2c0141639f3ba903501e61a52a8730247ff37
|
||||||
|
# via requests-cache
|
||||||
|
urllib3==2.5.0 \
|
||||||
|
--hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
|
||||||
|
--hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
|
||||||
|
# via
|
||||||
|
# requests
|
||||||
|
# requests-cache
|
||||||
|
wcwidth==0.2.13 \
|
||||||
|
--hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \
|
||||||
|
--hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5
|
||||||
|
# via ftfy
|
||||||
|
37
setup.py
37
setup.py
@ -1,37 +0,0 @@
|
|||||||
import setuptools
|
|
||||||
|
|
||||||
with open("README.md", "r") as fh:
|
|
||||||
long_description = fh.read()
|
|
||||||
|
|
||||||
install_requires = [
|
|
||||||
"pandas",
|
|
||||||
"python-stdnum",
|
|
||||||
"requests",
|
|
||||||
"requests-cache",
|
|
||||||
"pycountry",
|
|
||||||
"langid",
|
|
||||||
]
|
|
||||||
|
|
||||||
setuptools.setup(
|
|
||||||
name="csv-metadata-quality",
|
|
||||||
version="0.6.0",
|
|
||||||
author="Alan Orth",
|
|
||||||
author_email="aorth@mjanja.ch",
|
|
||||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
|
||||||
license="GPLv3",
|
|
||||||
long_description=long_description,
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
url="https://github.com/alanorth/csv-metadata-quality",
|
|
||||||
classifiers=[
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
||||||
"Operating System :: OS Independent",
|
|
||||||
],
|
|
||||||
packages=["csv_metadata_quality"],
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
|
|
||||||
},
|
|
||||||
install_requires=install_requires,
|
|
||||||
)
|
|
@ -1,11 +1,14 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests_cache
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
|
|
||||||
import csv_metadata_quality.check as check
|
import csv_metadata_quality.check as check
|
||||||
@ -74,7 +77,7 @@ def run(argv):
|
|||||||
signal.signal(signal.SIGINT, signal_handler)
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||||
df = pd.read_csv(args.input_file, dtype=str)
|
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
|
||||||
|
|
||||||
# Check if the user requested to skip any fields
|
# Check if the user requested to skip any fields
|
||||||
if args.exclude_fields:
|
if args.exclude_fields:
|
||||||
@ -82,7 +85,20 @@ def run(argv):
|
|||||||
# user should be careful to no include spaces here.
|
# user should be careful to no include spaces here.
|
||||||
exclude = args.exclude_fields.split(",")
|
exclude = args.exclude_fields.split(",")
|
||||||
else:
|
else:
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
|
# enable transparent request cache with thirty days expiry
|
||||||
|
expire_after = timedelta(days=30)
|
||||||
|
# Allow overriding the location of the requests cache, just in case we are
|
||||||
|
# running in an environment where we can't write to the current working di-
|
||||||
|
# rectory (for example from csv-metadata-quality-web).
|
||||||
|
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||||
|
requests_cache.install_cache(
|
||||||
|
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||||
|
)
|
||||||
|
|
||||||
|
# prune old cache entries
|
||||||
|
requests_cache.delete()
|
||||||
|
|
||||||
for column in df.columns:
|
for column in df.columns:
|
||||||
if column in exclude:
|
if column in exclude:
|
||||||
@ -91,7 +107,9 @@ def run(argv):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
match = re.match(r"^.*?abstract.*$", column)
|
# Skip whitespace and newline fixes on abstracts and descriptions
|
||||||
|
# because there are too many with legitimate multi-line metadata.
|
||||||
|
match = re.match(r"^.*?(abstract|description).*$", column)
|
||||||
if match is None:
|
if match is None:
|
||||||
# Fix: whitespace
|
# Fix: whitespace
|
||||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
@ -102,7 +120,7 @@ def run(argv):
|
|||||||
# Fix: missing space after comma. Only run on author and citation
|
# Fix: missing space after comma. Only run on author and citation
|
||||||
# fields for now, as this problem is mostly an issue in names.
|
# fields for now, as this problem is mostly an issue in names.
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
match = re.match(r"^.*?(author|citation).*$", column)
|
match = re.match(r"^.*?(author|[Cc]itation).*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||||
|
|
||||||
@ -123,10 +141,15 @@ def run(argv):
|
|||||||
# Fix: unnecessary Unicode
|
# Fix: unnecessary Unicode
|
||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
|
# Fix: normalize DOIs
|
||||||
|
match = re.match(r"^.*?identifier\.doi.*$", column)
|
||||||
|
if match is not None:
|
||||||
|
df[column] = df[column].apply(fix.normalize_dois)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||||
# and abstract fields because "|" is used to indicate something like
|
# and abstract fields because "|" is used to indicate something like
|
||||||
# a subtitle.
|
# a subtitle.
|
||||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
|
||||||
if match is None:
|
if match is None:
|
||||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
# Run whitespace fix again after fixing invalid separators
|
# Run whitespace fix again after fixing invalid separators
|
@ -1,14 +1,12 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
|
|
||||||
import country_converter as coco
|
import country_converter as coco
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import requests_cache
|
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pycountry import languages
|
from pycountry import languages
|
||||||
from stdnum import isbn as stdnum_isbn
|
from stdnum import isbn as stdnum_isbn
|
||||||
@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# List of suspicious characters, for example: ́ˆ~`
|
# List of suspicious characters, for example: ́ˆ~`
|
||||||
suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
|
suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]
|
||||||
|
|
||||||
for character in suspicious_characters:
|
for character in suspicious_characters:
|
||||||
# Find the position of the suspicious character in the string
|
# Find the position of the suspicious character in the string
|
||||||
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
|
|||||||
if pd.isna(field):
|
if pd.isna(field):
|
||||||
return
|
return
|
||||||
|
|
||||||
# enable transparent request cache with thirty days expiry
|
|
||||||
expire_after = timedelta(days=30)
|
|
||||||
# Allow overriding the location of the requests cache, just in case we are
|
|
||||||
# running in an environment where we can't write to the current working di-
|
|
||||||
# rectory (for example from csv-metadata-quality-web).
|
|
||||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
|
||||||
requests_cache.install_cache(
|
|
||||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
|
||||||
)
|
|
||||||
|
|
||||||
# prune old cache entries
|
|
||||||
# requests_cache.remove_expired_responses()
|
|
||||||
|
|
||||||
# Initialize an empty list to hold the validated AGROVOC values
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
values = list()
|
values = []
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||||
request_params = {"query": value}
|
request_params = {"query": value}
|
||||||
|
|
||||||
request = requests.get(request_url, params=request_params)
|
request = requests.get(request_url, params=request_params)
|
||||||
@ -373,7 +358,7 @@ def duplicate_items(df):
|
|||||||
|
|
||||||
if items_count_unique < items_count_total:
|
if items_count_unique < items_count_total:
|
||||||
# Create a list to hold our items while we check for duplicates
|
# Create a list to hold our items while we check for duplicates
|
||||||
items = list()
|
items = []
|
||||||
|
|
||||||
for index, row in df.iterrows():
|
for index, row in df.iterrows():
|
||||||
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
||||||
@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
|
|||||||
if row[region_column_name] is not None:
|
if row[region_column_name] is not None:
|
||||||
regions = row[region_column_name].split("||")
|
regions = row[region_column_name].split("||")
|
||||||
else:
|
else:
|
||||||
regions = list()
|
regions = []
|
||||||
|
|
||||||
for country in countries:
|
for country in countries:
|
||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
|
|||||||
un_region = cc.convert(names=country, to="UNRegion")
|
un_region = cc.convert(names=country, to="UNRegion")
|
||||||
|
|
||||||
if un_region != "not found" and un_region not in regions:
|
if un_region != "not found" and un_region not in regions:
|
||||||
|
try:
|
||||||
print(
|
print(
|
||||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
)
|
)
|
||||||
|
except KeyError:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
|
||||||
|
)
|
||||||
|
|
||||||
return
|
return
|
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import langid
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import py3langid as langid
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pycountry import languages
|
from pycountry import languages
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ def correct_language(row, exclude):
|
|||||||
# Initialize some variables at global scope so that we can set them in the
|
# Initialize some variables at global scope so that we can set them in the
|
||||||
# loop scope below and still be able to access them afterwards.
|
# loop scope below and still be able to access them afterwards.
|
||||||
language = ""
|
language = ""
|
||||||
sample_strings = list()
|
sample_strings = []
|
||||||
title = None
|
title = None
|
||||||
|
|
||||||
# Iterate over the labels of the current row's values. Before we transposed
|
# Iterate over the labels of the current row's values. Before we transposed
|
@ -23,7 +23,7 @@ def whitespace(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Initialize an empty list to hold the cleaned values
|
# Initialize an empty list to hold the cleaned values
|
||||||
values = list()
|
values = []
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
@ -64,7 +64,7 @@ def separators(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Initialize an empty list to hold the cleaned values
|
# Initialize an empty list to hold the cleaned values
|
||||||
values = list()
|
values = []
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
@ -175,7 +175,7 @@ def duplicates(field, field_name):
|
|||||||
values = field.split("||")
|
values = field.split("||")
|
||||||
|
|
||||||
# Initialize an empty list to hold the de-duplicated values
|
# Initialize an empty list to hold the de-duplicated values
|
||||||
new_values = list()
|
new_values = []
|
||||||
|
|
||||||
# Iterate over all values
|
# Iterate over all values
|
||||||
for value in values:
|
for value in values:
|
||||||
@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
|
|||||||
if row[region_column_name] is not None:
|
if row[region_column_name] is not None:
|
||||||
regions = row[region_column_name].split("||")
|
regions = row[region_column_name].split("||")
|
||||||
else:
|
else:
|
||||||
regions = list()
|
regions = []
|
||||||
|
|
||||||
# An empty list for our regions so we can keep track for all countries
|
# An empty list for our regions so we can keep track for all countries
|
||||||
missing_regions = list()
|
missing_regions = []
|
||||||
|
|
||||||
for country in countries:
|
for country in countries:
|
||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
|
|||||||
# it doesn't already exist in regions.
|
# it doesn't already exist in regions.
|
||||||
if un_region != "not found" and un_region not in regions:
|
if un_region != "not found" and un_region not in regions:
|
||||||
if un_region not in missing_regions:
|
if un_region not in missing_regions:
|
||||||
|
try:
|
||||||
print(
|
print(
|
||||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
)
|
)
|
||||||
|
except KeyError:
|
||||||
|
# If there is no title column in the CSV we will print
|
||||||
|
# the fix without the title instead of crashing.
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
|
||||||
|
)
|
||||||
|
|
||||||
missing_regions.append(un_region)
|
missing_regions.append(un_region)
|
||||||
|
|
||||||
if len(missing_regions) > 0:
|
if len(missing_regions) > 0:
|
||||||
@ -387,3 +395,88 @@ def countries_match_regions(row, exclude):
|
|||||||
row[region_column_name] = "||".join(missing_regions)
|
row[region_column_name] = "||".join(missing_regions)
|
||||||
|
|
||||||
return row
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_dois(field):
|
||||||
|
"""Normalize DOIs.
|
||||||
|
|
||||||
|
DOIs are meant to be globally unique identifiers. They are case insensitive,
|
||||||
|
but in order to compare them robustly they should be normalized to a common
|
||||||
|
format:
|
||||||
|
|
||||||
|
- strip leading and trailing whitespace
|
||||||
|
- lowercase all ASCII characters
|
||||||
|
- convert all variations to https://doi.org/10.xxxx/xxxx URI format
|
||||||
|
|
||||||
|
Return string with normalized DOI.
|
||||||
|
|
||||||
|
See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
values = field.split("||")
|
||||||
|
|
||||||
|
# Initialize an empty list to hold the de-duplicated values
|
||||||
|
new_values = []
|
||||||
|
|
||||||
|
# Iterate over all values (most items will only have one DOI)
|
||||||
|
for value in values:
|
||||||
|
# Strip leading and trailing whitespace
|
||||||
|
new_value = value.strip()
|
||||||
|
|
||||||
|
new_value = new_value.lower()
|
||||||
|
|
||||||
|
# Convert to HTTPS
|
||||||
|
pattern = re.compile(r"^http://")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://", new_value)
|
||||||
|
|
||||||
|
# Convert dx.doi.org to doi.org
|
||||||
|
pattern = re.compile(r"dx\.doi\.org")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "doi.org", new_value)
|
||||||
|
|
||||||
|
# Convert www.doi.org to doi.org
|
||||||
|
pattern = re.compile(r"www\.doi\.org")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "doi.org", new_value)
|
||||||
|
|
||||||
|
# Convert erroneous %2f to /
|
||||||
|
pattern = re.compile("%2f")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "/", new_value)
|
||||||
|
|
||||||
|
# Replace values like doi: 10.11648/j.jps.20140201.14
|
||||||
|
pattern = re.compile(r"^doi: 10\.")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||||
|
|
||||||
|
# Replace values like 10.3390/foods12010115
|
||||||
|
pattern = re.compile(r"^10\.")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||||
|
|
||||||
|
if new_value != value:
|
||||||
|
print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
|
||||||
|
|
||||||
|
new_values.append(new_value)
|
||||||
|
|
||||||
|
new_field = "||".join(new_values)
|
||||||
|
|
||||||
|
return new_field
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from importlib.resources import files
|
import os
|
||||||
|
|
||||||
from ftfy.badness import is_bad
|
from ftfy.badness import is_bad
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ def is_mojibake(field):
|
|||||||
def load_spdx_licenses():
|
def load_spdx_licenses():
|
||||||
"""Returns a Python list of SPDX short license identifiers."""
|
"""Returns a Python list of SPDX short license identifiers."""
|
||||||
|
|
||||||
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
|
||||||
licenses = json.load(f)
|
licenses = json.load(f)
|
||||||
|
|
||||||
# List comprehension to extract the license ID for each license
|
# List comprehension to extract the license ID for each license
|
@ -1,3 +1,3 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
VERSION = "0.6.0"
|
VERSION = "0.7.0"
|
@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "es"
|
language = "es"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "spa"
|
language = "spa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "en"
|
language = "en"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "eng"
|
language = "eng"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -407,7 +407,7 @@ def test_check_doi_field():
|
|||||||
# the citation and a DOI field.
|
# the citation and a DOI field.
|
||||||
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
||||||
series = pd.Series(data=d)
|
series = pd.Series(data=d)
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
result = check.citation_doi(series, exclude)
|
result = check.citation_doi(series, exclude)
|
||||||
|
|
||||||
@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
|
|||||||
"""Test an item with a DOI in its citation, but no DOI field."""
|
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||||
|
|
||||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
# an empty DOI field and a citation containing a DOI.
|
# an empty DOI field and a citation containing a DOI.
|
||||||
@ -439,7 +439,7 @@ def test_title_in_citation():
|
|||||||
|
|
||||||
title = "Testing all the things"
|
title = "Testing all the things"
|
||||||
citation = "Orth, A. 2021. Testing all the things."
|
citation = "Orth, A. 2021. Testing all the things."
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
# the title and citation.
|
# the title and citation.
|
||||||
@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):
|
|||||||
|
|
||||||
title = "Testing all the things"
|
title = "Testing all the things"
|
||||||
citation = "Orth, A. 2021. Testing all teh things."
|
citation = "Orth, A. 2021. Testing all teh things."
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
# the title and citation.
|
# the title and citation.
|
||||||
@ -477,7 +477,7 @@ def test_country_matches_region():
|
|||||||
|
|
||||||
country = "Kenya"
|
country = "Kenya"
|
||||||
region = "Eastern Africa"
|
region = "Eastern Africa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series)
|
# Emulate a column in a transposed dataframe (which is just a series)
|
||||||
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
||||||
@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
|
|||||||
country = "Kenya"
|
country = "Kenya"
|
||||||
region = ""
|
region = ""
|
||||||
missing_region = "Eastern Africa"
|
missing_region = "Eastern Africa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series)
|
# Emulate a column in a transposed dataframe (which is just a series)
|
||||||
d = {
|
d = {
|
||||||
|
@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
|
|||||||
country = "Kenya"
|
country = "Kenya"
|
||||||
region = ""
|
region = ""
|
||||||
missing_region = "Eastern Africa"
|
missing_region = "Eastern Africa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series)
|
# Emulate a column in a transposed dataframe (which is just a series)
|
||||||
d = {
|
d = {
|
||||||
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
|
|||||||
series_correct = pd.Series(data=d_correct)
|
series_correct = pd.Series(data=d_correct)
|
||||||
|
|
||||||
pd.testing.assert_series_equal(result, series_correct)
|
pd.testing.assert_series_equal(result, series_correct)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_normalize_dois():
|
||||||
|
"""Test normalizing a DOI."""
|
||||||
|
|
||||||
|
value = "doi: 10.11648/j.jps.20140201.14"
|
||||||
|
|
||||||
|
assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
|
||||||
|
Reference in New Issue
Block a user