mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-09 14:46:00 +02:00
Compare commits
119 Commits
v0.6.1
...
81e3ca3d9c
Author | SHA1 | Date | |
---|---|---|---|
81e3ca3d9c
|
|||
c470f8b375
|
|||
0f45448517
|
|||
7dd52ca491
|
|||
92ff0ee51b
|
|||
ae38a826ec
|
|||
c1f630c298
|
|||
82b056f0ea
|
|||
7fca981b95
|
|||
1a9424197b
|
|||
f6c6c94a1e
|
|||
f500fac64b
|
|||
8143a7d978
|
|||
94cec080d6
|
|||
9402af1e30
|
|||
d71ff9082b
|
|||
f309b694c4
|
|||
4d879f6d13
|
|||
a30fefcd52
|
|||
2341c56c40
|
|||
5be2195325
|
|||
736948ed2c
|
|||
ee0b448355
|
|||
4f3174a543
|
|||
d5c25f82fa
|
|||
7b3e2b4e68 | |||
f92b2fe206 | |||
df040b70c7 | |||
10bc8f3e14 | |||
7e6e92ecaa
|
|||
a21ffb0fa8
|
|||
fb341dd9fa | |||
2e943ee4db | |||
6d3a9870d6 | |||
82ecf7119a | |||
1db21cf275 | |||
bcd1408798 | |||
ee8d255811 | |||
2cc2dbe952
|
|||
940a325d61
|
|||
59b3b307c9
|
|||
b305da3f0b
|
|||
96a486471c | |||
530cd5863b
|
|||
f6018c51b6
|
|||
80c3f5b45a
|
|||
ba4637ea34 | |||
355428a691 | |||
58d4de973e | |||
e1216dae3c | |||
6b650ff1b3 | |||
fa7bde6fc0 | |||
f89159fe32 | |||
02058c5a65 | |||
8fed6b71ff | |||
b005b28cbe | |||
c626290599 | |||
1a06470b64 | |||
d46a81672e | |||
2a50e75082 | |||
0d45e73983 | |||
3611aab425 | |||
5c4ad0eb41 | |||
f1f39722f6 | |||
1c03999582 | |||
1f637f32cd
|
|||
b8241e919d
|
|||
b8dc19cc3f
|
|||
93c9b739ac
|
|||
4ed2786703
|
|||
8728789183 | |||
bf90464809
|
|||
1878002391 | |||
d21d2621e3 | |||
f3fb1ff7fb | |||
1fa81f7558 | |||
7409193b6b | |||
a84fcf0b7b
|
|||
25ac290df4
|
|||
3f52bad1e3
|
|||
0208ad0ade | |||
3632ae0fc9 | |||
17d089cc6e
|
|||
bc470a4343
|
|||
be609a809d
|
|||
de3387ded7
|
|||
f343e87f0c
|
|||
7d3524fbd5
|
|||
c614b71a52 | |||
d159a839f3 | |||
36e2ebe5f4
|
|||
33f67b7a7c
|
|||
c0e1448439
|
|||
5d0804a08f
|
|||
f01c9edf17
|
|||
8d4295b2b3
|
|||
e2d46e9495
|
|||
1491e1edb0
|
|||
34142c3e6b
|
|||
0c88b96e8d
|
|||
2e55b4d6e3
|
|||
c90aad29f0
|
|||
6fd1e1377f
|
|||
c64b7eb1f1
|
|||
29cbc4f3a3
|
|||
307af1acfc
|
|||
b5106de9df
|
|||
9eeadfc44e
|
|||
d4aed378cf
|
|||
20a2cce34b
|
|||
d661ffe439
|
|||
45a310387a
|
|||
47b03c49ba
|
|||
986b81cbf4
|
|||
d43a47ae32
|
|||
ede37569f1
|
|||
0c53efe60a
|
|||
5f0e25b818
|
|||
4776154d6c
|
70
.drone.yml
70
.drone.yml
@ -1,3 +1,33 @@
|
|||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
type: docker
|
||||||
|
name: python311
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: test
|
||||||
|
image: python:3.11-slim
|
||||||
|
commands:
|
||||||
|
- id
|
||||||
|
- python -V
|
||||||
|
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||||
|
- python -m pip install poetry
|
||||||
|
- poetry install
|
||||||
|
- poetry run pytest
|
||||||
|
# Basic test
|
||||||
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
|
# Basic test with unsafe fixes
|
||||||
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
|
# Geography test
|
||||||
|
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||||
|
# Geography test with unsafe fixes
|
||||||
|
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||||
|
# Test with experimental checks
|
||||||
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
|
# Test with AGROVOC validation
|
||||||
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||||
|
# Test with AGROVOC validation (and dropping invalid)
|
||||||
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||||
|
|
||||||
---
|
---
|
||||||
kind: pipeline
|
kind: pipeline
|
||||||
type: docker
|
type: docker
|
||||||
@ -10,23 +40,23 @@ steps:
|
|||||||
- id
|
- id
|
||||||
- python -V
|
- python -V
|
||||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||||
- pip install -r requirements-dev.txt
|
- python -m pip install poetry
|
||||||
- pytest
|
- poetry install
|
||||||
- python setup.py install
|
- poetry run pytest
|
||||||
# Basic test
|
# Basic test
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Basic test with unsafe fixes
|
# Basic test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
# Geography test
|
# Geography test
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||||
# Geography test with unsafe fixes
|
# Geography test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||||
# Test with AGROVOC validation (and dropping invalid)
|
# Test with AGROVOC validation (and dropping invalid)
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||||
|
|
||||||
---
|
---
|
||||||
kind: pipeline
|
kind: pipeline
|
||||||
@ -40,22 +70,22 @@ steps:
|
|||||||
- id
|
- id
|
||||||
- python -V
|
- python -V
|
||||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||||
- pip install -r requirements-dev.txt
|
- python -m pip install poetry
|
||||||
- pytest
|
- poetry install
|
||||||
- python setup.py install
|
- poetry run pytest
|
||||||
# Basic test
|
# Basic test
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Basic test with unsafe fixes
|
# Basic test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
# Geography test
|
# Geography test
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||||
# Geography test with unsafe fixes
|
# Geography test with unsafe fixes
|
||||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||||
# Test with AGROVOC validation (and dropping invalid)
|
# Test with AGROVOC validation (and dropping invalid)
|
||||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||||
|
|
||||||
# vim: ts=2 sw=2 et
|
# vim: ts=2 sw=2 et
|
||||||
|
36
.github/workflows/python-app.yml
vendored
36
.github/workflows/python-app.yml
vendored
@ -15,37 +15,27 @@ jobs:
|
|||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python 3.10
|
- name: Install rye
|
||||||
uses: actions/setup-python@v4
|
uses: eifinger/setup-rye@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
version: 'latest'
|
||||||
cache: 'pip'
|
- run: rye sync
|
||||||
- name: Install dependencies
|
- name: Lint
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install flake8 pytest
|
|
||||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
|
||||||
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
|
|
||||||
- name: Lint with flake8
|
|
||||||
run: |
|
run: |
|
||||||
# stop the build if there are Python syntax errors or undefined names
|
# stop the build if there are Python syntax errors or undefined names
|
||||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
rye lint
|
||||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
|
||||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: rye test
|
||||||
pytest
|
|
||||||
- name: Test CLI
|
- name: Test CLI
|
||||||
run: |
|
run: |
|
||||||
python setup.py install
|
|
||||||
# Basic test
|
# Basic test
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||||
# Test with unsafe fixes
|
# Test with unsafe fixes
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||||
# Test with experimental checks
|
# Test with experimental checks
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||||
# Test with AGROVOC validation
|
# Test with AGROVOC validation
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||||
# Test with AGROVOC validation (and dropping invalid)
|
# Test with AGROVOC validation (and dropping invalid)
|
||||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||||
|
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
|||||||
|
3.12
|
20
CHANGELOG.md
20
CHANGELOG.md
@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## Unreleased
|
||||||
|
### Added
|
||||||
|
- Ability to normalize DOIs to https://doi.org URI format
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Fixed regex so we don't run the invalid multi-value separator fix on
|
||||||
|
`dcterms.bibliographicCitation` fields
|
||||||
|
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
|
||||||
|
fields
|
||||||
|
- Don't crash the country/region checker/fixer when a title field is missing
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Don't run newline fix on description fields
|
||||||
|
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||||
|
- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
|
||||||
|
|
||||||
|
### Updated
|
||||||
|
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||||
|
- SPDX license list
|
||||||
|
|
||||||
## [0.6.1] - 2023-02-23
|
## [0.6.1] - 2023-02-23
|
||||||
### Fixed
|
### Fixed
|
||||||
- Missing region check should ignore subregion field, if it exists
|
- Missing region check should ignore subregion field, if it exists
|
||||||
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
|||||||
|
include csv_metadata_quality/data/licenses.json
|
@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
|||||||
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
||||||
- Remove duplicate metadata values
|
- Remove duplicate metadata values
|
||||||
- Check for duplicate items, using the title, type, and date issued as an indicator
|
- Check for duplicate items, using the title, type, and date issued as an indicator
|
||||||
|
- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
||||||
@ -125,9 +126,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Better logging, for example with INFO, WARN, and ERR levels
|
- Better logging, for example with INFO, WARN, and ERR levels
|
||||||
- Verbose, debug, or quiet options
|
- Verbose, debug, or quiet options
|
||||||
- Warn if an author is shorter than 3 characters?
|
- Warn if an author is shorter than 3 characters?
|
||||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
|
||||||
- Warn if two items use the same file in `filename` column
|
- Warn if two items use the same file in `filename` column
|
||||||
- Add an option to drop invalid AGROVOC subjects?
|
|
||||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||||
- Validate ISSNs or journal titles against CrossRef API?
|
- Validate ISSNs or journal titles against CrossRef API?
|
||||||
- Add configurable field validation, like specify a field name and a validation file?
|
- Add configurable field validation, like specify a field name and a validation file?
|
||||||
@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Warn if item is Open Access, but missing a license
|
- Warn if item is Open Access, but missing a license
|
||||||
- Warn if item has an ISSN but no journal title
|
- Warn if item has an ISSN but no journal title
|
||||||
- Update journal titles from ISSN
|
- Update journal titles from ISSN
|
||||||
- Migrate to https://github.com/spdx/license-list-data
|
- Migrate from Pandas to Polars
|
||||||
|
|
||||||
## License
|
## License
|
||||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests_cache
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
|
|
||||||
import csv_metadata_quality.check as check
|
import csv_metadata_quality.check as check
|
||||||
@ -74,7 +77,7 @@ def run(argv):
|
|||||||
signal.signal(signal.SIGINT, signal_handler)
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||||
df = pd.read_csv(args.input_file, dtype=str)
|
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
|
||||||
|
|
||||||
# Check if the user requested to skip any fields
|
# Check if the user requested to skip any fields
|
||||||
if args.exclude_fields:
|
if args.exclude_fields:
|
||||||
@ -82,7 +85,20 @@ def run(argv):
|
|||||||
# user should be careful to no include spaces here.
|
# user should be careful to no include spaces here.
|
||||||
exclude = args.exclude_fields.split(",")
|
exclude = args.exclude_fields.split(",")
|
||||||
else:
|
else:
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
|
# enable transparent request cache with thirty days expiry
|
||||||
|
expire_after = timedelta(days=30)
|
||||||
|
# Allow overriding the location of the requests cache, just in case we are
|
||||||
|
# running in an environment where we can't write to the current working di-
|
||||||
|
# rectory (for example from csv-metadata-quality-web).
|
||||||
|
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||||
|
requests_cache.install_cache(
|
||||||
|
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||||
|
)
|
||||||
|
|
||||||
|
# prune old cache entries
|
||||||
|
requests_cache.delete()
|
||||||
|
|
||||||
for column in df.columns:
|
for column in df.columns:
|
||||||
if column in exclude:
|
if column in exclude:
|
||||||
@ -91,7 +107,9 @@ def run(argv):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
match = re.match(r"^.*?abstract.*$", column)
|
# Skip whitespace and newline fixes on abstracts and descriptions
|
||||||
|
# because there are too many with legitimate multi-line metadata.
|
||||||
|
match = re.match(r"^.*?(abstract|description).*$", column)
|
||||||
if match is None:
|
if match is None:
|
||||||
# Fix: whitespace
|
# Fix: whitespace
|
||||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||||
@ -102,7 +120,7 @@ def run(argv):
|
|||||||
# Fix: missing space after comma. Only run on author and citation
|
# Fix: missing space after comma. Only run on author and citation
|
||||||
# fields for now, as this problem is mostly an issue in names.
|
# fields for now, as this problem is mostly an issue in names.
|
||||||
if args.unsafe_fixes:
|
if args.unsafe_fixes:
|
||||||
match = re.match(r"^.*?(author|citation).*$", column)
|
match = re.match(r"^.*?(author|[Cc]itation).*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||||
|
|
||||||
@ -123,10 +141,15 @@ def run(argv):
|
|||||||
# Fix: unnecessary Unicode
|
# Fix: unnecessary Unicode
|
||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
|
# Fix: normalize DOIs
|
||||||
|
match = re.match(r"^.*?identifier\.doi.*$", column)
|
||||||
|
if match is not None:
|
||||||
|
df[column] = df[column].apply(fix.normalize_dois)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||||
# and abstract fields because "|" is used to indicate something like
|
# and abstract fields because "|" is used to indicate something like
|
||||||
# a subtitle.
|
# a subtitle.
|
||||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
|
||||||
if match is None:
|
if match is None:
|
||||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
# Run whitespace fix again after fixing invalid separators
|
# Run whitespace fix again after fixing invalid separators
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
|
|
||||||
import country_converter as coco
|
import country_converter as coco
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import requests_cache
|
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pycountry import languages
|
from pycountry import languages
|
||||||
from stdnum import isbn as stdnum_isbn
|
from stdnum import isbn as stdnum_isbn
|
||||||
@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# List of suspicious characters, for example: ́ˆ~`
|
# List of suspicious characters, for example: ́ˆ~`
|
||||||
suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
|
suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]
|
||||||
|
|
||||||
for character in suspicious_characters:
|
for character in suspicious_characters:
|
||||||
# Find the position of the suspicious character in the string
|
# Find the position of the suspicious character in the string
|
||||||
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
|
|||||||
if pd.isna(field):
|
if pd.isna(field):
|
||||||
return
|
return
|
||||||
|
|
||||||
# enable transparent request cache with thirty days expiry
|
|
||||||
expire_after = timedelta(days=30)
|
|
||||||
# Allow overriding the location of the requests cache, just in case we are
|
|
||||||
# running in an environment where we can't write to the current working di-
|
|
||||||
# rectory (for example from csv-metadata-quality-web).
|
|
||||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
|
||||||
requests_cache.install_cache(
|
|
||||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
|
||||||
)
|
|
||||||
|
|
||||||
# prune old cache entries
|
|
||||||
# requests_cache.remove_expired_responses()
|
|
||||||
|
|
||||||
# Initialize an empty list to hold the validated AGROVOC values
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
values = list()
|
values = []
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||||
request_params = {"query": value}
|
request_params = {"query": value}
|
||||||
|
|
||||||
request = requests.get(request_url, params=request_params)
|
request = requests.get(request_url, params=request_params)
|
||||||
@ -373,7 +358,7 @@ def duplicate_items(df):
|
|||||||
|
|
||||||
if items_count_unique < items_count_total:
|
if items_count_unique < items_count_total:
|
||||||
# Create a list to hold our items while we check for duplicates
|
# Create a list to hold our items while we check for duplicates
|
||||||
items = list()
|
items = []
|
||||||
|
|
||||||
for index, row in df.iterrows():
|
for index, row in df.iterrows():
|
||||||
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
||||||
@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
|
|||||||
if row[region_column_name] is not None:
|
if row[region_column_name] is not None:
|
||||||
regions = row[region_column_name].split("||")
|
regions = row[region_column_name].split("||")
|
||||||
else:
|
else:
|
||||||
regions = list()
|
regions = []
|
||||||
|
|
||||||
for country in countries:
|
for country in countries:
|
||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
|
|||||||
un_region = cc.convert(names=country, to="UNRegion")
|
un_region = cc.convert(names=country, to="UNRegion")
|
||||||
|
|
||||||
if un_region != "not found" and un_region not in regions:
|
if un_region != "not found" and un_region not in regions:
|
||||||
print(
|
try:
|
||||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
print(
|
||||||
)
|
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
|
||||||
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import langid
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import py3langid as langid
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pycountry import languages
|
from pycountry import languages
|
||||||
|
|
||||||
@ -20,7 +20,7 @@ def correct_language(row, exclude):
|
|||||||
# Initialize some variables at global scope so that we can set them in the
|
# Initialize some variables at global scope so that we can set them in the
|
||||||
# loop scope below and still be able to access them afterwards.
|
# loop scope below and still be able to access them afterwards.
|
||||||
language = ""
|
language = ""
|
||||||
sample_strings = list()
|
sample_strings = []
|
||||||
title = None
|
title = None
|
||||||
|
|
||||||
# Iterate over the labels of the current row's values. Before we transposed
|
# Iterate over the labels of the current row's values. Before we transposed
|
||||||
|
@ -23,7 +23,7 @@ def whitespace(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Initialize an empty list to hold the cleaned values
|
# Initialize an empty list to hold the cleaned values
|
||||||
values = list()
|
values = []
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
@ -64,7 +64,7 @@ def separators(field, field_name):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Initialize an empty list to hold the cleaned values
|
# Initialize an empty list to hold the cleaned values
|
||||||
values = list()
|
values = []
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
@ -175,7 +175,7 @@ def duplicates(field, field_name):
|
|||||||
values = field.split("||")
|
values = field.split("||")
|
||||||
|
|
||||||
# Initialize an empty list to hold the de-duplicated values
|
# Initialize an empty list to hold the de-duplicated values
|
||||||
new_values = list()
|
new_values = []
|
||||||
|
|
||||||
# Iterate over all values
|
# Iterate over all values
|
||||||
for value in values:
|
for value in values:
|
||||||
@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
|
|||||||
if row[region_column_name] is not None:
|
if row[region_column_name] is not None:
|
||||||
regions = row[region_column_name].split("||")
|
regions = row[region_column_name].split("||")
|
||||||
else:
|
else:
|
||||||
regions = list()
|
regions = []
|
||||||
|
|
||||||
# An empty list for our regions so we can keep track for all countries
|
# An empty list for our regions so we can keep track for all countries
|
||||||
missing_regions = list()
|
missing_regions = []
|
||||||
|
|
||||||
for country in countries:
|
for country in countries:
|
||||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||||
@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
|
|||||||
# it doesn't already exist in regions.
|
# it doesn't already exist in regions.
|
||||||
if un_region != "not found" and un_region not in regions:
|
if un_region != "not found" and un_region not in regions:
|
||||||
if un_region not in missing_regions:
|
if un_region not in missing_regions:
|
||||||
print(
|
try:
|
||||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
print(
|
||||||
)
|
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
# If there is no title column in the CSV we will print
|
||||||
|
# the fix without the title instead of crashing.
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
|
||||||
|
)
|
||||||
|
|
||||||
missing_regions.append(un_region)
|
missing_regions.append(un_region)
|
||||||
|
|
||||||
if len(missing_regions) > 0:
|
if len(missing_regions) > 0:
|
||||||
@ -387,3 +395,88 @@ def countries_match_regions(row, exclude):
|
|||||||
row[region_column_name] = "||".join(missing_regions)
|
row[region_column_name] = "||".join(missing_regions)
|
||||||
|
|
||||||
return row
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_dois(field):
|
||||||
|
"""Normalize DOIs.
|
||||||
|
|
||||||
|
DOIs are meant to be globally unique identifiers. They are case insensitive,
|
||||||
|
but in order to compare them robustly they should be normalized to a common
|
||||||
|
format:
|
||||||
|
|
||||||
|
- strip leading and trailing whitespace
|
||||||
|
- lowercase all ASCII characters
|
||||||
|
- convert all variations to https://doi.org/10.xxxx/xxxx URI format
|
||||||
|
|
||||||
|
Return string with normalized DOI.
|
||||||
|
|
||||||
|
See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
values = field.split("||")
|
||||||
|
|
||||||
|
# Initialize an empty list to hold the de-duplicated values
|
||||||
|
new_values = []
|
||||||
|
|
||||||
|
# Iterate over all values (most items will only have one DOI)
|
||||||
|
for value in values:
|
||||||
|
# Strip leading and trailing whitespace
|
||||||
|
new_value = value.strip()
|
||||||
|
|
||||||
|
new_value = new_value.lower()
|
||||||
|
|
||||||
|
# Convert to HTTPS
|
||||||
|
pattern = re.compile(r"^http://")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://", new_value)
|
||||||
|
|
||||||
|
# Convert dx.doi.org to doi.org
|
||||||
|
pattern = re.compile(r"dx\.doi\.org")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "doi.org", new_value)
|
||||||
|
|
||||||
|
# Convert www.doi.org to doi.org
|
||||||
|
pattern = re.compile(r"www\.doi\.org")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "doi.org", new_value)
|
||||||
|
|
||||||
|
# Convert erroneous %2f to /
|
||||||
|
pattern = re.compile("%2f")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "/", new_value)
|
||||||
|
|
||||||
|
# Replace values like doi: 10.11648/j.jps.20140201.14
|
||||||
|
pattern = re.compile(r"^doi: 10\.")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||||
|
|
||||||
|
# Replace values like 10.3390/foods12010115
|
||||||
|
pattern = re.compile(r"^10\.")
|
||||||
|
match = re.findall(pattern, new_value)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||||
|
|
||||||
|
if new_value != value:
|
||||||
|
print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
|
||||||
|
|
||||||
|
new_values.append(new_value)
|
||||||
|
|
||||||
|
new_field = "||".join(new_values)
|
||||||
|
|
||||||
|
return new_field
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from importlib.resources import files
|
import os
|
||||||
|
|
||||||
from ftfy.badness import is_bad
|
from ftfy.badness import is_bad
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ def is_mojibake(field):
|
|||||||
def load_spdx_licenses():
|
def load_spdx_licenses():
|
||||||
"""Returns a Python list of SPDX short license identifiers."""
|
"""Returns a Python list of SPDX short license identifiers."""
|
||||||
|
|
||||||
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
|
||||||
licenses = json.load(f)
|
licenses = json.load(f)
|
||||||
|
|
||||||
# List comprehension to extract the license ID for each license
|
# List comprehension to extract the license ID for each license
|
||||||
|
@ -37,3 +37,7 @@ Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
|
|||||||
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
||||||
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
||||||
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
||||||
|
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
|
||||||
|
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
|
||||||
|
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
|
||||||
|
DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,
|
||||||
|
|
1756
poetry.lock
generated
1756
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,41 +1,63 @@
|
|||||||
[tool.poetry]
|
[project]
|
||||||
name = "csv-metadata-quality"
|
name = "csv-metadata-quality"
|
||||||
version = "0.6.1"
|
version = "0.6.1"
|
||||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
authors = [
|
||||||
license="GPL-3.0-only"
|
{ name = "Alan Orth", email = "alan.orth@gmail.com" }
|
||||||
|
]
|
||||||
|
license= { file = "LICENSE.txt" }
|
||||||
|
dependencies = [
|
||||||
|
"pandas[feather,performance]~=2.2",
|
||||||
|
"python-stdnum~=1.20",
|
||||||
|
"requests~=2.32",
|
||||||
|
"requests-cache~=1.2.1",
|
||||||
|
"colorama~=0.4",
|
||||||
|
"ftfy~=6.2.0",
|
||||||
|
"country-converter~=1.2",
|
||||||
|
"pycountry~=24.6.1",
|
||||||
|
"py3langid~=0.3",
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">= 3.9"
|
||||||
|
|
||||||
|
classifiers = [
|
||||||
|
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||||
|
"Natural Language :: English",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: Implementation :: CPython",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
repository = "https://github.com/ilri/csv-metadata-quality"
|
repository = "https://github.com/ilri/csv-metadata-quality"
|
||||||
homepage = "https://github.com/ilri/csv-metadata-quality"
|
homepage = "https://github.com/ilri/csv-metadata-quality"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[project.scripts]
|
||||||
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
# So rye doesn't fall back to setuptools
|
||||||
python = "^3.9"
|
# See: https://packaging.python.org/en/latest/tutorials/packaging-projects/#choosing-build-backend
|
||||||
pandas = "^1.5.2"
|
|
||||||
python-stdnum = "^1.18"
|
|
||||||
requests = "^2.28.2"
|
|
||||||
requests-cache = "^0.9.8"
|
|
||||||
langid = "^1.1.6"
|
|
||||||
colorama = "^0.4.6"
|
|
||||||
ftfy = "^6.1.1"
|
|
||||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
|
|
||||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
|
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
|
||||||
pytest = "^7.2.1"
|
|
||||||
flake8 = "^6.0.0"
|
|
||||||
pytest-clarity = "^1.0.1"
|
|
||||||
black = "^23.1.0"
|
|
||||||
isort = "^5.12.0"
|
|
||||||
csvkit = "^1.1.0"
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
|
||||||
ipython = "^8.10.0"
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry>=0.12"]
|
requires = ["hatchling"]
|
||||||
build-backend = "poetry.masonry.api"
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.rye]
|
||||||
|
managed = true
|
||||||
|
dev-dependencies = [
|
||||||
|
"pytest~=8.3",
|
||||||
|
"pytest-clarity~=1.0",
|
||||||
|
"isort~=5.13",
|
||||||
|
"csvkit~=2.0",
|
||||||
|
"ipython~=8.26",
|
||||||
|
"fixit~=2.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
# So hatch doesn't try to build other top-level directories like "data"
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["csv_metadata_quality"]
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
profile = "black"
|
profile = "black"
|
||||||
|
9
renovate.json
Normal file
9
renovate.json
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||||
|
"extends": [
|
||||||
|
"config:base"
|
||||||
|
],
|
||||||
|
"pip_requirements": {
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
}
|
188
requirements-dev.lock
Normal file
188
requirements-dev.lock
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
# generated by rye
|
||||||
|
# use `rye lock` or `rye sync` to update this lockfile
|
||||||
|
#
|
||||||
|
# last locked with the following flags:
|
||||||
|
# pre: false
|
||||||
|
# features: []
|
||||||
|
# all-features: false
|
||||||
|
# with-sources: false
|
||||||
|
# generate-hashes: false
|
||||||
|
# universal: false
|
||||||
|
|
||||||
|
-e file:.
|
||||||
|
agate==1.10.2
|
||||||
|
# via agate-dbf
|
||||||
|
# via agate-excel
|
||||||
|
# via agate-sql
|
||||||
|
# via csvkit
|
||||||
|
agate-dbf==0.2.3
|
||||||
|
# via csvkit
|
||||||
|
agate-excel==0.4.1
|
||||||
|
# via csvkit
|
||||||
|
agate-sql==0.7.2
|
||||||
|
# via csvkit
|
||||||
|
asttokens==2.4.1
|
||||||
|
# via stack-data
|
||||||
|
attrs==23.2.0
|
||||||
|
# via cattrs
|
||||||
|
# via requests-cache
|
||||||
|
babel==2.15.0
|
||||||
|
# via agate
|
||||||
|
bottleneck==1.3.8
|
||||||
|
# via pandas
|
||||||
|
cattrs==23.2.3
|
||||||
|
# via requests-cache
|
||||||
|
certifi==2024.2.2
|
||||||
|
# via requests
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
# via requests
|
||||||
|
click==8.1.7
|
||||||
|
# via fixit
|
||||||
|
# via moreorless
|
||||||
|
colorama==0.4.6
|
||||||
|
# via csv-metadata-quality
|
||||||
|
country-converter==1.2
|
||||||
|
# via csv-metadata-quality
|
||||||
|
csvkit==2.0.1
|
||||||
|
dbfread==2.0.7
|
||||||
|
# via agate-dbf
|
||||||
|
decorator==5.1.1
|
||||||
|
# via ipython
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
# via openpyxl
|
||||||
|
executing==2.0.1
|
||||||
|
# via stack-data
|
||||||
|
fixit==2.1.0
|
||||||
|
ftfy==6.2.0
|
||||||
|
# via csv-metadata-quality
|
||||||
|
greenlet==3.0.3
|
||||||
|
# via sqlalchemy
|
||||||
|
idna==3.7
|
||||||
|
# via requests
|
||||||
|
iniconfig==2.0.0
|
||||||
|
# via pytest
|
||||||
|
ipython==8.26.0
|
||||||
|
isodate==0.6.1
|
||||||
|
# via agate
|
||||||
|
isort==5.13.2
|
||||||
|
jedi==0.19.1
|
||||||
|
# via ipython
|
||||||
|
leather==0.4.0
|
||||||
|
# via agate
|
||||||
|
libcst==1.4.0
|
||||||
|
# via fixit
|
||||||
|
llvmlite==0.43.0
|
||||||
|
# via numba
|
||||||
|
markdown-it-py==3.0.0
|
||||||
|
# via rich
|
||||||
|
matplotlib-inline==0.1.7
|
||||||
|
# via ipython
|
||||||
|
mdurl==0.1.2
|
||||||
|
# via markdown-it-py
|
||||||
|
moreorless==0.4.0
|
||||||
|
# via fixit
|
||||||
|
numba==0.60.0
|
||||||
|
# via pandas
|
||||||
|
numexpr==2.10.0
|
||||||
|
# via pandas
|
||||||
|
numpy==2.0.0
|
||||||
|
# via bottleneck
|
||||||
|
# via numba
|
||||||
|
# via numexpr
|
||||||
|
# via pandas
|
||||||
|
# via py3langid
|
||||||
|
# via pyarrow
|
||||||
|
olefile==0.47
|
||||||
|
# via agate-excel
|
||||||
|
openpyxl==3.1.2
|
||||||
|
# via agate-excel
|
||||||
|
# via csvkit
|
||||||
|
packaging==24.0
|
||||||
|
# via fixit
|
||||||
|
# via pytest
|
||||||
|
pandas==2.2.2
|
||||||
|
# via country-converter
|
||||||
|
# via csv-metadata-quality
|
||||||
|
parsedatetime==2.6
|
||||||
|
# via agate
|
||||||
|
parso==0.8.4
|
||||||
|
# via jedi
|
||||||
|
pathspec==0.12.1
|
||||||
|
# via trailrunner
|
||||||
|
pexpect==4.9.0
|
||||||
|
# via ipython
|
||||||
|
platformdirs==4.2.2
|
||||||
|
# via requests-cache
|
||||||
|
pluggy==1.5.0
|
||||||
|
# via pytest
|
||||||
|
pprintpp==0.4.0
|
||||||
|
# via pytest-clarity
|
||||||
|
prompt-toolkit==3.0.43
|
||||||
|
# via ipython
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
# via pexpect
|
||||||
|
pure-eval==0.2.2
|
||||||
|
# via stack-data
|
||||||
|
py3langid==0.3.0
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pyarrow==16.1.0
|
||||||
|
# via pandas
|
||||||
|
pycountry==24.6.1
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pygments==2.18.0
|
||||||
|
# via ipython
|
||||||
|
# via rich
|
||||||
|
pytest==8.3.2
|
||||||
|
# via pytest-clarity
|
||||||
|
pytest-clarity==1.0.1
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
# via pandas
|
||||||
|
python-slugify==8.0.4
|
||||||
|
# via agate
|
||||||
|
python-stdnum==1.20
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pytimeparse==1.1.8
|
||||||
|
# via agate
|
||||||
|
pytz==2024.1
|
||||||
|
# via pandas
|
||||||
|
pyyaml==6.0.1
|
||||||
|
# via libcst
|
||||||
|
requests==2.32.2
|
||||||
|
# via csv-metadata-quality
|
||||||
|
# via requests-cache
|
||||||
|
requests-cache==1.2.1
|
||||||
|
# via csv-metadata-quality
|
||||||
|
rich==13.7.1
|
||||||
|
# via pytest-clarity
|
||||||
|
six==1.16.0
|
||||||
|
# via asttokens
|
||||||
|
# via isodate
|
||||||
|
# via python-dateutil
|
||||||
|
# via url-normalize
|
||||||
|
sqlalchemy==2.0.30
|
||||||
|
# via agate-sql
|
||||||
|
# via csvkit
|
||||||
|
stack-data==0.6.3
|
||||||
|
# via ipython
|
||||||
|
text-unidecode==1.3
|
||||||
|
# via python-slugify
|
||||||
|
trailrunner==1.4.0
|
||||||
|
# via fixit
|
||||||
|
traitlets==5.14.3
|
||||||
|
# via ipython
|
||||||
|
# via matplotlib-inline
|
||||||
|
typing-extensions==4.11.0
|
||||||
|
# via sqlalchemy
|
||||||
|
tzdata==2024.1
|
||||||
|
# via pandas
|
||||||
|
url-normalize==1.4.3
|
||||||
|
# via requests-cache
|
||||||
|
urllib3==2.2.1
|
||||||
|
# via requests
|
||||||
|
# via requests-cache
|
||||||
|
wcwidth==0.2.13
|
||||||
|
# via ftfy
|
||||||
|
# via prompt-toolkit
|
||||||
|
xlrd==2.0.1
|
||||||
|
# via agate-excel
|
||||||
|
# via csvkit
|
@ -1,80 +0,0 @@
|
|||||||
agate-dbf==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
agate-excel==0.2.5 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
agate-sql==0.5.9 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
|
|
||||||
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
|
||||||
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
|
|
||||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
leather==0.3.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
markdown-it-py==2.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
|
||||||
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
|
||||||
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
|
||||||
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
|
|
||||||
traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
|
|
||||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
78
requirements.lock
Normal file
78
requirements.lock
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
# generated by rye
|
||||||
|
# use `rye lock` or `rye sync` to update this lockfile
|
||||||
|
#
|
||||||
|
# last locked with the following flags:
|
||||||
|
# pre: false
|
||||||
|
# features: []
|
||||||
|
# all-features: false
|
||||||
|
# with-sources: false
|
||||||
|
# generate-hashes: false
|
||||||
|
# universal: false
|
||||||
|
|
||||||
|
-e file:.
|
||||||
|
attrs==23.2.0
|
||||||
|
# via cattrs
|
||||||
|
# via requests-cache
|
||||||
|
bottleneck==1.3.8
|
||||||
|
# via pandas
|
||||||
|
cattrs==23.2.3
|
||||||
|
# via requests-cache
|
||||||
|
certifi==2024.2.2
|
||||||
|
# via requests
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
# via requests
|
||||||
|
colorama==0.4.6
|
||||||
|
# via csv-metadata-quality
|
||||||
|
country-converter==1.2
|
||||||
|
# via csv-metadata-quality
|
||||||
|
ftfy==6.2.0
|
||||||
|
# via csv-metadata-quality
|
||||||
|
idna==3.7
|
||||||
|
# via requests
|
||||||
|
llvmlite==0.43.0
|
||||||
|
# via numba
|
||||||
|
numba==0.60.0
|
||||||
|
# via pandas
|
||||||
|
numexpr==2.10.0
|
||||||
|
# via pandas
|
||||||
|
numpy==2.0.0
|
||||||
|
# via bottleneck
|
||||||
|
# via numba
|
||||||
|
# via numexpr
|
||||||
|
# via pandas
|
||||||
|
# via py3langid
|
||||||
|
# via pyarrow
|
||||||
|
pandas==2.2.2
|
||||||
|
# via country-converter
|
||||||
|
# via csv-metadata-quality
|
||||||
|
platformdirs==4.2.2
|
||||||
|
# via requests-cache
|
||||||
|
py3langid==0.3.0
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pyarrow==16.1.0
|
||||||
|
# via pandas
|
||||||
|
pycountry==24.6.1
|
||||||
|
# via csv-metadata-quality
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
# via pandas
|
||||||
|
python-stdnum==1.20
|
||||||
|
# via csv-metadata-quality
|
||||||
|
pytz==2024.1
|
||||||
|
# via pandas
|
||||||
|
requests==2.32.2
|
||||||
|
# via csv-metadata-quality
|
||||||
|
# via requests-cache
|
||||||
|
requests-cache==1.2.1
|
||||||
|
# via csv-metadata-quality
|
||||||
|
six==1.16.0
|
||||||
|
# via python-dateutil
|
||||||
|
# via url-normalize
|
||||||
|
tzdata==2024.1
|
||||||
|
# via pandas
|
||||||
|
url-normalize==1.4.3
|
||||||
|
# via requests-cache
|
||||||
|
urllib3==2.2.1
|
||||||
|
# via requests
|
||||||
|
# via requests-cache
|
||||||
|
wcwidth==0.2.13
|
||||||
|
# via ftfy
|
@ -1,23 +0,0 @@
|
|||||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
|
||||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
|
||||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
|
||||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
|
||||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
|
36
setup.py
36
setup.py
@ -1,36 +0,0 @@
|
|||||||
import setuptools
|
|
||||||
|
|
||||||
with open("README.md", "r") as fh:
|
|
||||||
long_description = fh.read()
|
|
||||||
|
|
||||||
install_requires = [
|
|
||||||
"pandas",
|
|
||||||
"python-stdnum",
|
|
||||||
"requests",
|
|
||||||
"requests-cache",
|
|
||||||
"pycountry",
|
|
||||||
"langid",
|
|
||||||
]
|
|
||||||
|
|
||||||
setuptools.setup(
|
|
||||||
name="csv-metadata-quality",
|
|
||||||
version="0.6.1",
|
|
||||||
author="Alan Orth",
|
|
||||||
author_email="aorth@mjanja.ch",
|
|
||||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
|
||||||
license="GPLv3",
|
|
||||||
long_description=long_description,
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
url="https://github.com/alanorth/csv-metadata-quality",
|
|
||||||
classifiers=[
|
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
|
||||||
"Operating System :: OS Independent",
|
|
||||||
],
|
|
||||||
packages=["csv_metadata_quality"],
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
|
|
||||||
},
|
|
||||||
install_requires=install_requires,
|
|
||||||
)
|
|
@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "es"
|
language = "es"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "spa"
|
language = "spa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "en"
|
language = "en"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():
|
|||||||
|
|
||||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||||
language = "eng"
|
language = "eng"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Create a dictionary to mimic Pandas series
|
# Create a dictionary to mimic Pandas series
|
||||||
row = {"dc.title": title, "dc.language.iso": language}
|
row = {"dc.title": title, "dc.language.iso": language}
|
||||||
@ -407,7 +407,7 @@ def test_check_doi_field():
|
|||||||
# the citation and a DOI field.
|
# the citation and a DOI field.
|
||||||
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
||||||
series = pd.Series(data=d)
|
series = pd.Series(data=d)
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
result = check.citation_doi(series, exclude)
|
result = check.citation_doi(series, exclude)
|
||||||
|
|
||||||
@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
|
|||||||
"""Test an item with a DOI in its citation, but no DOI field."""
|
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||||
|
|
||||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
# an empty DOI field and a citation containing a DOI.
|
# an empty DOI field and a citation containing a DOI.
|
||||||
@ -439,7 +439,7 @@ def test_title_in_citation():
|
|||||||
|
|
||||||
title = "Testing all the things"
|
title = "Testing all the things"
|
||||||
citation = "Orth, A. 2021. Testing all the things."
|
citation = "Orth, A. 2021. Testing all the things."
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
# the title and citation.
|
# the title and citation.
|
||||||
@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):
|
|||||||
|
|
||||||
title = "Testing all the things"
|
title = "Testing all the things"
|
||||||
citation = "Orth, A. 2021. Testing all teh things."
|
citation = "Orth, A. 2021. Testing all teh things."
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||||
# the title and citation.
|
# the title and citation.
|
||||||
@ -477,7 +477,7 @@ def test_country_matches_region():
|
|||||||
|
|
||||||
country = "Kenya"
|
country = "Kenya"
|
||||||
region = "Eastern Africa"
|
region = "Eastern Africa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series)
|
# Emulate a column in a transposed dataframe (which is just a series)
|
||||||
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
||||||
@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
|
|||||||
country = "Kenya"
|
country = "Kenya"
|
||||||
region = ""
|
region = ""
|
||||||
missing_region = "Eastern Africa"
|
missing_region = "Eastern Africa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series)
|
# Emulate a column in a transposed dataframe (which is just a series)
|
||||||
d = {
|
d = {
|
||||||
|
@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
|
|||||||
country = "Kenya"
|
country = "Kenya"
|
||||||
region = ""
|
region = ""
|
||||||
missing_region = "Eastern Africa"
|
missing_region = "Eastern Africa"
|
||||||
exclude = list()
|
exclude = []
|
||||||
|
|
||||||
# Emulate a column in a transposed dataframe (which is just a series)
|
# Emulate a column in a transposed dataframe (which is just a series)
|
||||||
d = {
|
d = {
|
||||||
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
|
|||||||
series_correct = pd.Series(data=d_correct)
|
series_correct = pd.Series(data=d_correct)
|
||||||
|
|
||||||
pd.testing.assert_series_equal(result, series_correct)
|
pd.testing.assert_series_equal(result, series_correct)
|
||||||
|
|
||||||
|
|
||||||
|
def test_fix_normalize_dois():
|
||||||
|
"""Test normalizing a DOI."""
|
||||||
|
|
||||||
|
value = "doi: 10.11648/j.jps.20140201.14"
|
||||||
|
|
||||||
|
assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
|
||||||
|
Reference in New Issue
Block a user