mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-09 14:46:00 +02:00
Compare commits
119 Commits
v0.6.1
...
81e3ca3d9c
Author | SHA1 | Date | |
---|---|---|---|
81e3ca3d9c
|
|||
c470f8b375
|
|||
0f45448517
|
|||
7dd52ca491
|
|||
92ff0ee51b
|
|||
ae38a826ec
|
|||
c1f630c298
|
|||
82b056f0ea
|
|||
7fca981b95
|
|||
1a9424197b
|
|||
f6c6c94a1e
|
|||
f500fac64b
|
|||
8143a7d978
|
|||
94cec080d6
|
|||
9402af1e30
|
|||
d71ff9082b
|
|||
f309b694c4
|
|||
4d879f6d13
|
|||
a30fefcd52
|
|||
2341c56c40
|
|||
5be2195325
|
|||
736948ed2c
|
|||
ee0b448355
|
|||
4f3174a543
|
|||
d5c25f82fa
|
|||
7b3e2b4e68 | |||
f92b2fe206 | |||
df040b70c7 | |||
10bc8f3e14 | |||
7e6e92ecaa
|
|||
a21ffb0fa8
|
|||
fb341dd9fa | |||
2e943ee4db | |||
6d3a9870d6 | |||
82ecf7119a | |||
1db21cf275 | |||
bcd1408798 | |||
ee8d255811 | |||
2cc2dbe952
|
|||
940a325d61
|
|||
59b3b307c9
|
|||
b305da3f0b
|
|||
96a486471c | |||
530cd5863b
|
|||
f6018c51b6
|
|||
80c3f5b45a
|
|||
ba4637ea34 | |||
355428a691 | |||
58d4de973e | |||
e1216dae3c | |||
6b650ff1b3 | |||
fa7bde6fc0 | |||
f89159fe32 | |||
02058c5a65 | |||
8fed6b71ff | |||
b005b28cbe | |||
c626290599 | |||
1a06470b64 | |||
d46a81672e | |||
2a50e75082 | |||
0d45e73983 | |||
3611aab425 | |||
5c4ad0eb41 | |||
f1f39722f6 | |||
1c03999582 | |||
1f637f32cd
|
|||
b8241e919d
|
|||
b8dc19cc3f
|
|||
93c9b739ac
|
|||
4ed2786703
|
|||
8728789183 | |||
bf90464809
|
|||
1878002391 | |||
d21d2621e3 | |||
f3fb1ff7fb | |||
1fa81f7558 | |||
7409193b6b | |||
a84fcf0b7b
|
|||
25ac290df4
|
|||
3f52bad1e3
|
|||
0208ad0ade | |||
3632ae0fc9 | |||
17d089cc6e
|
|||
bc470a4343
|
|||
be609a809d
|
|||
de3387ded7
|
|||
f343e87f0c
|
|||
7d3524fbd5
|
|||
c614b71a52 | |||
d159a839f3 | |||
36e2ebe5f4
|
|||
33f67b7a7c
|
|||
c0e1448439
|
|||
5d0804a08f
|
|||
f01c9edf17
|
|||
8d4295b2b3
|
|||
e2d46e9495
|
|||
1491e1edb0
|
|||
34142c3e6b
|
|||
0c88b96e8d
|
|||
2e55b4d6e3
|
|||
c90aad29f0
|
|||
6fd1e1377f
|
|||
c64b7eb1f1
|
|||
29cbc4f3a3
|
|||
307af1acfc
|
|||
b5106de9df
|
|||
9eeadfc44e
|
|||
d4aed378cf
|
|||
20a2cce34b
|
|||
d661ffe439
|
|||
45a310387a
|
|||
47b03c49ba
|
|||
986b81cbf4
|
|||
d43a47ae32
|
|||
ede37569f1
|
|||
0c53efe60a
|
|||
5f0e25b818
|
|||
4776154d6c
|
70
.drone.yml
70
.drone.yml
@ -1,3 +1,33 @@
|
||||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
name: python311
|
||||
|
||||
steps:
|
||||
- name: test
|
||||
image: python:3.11-slim
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
@ -10,23 +40,23 @@ steps:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
@ -40,22 +70,22 @@ steps:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
# vim: ts=2 sw=2 et
|
||||
|
36
.github/workflows/python-app.yml
vendored
36
.github/workflows/python-app.yml
vendored
@ -15,37 +15,27 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v4
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install rye
|
||||
uses: eifinger/setup-rye@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
cache: 'pip'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install flake8 pytest
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
|
||||
- name: Lint with flake8
|
||||
version: 'latest'
|
||||
- run: rye sync
|
||||
- name: Lint
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
rye lint
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
||||
run: rye test
|
||||
- name: Test CLI
|
||||
run: |
|
||||
python setup.py install
|
||||
# Basic test
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Test with unsafe fixes
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
rye run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
||||
3.12
|
20
CHANGELOG.md
20
CHANGELOG.md
@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
### Added
|
||||
- Ability to normalize DOIs to https://doi.org URI format
|
||||
|
||||
### Fixed
|
||||
- Fixed regex so we don't run the invalid multi-value separator fix on
|
||||
`dcterms.bibliographicCitation` fields
|
||||
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
|
||||
fields
|
||||
- Don't crash the country/region checker/fixer when a title field is missing
|
||||
|
||||
### Changed
|
||||
- Don't run newline fix on description fields
|
||||
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||
- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
|
||||
|
||||
### Updated
|
||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||
- SPDX license list
|
||||
|
||||
## [0.6.1] - 2023-02-23
|
||||
### Fixed
|
||||
- Missing region check should ignore subregion field, if it exists
|
||||
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@ -0,0 +1 @@
|
||||
include csv_metadata_quality/data/licenses.json
|
@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
||||
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
||||
- Remove duplicate metadata values
|
||||
- Check for duplicate items, using the title, type, and date issued as an indicator
|
||||
- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format
|
||||
|
||||
## Installation
|
||||
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
||||
@ -125,9 +126,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Better logging, for example with INFO, WARN, and ERR levels
|
||||
- Verbose, debug, or quiet options
|
||||
- Warn if an author is shorter than 3 characters?
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
- Validate ISSNs or journal titles against CrossRef API?
|
||||
- Add configurable field validation, like specify a field name and a validation file?
|
||||
@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Warn if item is Open Access, but missing a license
|
||||
- Warn if item has an ISSN but no journal title
|
||||
- Update journal titles from ISSN
|
||||
- Migrate to https://github.com/spdx/license-list-data
|
||||
- Migrate from Pandas to Polars
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
@ -1,11 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
|
||||
import csv_metadata_quality.check as check
|
||||
@ -74,7 +77,7 @@ def run(argv):
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||
df = pd.read_csv(args.input_file, dtype=str)
|
||||
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
|
||||
|
||||
# Check if the user requested to skip any fields
|
||||
if args.exclude_fields:
|
||||
@ -82,7 +85,20 @@ def run(argv):
|
||||
# user should be careful to no include spaces here.
|
||||
exclude = args.exclude_fields.split(",")
|
||||
else:
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.delete()
|
||||
|
||||
for column in df.columns:
|
||||
if column in exclude:
|
||||
@ -91,7 +107,9 @@ def run(argv):
|
||||
continue
|
||||
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?abstract.*$", column)
|
||||
# Skip whitespace and newline fixes on abstracts and descriptions
|
||||
# because there are too many with legitimate multi-line metadata.
|
||||
match = re.match(r"^.*?(abstract|description).*$", column)
|
||||
if match is None:
|
||||
# Fix: whitespace
|
||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||
@ -102,7 +120,7 @@ def run(argv):
|
||||
# Fix: missing space after comma. Only run on author and citation
|
||||
# fields for now, as this problem is mostly an issue in names.
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?(author|citation).*$", column)
|
||||
match = re.match(r"^.*?(author|[Cc]itation).*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||
|
||||
@ -123,10 +141,15 @@ def run(argv):
|
||||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
# Fix: normalize DOIs
|
||||
match = re.match(r"^.*?identifier\.doi.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.normalize_dois)
|
||||
|
||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||
# and abstract fields because "|" is used to indicate something like
|
||||
# a subtitle.
|
||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
||||
match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
|
||||
if match is None:
|
||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
|
@ -1,14 +1,12 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
|
||||
import country_converter as coco
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
|
||||
return
|
||||
|
||||
# List of suspicious characters, for example: ́ˆ~`
|
||||
suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
|
||||
suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]
|
||||
|
||||
for character in suspicious_characters:
|
||||
# Find the position of the suspicious character in the string
|
||||
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
# requests_cache.remove_expired_responses()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_params = {"query": value}
|
||||
|
||||
request = requests.get(request_url, params=request_params)
|
||||
@ -373,7 +358,7 @@ def duplicate_items(df):
|
||||
|
||||
if items_count_unique < items_count_total:
|
||||
# Create a list to hold our items while we check for duplicates
|
||||
items = list()
|
||||
items = []
|
||||
|
||||
for index, row in df.iterrows():
|
||||
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
||||
@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
|
||||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
regions = []
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
|
||||
un_region = cc.convert(names=country, to="UNRegion")
|
||||
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
try:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
except KeyError:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
|
||||
)
|
||||
|
||||
return
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,8 @@
|
||||
|
||||
import re
|
||||
|
||||
import langid
|
||||
import pandas as pd
|
||||
import py3langid as langid
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
|
||||
@ -20,7 +20,7 @@ def correct_language(row, exclude):
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
language = ""
|
||||
sample_strings = list()
|
||||
sample_strings = []
|
||||
title = None
|
||||
|
||||
# Iterate over the labels of the current row's values. Before we transposed
|
||||
|
@ -23,7 +23,7 @@ def whitespace(field, field_name):
|
||||
return
|
||||
|
||||
# Initialize an empty list to hold the cleaned values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
@ -64,7 +64,7 @@ def separators(field, field_name):
|
||||
return
|
||||
|
||||
# Initialize an empty list to hold the cleaned values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
@ -175,7 +175,7 @@ def duplicates(field, field_name):
|
||||
values = field.split("||")
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = list()
|
||||
new_values = []
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
|
||||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
regions = []
|
||||
|
||||
# An empty list for our regions so we can keep track for all countries
|
||||
missing_regions = list()
|
||||
missing_regions = []
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
|
||||
# it doesn't already exist in regions.
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
if un_region not in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
try:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
except KeyError:
|
||||
# If there is no title column in the CSV we will print
|
||||
# the fix without the title instead of crashing.
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
|
||||
)
|
||||
|
||||
missing_regions.append(un_region)
|
||||
|
||||
if len(missing_regions) > 0:
|
||||
@ -387,3 +395,88 @@ def countries_match_regions(row, exclude):
|
||||
row[region_column_name] = "||".join(missing_regions)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def normalize_dois(field):
|
||||
"""Normalize DOIs.
|
||||
|
||||
DOIs are meant to be globally unique identifiers. They are case insensitive,
|
||||
but in order to compare them robustly they should be normalized to a common
|
||||
format:
|
||||
|
||||
- strip leading and trailing whitespace
|
||||
- lowercase all ASCII characters
|
||||
- convert all variations to https://doi.org/10.xxxx/xxxx URI format
|
||||
|
||||
Return string with normalized DOI.
|
||||
|
||||
See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
values = field.split("||")
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = []
|
||||
|
||||
# Iterate over all values (most items will only have one DOI)
|
||||
for value in values:
|
||||
# Strip leading and trailing whitespace
|
||||
new_value = value.strip()
|
||||
|
||||
new_value = new_value.lower()
|
||||
|
||||
# Convert to HTTPS
|
||||
pattern = re.compile(r"^http://")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "https://", new_value)
|
||||
|
||||
# Convert dx.doi.org to doi.org
|
||||
pattern = re.compile(r"dx\.doi\.org")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "doi.org", new_value)
|
||||
|
||||
# Convert www.doi.org to doi.org
|
||||
pattern = re.compile(r"www\.doi\.org")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "doi.org", new_value)
|
||||
|
||||
# Convert erroneous %2f to /
|
||||
pattern = re.compile("%2f")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "/", new_value)
|
||||
|
||||
# Replace values like doi: 10.11648/j.jps.20140201.14
|
||||
pattern = re.compile(r"^doi: 10\.")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||
|
||||
# Replace values like 10.3390/foods12010115
|
||||
pattern = re.compile(r"^10\.")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||
|
||||
if new_value != value:
|
||||
print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
|
||||
|
||||
new_values.append(new_value)
|
||||
|
||||
new_field = "||".join(new_values)
|
||||
|
||||
return new_field
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
|
||||
import json
|
||||
from importlib.resources import files
|
||||
import os
|
||||
|
||||
from ftfy.badness import is_bad
|
||||
|
||||
@ -58,7 +58,7 @@ def is_mojibake(field):
|
||||
def load_spdx_licenses():
|
||||
"""Returns a Python list of SPDX short license identifiers."""
|
||||
|
||||
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
|
||||
licenses = json.load(f)
|
||||
|
||||
# List comprehension to extract the license ID for each license
|
||||
|
@ -37,3 +37,7 @@ Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
|
||||
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
||||
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
||||
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
||||
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
|
||||
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
|
||||
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
|
||||
DOI with %2f,2024-06-25,,,,,,,,,,https://doi.org/10.1016%2fj.envc.2023.100794,,
|
||||
|
|
1756
poetry.lock
generated
1756
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,41 +1,63 @@
|
||||
[tool.poetry]
|
||||
[project]
|
||||
name = "csv-metadata-quality"
|
||||
version = "0.6.1"
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
||||
license="GPL-3.0-only"
|
||||
authors = [
|
||||
{ name = "Alan Orth", email = "alan.orth@gmail.com" }
|
||||
]
|
||||
license= { file = "LICENSE.txt" }
|
||||
dependencies = [
|
||||
"pandas[feather,performance]~=2.2",
|
||||
"python-stdnum~=1.20",
|
||||
"requests~=2.32",
|
||||
"requests-cache~=1.2.1",
|
||||
"colorama~=0.4",
|
||||
"ftfy~=6.2.0",
|
||||
"country-converter~=1.2",
|
||||
"pycountry~=24.6.1",
|
||||
"py3langid~=0.3",
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">= 3.9"
|
||||
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Natural Language :: English",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
repository = "https://github.com/ilri/csv-metadata-quality"
|
||||
homepage = "https://github.com/ilri/csv-metadata-quality"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
[project.scripts]
|
||||
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
pandas = "^1.5.2"
|
||||
python-stdnum = "^1.18"
|
||||
requests = "^2.28.2"
|
||||
requests-cache = "^0.9.8"
|
||||
langid = "^1.1.6"
|
||||
colorama = "^0.4.6"
|
||||
ftfy = "^6.1.1"
|
||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^7.2.1"
|
||||
flake8 = "^6.0.0"
|
||||
pytest-clarity = "^1.0.1"
|
||||
black = "^23.1.0"
|
||||
isort = "^5.12.0"
|
||||
csvkit = "^1.1.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipython = "^8.10.0"
|
||||
|
||||
# So rye doesn't fall back to setuptools
|
||||
# See: https://packaging.python.org/en/latest/tutorials/packaging-projects/#choosing-build-backend
|
||||
[build-system]
|
||||
requires = ["poetry>=0.12"]
|
||||
build-backend = "poetry.masonry.api"
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.rye]
|
||||
managed = true
|
||||
dev-dependencies = [
|
||||
"pytest~=8.3",
|
||||
"pytest-clarity~=1.0",
|
||||
"isort~=5.13",
|
||||
"csvkit~=2.0",
|
||||
"ipython~=8.26",
|
||||
"fixit~=2.1",
|
||||
]
|
||||
|
||||
# So hatch doesn't try to build other top-level directories like "data"
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["csv_metadata_quality"]
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
9
renovate.json
Normal file
9
renovate.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||
"extends": [
|
||||
"config:base"
|
||||
],
|
||||
"pip_requirements": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
188
requirements-dev.lock
Normal file
188
requirements-dev.lock
Normal file
@ -0,0 +1,188 @@
|
||||
# generated by rye
|
||||
# use `rye lock` or `rye sync` to update this lockfile
|
||||
#
|
||||
# last locked with the following flags:
|
||||
# pre: false
|
||||
# features: []
|
||||
# all-features: false
|
||||
# with-sources: false
|
||||
# generate-hashes: false
|
||||
# universal: false
|
||||
|
||||
-e file:.
|
||||
agate==1.10.2
|
||||
# via agate-dbf
|
||||
# via agate-excel
|
||||
# via agate-sql
|
||||
# via csvkit
|
||||
agate-dbf==0.2.3
|
||||
# via csvkit
|
||||
agate-excel==0.4.1
|
||||
# via csvkit
|
||||
agate-sql==0.7.2
|
||||
# via csvkit
|
||||
asttokens==2.4.1
|
||||
# via stack-data
|
||||
attrs==23.2.0
|
||||
# via cattrs
|
||||
# via requests-cache
|
||||
babel==2.15.0
|
||||
# via agate
|
||||
bottleneck==1.3.8
|
||||
# via pandas
|
||||
cattrs==23.2.3
|
||||
# via requests-cache
|
||||
certifi==2024.2.2
|
||||
# via requests
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via fixit
|
||||
# via moreorless
|
||||
colorama==0.4.6
|
||||
# via csv-metadata-quality
|
||||
country-converter==1.2
|
||||
# via csv-metadata-quality
|
||||
csvkit==2.0.1
|
||||
dbfread==2.0.7
|
||||
# via agate-dbf
|
||||
decorator==5.1.1
|
||||
# via ipython
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
executing==2.0.1
|
||||
# via stack-data
|
||||
fixit==2.1.0
|
||||
ftfy==6.2.0
|
||||
# via csv-metadata-quality
|
||||
greenlet==3.0.3
|
||||
# via sqlalchemy
|
||||
idna==3.7
|
||||
# via requests
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
ipython==8.26.0
|
||||
isodate==0.6.1
|
||||
# via agate
|
||||
isort==5.13.2
|
||||
jedi==0.19.1
|
||||
# via ipython
|
||||
leather==0.4.0
|
||||
# via agate
|
||||
libcst==1.4.0
|
||||
# via fixit
|
||||
llvmlite==0.43.0
|
||||
# via numba
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
matplotlib-inline==0.1.7
|
||||
# via ipython
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
moreorless==0.4.0
|
||||
# via fixit
|
||||
numba==0.60.0
|
||||
# via pandas
|
||||
numexpr==2.10.0
|
||||
# via pandas
|
||||
numpy==2.0.0
|
||||
# via bottleneck
|
||||
# via numba
|
||||
# via numexpr
|
||||
# via pandas
|
||||
# via py3langid
|
||||
# via pyarrow
|
||||
olefile==0.47
|
||||
# via agate-excel
|
||||
openpyxl==3.1.2
|
||||
# via agate-excel
|
||||
# via csvkit
|
||||
packaging==24.0
|
||||
# via fixit
|
||||
# via pytest
|
||||
pandas==2.2.2
|
||||
# via country-converter
|
||||
# via csv-metadata-quality
|
||||
parsedatetime==2.6
|
||||
# via agate
|
||||
parso==0.8.4
|
||||
# via jedi
|
||||
pathspec==0.12.1
|
||||
# via trailrunner
|
||||
pexpect==4.9.0
|
||||
# via ipython
|
||||
platformdirs==4.2.2
|
||||
# via requests-cache
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
pprintpp==0.4.0
|
||||
# via pytest-clarity
|
||||
prompt-toolkit==3.0.43
|
||||
# via ipython
|
||||
ptyprocess==0.7.0
|
||||
# via pexpect
|
||||
pure-eval==0.2.2
|
||||
# via stack-data
|
||||
py3langid==0.3.0
|
||||
# via csv-metadata-quality
|
||||
pyarrow==16.1.0
|
||||
# via pandas
|
||||
pycountry==24.6.1
|
||||
# via csv-metadata-quality
|
||||
pygments==2.18.0
|
||||
# via ipython
|
||||
# via rich
|
||||
pytest==8.3.2
|
||||
# via pytest-clarity
|
||||
pytest-clarity==1.0.1
|
||||
python-dateutil==2.9.0.post0
|
||||
# via pandas
|
||||
python-slugify==8.0.4
|
||||
# via agate
|
||||
python-stdnum==1.20
|
||||
# via csv-metadata-quality
|
||||
pytimeparse==1.1.8
|
||||
# via agate
|
||||
pytz==2024.1
|
||||
# via pandas
|
||||
pyyaml==6.0.1
|
||||
# via libcst
|
||||
requests==2.32.2
|
||||
# via csv-metadata-quality
|
||||
# via requests-cache
|
||||
requests-cache==1.2.1
|
||||
# via csv-metadata-quality
|
||||
rich==13.7.1
|
||||
# via pytest-clarity
|
||||
six==1.16.0
|
||||
# via asttokens
|
||||
# via isodate
|
||||
# via python-dateutil
|
||||
# via url-normalize
|
||||
sqlalchemy==2.0.30
|
||||
# via agate-sql
|
||||
# via csvkit
|
||||
stack-data==0.6.3
|
||||
# via ipython
|
||||
text-unidecode==1.3
|
||||
# via python-slugify
|
||||
trailrunner==1.4.0
|
||||
# via fixit
|
||||
traitlets==5.14.3
|
||||
# via ipython
|
||||
# via matplotlib-inline
|
||||
typing-extensions==4.11.0
|
||||
# via sqlalchemy
|
||||
tzdata==2024.1
|
||||
# via pandas
|
||||
url-normalize==1.4.3
|
||||
# via requests-cache
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
# via requests-cache
|
||||
wcwidth==0.2.13
|
||||
# via ftfy
|
||||
# via prompt-toolkit
|
||||
xlrd==2.0.1
|
||||
# via agate-excel
|
||||
# via csvkit
|
@ -1,80 +0,0 @@
|
||||
agate-dbf==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
agate-excel==0.2.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
agate-sql==0.5.9 ; python_version >= "3.9" and python_version < "4.0"
|
||||
agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
|
||||
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
||||
csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
||||
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
||||
greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
||||
iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
leather==0.3.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
markdown-it-py==2.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
||||
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
|
||||
openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
||||
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
||||
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
||||
rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
|
||||
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
|
||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
||||
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
78
requirements.lock
Normal file
78
requirements.lock
Normal file
@ -0,0 +1,78 @@
|
||||
# generated by rye
|
||||
# use `rye lock` or `rye sync` to update this lockfile
|
||||
#
|
||||
# last locked with the following flags:
|
||||
# pre: false
|
||||
# features: []
|
||||
# all-features: false
|
||||
# with-sources: false
|
||||
# generate-hashes: false
|
||||
# universal: false
|
||||
|
||||
-e file:.
|
||||
attrs==23.2.0
|
||||
# via cattrs
|
||||
# via requests-cache
|
||||
bottleneck==1.3.8
|
||||
# via pandas
|
||||
cattrs==23.2.3
|
||||
# via requests-cache
|
||||
certifi==2024.2.2
|
||||
# via requests
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
colorama==0.4.6
|
||||
# via csv-metadata-quality
|
||||
country-converter==1.2
|
||||
# via csv-metadata-quality
|
||||
ftfy==6.2.0
|
||||
# via csv-metadata-quality
|
||||
idna==3.7
|
||||
# via requests
|
||||
llvmlite==0.43.0
|
||||
# via numba
|
||||
numba==0.60.0
|
||||
# via pandas
|
||||
numexpr==2.10.0
|
||||
# via pandas
|
||||
numpy==2.0.0
|
||||
# via bottleneck
|
||||
# via numba
|
||||
# via numexpr
|
||||
# via pandas
|
||||
# via py3langid
|
||||
# via pyarrow
|
||||
pandas==2.2.2
|
||||
# via country-converter
|
||||
# via csv-metadata-quality
|
||||
platformdirs==4.2.2
|
||||
# via requests-cache
|
||||
py3langid==0.3.0
|
||||
# via csv-metadata-quality
|
||||
pyarrow==16.1.0
|
||||
# via pandas
|
||||
pycountry==24.6.1
|
||||
# via csv-metadata-quality
|
||||
python-dateutil==2.9.0.post0
|
||||
# via pandas
|
||||
python-stdnum==1.20
|
||||
# via csv-metadata-quality
|
||||
pytz==2024.1
|
||||
# via pandas
|
||||
requests==2.32.2
|
||||
# via csv-metadata-quality
|
||||
# via requests-cache
|
||||
requests-cache==1.2.1
|
||||
# via csv-metadata-quality
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
# via url-normalize
|
||||
tzdata==2024.1
|
||||
# via pandas
|
||||
url-normalize==1.4.3
|
||||
# via requests-cache
|
||||
urllib3==2.2.1
|
||||
# via requests
|
||||
# via requests-cache
|
||||
wcwidth==0.2.13
|
||||
# via ftfy
|
@ -1,23 +0,0 @@
|
||||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
36
setup.py
36
setup.py
@ -1,36 +0,0 @@
|
||||
import setuptools
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
install_requires = [
|
||||
"pandas",
|
||||
"python-stdnum",
|
||||
"requests",
|
||||
"requests-cache",
|
||||
"pycountry",
|
||||
"langid",
|
||||
]
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.6.1",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
license="GPLv3",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/alanorth/csv-metadata-quality",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
packages=["csv_metadata_quality"],
|
||||
entry_points={
|
||||
"console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
|
||||
},
|
||||
install_requires=install_requires,
|
||||
)
|
@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "es"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "spa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "en"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "eng"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
@ -407,7 +407,7 @@ def test_check_doi_field():
|
||||
# the citation and a DOI field.
|
||||
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
||||
series = pd.Series(data=d)
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
result = check.citation_doi(series, exclude)
|
||||
|
||||
@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
|
||||
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||
|
||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# an empty DOI field and a citation containing a DOI.
|
||||
@ -439,7 +439,7 @@ def test_title_in_citation():
|
||||
|
||||
title = "Testing all the things"
|
||||
citation = "Orth, A. 2021. Testing all the things."
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the title and citation.
|
||||
@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):
|
||||
|
||||
title = "Testing all the things"
|
||||
citation = "Orth, A. 2021. Testing all teh things."
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the title and citation.
|
||||
@ -477,7 +477,7 @@ def test_country_matches_region():
|
||||
|
||||
country = "Kenya"
|
||||
region = "Eastern Africa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
||||
@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
|
||||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
|
@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
|
||||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
|
||||
series_correct = pd.Series(data=d_correct)
|
||||
|
||||
pd.testing.assert_series_equal(result, series_correct)
|
||||
|
||||
|
||||
def test_fix_normalize_dois():
|
||||
"""Test normalizing a DOI."""
|
||||
|
||||
value = "doi: 10.11648/j.jps.20140201.14"
|
||||
|
||||
assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
|
||||
|
Reference in New Issue
Block a user