Compare commits
100 Commits
Author | SHA1 | Date |
---|---|---|
Alan Orth | 2341c56c40 | |
Alan Orth | 5be2195325 | |
Alan Orth | 736948ed2c | |
Alan Orth | ee0b448355 | |
Alan Orth | 4f3174a543 | |
Alan Orth | d5c25f82fa | |
Alan Orth | 7b3e2b4e68 | |
Alan Orth | f92b2fe206 | |
renovate[bot] | df040b70c7 | |
renovate[bot] | 10bc8f3e14 | |
Alan Orth | 7e6e92ecaa | |
Alan Orth | a21ffb0fa8 | |
Alan Orth | fb341dd9fa | |
Alan Orth | 2e943ee4db | |
Alan Orth | 6d3a9870d6 | |
Alan Orth | 82ecf7119a | |
renovate[bot] | 1db21cf275 | |
renovate[bot] | bcd1408798 | |
renovate[bot] | ee8d255811 | |
Alan Orth | 2cc2dbe952 | |
Alan Orth | 940a325d61 | |
Alan Orth | 59b3b307c9 | |
Alan Orth | b305da3f0b | |
renovate[bot] | 96a486471c | |
Alan Orth | 530cd5863b | |
Alan Orth | f6018c51b6 | |
Alan Orth | 80c3f5b45a | |
Alan Orth | ba4637ea34 | |
Alan Orth | 355428a691 | |
renovate[bot] | 58d4de973e | |
Alan Orth | e1216dae3c | |
renovate[bot] | 6b650ff1b3 | |
Alan Orth | fa7bde6fc0 | |
renovate[bot] | f89159fe32 | |
renovate[bot] | 02058c5a65 | |
Alan Orth | 8fed6b71ff | |
Alan Orth | b005b28cbe | |
renovate[bot] | c626290599 | |
renovate[bot] | 1a06470b64 | |
Alan Orth | d46a81672e | |
Alan Orth | 2a50e75082 | |
Alan Orth | 0d45e73983 | |
renovate[bot] | 3611aab425 | |
renovate[bot] | 5c4ad0eb41 | |
renovate[bot] | f1f39722f6 | |
Alan Orth | 1c03999582 | |
Alan Orth | 1f637f32cd | |
Alan Orth | b8241e919d | |
Alan Orth | b8dc19cc3f | |
Alan Orth | 93c9b739ac | |
Alan Orth | 4ed2786703 | |
renovate[bot] | 8728789183 | |
Alan Orth | bf90464809 | |
Alan Orth | 1878002391 | |
Alan Orth | d21d2621e3 | |
Alan Orth | f3fb1ff7fb | |
Alan Orth | 1fa81f7558 | |
renovate[bot] | 7409193b6b | |
Alan Orth | a84fcf0b7b | |
Alan Orth | 25ac290df4 | |
Alan Orth | 3f52bad1e3 | |
Alan Orth | 0208ad0ade | |
renovate[bot] | 3632ae0fc9 | |
Alan Orth | 17d089cc6e | |
Alan Orth | bc470a4343 | |
Alan Orth | be609a809d | |
Alan Orth | de3387ded7 | |
Alan Orth | f343e87f0c | |
Alan Orth | 7d3524fbd5 | |
Alan Orth | c614b71a52 | |
renovate[bot] | d159a839f3 | |
Alan Orth | 36e2ebe5f4 | |
Alan Orth | 33f67b7a7c | |
Alan Orth | c0e1448439 | |
Alan Orth | 5d0804a08f | |
Alan Orth | f01c9edf17 | |
Alan Orth | 8d4295b2b3 | |
Alan Orth | e2d46e9495 | |
Alan Orth | 1491e1edb0 | |
Alan Orth | 34142c3e6b | |
Alan Orth | 0c88b96e8d | |
Alan Orth | 2e55b4d6e3 | |
Alan Orth | c90aad29f0 | |
Alan Orth | 6fd1e1377f | |
Alan Orth | c64b7eb1f1 | |
Alan Orth | 29cbc4f3a3 | |
Alan Orth | 307af1acfc | |
Alan Orth | b5106de9df | |
Alan Orth | 9eeadfc44e | |
Alan Orth | d4aed378cf | |
Alan Orth | 20a2cce34b | |
Alan Orth | d661ffe439 | |
Alan Orth | 45a310387a | |
Alan Orth | 47b03c49ba | |
Alan Orth | 986b81cbf4 | |
Alan Orth | d43a47ae32 | |
Alan Orth | ede37569f1 | |
Alan Orth | 0c53efe60a | |
Alan Orth | 5f0e25b818 | |
Alan Orth | 4776154d6c |
70
.drone.yml
70
.drone.yml
|
@ -1,3 +1,33 @@
|
|||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
name: python311
|
||||
|
||||
steps:
|
||||
- name: test
|
||||
image: python:3.11-slim
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
|
@ -10,23 +40,23 @@ steps:
|
|||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
---
|
||||
kind: pipeline
|
||||
|
@ -40,22 +70,22 @@ steps:
|
|||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
- python -m pip install poetry
|
||||
- poetry install
|
||||
- poetry run pytest
|
||||
# Basic test
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Basic test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Geography test
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
|
||||
# Geography test with unsafe fixes
|
||||
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
- poetry run csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
- poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
||||
# vim: ts=2 sw=2 et
|
||||
|
|
|
@ -15,37 +15,31 @@ jobs:
|
|||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v4
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
cache: 'pip'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install flake8 pytest
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
|
||||
python-version: '3.11'
|
||||
cache: 'poetry'
|
||||
- run: poetry install
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
||||
run: poetry run pytest
|
||||
- name: Test CLI
|
||||
run: |
|
||||
python setup.py install
|
||||
# Basic test
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv
|
||||
# Test with unsafe fixes
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
|
||||
# Test with experimental checks
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
# Test with AGROVOC validation
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
|
||||
# Test with AGROVOC validation (and dropping invalid)
|
||||
csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
poetry run csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
|
||||
|
|
20
CHANGELOG.md
20
CHANGELOG.md
|
@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
|
|||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
### Added
|
||||
- Ability to normalize DOIs to https://doi.org URI format
|
||||
|
||||
### Fixed
|
||||
- Fixed regex so we don't run the invalid multi-value separator fix on
|
||||
`dcterms.bibliographicCitation` fields
|
||||
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
|
||||
fields
|
||||
- Don't crash the country/region checker/fixer when a title field is missing
|
||||
|
||||
### Changed
|
||||
- Don't run newline fix on description fields
|
||||
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||
- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
|
||||
|
||||
### Updated
|
||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||
- SPDX license list
|
||||
|
||||
## [0.6.1] - 2023-02-23
|
||||
### Fixed
|
||||
- Missing region check should ignore subregion field, if it exists
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
include csv_metadata_quality/data/licenses.json
|
|
@ -31,6 +31,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
|||
- Check for countries with missing regions (and attempt to fix with `--unsafe-fixes`)
|
||||
- Remove duplicate metadata values
|
||||
- Check for duplicate items, using the title, type, and date issued as an indicator
|
||||
- [Normalize DOIs](https://www.crossref.org/documentation/member-setup/constructing-your-dois/) to https://doi.org URI format
|
||||
|
||||
## Installation
|
||||
The easiest way to install CSV Metadata Quality is with [poetry](https://python-poetry.org):
|
||||
|
@ -125,9 +126,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||
- Better logging, for example with INFO, WARN, and ERR levels
|
||||
- Verbose, debug, or quiet options
|
||||
- Warn if an author is shorter than 3 characters?
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
- Validate ISSNs or journal titles against CrossRef API?
|
||||
- Add configurable field validation, like specify a field name and a validation file?
|
||||
|
@ -137,7 +136,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||
- Warn if item is Open Access, but missing a license
|
||||
- Warn if item has an ISSN but no journal title
|
||||
- Update journal titles from ISSN
|
||||
- Migrate to https://github.com/spdx/license-list-data
|
||||
- Migrate from Pandas to Polars
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
|
||||
import csv_metadata_quality.check as check
|
||||
|
@ -74,7 +77,7 @@ def run(argv):
|
|||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||
df = pd.read_csv(args.input_file, dtype=str)
|
||||
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
|
||||
|
||||
# Check if the user requested to skip any fields
|
||||
if args.exclude_fields:
|
||||
|
@ -82,7 +85,20 @@ def run(argv):
|
|||
# user should be careful to no include spaces here.
|
||||
exclude = args.exclude_fields.split(",")
|
||||
else:
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.delete()
|
||||
|
||||
for column in df.columns:
|
||||
if column in exclude:
|
||||
|
@ -91,7 +107,9 @@ def run(argv):
|
|||
continue
|
||||
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?abstract.*$", column)
|
||||
# Skip whitespace and newline fixes on abstracts and descriptions
|
||||
# because there are too many with legitimate multi-line metadata.
|
||||
match = re.match(r"^.*?(abstract|description).*$", column)
|
||||
if match is None:
|
||||
# Fix: whitespace
|
||||
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
||||
|
@ -102,7 +120,7 @@ def run(argv):
|
|||
# Fix: missing space after comma. Only run on author and citation
|
||||
# fields for now, as this problem is mostly an issue in names.
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?(author|citation).*$", column)
|
||||
match = re.match(r"^.*?(author|[Cc]itation).*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||
|
||||
|
@ -123,10 +141,15 @@ def run(argv):
|
|||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
# Fix: normalize DOIs
|
||||
match = re.match(r"^.*?identifier\.doi.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.normalize_dois)
|
||||
|
||||
# Fix: invalid and unnecessary multi-value separators. Skip the title
|
||||
# and abstract fields because "|" is used to indicate something like
|
||||
# a subtitle.
|
||||
match = re.match(r"^.*?(abstract|title).*$", column)
|
||||
match = re.match(r"^.*?(abstract|[Cc]itation|title).*$", column)
|
||||
if match is None:
|
||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
|
||||
import country_converter as coco
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
|
@ -135,7 +133,7 @@ def suspicious_characters(field, field_name):
|
|||
return
|
||||
|
||||
# List of suspicious characters, for example: ́ˆ~`
|
||||
suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
|
||||
suspicious_characters = ["\u00b4", "\u02c6", "\u007e", "\u0060"]
|
||||
|
||||
for character in suspicious_characters:
|
||||
# Find the position of the suspicious character in the string
|
||||
|
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
|
|||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
# requests_cache.remove_expired_responses()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_params = {"query": value}
|
||||
|
||||
request = requests.get(request_url, params=request_params)
|
||||
|
@ -373,7 +358,7 @@ def duplicate_items(df):
|
|||
|
||||
if items_count_unique < items_count_total:
|
||||
# Create a list to hold our items while we check for duplicates
|
||||
items = list()
|
||||
items = []
|
||||
|
||||
for index, row in df.iterrows():
|
||||
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
||||
|
@ -554,7 +539,7 @@ def countries_match_regions(row, exclude):
|
|||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
regions = []
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
|
@ -563,8 +548,13 @@ def countries_match_regions(row, exclude):
|
|||
un_region = cc.convert(names=country, to="UNRegion")
|
||||
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
try:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
except KeyError:
|
||||
print(
|
||||
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
|
||||
)
|
||||
|
||||
return
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -2,8 +2,8 @@
|
|||
|
||||
import re
|
||||
|
||||
import langid
|
||||
import pandas as pd
|
||||
import py3langid as langid
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
|
||||
|
@ -20,7 +20,7 @@ def correct_language(row, exclude):
|
|||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
language = ""
|
||||
sample_strings = list()
|
||||
sample_strings = []
|
||||
title = None
|
||||
|
||||
# Iterate over the labels of the current row's values. Before we transposed
|
||||
|
|
|
@ -23,7 +23,7 @@ def whitespace(field, field_name):
|
|||
return
|
||||
|
||||
# Initialize an empty list to hold the cleaned values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
|
@ -64,7 +64,7 @@ def separators(field, field_name):
|
|||
return
|
||||
|
||||
# Initialize an empty list to hold the cleaned values
|
||||
values = list()
|
||||
values = []
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
|
@ -175,7 +175,7 @@ def duplicates(field, field_name):
|
|||
values = field.split("||")
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = list()
|
||||
new_values = []
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
|
@ -355,10 +355,10 @@ def countries_match_regions(row, exclude):
|
|||
if row[region_column_name] is not None:
|
||||
regions = row[region_column_name].split("||")
|
||||
else:
|
||||
regions = list()
|
||||
regions = []
|
||||
|
||||
# An empty list for our regions so we can keep track for all countries
|
||||
missing_regions = list()
|
||||
missing_regions = []
|
||||
|
||||
for country in countries:
|
||||
# Look up the UN M.49 regions for this country code. CoCo seems to
|
||||
|
@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
|
|||
# it doesn't already exist in regions.
|
||||
if un_region != "not found" and un_region not in regions:
|
||||
if un_region not in missing_regions:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
try:
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
|
||||
)
|
||||
except KeyError:
|
||||
# If there is no title column in the CSV we will print
|
||||
# the fix without the title instead of crashing.
|
||||
print(
|
||||
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
|
||||
)
|
||||
|
||||
missing_regions.append(un_region)
|
||||
|
||||
if len(missing_regions) > 0:
|
||||
|
@ -387,3 +395,74 @@ def countries_match_regions(row, exclude):
|
|||
row[region_column_name] = "||".join(missing_regions)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def normalize_dois(field):
|
||||
"""Normalize DOIs.
|
||||
|
||||
DOIs are meant to be globally unique identifiers. They are case insensitive,
|
||||
but in order to compare them robustly they should be normalized to a common
|
||||
format:
|
||||
|
||||
- strip leading and trailing whitespace
|
||||
- lowercase all ASCII characters
|
||||
- convert all variations to https://doi.org/10.xxxx/xxxx URI format
|
||||
|
||||
Return string with normalized DOI.
|
||||
|
||||
See: https://www.crossref.org/documentation/member-setup/constructing-your-dois/
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
values = field.split("||")
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = []
|
||||
|
||||
# Iterate over all values (most items will only have one DOI)
|
||||
for value in values:
|
||||
# Strip leading and trailing whitespace
|
||||
new_value = value.strip()
|
||||
|
||||
new_value = new_value.lower()
|
||||
|
||||
# Convert to HTTPS
|
||||
pattern = re.compile(r"^http://")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "https://", new_value)
|
||||
|
||||
# Convert dx.doi.org to doi.org
|
||||
pattern = re.compile(r"dx\.doi\.org")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "doi.org", new_value)
|
||||
|
||||
# Replace values like doi: 10.11648/j.jps.20140201.14
|
||||
pattern = re.compile(r"^doi: 10\.")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||
|
||||
# Replace values like 10.3390/foods12010115
|
||||
pattern = re.compile(r"^10\.")
|
||||
match = re.findall(pattern, new_value)
|
||||
|
||||
if match:
|
||||
new_value = re.sub(pattern, "https://doi.org/10.", new_value)
|
||||
|
||||
if new_value != value:
|
||||
print(f"{Fore.GREEN}Normalized DOI: {Fore.RESET}{value}")
|
||||
|
||||
new_values.append(new_value)
|
||||
|
||||
new_field = "||".join(new_values)
|
||||
|
||||
return new_field
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
|
||||
import json
|
||||
from importlib.resources import files
|
||||
import os
|
||||
|
||||
from ftfy.badness import is_bad
|
||||
|
||||
|
@ -58,7 +58,7 @@ def is_mojibake(field):
|
|||
def load_spdx_licenses():
|
||||
"""Returns a Python list of SPDX short license identifiers."""
|
||||
|
||||
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "data/licenses.json")) as f:
|
||||
licenses = json.load(f)
|
||||
|
||||
# List comprehension to extract the license ID for each license
|
||||
|
|
|
@ -37,3 +37,6 @@ Mojibake,2021-03-18,,,,Publicaçao CIAT,,,,Report,,,,
|
|||
Title missing from citation,2021-12-05,,,,,,,,,"Orth, A. 2021. Title missing f rom citation.",,,
|
||||
Country missing region,2021-12-08,,,,,Kenya,,,,,,,
|
||||
Subregion field shouldn’t trigger region checks,2022-12-07,,,,,Kenya,,,,,,Eastern Africa,Baringo
|
||||
DOI with HTTP and dx.doi.org,2024-04-23,,,,,,,,,,http://dx.doi.org/10.1016/j.envc.2023.100794,,
|
||||
DOI with colon,2024-04-23,,,,,,,,,,doi: 10.11648/j.jps.20140201.14,,
|
||||
Upper case bare DOI,2024-04-23,,,,,,,,,,10.19103/AS.2018.0043.16,,
|
||||
|
|
|
File diff suppressed because it is too large
Load Diff
|
@ -12,26 +12,25 @@ csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
pandas = "^1.5.2"
|
||||
pandas = {version = "^2.0.2", extras = ["feather", "performance"]}
|
||||
python-stdnum = "^1.18"
|
||||
requests = "^2.28.2"
|
||||
requests-cache = "^0.9.8"
|
||||
langid = "^1.1.6"
|
||||
requests-cache = "^1.0.0"
|
||||
colorama = "^0.4.6"
|
||||
ftfy = "^6.1.1"
|
||||
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}
|
||||
country-converter = "~1.1.0"
|
||||
pycountry = "^23.12.7"
|
||||
py3langid = "^0.2.2"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.2.1"
|
||||
flake8 = "^6.0.0"
|
||||
flake8 = "^7.0.0"
|
||||
pytest-clarity = "^1.0.1"
|
||||
black = "^23.1.0"
|
||||
isort = "^5.12.0"
|
||||
csvkit = "^1.1.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipython = "^8.10.0"
|
||||
fixit = "^2.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry>=0.12"]
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||
"extends": [
|
||||
"config:base"
|
||||
],
|
||||
"pip_requirements": {
|
||||
"enabled": false
|
||||
}
|
||||
}
|
|
@ -5,28 +5,28 @@ agate==1.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
|||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
|
||||
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
babel==2.12.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
black==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
black==23.3.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
csvkit==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
||||
exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
flake8==6.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
||||
greenlet==2.0.2 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
||||
greenlet==2.0.2 ; python_version >= "3.9" and platform_machine == "aarch64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "ppc64le" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "x86_64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "amd64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "AMD64" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "win32" and python_version < "4.0" or python_version >= "3.9" and platform_machine == "WIN32" and python_version < "4.0"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ipython==8.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ipython==8.13.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
isort==5.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
|
@ -37,44 +37,46 @@ matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
|||
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mypy-extensions==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
||||
numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
|
||||
openpyxl==3.1.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
packaging==23.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
openpyxl==3.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
packaging==23.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
parsedatetime==2.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pathspec==0.11.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pathspec==0.11.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
||||
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
platformdirs==3.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
platformdirs==3.5.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
prompt-toolkit==3.0.37 ; python_version >= "3.9" and python_version < "4.0"
|
||||
prompt-toolkit==3.0.38 ; python_version >= "3.9" and python_version < "4.0"
|
||||
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
|
||||
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycodestyle==2.10.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyflakes==3.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pygments==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pygments==2.15.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest==7.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytest==7.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-slugify==8.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-slugify==8.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
||||
rich==13.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
rich==13.3.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sqlalchemy==1.4.46 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sqlalchemy==1.4.48 ; python_version >= "3.9" and python_version < "4.0"
|
||||
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
traitlets==5.9.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
typing-extensions==4.5.0 ; python_version >= "3.9" and python_version < "3.10"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
||||
urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
|
||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
||||
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
|
|
|
@ -1,23 +1,25 @@
|
|||
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
|
||||
charset-normalizer==3.0.1 ; python_version >= "3.9" and python_version < "4"
|
||||
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.0 ; python_version >= "3.9" and python_version < "3.11"
|
||||
country-converter==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
exceptiongroup==1.1.1 ; python_version >= "3.9" and python_version < "3.11"
|
||||
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.2 ; python_version < "4.0" and python_version >= "3.9"
|
||||
pandas==1.5.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.24.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyarrow==11.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.13.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2022.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests-cache==0.9.8 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.28.2 ; python_version >= "3.9" and python_version < "4"
|
||||
requests==2.29.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==1.26.14 ; python_version >= "3.9" and python_version < "4"
|
||||
urllib3==1.26.15 ; python_version >= "3.9" and python_version < "4.0"
|
||||
wcwidth==0.2.6 ; python_version >= "3.9" and python_version < "4"
|
||||
|
|
36
setup.py
36
setup.py
|
@ -1,36 +0,0 @@
|
|||
import setuptools
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
install_requires = [
|
||||
"pandas",
|
||||
"python-stdnum",
|
||||
"requests",
|
||||
"requests-cache",
|
||||
"pycountry",
|
||||
"langid",
|
||||
]
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.6.1",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
license="GPLv3",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/alanorth/csv-metadata-quality",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
packages=["csv_metadata_quality"],
|
||||
entry_points={
|
||||
"console_scripts": ["csv-metadata-quality = csv_metadata_quality.__main__:main"]
|
||||
},
|
||||
install_requires=install_requires,
|
||||
)
|
|
@ -257,7 +257,7 @@ def test_check_incorrect_iso_639_1_language(capsys):
|
|||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "es"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
|
@ -277,7 +277,7 @@ def test_check_incorrect_iso_639_3_language(capsys):
|
|||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "spa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
|
@ -297,7 +297,7 @@ def test_check_correct_iso_639_1_language():
|
|||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "en"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
|
@ -313,7 +313,7 @@ def test_check_correct_iso_639_3_language():
|
|||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "eng"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
|
@ -407,7 +407,7 @@ def test_check_doi_field():
|
|||
# the citation and a DOI field.
|
||||
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
|
||||
series = pd.Series(data=d)
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
result = check.citation_doi(series, exclude)
|
||||
|
||||
|
@ -418,7 +418,7 @@ def test_check_doi_only_in_citation(capsys):
|
|||
"""Test an item with a DOI in its citation, but no DOI field."""
|
||||
|
||||
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# an empty DOI field and a citation containing a DOI.
|
||||
|
@ -439,7 +439,7 @@ def test_title_in_citation():
|
|||
|
||||
title = "Testing all the things"
|
||||
citation = "Orth, A. 2021. Testing all the things."
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the title and citation.
|
||||
|
@ -456,7 +456,7 @@ def test_title_not_in_citation(capsys):
|
|||
|
||||
title = "Testing all the things"
|
||||
citation = "Orth, A. 2021. Testing all teh things."
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series), with
|
||||
# the title and citation.
|
||||
|
@ -477,7 +477,7 @@ def test_country_matches_region():
|
|||
|
||||
country = "Kenya"
|
||||
region = "Eastern Africa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {"cg.coverage.country": country, "cg.coverage.region": region}
|
||||
|
@ -495,7 +495,7 @@ def test_country_not_matching_region(capsys):
|
|||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
|
|
|
@ -131,7 +131,7 @@ def test_fix_country_not_matching_region():
|
|||
country = "Kenya"
|
||||
region = ""
|
||||
missing_region = "Eastern Africa"
|
||||
exclude = list()
|
||||
exclude = []
|
||||
|
||||
# Emulate a column in a transposed dataframe (which is just a series)
|
||||
d = {
|
||||
|
@ -152,3 +152,11 @@ def test_fix_country_not_matching_region():
|
|||
series_correct = pd.Series(data=d_correct)
|
||||
|
||||
pd.testing.assert_series_equal(result, series_correct)
|
||||
|
||||
|
||||
def test_fix_normalize_dois():
|
||||
"""Test normalizing a DOI."""
|
||||
|
||||
value = "doi: 10.11648/j.jps.20140201.14"
|
||||
|
||||
assert fix.normalize_dois(value) == "https://doi.org/10.11648/j.jps.20140201.14"
|
||||
|
|
Loading…
Reference in New Issue