1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-10 15:16:01 +02:00

6 Commits

Author SHA1 Message Date
46098861ed poetry.lock: Run poetry update
All checks were successful
continuous-integration/drone/push Build is passing
2021-03-11 22:45:32 +02:00
fa84cfa440 Bump version to 0.4.6-dev 2021-03-11 22:44:36 +02:00
6cc1401f88 pyproject.toml: Minimum Python is technically 3.7.1
All checks were successful
continuous-integration/drone/push Build is passing
See: https://pandas.pydata.org/pandas-docs/stable/whatsnew/v1.2.0.html
2021-03-11 13:41:58 +02:00
ad2cda8a41 README.md: Add note about SPDX license identifiers
All checks were successful
continuous-integration/drone/push Build is passing
2021-03-11 12:21:34 +02:00
dc6920802e .github/workflows/python-app.yml: Use Python 3.9
I now use this version in my development environment. Eventually I
should add a matrix of versions to use, but I don't know the GitHub
Actions syntax well enough yet.
2021-03-11 12:17:57 +02:00
6ca449d8ed README.md: Update note about Python 3.8 to 3.8+
Currently the lower bound on Python version support is 3.7 because
of Pandas 1.2.0 requiring it, but I use 3.9 on my development box.
2021-03-11 12:16:07 +02:00
6 changed files with 54 additions and 12 deletions

View File

@ -16,10 +16,10 @@ jobs:
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Set up Python 3.8 - name: Set up Python 3.9
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
python-version: 3.8 python-version: 3.9
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip

View File

@ -1,7 +1,7 @@
# DSpace CSV Metadata Quality Checker ![GitHub Actions](https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg) [![Build Status](https://ci.mjanja.ch/api/badges/alanorth/csv-metadata-quality/status.svg)](https://ci.mjanja.ch/alanorth/csv-metadata-quality) # DSpace CSV Metadata Quality Checker ![GitHub Actions](https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg) [![Build Status](https://ci.mjanja.ch/api/badges/alanorth/csv-metadata-quality/status.svg)](https://ci.mjanja.ch/alanorth/csv-metadata-quality)
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc. A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
Requires Python 3.7 or greater (3.8 recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested. Requires Python 3.7.1 or greater (3.8+ recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
If you use the DSpace CSV metadata quality checker please cite: If you use the DSpace CSV metadata quality checker please cite:
@ -13,6 +13,7 @@ If you use the DSpace CSV metadata quality checker please cite:
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3) - Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
- Experimental validation of titles and abstracts against item's Dublin Core language field - Experimental validation of titles and abstracts against item's Dublin Core language field
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option) - Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
- Validation of licenses against the list of [SPDX license identifiers](https://spdx.org/licenses)
- Fix leading, trailing, and excessive (ie, more than one) whitespace - Fix leading, trailing, and excessive (ie, more than one) whitespace
- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes` - Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes`
- Fix problematic newlines (line feeds) using `--unsafe-fixes` - Fix problematic newlines (line feeds) using `--unsafe-fixes`

View File

@ -1 +1 @@
VERSION = "0.4.6" VERSION = "0.4.6-dev"

51
poetry.lock generated
View File

@ -212,6 +212,7 @@ optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
[package.dependencies] [package.dependencies]
importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
mccabe = ">=0.6.0,<0.7.0" mccabe = ">=0.6.0,<0.7.0"
pycodestyle = ">=2.6.0a1,<2.7.0" pycodestyle = ">=2.6.0a1,<2.7.0"
pyflakes = ">=2.2.0,<2.3.0" pyflakes = ">=2.2.0,<2.3.0"
@ -224,6 +225,22 @@ category = "main"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "importlib-metadata"
version = "3.7.2"
description = "Read metadata from Python packages"
category = "dev"
optional = false
python-versions = ">=3.6"
[package.dependencies]
typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
zipp = ">=0.5"
[package.extras]
docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"]
[[package]] [[package]]
name = "iniconfig" name = "iniconfig"
version = "1.1.1" version = "1.1.1"
@ -449,12 +466,15 @@ category = "dev"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[package.dependencies]
importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
[package.extras] [package.extras]
dev = ["pre-commit", "tox"] dev = ["pre-commit", "tox"]
[[package]] [[package]]
name = "prompt-toolkit" name = "prompt-toolkit"
version = "3.0.16" version = "3.0.17"
description = "Library for building powerful interactive command lines in Python" description = "Library for building powerful interactive command lines in Python"
category = "dev" category = "dev"
optional = false optional = false
@ -539,6 +559,7 @@ python-versions = ">=3.6"
atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
attrs = ">=19.2.0" attrs = ">=19.2.0"
colorama = {version = "*", markers = "sys_platform == \"win32\""} colorama = {version = "*", markers = "sys_platform == \"win32\""}
importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
iniconfig = "*" iniconfig = "*"
packaging = "*" packaging = "*"
pluggy = ">=0.12,<1.0.0a1" pluggy = ">=0.12,<1.0.0a1"
@ -770,10 +791,22 @@ category = "main"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "zipp"
version = "3.4.1"
description = "Backport of pathlib-compatible object wrapper for zip files"
category = "dev"
optional = false
python-versions = ">=3.6"
[package.extras]
docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
testing = ["pytest (>=4.6)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"]
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.8" python-versions = "^3.7.1"
content-hash = "6a9ee0f26b50f361d7e0e6a2275f0e3174dee1c89fbd460583c4ea3d873857b8" content-hash = "e60e882e091af667b968c00521fd378e1220c1836d394d90bbc783920e38bb62"
[metadata.files] [metadata.files]
agate = [ agate = [
@ -853,6 +886,10 @@ idna = [
{file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
{file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
] ]
importlib-metadata = [
{file = "importlib_metadata-3.7.2-py3-none-any.whl", hash = "sha256:407d13f55dc6f2a844e62325d18ad7019a436c4bfcaee34cda35f2be6e7c3e34"},
{file = "importlib_metadata-3.7.2.tar.gz", hash = "sha256:18d5ff601069f98d5d605b6a4b50c18a34811d655c55548adc833e687289acde"},
]
iniconfig = [ iniconfig = [
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@ -969,8 +1006,8 @@ pluggy = [
{file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
] ]
prompt-toolkit = [ prompt-toolkit = [
{file = "prompt_toolkit-3.0.16-py3-none-any.whl", hash = "sha256:62c811e46bd09130fb11ab759012a4ae385ce4fb2073442d1898867a824183bd"}, {file = "prompt_toolkit-3.0.17-py3-none-any.whl", hash = "sha256:4cea7d09e46723885cb8bc54678175453e5071e9449821dce6f017b1d1fbfc1a"},
{file = "prompt_toolkit-3.0.16.tar.gz", hash = "sha256:0fa02fa80363844a4ab4b8d6891f62dd0645ba672723130423ca4037b80c1974"}, {file = "prompt_toolkit-3.0.17.tar.gz", hash = "sha256:9397a7162cf45449147ad6042fa37983a081b8a73363a5253dd4072666333137"},
] ]
ptyprocess = [ ptyprocess = [
{file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
@ -1191,3 +1228,7 @@ xlrd = [
{file = "xlrd-1.2.0-py2.py3-none-any.whl", hash = "sha256:e551fb498759fa3a5384a94ccd4c3c02eb7c00ea424426e212ac0c57be9dfbde"}, {file = "xlrd-1.2.0-py2.py3-none-any.whl", hash = "sha256:e551fb498759fa3a5384a94ccd4c3c02eb7c00ea424426e212ac0c57be9dfbde"},
{file = "xlrd-1.2.0.tar.gz", hash = "sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2"}, {file = "xlrd-1.2.0.tar.gz", hash = "sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2"},
] ]
zipp = [
{file = "zipp-3.4.1-py3-none-any.whl", hash = "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"},
{file = "zipp-3.4.1.tar.gz", hash = "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76"},
]

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "csv-metadata-quality" name = "csv-metadata-quality"
version = "0.4.6" version = "0.4.6-dev"
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem." description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
authors = ["Alan Orth <alan.orth@gmail.com>"] authors = ["Alan Orth <alan.orth@gmail.com>"]
license="GPL-3.0-only" license="GPL-3.0-only"
@ -11,7 +11,7 @@ homepage = "https://github.com/ilri/csv-metadata-quality"
csv-metadata-quality = 'csv_metadata_quality.__main__:main' csv-metadata-quality = 'csv_metadata_quality.__main__:main'
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.8" python = "^3.7.1"
pandas = "^1.0.4" pandas = "^1.0.4"
python-stdnum = "^1.13" python-stdnum = "^1.13"
xlrd = "^1.2.0" xlrd = "^1.2.0"

View File

@ -14,7 +14,7 @@ install_requires = [
setuptools.setup( setuptools.setup(
name="csv-metadata-quality", name="csv-metadata-quality",
version="0.4.6", version="0.4.6-dev",
author="Alan Orth", author="Alan Orth",
author_email="aorth@mjanja.ch", author_email="aorth@mjanja.ch",
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.", description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",