1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-02-20 16:56:21 +01:00

Compare commits

..

No commits in common. "ad2cda8a41ef117db2637de17957bd58f351a5ba" and "3dbe656f9fd51d2e11bfac795ff54632fb1b4867" have entirely different histories.

9 changed files with 12 additions and 19 deletions

View File

@ -9,7 +9,6 @@ steps:
commands:
- id
- python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config
- pip install -r requirements-dev.txt
- pytest
- python setup.py install
@ -26,7 +25,6 @@ steps:
commands:
- id
- python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config
- pip install -r requirements-dev.txt
- pytest
- python setup.py install
@ -43,7 +41,6 @@ steps:
commands:
- id
- python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config
- pip install -r requirements-dev.txt
- pytest
- python setup.py install

View File

@ -16,10 +16,10 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: 3.8
- name: Install dependencies
run: |
python -m pip install --upgrade pip

View File

@ -4,19 +4,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.4.6] - 2021-03-11
### Added
## Unreleased
## Added
- Validation of dcterms.license field against SPDX license identifiers
### Changed
## Changed
- Use DCTERMS fields where possible in `data/test.csv`
### Updated
- Run `poetry update` to update project dependencies
### Fixed
- Output for all fixes should be green, because it is good
## [0.4.5] - 2021-03-04
### Added
- Check dates in dcterms.issued field as well, not just fields that have the

View File

@ -1,7 +1,7 @@
# DSpace CSV Metadata Quality Checker ![GitHub Actions](https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg) [![Build Status](https://ci.mjanja.ch/api/badges/alanorth/csv-metadata-quality/status.svg)](https://ci.mjanja.ch/alanorth/csv-metadata-quality)
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
Requires Python 3.7 or greater (3.8+ recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
Requires Python 3.7 or greater (3.8 recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
If you use the DSpace CSV metadata quality checker please cite:
@ -13,7 +13,6 @@ If you use the DSpace CSV metadata quality checker please cite:
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
- Experimental validation of titles and abstracts against item's Dublin Core language field
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
- Validation of licenses against the list of [SPDX license identifiers](https://spdx.org/licenses)
- Fix leading, trailing, and excessive (ie, more than one) whitespace
- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes`
- Fix problematic newlines (line feeds) using `--unsafe-fixes`

View File

@ -77,7 +77,7 @@ def separators(field, field_name):
if match:
print(
f"{Fore.GREEN}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
)
value = re.sub(pattern, "||", value)

View File

@ -1 +1 @@
VERSION = "0.4.6"
VERSION = "0.4.5"

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "csv-metadata-quality"
version = "0.4.6"
version = "0.4.5"
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
authors = ["Alan Orth <alan.orth@gmail.com>"]
license="GPL-3.0-only"

View File

@ -14,7 +14,7 @@ install_requires = [
setuptools.setup(
name="csv-metadata-quality",
version="0.4.6",
version="0.4.5",
author="Alan Orth",
author_email="aorth@mjanja.ch",
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",

View File

@ -224,7 +224,7 @@ def test_check_invalid_agrovoc(capsys):
"""Test invalid AGROVOC subject."""
value = "FOREST"
field_name = "dcterms.subject"
field_name = "dc.subject"
check.agrovoc(value, field_name)
@ -239,7 +239,7 @@ def test_check_valid_agrovoc():
"""Test valid AGROVOC subject."""
value = "FORESTS"
field_name = "dcterms.subject"
field_name = "dc.subject"
result = check.agrovoc(value, field_name)