1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-02-22 09:46:22 +01:00

Compare commits

..

No commits in common. "ad2cda8a41ef117db2637de17957bd58f351a5ba" and "3dbe656f9fd51d2e11bfac795ff54632fb1b4867" have entirely different histories.

9 changed files with 12 additions and 19 deletions

View File

@ -9,7 +9,6 @@ steps:
commands: commands:
- id - id
- python -V - python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config
- pip install -r requirements-dev.txt - pip install -r requirements-dev.txt
- pytest - pytest
- python setup.py install - python setup.py install
@ -26,7 +25,6 @@ steps:
commands: commands:
- id - id
- python -V - python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config
- pip install -r requirements-dev.txt - pip install -r requirements-dev.txt
- pytest - pytest
- python setup.py install - python setup.py install
@ -43,7 +41,6 @@ steps:
commands: commands:
- id - id
- python -V - python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config
- pip install -r requirements-dev.txt - pip install -r requirements-dev.txt
- pytest - pytest
- python setup.py install - python setup.py install

View File

@ -16,10 +16,10 @@ jobs:
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Set up Python 3.9 - name: Set up Python 3.8
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
python-version: 3.9 python-version: 3.8
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip

View File

@ -4,19 +4,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.4.6] - 2021-03-11 ## Unreleased
### Added ## Added
- Validation of dcterms.license field against SPDX license identifiers - Validation of dcterms.license field against SPDX license identifiers
### Changed ## Changed
- Use DCTERMS fields where possible in `data/test.csv` - Use DCTERMS fields where possible in `data/test.csv`
### Updated ### Updated
- Run `poetry update` to update project dependencies - Run `poetry update` to update project dependencies
### Fixed
- Output for all fixes should be green, because it is good
## [0.4.5] - 2021-03-04 ## [0.4.5] - 2021-03-04
### Added ### Added
- Check dates in dcterms.issued field as well, not just fields that have the - Check dates in dcterms.issued field as well, not just fields that have the

View File

@ -1,7 +1,7 @@
# DSpace CSV Metadata Quality Checker ![GitHub Actions](https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg) [![Build Status](https://ci.mjanja.ch/api/badges/alanorth/csv-metadata-quality/status.svg)](https://ci.mjanja.ch/alanorth/csv-metadata-quality) # DSpace CSV Metadata Quality Checker ![GitHub Actions](https://github.com/ilri/csv-metadata-quality/workflows/Build%20and%20Test/badge.svg) [![Build Status](https://ci.mjanja.ch/api/badges/alanorth/csv-metadata-quality/status.svg)](https://ci.mjanja.ch/alanorth/csv-metadata-quality)
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc. A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
Requires Python 3.7 or greater (3.8+ recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested. Requires Python 3.7 or greater (3.8 recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
If you use the DSpace CSV metadata quality checker please cite: If you use the DSpace CSV metadata quality checker please cite:
@ -13,7 +13,6 @@ If you use the DSpace CSV metadata quality checker please cite:
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3) - Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
- Experimental validation of titles and abstracts against item's Dublin Core language field - Experimental validation of titles and abstracts against item's Dublin Core language field
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option) - Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
- Validation of licenses against the list of [SPDX license identifiers](https://spdx.org/licenses)
- Fix leading, trailing, and excessive (ie, more than one) whitespace - Fix leading, trailing, and excessive (ie, more than one) whitespace
- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes` - Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes`
- Fix problematic newlines (line feeds) using `--unsafe-fixes` - Fix problematic newlines (line feeds) using `--unsafe-fixes`

View File

@ -77,7 +77,7 @@ def separators(field, field_name):
if match: if match:
print( print(
f"{Fore.GREEN}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}" f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
) )
value = re.sub(pattern, "||", value) value = re.sub(pattern, "||", value)

View File

@ -1 +1 @@
VERSION = "0.4.6" VERSION = "0.4.5"

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "csv-metadata-quality" name = "csv-metadata-quality"
version = "0.4.6" version = "0.4.5"
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem." description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
authors = ["Alan Orth <alan.orth@gmail.com>"] authors = ["Alan Orth <alan.orth@gmail.com>"]
license="GPL-3.0-only" license="GPL-3.0-only"

View File

@ -14,7 +14,7 @@ install_requires = [
setuptools.setup( setuptools.setup(
name="csv-metadata-quality", name="csv-metadata-quality",
version="0.4.6", version="0.4.5",
author="Alan Orth", author="Alan Orth",
author_email="aorth@mjanja.ch", author_email="aorth@mjanja.ch",
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.", description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",

View File

@ -224,7 +224,7 @@ def test_check_invalid_agrovoc(capsys):
"""Test invalid AGROVOC subject.""" """Test invalid AGROVOC subject."""
value = "FOREST" value = "FOREST"
field_name = "dcterms.subject" field_name = "dc.subject"
check.agrovoc(value, field_name) check.agrovoc(value, field_name)
@ -239,7 +239,7 @@ def test_check_valid_agrovoc():
"""Test valid AGROVOC subject.""" """Test valid AGROVOC subject."""
value = "FORESTS" value = "FORESTS"
field_name = "dcterms.subject" field_name = "dc.subject"
result = check.agrovoc(value, field_name) result = check.agrovoc(value, field_name)