mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-02-21 17:26:21 +01:00
Compare commits
8 Commits
3dbe656f9f
...
ad2cda8a41
Author | SHA1 | Date | |
---|---|---|---|
ad2cda8a41 | |||
dc6920802e | |||
6ca449d8ed | |||
1554cfd5c9 | |||
00b8faad6d | |||
b19d81abdd | |||
a0ea829f5c | |||
0089efa914 |
@ -9,6 +9,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- id
|
- id
|
||||||
- python -V
|
- python -V
|
||||||
|
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||||
- pip install -r requirements-dev.txt
|
- pip install -r requirements-dev.txt
|
||||||
- pytest
|
- pytest
|
||||||
- python setup.py install
|
- python setup.py install
|
||||||
@ -25,6 +26,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- id
|
- id
|
||||||
- python -V
|
- python -V
|
||||||
|
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||||
- pip install -r requirements-dev.txt
|
- pip install -r requirements-dev.txt
|
||||||
- pytest
|
- pytest
|
||||||
- python setup.py install
|
- python setup.py install
|
||||||
@ -41,6 +43,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- id
|
- id
|
||||||
- python -V
|
- python -V
|
||||||
|
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||||
- pip install -r requirements-dev.txt
|
- pip install -r requirements-dev.txt
|
||||||
- pytest
|
- pytest
|
||||||
- python setup.py install
|
- python setup.py install
|
||||||
|
4
.github/workflows/python-app.yml
vendored
4
.github/workflows/python-app.yml
vendored
@ -16,10 +16,10 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python 3.8
|
- name: Set up Python 3.9
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: 3.8
|
python-version: 3.9
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
|
@ -4,16 +4,19 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
## Unreleased
|
## [0.4.6] - 2021-03-11
|
||||||
## Added
|
### Added
|
||||||
- Validation of dcterms.license field against SPDX license identifiers
|
- Validation of dcterms.license field against SPDX license identifiers
|
||||||
|
|
||||||
## Changed
|
### Changed
|
||||||
- Use DCTERMS fields where possible in `data/test.csv`
|
- Use DCTERMS fields where possible in `data/test.csv`
|
||||||
|
|
||||||
### Updated
|
### Updated
|
||||||
- Run `poetry update` to update project dependencies
|
- Run `poetry update` to update project dependencies
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Output for all fixes should be green, because it is good
|
||||||
|
|
||||||
## [0.4.5] - 2021-03-04
|
## [0.4.5] - 2021-03-04
|
||||||
### Added
|
### Added
|
||||||
- Check dates in dcterms.issued field as well, not just fields that have the
|
- Check dates in dcterms.issued field as well, not just fields that have the
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# DSpace CSV Metadata Quality Checker data:image/s3,"s3://crabby-images/d21e4/d21e41d201503679441f1c46a22b004e906bae4c" alt="GitHub Actions" [data:image/s3,"s3://crabby-images/78772/78772a7d5a544a67e2f8aadd7dfbeffee3edb4c7" alt="Build Status"](https://ci.mjanja.ch/alanorth/csv-metadata-quality)
|
# DSpace CSV Metadata Quality Checker data:image/s3,"s3://crabby-images/d21e4/d21e41d201503679441f1c46a22b004e906bae4c" alt="GitHub Actions" [data:image/s3,"s3://crabby-images/78772/78772a7d5a544a67e2f8aadd7dfbeffee3edb4c7" alt="Build Status"](https://ci.mjanja.ch/alanorth/csv-metadata-quality)
|
||||||
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
|
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
|
||||||
|
|
||||||
Requires Python 3.7 or greater (3.8 recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
|
Requires Python 3.7 or greater (3.8+ recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
|
||||||
|
|
||||||
If you use the DSpace CSV metadata quality checker please cite:
|
If you use the DSpace CSV metadata quality checker please cite:
|
||||||
|
|
||||||
@ -13,6 +13,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
|||||||
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
|
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
|
||||||
- Experimental validation of titles and abstracts against item's Dublin Core language field
|
- Experimental validation of titles and abstracts against item's Dublin Core language field
|
||||||
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
|
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
|
||||||
|
- Validation of licenses against the list of [SPDX license identifiers](https://spdx.org/licenses)
|
||||||
- Fix leading, trailing, and excessive (ie, more than one) whitespace
|
- Fix leading, trailing, and excessive (ie, more than one) whitespace
|
||||||
- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes`
|
- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes`
|
||||||
- Fix problematic newlines (line feeds) using `--unsafe-fixes`
|
- Fix problematic newlines (line feeds) using `--unsafe-fixes`
|
||||||
|
@ -77,7 +77,7 @@ def separators(field, field_name):
|
|||||||
|
|
||||||
if match:
|
if match:
|
||||||
print(
|
print(
|
||||||
f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
|
f"{Fore.GREEN}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
|
||||||
)
|
)
|
||||||
|
|
||||||
value = re.sub(pattern, "||", value)
|
value = re.sub(pattern, "||", value)
|
||||||
|
@ -1 +1 @@
|
|||||||
VERSION = "0.4.5"
|
VERSION = "0.4.6"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "csv-metadata-quality"
|
name = "csv-metadata-quality"
|
||||||
version = "0.4.5"
|
version = "0.4.6"
|
||||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
||||||
license="GPL-3.0-only"
|
license="GPL-3.0-only"
|
||||||
|
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ install_requires = [
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="csv-metadata-quality",
|
name="csv-metadata-quality",
|
||||||
version="0.4.5",
|
version="0.4.6",
|
||||||
author="Alan Orth",
|
author="Alan Orth",
|
||||||
author_email="aorth@mjanja.ch",
|
author_email="aorth@mjanja.ch",
|
||||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||||
|
@ -224,7 +224,7 @@ def test_check_invalid_agrovoc(capsys):
|
|||||||
"""Test invalid AGROVOC subject."""
|
"""Test invalid AGROVOC subject."""
|
||||||
|
|
||||||
value = "FOREST"
|
value = "FOREST"
|
||||||
field_name = "dc.subject"
|
field_name = "dcterms.subject"
|
||||||
|
|
||||||
check.agrovoc(value, field_name)
|
check.agrovoc(value, field_name)
|
||||||
|
|
||||||
@ -239,7 +239,7 @@ def test_check_valid_agrovoc():
|
|||||||
"""Test valid AGROVOC subject."""
|
"""Test valid AGROVOC subject."""
|
||||||
|
|
||||||
value = "FORESTS"
|
value = "FORESTS"
|
||||||
field_name = "dc.subject"
|
field_name = "dcterms.subject"
|
||||||
|
|
||||||
result = check.agrovoc(value, field_name)
|
result = check.agrovoc(value, field_name)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user