1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-28 08:38:18 +01:00

Compare commits

..

No commits in common. "8c23382b224abee745d1e473e9daf89f86148fb2" and "051777bcece3a5ec319c4c3fc7f56f7ff7e09807" have entirely different histories.

10 changed files with 1051 additions and 7307 deletions

View File

@ -58,4 +58,34 @@ steps:
# Test with AGROVOC validation (and dropping invalid)
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
---
kind: pipeline
type: docker
name: python38
steps:
- name: test
image: python:3.8-slim
commands:
- id
- python -V
- apt update && apt install -y gcc g++ libicu-dev pkg-config git
- pip install -r requirements-dev.txt
- pytest
- python setup.py install
# Basic test
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv
# Basic test with unsafe fixes
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -u
# Geography test
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv
# Geography test with unsafe fixes
- csv-metadata-quality -i data/test-geography.csv -o /tmp/test.csv -u
# Test with experimental checks
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
# Test with AGROVOC validation
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject
# Test with AGROVOC validation (and dropping invalid)
- csv-metadata-quality -i data/test.csv -o /tmp/test.csv --agrovoc-fields dcterms.subject -d
# vim: ts=2 sw=2 et

View File

@ -8,11 +8,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Missing region check should ignore subregion field, if it exists
### Changed
- Use SPDX license data from SPDX themselves instead of spdx-license-list
because it is deprecated and outdated
- Require Python 3.9+
## [0.6.0] - 2022-09-02
### Changed
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding

View File

@ -8,7 +8,7 @@
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
Requires Python 3.9 or greater. CSV support comes from the [Pandas](https://pandas.pydata.org/) library.
Requires Python 3.8 or greater. CSV support comes from the [Pandas](https://pandas.pydata.org/) library.
If you use the DSpace CSV metadata quality checker please cite:

View File

@ -9,12 +9,13 @@ import country_converter as coco
import pandas as pd
import requests
import requests_cache
import spdx_license_list
from colorama import Fore
from pycountry import languages
from stdnum import isbn as stdnum_isbn
from stdnum import issn as stdnum_issn
from csv_metadata_quality.util import is_mojibake, load_spdx_licenses
from csv_metadata_quality.util import is_mojibake
def issn(field):
@ -316,11 +317,9 @@ def spdx_license_identifier(field):
if pd.isna(field):
return
spdx_licenses = load_spdx_licenses()
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if value not in spdx_licenses:
if value not in spdx_license_list.LICENSES:
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
return

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import json
from importlib.resources import files
from ftfy.badness import is_bad
@ -53,13 +49,3 @@ def is_mojibake(field):
else:
# Encodable as CP-1252, Mojibake alert level high
return True
def load_spdx_licenses():
"""Returns a Python list of SPDX short license identifiers."""
with open(files("csv_metadata_quality").joinpath("data/licenses.json")) as f:
licenses = json.load(f)
# List comprehension to extract the license ID for each license
return [license["licenseId"] for license in licenses["licenses"]]

1780
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -11,13 +11,14 @@ homepage = "https://github.com/ilri/csv-metadata-quality"
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
[tool.poetry.dependencies]
python = "^3.9"
python = "^3.8"
pandas = "^1.5.1"
python-stdnum = "^1.17"
requests = "^2.28.1"
requests-cache = "^0.9.7"
langid = "^1.1.6"
colorama = "^0.4.5"
spdx-license-list = "^0.5.2"
ftfy = "^6.1.1"
country-converter = {git = "https://github.com/alanorth/country_converter.git", rev = "myanmar-region"}
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.12.0"}

View File

@ -1,80 +1,82 @@
agate-dbf==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
agate-excel==0.2.5 ; python_version >= "3.9" and python_version < "4.0"
agate-sql==0.5.8 ; python_version >= "3.9" and python_version < "4.0"
agate==1.6.3 ; python_version >= "3.9" and python_version < "4.0"
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
appnope==0.1.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "darwin"
asttokens==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
attrs==22.1.0 ; python_version >= "3.9" and python_version < "4.0"
babel==2.11.0 ; python_version >= "3.9" and python_version < "4.0"
backcall==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
black==22.12.0 ; python_version >= "3.9" and python_version < "4.0"
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
charset-normalizer==2.1.1 ; python_version >= "3.9" and python_version < "4"
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
commonmark==0.9.1 ; python_version >= "3.9" and python_version < "4.0"
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
csvkit==1.0.7 ; python_version >= "3.9" and python_version < "4.0"
dbfread==2.0.7 ; python_version >= "3.9" and python_version < "4.0"
decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0"
et-xmlfile==1.1.0 ; python_version >= "3.9" and python_version < "4.0"
exceptiongroup==1.0.4 ; python_version >= "3.9" and python_version < "3.11"
executing==1.2.0 ; python_version >= "3.9" and python_version < "4.0"
flake8==5.0.4 ; python_version >= "3.9" and python_version < "4.0"
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
future==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
greenlet==2.0.1 ; python_version >= "3.9" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
idna==3.4 ; python_version >= "3.9" and python_version < "4"
iniconfig==1.1.1 ; python_version >= "3.9" and python_version < "4.0"
ipython==8.7.0 ; python_version >= "3.9" and python_version < "4.0"
isodate==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
isort==5.11.1 ; python_version >= "3.9" and python_version < "4.0"
jedi==0.18.2 ; python_version >= "3.9" and python_version < "4.0"
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
leather==0.3.4 ; python_version >= "3.9" and python_version < "4.0"
matplotlib-inline==0.1.6 ; python_version >= "3.9" and python_version < "4.0"
mccabe==0.7.0 ; python_version >= "3.9" and python_version < "4.0"
mypy-extensions==0.4.3 ; python_version >= "3.9" and python_version < "4.0"
numpy==1.23.5 ; python_version < "4.0" and python_version >= "3.9"
olefile==0.46 ; python_version >= "3.9" and python_version < "4.0"
openpyxl==3.0.10 ; python_version >= "3.9" and python_version < "4.0"
packaging==22.0 ; python_version >= "3.9" and python_version < "4.0"
pandas==1.5.2 ; python_version >= "3.9" and python_version < "4.0"
parsedatetime==2.4 ; python_version >= "3.9" and python_version < "4.0"
parso==0.8.3 ; python_version >= "3.9" and python_version < "4.0"
pathspec==0.10.3 ; python_version >= "3.9" and python_version < "4.0"
pexpect==4.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
pickleshare==0.7.5 ; python_version >= "3.9" and python_version < "4.0"
platformdirs==2.6.0 ; python_version >= "3.9" and python_version < "4.0"
pluggy==1.0.0 ; python_version >= "3.9" and python_version < "4.0"
pprintpp==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
prompt-toolkit==3.0.36 ; python_version >= "3.9" and python_version < "4.0"
ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform != "win32"
pure-eval==0.2.2 ; python_version >= "3.9" and python_version < "4.0"
pycodestyle==2.9.1 ; python_version >= "3.9" and python_version < "4.0"
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
pyflakes==2.5.0 ; python_version >= "3.9" and python_version < "4.0"
pygments==2.13.0 ; python_version >= "3.9" and python_version < "4.0"
pytest-clarity==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
pytest==7.2.0 ; python_version >= "3.9" and python_version < "4.0"
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
python-slugify==7.0.0 ; python_version >= "3.9" and python_version < "4.0"
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
pytimeparse==1.1.8 ; python_version >= "3.9" and python_version < "4.0"
pytz==2022.6 ; python_version >= "3.9" and python_version < "4.0"
requests-cache==0.9.7 ; python_version >= "3.9" and python_version < "4.0"
requests==2.28.1 ; python_version >= "3.9" and python_version < "4"
rich==12.6.0 ; python_version >= "3.9" and python_version < "4.0"
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
sqlalchemy==1.4.45 ; python_version >= "3.9" and python_version < "4.0"
stack-data==0.6.2 ; python_version >= "3.9" and python_version < "4.0"
text-unidecode==1.3 ; python_version >= "3.9" and python_version < "4.0"
tomli==2.0.1 ; python_version >= "3.9" and python_full_version < "3.11.0a7"
traitlets==5.7.1 ; python_version >= "3.9" and python_version < "4.0"
typing-extensions==4.4.0 ; python_version >= "3.9" and python_version < "3.10"
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
urllib3==1.26.13 ; python_version >= "3.9" and python_version < "4"
wcwidth==0.2.5 ; python_version >= "3.9" and python_version < "4"
xlrd==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
agate-dbf==0.2.2 ; python_version >= "3.8" and python_version < "4.0"
agate-excel==0.2.5 ; python_version >= "3.8" and python_version < "4.0"
agate-sql==0.5.8 ; python_version >= "3.8" and python_version < "4.0"
agate==1.6.3 ; python_version >= "3.8" and python_version < "4.0"
appdirs==1.4.4 ; python_version >= "3.8" and python_version < "4.0"
appnope==0.1.3 ; python_version >= "3.8" and python_version < "4.0" and sys_platform == "darwin"
asttokens==2.1.0 ; python_version >= "3.8" and python_version < "4.0"
attrs==22.1.0 ; python_version >= "3.8" and python_version < "4.0"
babel==2.11.0 ; python_version >= "3.8" and python_version < "4.0"
backcall==0.2.0 ; python_version >= "3.8" and python_version < "4.0"
black==22.10.0 ; python_version >= "3.8" and python_version < "4.0"
cattrs==22.2.0 ; python_version >= "3.8" and python_version < "4.0"
certifi==2022.9.24 ; python_version >= "3.8" and python_version < "4"
charset-normalizer==2.1.1 ; python_version >= "3.8" and python_version < "4"
click==8.1.3 ; python_version >= "3.8" and python_version < "4.0"
colorama==0.4.6 ; python_version >= "3.8" and python_version < "4.0"
commonmark==0.9.1 ; python_version >= "3.8" and python_version < "4.0"
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.8" and python_version < "4.0"
csvkit==1.0.7 ; python_version >= "3.8" and python_version < "4.0"
dbfread==2.0.7 ; python_version >= "3.8" and python_version < "4.0"
decorator==5.1.1 ; python_version >= "3.8" and python_version < "4.0"
et-xmlfile==1.1.0 ; python_version >= "3.8" and python_version < "4.0"
exceptiongroup==1.0.4 ; python_version >= "3.8" and python_version < "3.11"
executing==1.2.0 ; python_version >= "3.8" and python_version < "4.0"
flake8==5.0.4 ; python_version >= "3.8" and python_version < "4.0"
ftfy==6.1.1 ; python_version >= "3.8" and python_version < "4"
future==0.18.2 ; python_version >= "3.8" and python_version < "4.0"
greenlet==2.0.1 ; python_version >= "3.8" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
idna==3.4 ; python_version >= "3.8" and python_version < "4"
iniconfig==1.1.1 ; python_version >= "3.8" and python_version < "4.0"
ipython==8.7.0 ; python_version >= "3.8" and python_version < "4.0"
isodate==0.6.1 ; python_version >= "3.8" and python_version < "4.0"
isort==5.10.1 ; python_version >= "3.8" and python_version < "4.0"
jedi==0.18.2 ; python_version >= "3.8" and python_version < "4.0"
langid==1.1.6 ; python_version >= "3.8" and python_version < "4.0"
leather==0.3.4 ; python_version >= "3.8" and python_version < "4.0"
matplotlib-inline==0.1.6 ; python_version >= "3.8" and python_version < "4.0"
mccabe==0.7.0 ; python_version >= "3.8" and python_version < "4.0"
mypy-extensions==0.4.3 ; python_version >= "3.8" and python_version < "4.0"
numpy==1.23.5 ; python_version < "4.0" and python_version >= "3.8"
olefile==0.46 ; python_version >= "3.8" and python_version < "4.0"
openpyxl==3.0.10 ; python_version >= "3.8" and python_version < "4.0"
packaging==21.3 ; python_version >= "3.8" and python_version < "4.0"
pandas==1.5.2 ; python_version >= "3.8" and python_version < "4.0"
parsedatetime==2.4 ; python_version >= "3.8" and python_version < "4.0"
parso==0.8.3 ; python_version >= "3.8" and python_version < "4.0"
pathspec==0.10.2 ; python_version >= "3.8" and python_version < "4.0"
pexpect==4.8.0 ; python_version >= "3.8" and python_version < "4.0" and sys_platform != "win32"
pickleshare==0.7.5 ; python_version >= "3.8" and python_version < "4.0"
platformdirs==2.5.4 ; python_version >= "3.8" and python_version < "4.0"
pluggy==1.0.0 ; python_version >= "3.8" and python_version < "4.0"
pprintpp==0.4.0 ; python_version >= "3.8" and python_version < "4.0"
prompt-toolkit==3.0.33 ; python_version >= "3.8" and python_version < "4.0"
ptyprocess==0.7.0 ; python_version >= "3.8" and python_version < "4.0" and sys_platform != "win32"
pure-eval==0.2.2 ; python_version >= "3.8" and python_version < "4.0"
pycodestyle==2.9.1 ; python_version >= "3.8" and python_version < "4.0"
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.8" and python_version < "4.0"
pyflakes==2.5.0 ; python_version >= "3.8" and python_version < "4.0"
pygments==2.13.0 ; python_version >= "3.8" and python_version < "4.0"
pyparsing==3.0.9 ; python_version >= "3.8" and python_version < "4.0"
pytest-clarity==1.0.1 ; python_version >= "3.8" and python_version < "4.0"
pytest==7.2.0 ; python_version >= "3.8" and python_version < "4.0"
python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "4.0"
python-slugify==7.0.0 ; python_version >= "3.8" and python_version < "4.0"
python-stdnum==1.18 ; python_version >= "3.8" and python_version < "4.0"
pytimeparse==1.1.8 ; python_version >= "3.8" and python_version < "4.0"
pytz==2022.6 ; python_version >= "3.8" and python_version < "4.0"
requests-cache==0.9.7 ; python_version >= "3.8" and python_version < "4.0"
requests==2.28.1 ; python_version >= "3.8" and python_version < "4"
rich==12.6.0 ; python_version >= "3.8" and python_version < "4.0"
six==1.16.0 ; python_version >= "3.8" and python_version < "4.0"
spdx-license-list==0.5.2 ; python_version >= "3.8" and python_version < "4.0"
sqlalchemy==1.4.44 ; python_version >= "3.8" and python_version < "4.0"
stack-data==0.6.2 ; python_version >= "3.8" and python_version < "4.0"
text-unidecode==1.3 ; python_version >= "3.8" and python_version < "4.0"
tomli==2.0.1 ; python_version >= "3.8" and python_full_version < "3.11.0a7"
traitlets==5.5.0 ; python_version >= "3.8" and python_version < "4.0"
typing-extensions==4.4.0 ; python_version >= "3.8" and python_version < "3.10"
url-normalize==1.4.3 ; python_version >= "3.8" and python_version < "4.0"
urllib3==1.26.13 ; python_version >= "3.8" and python_version < "4"
wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "4"
xlrd==2.0.1 ; python_version >= "3.8" and python_version < "4.0"

View File

@ -1,23 +1,24 @@
appdirs==1.4.4 ; python_version >= "3.9" and python_version < "4.0"
attrs==22.1.0 ; python_version >= "3.9" and python_version < "4.0"
cattrs==22.2.0 ; python_version >= "3.9" and python_version < "4.0"
certifi==2022.12.7 ; python_version >= "3.9" and python_version < "4"
charset-normalizer==2.1.1 ; python_version >= "3.9" and python_version < "4"
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0"
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.9" and python_version < "4.0"
exceptiongroup==1.0.4 ; python_version >= "3.9" and python_version < "3.11"
ftfy==6.1.1 ; python_version >= "3.9" and python_version < "4"
idna==3.4 ; python_version >= "3.9" and python_version < "4"
langid==1.1.6 ; python_version >= "3.9" and python_version < "4.0"
numpy==1.23.5 ; python_version < "4.0" and python_version >= "3.9"
pandas==1.5.2 ; python_version >= "3.9" and python_version < "4.0"
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.9" and python_version < "4.0"
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
python-stdnum==1.18 ; python_version >= "3.9" and python_version < "4.0"
pytz==2022.6 ; python_version >= "3.9" and python_version < "4.0"
requests-cache==0.9.7 ; python_version >= "3.9" and python_version < "4.0"
requests==2.28.1 ; python_version >= "3.9" and python_version < "4"
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
url-normalize==1.4.3 ; python_version >= "3.9" and python_version < "4.0"
urllib3==1.26.13 ; python_version >= "3.9" and python_version < "4"
wcwidth==0.2.5 ; python_version >= "3.9" and python_version < "4"
appdirs==1.4.4 ; python_version >= "3.8" and python_version < "4.0"
attrs==22.1.0 ; python_version >= "3.8" and python_version < "4.0"
cattrs==22.2.0 ; python_version >= "3.8" and python_version < "4.0"
certifi==2022.9.24 ; python_version >= "3.8" and python_version < "4"
charset-normalizer==2.1.1 ; python_version >= "3.8" and python_version < "4"
colorama==0.4.6 ; python_version >= "3.8" and python_version < "4.0"
country-converter @ git+https://github.com/alanorth/country_converter.git@myanmar-region ; python_version >= "3.8" and python_version < "4.0"
exceptiongroup==1.0.4 ; python_version >= "3.8" and python_version < "3.11"
ftfy==6.1.1 ; python_version >= "3.8" and python_version < "4"
idna==3.4 ; python_version >= "3.8" and python_version < "4"
langid==1.1.6 ; python_version >= "3.8" and python_version < "4.0"
numpy==1.23.5 ; python_version < "4.0" and python_version >= "3.8"
pandas==1.5.2 ; python_version >= "3.8" and python_version < "4.0"
pycountry @ git+https://github.com/alanorth/pycountry@iso-codes-4.12.0 ; python_version >= "3.8" and python_version < "4.0"
python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "4.0"
python-stdnum==1.18 ; python_version >= "3.8" and python_version < "4.0"
pytz==2022.6 ; python_version >= "3.8" and python_version < "4.0"
requests-cache==0.9.7 ; python_version >= "3.8" and python_version < "4.0"
requests==2.28.1 ; python_version >= "3.8" and python_version < "4"
six==1.16.0 ; python_version >= "3.8" and python_version < "4.0"
spdx-license-list==0.5.2 ; python_version >= "3.8" and python_version < "4.0"
url-normalize==1.4.3 ; python_version >= "3.8" and python_version < "4.0"
urllib3==1.26.13 ; python_version >= "3.8" and python_version < "4"
wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "4"