mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-10 23:26:01 +02:00
Compare commits
20 Commits
v0.4.5
...
6cc1401f88
Author | SHA1 | Date | |
---|---|---|---|
6cc1401f88
|
|||
ad2cda8a41
|
|||
dc6920802e
|
|||
6ca449d8ed
|
|||
1554cfd5c9
|
|||
00b8faad6d
|
|||
b19d81abdd
|
|||
a0ea829f5c
|
|||
0089efa914
|
|||
3dbe656f9f
|
|||
7ad821dcad
|
|||
cd876c4fb3
|
|||
d88ea56488
|
|||
e0e3ca6c58
|
|||
abae8ca4fb
|
|||
d7d4d4efca
|
|||
5318953150
|
|||
3b17914002
|
|||
6e4b0e5c1b
|
|||
b16fa9121f
|
@ -9,6 +9,7 @@ steps:
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
@ -25,6 +26,7 @@ steps:
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
@ -41,6 +43,7 @@ steps:
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
|
4
.github/workflows/python-app.yml
vendored
4
.github/workflows/python-app.yml
vendored
@ -16,10 +16,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.8
|
||||
- name: Set up Python 3.9
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
python-version: 3.9
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
13
CHANGELOG.md
13
CHANGELOG.md
@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.4.6] - 2021-03-11
|
||||
### Added
|
||||
- Validation of dcterms.license field against SPDX license identifiers
|
||||
|
||||
### Changed
|
||||
- Use DCTERMS fields where possible in `data/test.csv`
|
||||
|
||||
### Updated
|
||||
- Run `poetry update` to update project dependencies
|
||||
|
||||
### Fixed
|
||||
- Output for all fixes should be green, because it is good
|
||||
|
||||
## [0.4.5] - 2021-03-04
|
||||
### Added
|
||||
- Check dates in dcterms.issued field as well, not just fields that have the
|
||||
|
@ -1,7 +1,7 @@
|
||||
# DSpace CSV Metadata Quality Checker  [](https://ci.mjanja.ch/alanorth/csv-metadata-quality)
|
||||
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
|
||||
|
||||
Requires Python 3.7 or greater (3.8 recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
|
||||
Requires Python 3.7.1 or greater (3.8+ recommended). CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
|
||||
|
||||
If you use the DSpace CSV metadata quality checker please cite:
|
||||
|
||||
@ -13,6 +13,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
||||
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
|
||||
- Experimental validation of titles and abstracts against item's Dublin Core language field
|
||||
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
|
||||
- Validation of licenses against the list of [SPDX license identifiers](https://spdx.org/licenses)
|
||||
- Fix leading, trailing, and excessive (ie, more than one) whitespace
|
||||
- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes`
|
||||
- Fix problematic newlines (line feeds) using `--unsafe-fixes`
|
||||
@ -103,7 +104,6 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Better logging, for example with INFO, WARN, and ERR levels
|
||||
- Verbose, debug, or quiet options
|
||||
- Warn if an author is shorter than 3 characters?
|
||||
- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
|
@ -150,6 +150,11 @@ def run(argv):
|
||||
if column == "filename":
|
||||
df[column] = df[column].apply(check.filename_extension)
|
||||
|
||||
# Check: SPDX license identifier
|
||||
match = re.match(r"dcterms\.license.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.spdx_license_identifier)
|
||||
|
||||
##
|
||||
# Perform some checks on rows so we can consider items as a whole rather
|
||||
# than simple on a field-by-field basis. This allows us to check whether
|
||||
|
@ -1,10 +1,14 @@
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
import spdx_license_list
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
from stdnum import issn as stdnum_issn
|
||||
|
||||
|
||||
def issn(field):
|
||||
@ -17,8 +21,6 @@ def issn(field):
|
||||
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||
"""
|
||||
|
||||
from stdnum import issn
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -26,7 +28,7 @@ def issn(field):
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
|
||||
if not issn.is_valid(value):
|
||||
if not stdnum_issn.is_valid(value):
|
||||
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
@ -42,8 +44,6 @@ def isbn(field):
|
||||
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||
"""
|
||||
|
||||
from stdnum import isbn
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -51,7 +51,7 @@ def isbn(field):
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
|
||||
if not isbn.is_valid(value):
|
||||
if not stdnum_isbn.is_valid(value):
|
||||
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
@ -67,8 +67,6 @@ def separators(field, field_name):
|
||||
Prints the field with the invalid multi-value separator.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -277,8 +275,6 @@ def filename_extension(field):
|
||||
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -317,3 +313,23 @@ def filename_extension(field):
|
||||
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def spdx_license_identifier(field):
|
||||
"""Check if a license is a valid SPDX identifier.
|
||||
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
if value not in spdx_license_list.LICENSES:
|
||||
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
|
||||
|
||||
pass
|
||||
|
||||
return field
|
||||
|
@ -77,7 +77,7 @@ def separators(field, field_name):
|
||||
|
||||
if match:
|
||||
print(
|
||||
f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
|
||||
f"{Fore.GREEN}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
|
||||
)
|
||||
|
||||
value = re.sub(pattern, "||", value)
|
||||
|
@ -1 +1 @@
|
||||
VERSION = "0.4.5"
|
||||
VERSION = "0.4.6"
|
||||
|
@ -1,31 +1,32 @@
|
||||
dc.title,dc.date.issued,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
|
||||
Leading space,2019-07-29,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
|
||||
Invalid date,2019-07-260,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,
|
||||
Invalid language,2019-07-29,,,Span,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
|
||||
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license
|
||||
Leading space,2019-07-29,,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,
|
||||
Invalid date,2019-07-260,,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,
|
||||
Invalid language,2019-07-29,,,Span,,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,
|
||||
Newline (LF),2019-07-30,,,,"TANZA
|
||||
NIA",,
|
||||
Missing date,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,
|
||||
Composéd Unicode,2020-01-14,,,,,,
|
||||
Decomposéd Unicode,2020-01-14,,,,,,
|
||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,
|
||||
NIA",,,
|
||||
Missing date,,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,
|
||||
Composéd Unicode,2020-01-14,,,,,,,
|
||||
Decomposéd Unicode,2020-01-14,,,,,,,
|
||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,
|
||||
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY
|
||||
|
|
56
poetry.lock
generated
56
poetry.lock
generated
@ -1,6 +1,6 @@
|
||||
[[package]]
|
||||
name = "agate"
|
||||
version = "1.6.1"
|
||||
version = "1.6.2"
|
||||
description = "A data analysis library that is optimized for humans instead of machines."
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -11,6 +11,7 @@ Babel = ">=2.0"
|
||||
isodate = ">=0.5.4"
|
||||
leather = ">=0.3.2"
|
||||
parsedatetime = ">=2.1"
|
||||
PyICU = ">=2.4.2"
|
||||
python-slugify = ">=1.2.1"
|
||||
pytimeparse = ">=1.1.5"
|
||||
six = ">=1.9.0"
|
||||
@ -294,14 +295,6 @@ pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
|
||||
requirements_deprecated_finder = ["pipreqs", "pip-api"]
|
||||
colors = ["colorama (>=0.4.3,<0.5.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "jdcal"
|
||||
version = "1.4.1"
|
||||
description = "Julian dates from proleptic Gregorian and Julian calendars."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "jedi"
|
||||
version = "0.18.0"
|
||||
@ -365,7 +358,7 @@ python-versions = ">=3.7"
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.0.6"
|
||||
version = "3.0.7"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -373,7 +366,6 @@ python-versions = ">=3.6,"
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
jdcal = "*"
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
@ -513,12 +505,20 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.8.0"
|
||||
version = "2.8.1"
|
||||
description = "Pygments is a syntax highlighting package written in Python."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
|
||||
[[package]]
|
||||
name = "pyicu"
|
||||
version = "2.6"
|
||||
description = "Python extension wrapping the ICU C++ API"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pyparsing"
|
||||
version = "2.4.7"
|
||||
@ -659,6 +659,14 @@ category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
|
||||
[[package]]
|
||||
name = "spdx-license-list"
|
||||
version = "0.5.2"
|
||||
description = "A simple tool/library for working with SPDX license definitions."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "sqlalchemy"
|
||||
version = "1.3.23"
|
||||
@ -765,12 +773,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.8"
|
||||
content-hash = "8c4ba410bbdc930d2d74f7864470a18827029a5697869833959708d7425460ae"
|
||||
content-hash = "6a9ee0f26b50f361d7e0e6a2275f0e3174dee1c89fbd460583c4ea3d873857b8"
|
||||
|
||||
[metadata.files]
|
||||
agate = [
|
||||
{file = "agate-1.6.1-py2.py3-none-any.whl", hash = "sha256:48d6f80b35611c1ba25a642cbc5b90fcbdeeb2a54711c4a8d062ee2809334d1c"},
|
||||
{file = "agate-1.6.1.tar.gz", hash = "sha256:c93aaa500b439d71e4a5cf088d0006d2ce2c76f1950960c8843114e5f361dfd3"},
|
||||
{file = "agate-1.6.2.tar.gz", hash = "sha256:8dbd4a57a2cffecfa2d8109ef5993ec4be12a8a7c81fbc0c8c79d96d4c4399ed"},
|
||||
]
|
||||
agate-dbf = [
|
||||
{file = "agate-dbf-0.2.2.tar.gz", hash = "sha256:589682b78c5c03f2dc8511e6e3edb659fb7336cd118e248896bb0b44c2f1917b"},
|
||||
@ -866,10 +873,6 @@ isort = [
|
||||
{file = "isort-5.7.0-py3-none-any.whl", hash = "sha256:fff4f0c04e1825522ce6949973e83110a6e907750cd92d128b0d14aaaadbffdc"},
|
||||
{file = "isort-5.7.0.tar.gz", hash = "sha256:c729845434366216d320e936b8ad6f9d681aab72dc7cbc2d51bedc3582f3ad1e"},
|
||||
]
|
||||
jdcal = [
|
||||
{file = "jdcal-1.4.1-py2.py3-none-any.whl", hash = "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba"},
|
||||
{file = "jdcal-1.4.1.tar.gz", hash = "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"},
|
||||
]
|
||||
jedi = [
|
||||
{file = "jedi-0.18.0-py2.py3-none-any.whl", hash = "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93"},
|
||||
{file = "jedi-0.18.0.tar.gz", hash = "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"},
|
||||
@ -916,8 +919,8 @@ numpy = [
|
||||
{file = "numpy-1.20.1.zip", hash = "sha256:3bc63486a870294683980d76ec1e3efc786295ae00128f9ea38e2c6e74d5a60a"},
|
||||
]
|
||||
openpyxl = [
|
||||
{file = "openpyxl-3.0.6-py2.py3-none-any.whl", hash = "sha256:1a4b3869c2500b5c713e8e28341cdada49ecfcff1b10cd9006945f5bcefc090d"},
|
||||
{file = "openpyxl-3.0.6.tar.gz", hash = "sha256:b229112b46e158b910a5d1b270b212c42773d39cab24e8db527f775b82afc041"},
|
||||
{file = "openpyxl-3.0.7-py2.py3-none-any.whl", hash = "sha256:46af4eaf201a89b610fcca177eed957635f88770a5462fb6aae4a2a52b0ff516"},
|
||||
{file = "openpyxl-3.0.7.tar.gz", hash = "sha256:6456a3b472e1ef0facb1129f3c6ef00713cebf62e736cd7a75bcc3247432f251"},
|
||||
]
|
||||
packaging = [
|
||||
{file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
|
||||
@ -989,8 +992,11 @@ pyflakes = [
|
||||
{file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"},
|
||||
]
|
||||
pygments = [
|
||||
{file = "Pygments-2.8.0-py3-none-any.whl", hash = "sha256:b21b072d0ccdf29297a82a2363359d99623597b8a265b8081760e4d0f7153c88"},
|
||||
{file = "Pygments-2.8.0.tar.gz", hash = "sha256:37a13ba168a02ac54cc5891a42b1caec333e59b66addb7fa633ea8a6d73445c0"},
|
||||
{file = "Pygments-2.8.1-py3-none-any.whl", hash = "sha256:534ef71d539ae97d4c3a4cf7d6f110f214b0e687e92f9cb9d2a3b0d3101289c8"},
|
||||
{file = "Pygments-2.8.1.tar.gz", hash = "sha256:2656e1a6edcdabf4275f9a3640db59fd5de107d88e8663c5d4e9a0fa62f77f94"},
|
||||
]
|
||||
pyicu = [
|
||||
{file = "PyICU-2.6.tar.gz", hash = "sha256:a9a5bf6833360f8f69e9375b91c1a7dd6e0c9157a42aee5bb7d6891804d96371"},
|
||||
]
|
||||
pyparsing = [
|
||||
{file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
|
||||
@ -1077,6 +1083,10 @@ six = [
|
||||
{file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
|
||||
{file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
|
||||
]
|
||||
spdx-license-list = [
|
||||
{file = "spdx_license_list-0.5.2-py3-none-any.whl", hash = "sha256:1b338470c7b403dbecceca563a316382c7977516128ca6c1e8f7078e3ed6e7b0"},
|
||||
{file = "spdx_license_list-0.5.2.tar.gz", hash = "sha256:952996f72ab807972dc2278bb9b91e5294767211e51f09aad9c0e2ff5b82a31b"},
|
||||
]
|
||||
sqlalchemy = [
|
||||
{file = "SQLAlchemy-1.3.23-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:fd3b96f8c705af8e938eaa99cbd8fd1450f632d38cad55e7367c33b263bf98ec"},
|
||||
{file = "SQLAlchemy-1.3.23-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:29cccc9606750fe10c5d0e8bd847f17a97f3850b8682aef1f56f5d5e1a5a64b1"},
|
||||
|
@ -1,14 +1,17 @@
|
||||
[tool.poetry]
|
||||
name = "csv-metadata-quality"
|
||||
version = "0.4.5"
|
||||
version = "0.4.6"
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
||||
license="GPL-3.0-only"
|
||||
repository = "https://github.com/ilri/csv-metadata-quality"
|
||||
homepage = "https://github.com/ilri/csv-metadata-quality"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
python = "^3.7.1"
|
||||
pandas = "^1.0.4"
|
||||
python-stdnum = "^1.13"
|
||||
xlrd = "^1.2.0"
|
||||
@ -17,6 +20,7 @@ requests-cache = "^0.5.2"
|
||||
pycountry = "^19.8.18"
|
||||
langid = "^1.1.6"
|
||||
colorama = "^0.4.4"
|
||||
spdx-license-list = "^0.5.2"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^6.1.1"
|
||||
|
@ -1,7 +1,7 @@
|
||||
agate-dbf==0.2.2
|
||||
agate-excel==0.2.3
|
||||
agate-sql==0.5.5
|
||||
agate==1.6.1
|
||||
agate==1.6.2
|
||||
appdirs==1.4.4; python_version >= "3.6"
|
||||
appnope==0.1.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "darwin"
|
||||
atomicwrites==1.4.0; python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6") or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6")
|
||||
@ -24,14 +24,13 @@ ipython-genutils==0.2.0; python_version >= "3.7" and python_version < "4.0"
|
||||
ipython==7.21.0; python_version >= "3.7" and python_version < "4.0"
|
||||
isodate==0.6.0
|
||||
isort==5.7.0; python_version >= "3.6" and python_version < "4.0"
|
||||
jdcal==1.4.1; python_version >= "3.6"
|
||||
jedi==0.18.0; python_version >= "3.7" and python_version < "4.0"
|
||||
langid==1.1.6
|
||||
leather==0.3.3
|
||||
mccabe==0.6.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
mypy-extensions==0.4.3; python_version >= "3.6"
|
||||
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
openpyxl==3.0.6; python_version >= "3.6"
|
||||
openpyxl==3.0.7; python_version >= "3.6"
|
||||
packaging==20.9; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
pandas==1.2.3; python_full_version >= "3.7.1"
|
||||
parsedatetime==2.6
|
||||
@ -46,7 +45,8 @@ py==1.10.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_
|
||||
pycodestyle==2.6.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
pycountry==19.8.18
|
||||
pyflakes==2.2.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
pygments==2.8.0; python_version >= "3.7" and python_version < "4.0"
|
||||
pygments==2.8.1; python_version >= "3.7" and python_version < "4.0"
|
||||
pyicu==2.6
|
||||
pyparsing==2.4.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
pytest-clarity==0.3.0a0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
|
||||
pytest==6.2.2; python_version >= "3.6"
|
||||
@ -59,6 +59,7 @@ regex==2020.11.13; python_version >= "3.6"
|
||||
requests-cache==0.5.2
|
||||
requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
six==1.15.0; python_full_version >= "3.7.1"
|
||||
spdx-license-list==0.5.2
|
||||
sqlalchemy==1.3.23; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
termcolor==1.1.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
text-unidecode==1.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
|
@ -12,5 +12,6 @@ pytz==2021.1; python_full_version >= "3.7.1"
|
||||
requests-cache==0.5.2
|
||||
requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
six==1.15.0; python_full_version >= "3.7.1"
|
||||
spdx-license-list==0.5.2
|
||||
urllib3==1.26.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version < "4"
|
||||
xlrd==1.2.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
|
||||
|
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ install_requires = [
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.4.5",
|
||||
version="0.4.6",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
@ -224,7 +224,7 @@ def test_check_invalid_agrovoc(capsys):
|
||||
"""Test invalid AGROVOC subject."""
|
||||
|
||||
value = "FOREST"
|
||||
field_name = "dc.subject"
|
||||
field_name = "dcterms.subject"
|
||||
|
||||
check.agrovoc(value, field_name)
|
||||
|
||||
@ -239,7 +239,7 @@ def test_check_valid_agrovoc():
|
||||
"""Test valid AGROVOC subject."""
|
||||
|
||||
value = "FORESTS"
|
||||
field_name = "dc.subject"
|
||||
field_name = "dcterms.subject"
|
||||
|
||||
result = check.agrovoc(value, field_name)
|
||||
|
||||
@ -336,3 +336,27 @@ def test_check_correct_iso_639_3_language():
|
||||
result = experimental.correct_language(series)
|
||||
|
||||
assert result == language
|
||||
|
||||
|
||||
def test_check_valid_spdx_license_identifier():
|
||||
"""Test valid SPDX license identifier."""
|
||||
|
||||
license = "CC-BY-SA-4.0"
|
||||
|
||||
result = check.spdx_license_identifier(license)
|
||||
|
||||
assert result == license
|
||||
|
||||
|
||||
def test_check_invalid_spdx_license_identifier(capsys):
|
||||
"""Test invalid SPDX license identifier."""
|
||||
|
||||
license = "CC-BY-SA"
|
||||
|
||||
result = check.spdx_license_identifier(license)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
captured.out
|
||||
== f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{license}\n"
|
||||
)
|
||||
|
Reference in New Issue
Block a user