mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-10 07:06:00 +02:00
Compare commits
28 Commits
a7fc5a246c
...
v0.4.6
Author | SHA1 | Date | |
---|---|---|---|
1554cfd5c9
|
|||
00b8faad6d
|
|||
b19d81abdd
|
|||
a0ea829f5c
|
|||
0089efa914
|
|||
3dbe656f9f
|
|||
7ad821dcad
|
|||
cd876c4fb3
|
|||
d88ea56488
|
|||
e0e3ca6c58
|
|||
abae8ca4fb
|
|||
d7d4d4efca
|
|||
5318953150
|
|||
3b17914002
|
|||
6e4b0e5c1b
|
|||
b16fa9121f
|
|||
202bda862a
|
|||
7479310ac0
|
|||
98a91bc9c2
|
|||
fc5bedcc5c
|
|||
44d12d771a
|
|||
4a7000e975
|
|||
27b2d81ca8
|
|||
91ebd0f606
|
|||
dd2cfae047
|
|||
d76e72532a
|
|||
13980d2dde
|
|||
9aaaa62461
|
@ -9,6 +9,7 @@ steps:
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
@ -25,6 +26,7 @@ steps:
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
@ -41,6 +43,7 @@ steps:
|
||||
commands:
|
||||
- id
|
||||
- python -V
|
||||
- apt update && apt install -y gcc g++ libicu-dev pkg-config
|
||||
- pip install -r requirements-dev.txt
|
||||
- pytest
|
||||
- python setup.py install
|
||||
|
25
CHANGELOG.md
25
CHANGELOG.md
@ -4,10 +4,33 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
## [0.4.6] - 2021-03-11
|
||||
### Added
|
||||
- Validation of dcterms.license field against SPDX license identifiers
|
||||
|
||||
### Changed
|
||||
- Use DCTERMS fields where possible in `data/test.csv`
|
||||
|
||||
### Updated
|
||||
- Run `poetry update` to update project dependencies
|
||||
|
||||
### Fixed
|
||||
- Output for all fixes should be green, because it is good
|
||||
|
||||
## [0.4.5] - 2021-03-04
|
||||
### Added
|
||||
- Check dates in dcterms.issued field as well, not just fields that have the
|
||||
word "date" in them
|
||||
|
||||
### Updated
|
||||
- Run `poetry update` to update project dependencies
|
||||
|
||||
## [0.4.4] - 2021-02-21
|
||||
### Added
|
||||
- Accept dates formatted in ISO 8601 extended with combined date and time, for
|
||||
example: 2020-08-31T11:04:56Z
|
||||
- Colorized output: red for errors, yellow for warnings and information, green
|
||||
for changes
|
||||
|
||||
### Updated
|
||||
- Run `poetry update` to update project dependencies
|
||||
|
10
README.md
10
README.md
@ -103,14 +103,18 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Better logging, for example with INFO, WARN, and ERR levels
|
||||
- Verbose, debug, or quiet options
|
||||
- Warn if an author is shorter than 3 characters?
|
||||
- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
- Validate ISSNs or journal titles against CrossRef API?
|
||||
- Better ISO 8601 date parsing (currently only supports simple dates, perhaps we need to use dateutil.parser.parseiso())
|
||||
- Fix lazy date check (assumes field name has "date" but could be dcterms.issued etc!)
|
||||
- Add configurable field validation, like specify a field name and a validation file?
|
||||
- Perhaps like --validate=field.name,filename
|
||||
- Add some row-based item sanity checks and fixes:
|
||||
- Warn if item is Open Access, but missing a filename or URL
|
||||
- Warn if item is Open Access, but missing a license
|
||||
- Warn if item has an ISSN but no journal title
|
||||
- Update journal titles from ISSN
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
@ -142,7 +142,7 @@ def run(argv):
|
||||
df[column] = df[column].apply(check.isbn)
|
||||
|
||||
# Check: invalid date
|
||||
match = re.match(r"^.*?date.*$", column)
|
||||
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.date, field_name=column)
|
||||
|
||||
@ -150,6 +150,11 @@ def run(argv):
|
||||
if column == "filename":
|
||||
df[column] = df[column].apply(check.filename_extension)
|
||||
|
||||
# Check: SPDX license identifier
|
||||
match = re.match(r"dcterms\.license.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.spdx_license_identifier)
|
||||
|
||||
##
|
||||
# Perform some checks on rows so we can consider items as a whole rather
|
||||
# than simple on a field-by-field basis. This allows us to check whether
|
||||
|
@ -1,10 +1,14 @@
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
import spdx_license_list
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
from stdnum import issn as stdnum_issn
|
||||
|
||||
|
||||
def issn(field):
|
||||
@ -17,8 +21,6 @@ def issn(field):
|
||||
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||
"""
|
||||
|
||||
from stdnum import issn
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -26,7 +28,7 @@ def issn(field):
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
|
||||
if not issn.is_valid(value):
|
||||
if not stdnum_issn.is_valid(value):
|
||||
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
@ -42,8 +44,6 @@ def isbn(field):
|
||||
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||
"""
|
||||
|
||||
from stdnum import isbn
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -51,7 +51,7 @@ def isbn(field):
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
|
||||
if not isbn.is_valid(value):
|
||||
if not stdnum_isbn.is_valid(value):
|
||||
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
@ -67,8 +67,6 @@ def separators(field, field_name):
|
||||
Prints the field with the invalid multi-value separator.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -277,8 +275,6 @@ def filename_extension(field):
|
||||
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
@ -317,3 +313,23 @@ def filename_extension(field):
|
||||
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def spdx_license_identifier(field):
|
||||
"""Check if a license is a valid SPDX identifier.
|
||||
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
if value not in spdx_license_list.LICENSES:
|
||||
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
|
||||
|
||||
pass
|
||||
|
||||
return field
|
||||
|
@ -77,7 +77,7 @@ def separators(field, field_name):
|
||||
|
||||
if match:
|
||||
print(
|
||||
f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
|
||||
f"{Fore.GREEN}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
|
||||
)
|
||||
|
||||
value = re.sub(pattern, "||", value)
|
||||
|
@ -1 +1 @@
|
||||
VERSION = "0.4.3"
|
||||
VERSION = "0.4.6"
|
||||
|
@ -1,31 +1,32 @@
|
||||
dc.title,dc.date.issued,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
|
||||
Leading space,2019-07-29,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
|
||||
Invalid date,2019-07-260,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,
|
||||
Invalid language,2019-07-29,,,Span,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
|
||||
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license
|
||||
Leading space,2019-07-29,,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,
|
||||
Invalid date,2019-07-260,,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,
|
||||
Invalid language,2019-07-29,,,Span,,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,
|
||||
Newline (LF),2019-07-30,,,,"TANZA
|
||||
NIA",,
|
||||
Missing date,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,
|
||||
Composéd Unicode,2020-01-14,,,,,,
|
||||
Decomposéd Unicode,2020-01-14,,,,,,
|
||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,
|
||||
NIA",,,
|
||||
Missing date,,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,
|
||||
Composéd Unicode,2020-01-14,,,,,,,
|
||||
Decomposéd Unicode,2020-01-14,,,,,,,
|
||||
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,
|
||||
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY
|
||||
|
|
98
poetry.lock
generated
98
poetry.lock
generated
@ -1,6 +1,6 @@
|
||||
[[package]]
|
||||
name = "agate"
|
||||
version = "1.6.1"
|
||||
version = "1.6.2"
|
||||
description = "A data analysis library that is optimized for humans instead of machines."
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -11,6 +11,7 @@ Babel = ">=2.0"
|
||||
isodate = ">=0.5.4"
|
||||
leather = ">=0.3.2"
|
||||
parsedatetime = ">=2.1"
|
||||
PyICU = ">=2.4.2"
|
||||
python-slugify = ">=1.2.1"
|
||||
pytimeparse = ">=1.1.5"
|
||||
six = ">=1.9.0"
|
||||
@ -233,7 +234,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "ipython"
|
||||
version = "7.20.0"
|
||||
version = "7.21.0"
|
||||
description = "IPython: Productive Interactive Computing"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -294,14 +295,6 @@ pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
|
||||
requirements_deprecated_finder = ["pipreqs", "pip-api"]
|
||||
colors = ["colorama (>=0.4.3,<0.5.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "jdcal"
|
||||
version = "1.4.1"
|
||||
description = "Julian dates from proleptic Gregorian and Julian calendars."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "jedi"
|
||||
version = "0.18.0"
|
||||
@ -365,7 +358,7 @@ python-versions = ">=3.7"
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.0.6"
|
||||
version = "3.0.7"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@ -373,7 +366,6 @@ python-versions = ">=3.6,"
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
jdcal = "*"
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
@ -388,7 +380,7 @@ pyparsing = ">=2.0.2"
|
||||
|
||||
[[package]]
|
||||
name = "pandas"
|
||||
version = "1.2.2"
|
||||
version = "1.2.3"
|
||||
description = "Powerful data structures for data analysis, time series, and statistics"
|
||||
category = "main"
|
||||
optional = false
|
||||
@ -513,12 +505,20 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.8.0"
|
||||
version = "2.8.1"
|
||||
description = "Pygments is a syntax highlighting package written in Python."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
|
||||
[[package]]
|
||||
name = "pyicu"
|
||||
version = "2.6"
|
||||
description = "Python extension wrapping the ICU C++ API"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pyparsing"
|
||||
version = "2.4.7"
|
||||
@ -659,6 +659,14 @@ category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
|
||||
|
||||
[[package]]
|
||||
name = "spdx-license-list"
|
||||
version = "0.5.2"
|
||||
description = "A simple tool/library for working with SPDX license definitions."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "sqlalchemy"
|
||||
version = "1.3.23"
|
||||
@ -765,12 +773,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.8"
|
||||
content-hash = "8c4ba410bbdc930d2d74f7864470a18827029a5697869833959708d7425460ae"
|
||||
content-hash = "6a9ee0f26b50f361d7e0e6a2275f0e3174dee1c89fbd460583c4ea3d873857b8"
|
||||
|
||||
[metadata.files]
|
||||
agate = [
|
||||
{file = "agate-1.6.1-py2.py3-none-any.whl", hash = "sha256:48d6f80b35611c1ba25a642cbc5b90fcbdeeb2a54711c4a8d062ee2809334d1c"},
|
||||
{file = "agate-1.6.1.tar.gz", hash = "sha256:c93aaa500b439d71e4a5cf088d0006d2ce2c76f1950960c8843114e5f361dfd3"},
|
||||
{file = "agate-1.6.2.tar.gz", hash = "sha256:8dbd4a57a2cffecfa2d8109ef5993ec4be12a8a7c81fbc0c8c79d96d4c4399ed"},
|
||||
]
|
||||
agate-dbf = [
|
||||
{file = "agate-dbf-0.2.2.tar.gz", hash = "sha256:589682b78c5c03f2dc8511e6e3edb659fb7336cd118e248896bb0b44c2f1917b"},
|
||||
@ -851,8 +858,8 @@ iniconfig = [
|
||||
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
|
||||
]
|
||||
ipython = [
|
||||
{file = "ipython-7.20.0-py3-none-any.whl", hash = "sha256:1918dea4bfdc5d1a830fcfce9a710d1d809cbed123e85eab0539259cb0f56640"},
|
||||
{file = "ipython-7.20.0.tar.gz", hash = "sha256:1923af00820a8cf58e91d56b89efc59780a6e81363b94464a0f17c039dffff9e"},
|
||||
{file = "ipython-7.21.0-py3-none-any.whl", hash = "sha256:34207ffb2f653bced2bc8e3756c1db86e7d93e44ed049daae9814fed66d408ec"},
|
||||
{file = "ipython-7.21.0.tar.gz", hash = "sha256:04323f72d5b85b606330b6d7e2dc8d2683ad46c3905e955aa96ecc7a99388e70"},
|
||||
]
|
||||
ipython-genutils = [
|
||||
{file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"},
|
||||
@ -866,10 +873,6 @@ isort = [
|
||||
{file = "isort-5.7.0-py3-none-any.whl", hash = "sha256:fff4f0c04e1825522ce6949973e83110a6e907750cd92d128b0d14aaaadbffdc"},
|
||||
{file = "isort-5.7.0.tar.gz", hash = "sha256:c729845434366216d320e936b8ad6f9d681aab72dc7cbc2d51bedc3582f3ad1e"},
|
||||
]
|
||||
jdcal = [
|
||||
{file = "jdcal-1.4.1-py2.py3-none-any.whl", hash = "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba"},
|
||||
{file = "jdcal-1.4.1.tar.gz", hash = "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"},
|
||||
]
|
||||
jedi = [
|
||||
{file = "jedi-0.18.0-py2.py3-none-any.whl", hash = "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93"},
|
||||
{file = "jedi-0.18.0.tar.gz", hash = "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"},
|
||||
@ -916,32 +919,30 @@ numpy = [
|
||||
{file = "numpy-1.20.1.zip", hash = "sha256:3bc63486a870294683980d76ec1e3efc786295ae00128f9ea38e2c6e74d5a60a"},
|
||||
]
|
||||
openpyxl = [
|
||||
{file = "openpyxl-3.0.6-py2.py3-none-any.whl", hash = "sha256:1a4b3869c2500b5c713e8e28341cdada49ecfcff1b10cd9006945f5bcefc090d"},
|
||||
{file = "openpyxl-3.0.6.tar.gz", hash = "sha256:b229112b46e158b910a5d1b270b212c42773d39cab24e8db527f775b82afc041"},
|
||||
{file = "openpyxl-3.0.7-py2.py3-none-any.whl", hash = "sha256:46af4eaf201a89b610fcca177eed957635f88770a5462fb6aae4a2a52b0ff516"},
|
||||
{file = "openpyxl-3.0.7.tar.gz", hash = "sha256:6456a3b472e1ef0facb1129f3c6ef00713cebf62e736cd7a75bcc3247432f251"},
|
||||
]
|
||||
packaging = [
|
||||
{file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
|
||||
{file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
|
||||
]
|
||||
pandas = [
|
||||
{file = "pandas-1.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c76a108272a4de63189b8f64086bbaf8348841d7e610b52f50959fbbf401524f"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e61a089151f1ed78682aa77a3bcae0495cf8e585546c26924857d7e8a9960568"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:fc351cd2df318674669481eb978a7799f24fd14ef26987a1aa75105b0531d1a1"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-win32.whl", hash = "sha256:05ca6bda50123158eb15e716789083ca4c3b874fd47688df1716daa72644ee1c"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:08b6bbe74ae2b3e4741a744d2bce35ce0868a6b4189d8b84be26bb334f73da4c"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:230de25bd9791748b2638c726a5f37d77a96a83854710110fadd068d1e2c2c9f"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:a50cf3110a1914442e7b7b9cef394ef6bed0d801b8a34d56f4c4e927bbbcc7d0"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4d33537a375cfb2db4d388f9a929b6582a364137ea6c6b161b0166440d6ffe36"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8ac028cd9a6e1efe43f3dc36f708263838283535cc45430a98b9803f44f4c84b"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-win32.whl", hash = "sha256:c43d1beb098a1da15934262009a7120aac8dafa20d042b31dab48c28868eb5a4"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:69a70d79a791fa1fd5f6e84b8b6dec2ec92369bde4ab2e18d43fc8a1825f51d1"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cbad4155028b8ca66aa19a8b13f593ebbf51bfb6c3f2685fe64f04d695a81864"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:fbddbb20f30308ba2546193d64e18c23b69f59d48cdef73676cbed803495c8dc"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:214ae60b1f863844e97c87f758c29940ffad96c666257323a4bb2a33c58719c2"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:26b4919eb3039a686a86cd4f4a74224f8f66e3a419767da26909dcdd3b37c31e"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-win32.whl", hash = "sha256:e3c250faaf9979d0ec836d25e420428db37783fa5fed218da49c9fc06f80f51c"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:e9bbcc7b5c432600797981706f5b54611990c6a86b2e424329c995eea5f9c42b"},
|
||||
{file = "pandas-1.2.2.tar.gz", hash = "sha256:14ed84b463e9b84c8ff9308a79b04bf591ae3122a376ee0f62c68a1bd917a773"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4d821b9b911fc1b7d428978d04ace33f0af32bb7549525c8a7b08444bce46b74"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:9f5829e64507ad10e2561b60baf285c470f3c4454b007c860e77849b88865ae7"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:97b1954533b2a74c7e20d1342c4f01311d3203b48f2ebf651891e6a6eaf01104"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-win32.whl", hash = "sha256:5e3c8c60541396110586bcbe6eccdc335a38e7de8c217060edaf4722260b158f"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8a051e957c5206f722e83f295f95a2cf053e890f9a1fba0065780a8c2d045f5d"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a93e34f10f67d81de706ce00bf8bb3798403cabce4ccb2de10c61b5ae8786ab5"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:46fc671c542a8392a4f4c13edc8527e3a10f6cb62912d856f82248feb747f06e"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:43e00770552595c2250d8d712ec8b6e08ca73089ac823122344f023efa4abea3"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-win32.whl", hash = "sha256:475b7772b6e18a93a43ea83517932deff33954a10d4fbae18d0c1aba4182310f"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:72ffcea00ae8ffcdbdefff800284311e155fbb5ed6758f1a6110fc1f8f8f0c1c"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:621c044a1b5e535cf7dcb3ab39fca6f867095c3ef223a524f18f60c7fee028ea"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:0f27fd1adfa256388dc34895ca5437eaf254832223812afd817a6f73127f969c"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:dbb255975eb94143f2e6ec7dadda671d25147939047839cd6b8a4aff0379bb9b"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-win32.whl", hash = "sha256:d59842a5aa89ca03c2099312163ffdd06f56486050e641a45d926a072f04d994"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:09761bf5f8c741d47d4b8b9073288de1be39bbfccc281d70b889ade12b2aad29"},
|
||||
{file = "pandas-1.2.3.tar.gz", hash = "sha256:df6f10b85aef7a5bb25259ad651ad1cc1d6bb09000595cab47e718cbac250b1d"},
|
||||
]
|
||||
parsedatetime = [
|
||||
{file = "parsedatetime-2.6-py3-none-any.whl", hash = "sha256:cb96edd7016872f58479e35879294258c71437195760746faffedb692aef000b"},
|
||||
@ -991,8 +992,11 @@ pyflakes = [
|
||||
{file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"},
|
||||
]
|
||||
pygments = [
|
||||
{file = "Pygments-2.8.0-py3-none-any.whl", hash = "sha256:b21b072d0ccdf29297a82a2363359d99623597b8a265b8081760e4d0f7153c88"},
|
||||
{file = "Pygments-2.8.0.tar.gz", hash = "sha256:37a13ba168a02ac54cc5891a42b1caec333e59b66addb7fa633ea8a6d73445c0"},
|
||||
{file = "Pygments-2.8.1-py3-none-any.whl", hash = "sha256:534ef71d539ae97d4c3a4cf7d6f110f214b0e687e92f9cb9d2a3b0d3101289c8"},
|
||||
{file = "Pygments-2.8.1.tar.gz", hash = "sha256:2656e1a6edcdabf4275f9a3640db59fd5de107d88e8663c5d4e9a0fa62f77f94"},
|
||||
]
|
||||
pyicu = [
|
||||
{file = "PyICU-2.6.tar.gz", hash = "sha256:a9a5bf6833360f8f69e9375b91c1a7dd6e0c9157a42aee5bb7d6891804d96371"},
|
||||
]
|
||||
pyparsing = [
|
||||
{file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
|
||||
@ -1079,6 +1083,10 @@ six = [
|
||||
{file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
|
||||
{file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
|
||||
]
|
||||
spdx-license-list = [
|
||||
{file = "spdx_license_list-0.5.2-py3-none-any.whl", hash = "sha256:1b338470c7b403dbecceca563a316382c7977516128ca6c1e8f7078e3ed6e7b0"},
|
||||
{file = "spdx_license_list-0.5.2.tar.gz", hash = "sha256:952996f72ab807972dc2278bb9b91e5294767211e51f09aad9c0e2ff5b82a31b"},
|
||||
]
|
||||
sqlalchemy = [
|
||||
{file = "SQLAlchemy-1.3.23-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:fd3b96f8c705af8e938eaa99cbd8fd1450f632d38cad55e7367c33b263bf98ec"},
|
||||
{file = "SQLAlchemy-1.3.23-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:29cccc9606750fe10c5d0e8bd847f17a97f3850b8682aef1f56f5d5e1a5a64b1"},
|
||||
|
@ -1,12 +1,15 @@
|
||||
[tool.poetry]
|
||||
name = "csv-metadata-quality"
|
||||
version = "0.4.3"
|
||||
version = "0.4.6"
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
||||
license="GPL-3.0-only"
|
||||
repository = "https://github.com/ilri/csv-metadata-quality"
|
||||
homepage = "https://github.com/ilri/csv-metadata-quality"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
pandas = "^1.0.4"
|
||||
@ -17,6 +20,7 @@ requests-cache = "^0.5.2"
|
||||
pycountry = "^19.8.18"
|
||||
langid = "^1.1.6"
|
||||
colorama = "^0.4.4"
|
||||
spdx-license-list = "^0.5.2"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^6.1.1"
|
||||
|
@ -1,7 +1,7 @@
|
||||
agate-dbf==0.2.2
|
||||
agate-excel==0.2.3
|
||||
agate-sql==0.5.5
|
||||
agate==1.6.1
|
||||
agate==1.6.2
|
||||
appdirs==1.4.4; python_version >= "3.6"
|
||||
appnope==0.1.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "darwin"
|
||||
atomicwrites==1.4.0; python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6") or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6")
|
||||
@ -12,7 +12,7 @@ black==20.8b1; python_version >= "3.6"
|
||||
certifi==2020.12.5; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
chardet==4.0.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
click==7.1.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.6"
|
||||
colorama==0.4.4; python_version >= "3.7" and python_full_version < "3.0.0" and sys_platform == "win32" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6") or sys_platform == "win32" and python_version >= "3.7" and python_full_version >= "3.5.0" and python_version < "4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6")
|
||||
colorama==0.4.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
csvkit==1.0.5
|
||||
dbfread==2.0.7
|
||||
decorator==4.4.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.2.0"
|
||||
@ -21,44 +21,45 @@ flake8==3.8.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (p
|
||||
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
iniconfig==1.1.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
ipython-genutils==0.2.0; python_version >= "3.7" and python_version < "4.0"
|
||||
ipython==7.20.0; python_version >= "3.7" and python_version < "4.0"
|
||||
ipython==7.21.0; python_version >= "3.7" and python_version < "4.0"
|
||||
isodate==0.6.0
|
||||
isort==5.7.0; python_version >= "3.6" and python_version < "4.0"
|
||||
jdcal==1.4.1; python_version >= "3.6"
|
||||
jedi==0.18.0; python_version >= "3.7" and python_version < "4.0"
|
||||
langid==1.1.6
|
||||
leather==0.3.3
|
||||
mccabe==0.6.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
mypy-extensions==0.4.3; python_version >= "3.6"
|
||||
numpy==1.20.0; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
openpyxl==3.0.6; python_version >= "3.6"
|
||||
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
openpyxl==3.0.7; python_version >= "3.6"
|
||||
packaging==20.9; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
pandas==1.2.1; python_full_version >= "3.7.1"
|
||||
pandas==1.2.3; python_full_version >= "3.7.1"
|
||||
parsedatetime==2.6
|
||||
parso==0.8.1; python_version >= "3.7" and python_version < "4.0"
|
||||
pathspec==0.8.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.6"
|
||||
pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform != "win32"
|
||||
pickleshare==0.7.5; python_version >= "3.7" and python_version < "4.0"
|
||||
pluggy==0.13.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
prompt-toolkit==3.0.14; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.1"
|
||||
prompt-toolkit==3.0.16; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.1"
|
||||
ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" and sys_platform != "win32"
|
||||
py==1.10.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
pycodestyle==2.6.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
pycountry==19.8.18
|
||||
pyflakes==2.2.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
pygments==2.7.4; python_version >= "3.7" and python_version < "4.0"
|
||||
pygments==2.8.1; python_version >= "3.7" and python_version < "4.0"
|
||||
pyicu==2.6
|
||||
pyparsing==2.4.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
pytest-clarity==0.3.0a0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
|
||||
pytest==6.2.2; python_version >= "3.6"
|
||||
python-dateutil==2.8.1; python_full_version >= "3.7.1"
|
||||
python-slugify==4.0.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
python-stdnum==1.15
|
||||
python-stdnum==1.16
|
||||
pytimeparse==1.1.8
|
||||
pytz==2021.1; python_full_version >= "3.7.1"
|
||||
regex==2020.11.13; python_version >= "3.6"
|
||||
requests-cache==0.5.2
|
||||
requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
six==1.15.0; python_full_version >= "3.7.1"
|
||||
spdx-license-list==0.5.2
|
||||
sqlalchemy==1.3.23; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
termcolor==1.1.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
|
||||
text-unidecode==1.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
|
@ -1,15 +1,17 @@
|
||||
certifi==2020.12.5; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
chardet==4.0.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
colorama==0.4.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
langid==1.1.6
|
||||
numpy==1.20.0; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
pandas==1.2.1; python_full_version >= "3.7.1"
|
||||
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
pandas==1.2.3; python_full_version >= "3.7.1"
|
||||
pycountry==19.8.18
|
||||
python-dateutil==2.8.1; python_full_version >= "3.7.1"
|
||||
python-stdnum==1.15
|
||||
python-stdnum==1.16
|
||||
pytz==2021.1; python_full_version >= "3.7.1"
|
||||
requests-cache==0.5.2
|
||||
requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
six==1.15.0; python_full_version >= "3.7.1"
|
||||
spdx-license-list==0.5.2
|
||||
urllib3==1.26.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version < "4"
|
||||
xlrd==1.2.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
|
||||
|
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ install_requires = [
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.4.3",
|
||||
version="0.4.6",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
@ -224,7 +224,7 @@ def test_check_invalid_agrovoc(capsys):
|
||||
"""Test invalid AGROVOC subject."""
|
||||
|
||||
value = "FOREST"
|
||||
field_name = "dc.subject"
|
||||
field_name = "dcterms.subject"
|
||||
|
||||
check.agrovoc(value, field_name)
|
||||
|
||||
@ -239,7 +239,7 @@ def test_check_valid_agrovoc():
|
||||
"""Test valid AGROVOC subject."""
|
||||
|
||||
value = "FORESTS"
|
||||
field_name = "dc.subject"
|
||||
field_name = "dcterms.subject"
|
||||
|
||||
result = check.agrovoc(value, field_name)
|
||||
|
||||
@ -336,3 +336,27 @@ def test_check_correct_iso_639_3_language():
|
||||
result = experimental.correct_language(series)
|
||||
|
||||
assert result == language
|
||||
|
||||
|
||||
def test_check_valid_spdx_license_identifier():
|
||||
"""Test valid SPDX license identifier."""
|
||||
|
||||
license = "CC-BY-SA-4.0"
|
||||
|
||||
result = check.spdx_license_identifier(license)
|
||||
|
||||
assert result == license
|
||||
|
||||
|
||||
def test_check_invalid_spdx_license_identifier(capsys):
|
||||
"""Test invalid SPDX license identifier."""
|
||||
|
||||
license = "CC-BY-SA"
|
||||
|
||||
result = check.spdx_license_identifier(license)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
captured.out
|
||||
== f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{license}\n"
|
||||
)
|
||||
|
Reference in New Issue
Block a user