1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-02-20 16:56:21 +01:00

Compare commits

..

No commits in common. "3dbe656f9fd51d2e11bfac795ff54632fb1b4867" and "b16fa9121f676d69bc49044fdae88ba7ed9e7dd3" have entirely different histories.

10 changed files with 68 additions and 136 deletions

View File

@ -4,16 +4,6 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
## Added
- Validation of dcterms.license field against SPDX license identifiers
## Changed
- Use DCTERMS fields where possible in `data/test.csv`
### Updated
- Run `poetry update` to update project dependencies
## [0.4.5] - 2021-03-04
### Added
- Check dates in dcterms.issued field as well, not just fields that have the

View File

@ -103,6 +103,7 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
- Better logging, for example with INFO, WARN, and ERR levels
- Verbose, debug, or quiet options
- Warn if an author is shorter than 3 characters?
- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
- Warn if two items use the same file in `filename` column
- Add an option to drop invalid AGROVOC subjects?

View File

@ -150,11 +150,6 @@ def run(argv):
if column == "filename":
df[column] = df[column].apply(check.filename_extension)
# Check: SPDX license identifier
match = re.match(r"dcterms\.license.*$", column)
if match is not None:
df[column] = df[column].apply(check.spdx_license_identifier)
##
# Perform some checks on rows so we can consider items as a whole rather
# than simple on a field-by-field basis. This allows us to check whether

View File

@ -1,14 +1,10 @@
import re
from datetime import datetime, timedelta
import pandas as pd
import requests
import requests_cache
import spdx_license_list
from colorama import Fore
from pycountry import languages
from stdnum import isbn as stdnum_isbn
from stdnum import issn as stdnum_issn
def issn(field):
@ -21,6 +17,8 @@ def issn(field):
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
"""
from stdnum import issn
# Skip fields with missing values
if pd.isna(field):
return
@ -28,7 +26,7 @@ def issn(field):
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if not stdnum_issn.is_valid(value):
if not issn.is_valid(value):
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
return field
@ -44,6 +42,8 @@ def isbn(field):
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
"""
from stdnum import isbn
# Skip fields with missing values
if pd.isna(field):
return
@ -51,7 +51,7 @@ def isbn(field):
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if not stdnum_isbn.is_valid(value):
if not isbn.is_valid(value):
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
return field
@ -67,6 +67,8 @@ def separators(field, field_name):
Prints the field with the invalid multi-value separator.
"""
import re
# Skip fields with missing values
if pd.isna(field):
return
@ -275,6 +277,8 @@ def filename_extension(field):
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
"""
import re
# Skip fields with missing values
if pd.isna(field):
return
@ -313,23 +317,3 @@ def filename_extension(field):
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field
def spdx_license_identifier(field):
"""Check if a license is a valid SPDX identifier.
Prints the value if it is invalid.
"""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split("||"):
if value not in spdx_license_list.LICENSES:
print(f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{value}")
pass
return field

View File

@ -1,32 +1,31 @@
dc.title,dcterms.issued,dc.identifier.issn,dc.identifier.isbn,dcterms.language,dcterms.subject,cg.coverage.country,filename,dcterms.license
Leading space,2019-07-29,,,,,,,
Trailing space ,2019-07-29,,,,,,,
Excessive space,2019-07-29,,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,,
Invalid date,2019-07-260,,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,,
Unnecessary Unicode,2019-07-29,,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,
Invalid language,2019-07-29,,,Span,,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,
dc.title,dc.date.issued,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
Leading space,2019-07-29,,,,,,
Trailing space ,2019-07-29,,,,,,
Excessive space,2019-07-29,,,,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
Duplicate||Duplicate,2019-07-29,,,,,,
Invalid ISSN,2019-07-29,2321-2302,,,,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
Invalid date,2019-07-260,,,,,,
Multiple dates,2019-07-26||2019-01-10,,,,,,
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
Unnecessary Unicode,2019-07-29,,,,,,
Suspicious character||foreˆt,2019-07-29,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,
Invalid language,2019-07-29,,,Span,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
Newline (LF),2019-07-30,,,,"TANZA
NIA",,,
Missing date,,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck,
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,,
"Missing space,after comma",2019-08-27,,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,,
Composéd Unicode,2020-01-14,,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY
NIA",,
Missing date,,,,,,,
Invalid country,2019-08-01,,,,,KENYAA,
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,,
"Missing space,after comma",2019-08-27,,,,,,
Incorrect ISO 639-1 language,2019-09-26,,,es,,,
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,
Composéd Unicode,2020-01-14,,,,,,
Decomposéd Unicode,2020-01-14,,,,,,
Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,

1 dc.title dcterms.issued dc.date.issued dc.identifier.issn dc.identifier.isbn dcterms.language dc.language.iso dcterms.subject dc.subject cg.coverage.country filename dcterms.license
2 Leading space 2019-07-29
3 Trailing space 2019-07-29
4 Excessive space 2019-07-29
5 Miscellaenous ||whitespace | issues 2019-07-29
6 Duplicate||Duplicate 2019-07-29
7 Invalid ISSN 2019-07-29 2321-2302
8 Invalid ISBN 2019-07-29 978-0-306-40615-6
9 Multiple valid ISSNs 2019-07-29 0378-5955||0024-9319
10 Multiple valid ISBNs 2019-07-29 99921-58-10-7||978-0-306-40615-7
11 Invalid date 2019-07-260
12 Multiple dates 2019-07-26||2019-01-10
13 Invalid multi-value separator 2019-07-29 0378-5955|0024-9319
14 Unnecessary Unicode​ 2019-07-29
15 Suspicious character||foreˆt 2019-07-29
16 Invalid ISO 639-1 (alpha 2) language 2019-07-29 jp
17 Invalid ISO 639-3 (alpha 3) language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA
23 Uncommon filename extension 2019-08-10 file.pdf.lck
24 Unneccesary unicode (U+002D + U+00AD) 2019-08-10 978-­92-­9043-­823-­6
25 Missing space,after comma 2019-08-27
26 Incorrect ISO 639-1 language 2019-09-26 es
27 Incorrect ISO 639-3 language 2019-09-26 spa
28 Composéd Unicode 2020-01-14
29 Decomposéd Unicode 2020-01-14
30 Unnecessary multi-value separator 2021-01-03 0378-5955||
31 CC-BY

56
poetry.lock generated
View File

@ -1,6 +1,6 @@
[[package]]
name = "agate"
version = "1.6.2"
version = "1.6.1"
description = "A data analysis library that is optimized for humans instead of machines."
category = "dev"
optional = false
@ -11,7 +11,6 @@ Babel = ">=2.0"
isodate = ">=0.5.4"
leather = ">=0.3.2"
parsedatetime = ">=2.1"
PyICU = ">=2.4.2"
python-slugify = ">=1.2.1"
pytimeparse = ">=1.1.5"
six = ">=1.9.0"
@ -295,6 +294,14 @@ pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
requirements_deprecated_finder = ["pipreqs", "pip-api"]
colors = ["colorama (>=0.4.3,<0.5.0)"]
[[package]]
name = "jdcal"
version = "1.4.1"
description = "Julian dates from proleptic Gregorian and Julian calendars."
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "jedi"
version = "0.18.0"
@ -358,7 +365,7 @@ python-versions = ">=3.7"
[[package]]
name = "openpyxl"
version = "3.0.7"
version = "3.0.6"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
category = "dev"
optional = false
@ -366,6 +373,7 @@ python-versions = ">=3.6,"
[package.dependencies]
et-xmlfile = "*"
jdcal = "*"
[[package]]
name = "packaging"
@ -505,20 +513,12 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "pygments"
version = "2.8.1"
version = "2.8.0"
description = "Pygments is a syntax highlighting package written in Python."
category = "dev"
optional = false
python-versions = ">=3.5"
[[package]]
name = "pyicu"
version = "2.6"
description = "Python extension wrapping the ICU C++ API"
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "pyparsing"
version = "2.4.7"
@ -659,14 +659,6 @@ category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "spdx-license-list"
version = "0.5.2"
description = "A simple tool/library for working with SPDX license definitions."
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "sqlalchemy"
version = "1.3.23"
@ -773,11 +765,12 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "6a9ee0f26b50f361d7e0e6a2275f0e3174dee1c89fbd460583c4ea3d873857b8"
content-hash = "8c4ba410bbdc930d2d74f7864470a18827029a5697869833959708d7425460ae"
[metadata.files]
agate = [
{file = "agate-1.6.2.tar.gz", hash = "sha256:8dbd4a57a2cffecfa2d8109ef5993ec4be12a8a7c81fbc0c8c79d96d4c4399ed"},
{file = "agate-1.6.1-py2.py3-none-any.whl", hash = "sha256:48d6f80b35611c1ba25a642cbc5b90fcbdeeb2a54711c4a8d062ee2809334d1c"},
{file = "agate-1.6.1.tar.gz", hash = "sha256:c93aaa500b439d71e4a5cf088d0006d2ce2c76f1950960c8843114e5f361dfd3"},
]
agate-dbf = [
{file = "agate-dbf-0.2.2.tar.gz", hash = "sha256:589682b78c5c03f2dc8511e6e3edb659fb7336cd118e248896bb0b44c2f1917b"},
@ -873,6 +866,10 @@ isort = [
{file = "isort-5.7.0-py3-none-any.whl", hash = "sha256:fff4f0c04e1825522ce6949973e83110a6e907750cd92d128b0d14aaaadbffdc"},
{file = "isort-5.7.0.tar.gz", hash = "sha256:c729845434366216d320e936b8ad6f9d681aab72dc7cbc2d51bedc3582f3ad1e"},
]
jdcal = [
{file = "jdcal-1.4.1-py2.py3-none-any.whl", hash = "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba"},
{file = "jdcal-1.4.1.tar.gz", hash = "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"},
]
jedi = [
{file = "jedi-0.18.0-py2.py3-none-any.whl", hash = "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93"},
{file = "jedi-0.18.0.tar.gz", hash = "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"},
@ -919,8 +916,8 @@ numpy = [
{file = "numpy-1.20.1.zip", hash = "sha256:3bc63486a870294683980d76ec1e3efc786295ae00128f9ea38e2c6e74d5a60a"},
]
openpyxl = [
{file = "openpyxl-3.0.7-py2.py3-none-any.whl", hash = "sha256:46af4eaf201a89b610fcca177eed957635f88770a5462fb6aae4a2a52b0ff516"},
{file = "openpyxl-3.0.7.tar.gz", hash = "sha256:6456a3b472e1ef0facb1129f3c6ef00713cebf62e736cd7a75bcc3247432f251"},
{file = "openpyxl-3.0.6-py2.py3-none-any.whl", hash = "sha256:1a4b3869c2500b5c713e8e28341cdada49ecfcff1b10cd9006945f5bcefc090d"},
{file = "openpyxl-3.0.6.tar.gz", hash = "sha256:b229112b46e158b910a5d1b270b212c42773d39cab24e8db527f775b82afc041"},
]
packaging = [
{file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
@ -992,11 +989,8 @@ pyflakes = [
{file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"},
]
pygments = [
{file = "Pygments-2.8.1-py3-none-any.whl", hash = "sha256:534ef71d539ae97d4c3a4cf7d6f110f214b0e687e92f9cb9d2a3b0d3101289c8"},
{file = "Pygments-2.8.1.tar.gz", hash = "sha256:2656e1a6edcdabf4275f9a3640db59fd5de107d88e8663c5d4e9a0fa62f77f94"},
]
pyicu = [
{file = "PyICU-2.6.tar.gz", hash = "sha256:a9a5bf6833360f8f69e9375b91c1a7dd6e0c9157a42aee5bb7d6891804d96371"},
{file = "Pygments-2.8.0-py3-none-any.whl", hash = "sha256:b21b072d0ccdf29297a82a2363359d99623597b8a265b8081760e4d0f7153c88"},
{file = "Pygments-2.8.0.tar.gz", hash = "sha256:37a13ba168a02ac54cc5891a42b1caec333e59b66addb7fa633ea8a6d73445c0"},
]
pyparsing = [
{file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
@ -1083,10 +1077,6 @@ six = [
{file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
{file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
]
spdx-license-list = [
{file = "spdx_license_list-0.5.2-py3-none-any.whl", hash = "sha256:1b338470c7b403dbecceca563a316382c7977516128ca6c1e8f7078e3ed6e7b0"},
{file = "spdx_license_list-0.5.2.tar.gz", hash = "sha256:952996f72ab807972dc2278bb9b91e5294767211e51f09aad9c0e2ff5b82a31b"},
]
sqlalchemy = [
{file = "SQLAlchemy-1.3.23-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:fd3b96f8c705af8e938eaa99cbd8fd1450f632d38cad55e7367c33b263bf98ec"},
{file = "SQLAlchemy-1.3.23-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:29cccc9606750fe10c5d0e8bd847f17a97f3850b8682aef1f56f5d5e1a5a64b1"},

View File

@ -20,7 +20,6 @@ requests-cache = "^0.5.2"
pycountry = "^19.8.18"
langid = "^1.1.6"
colorama = "^0.4.4"
spdx-license-list = "^0.5.2"
[tool.poetry.dev-dependencies]
pytest = "^6.1.1"

View File

@ -1,7 +1,7 @@
agate-dbf==0.2.2
agate-excel==0.2.3
agate-sql==0.5.5
agate==1.6.2
agate==1.6.1
appdirs==1.4.4; python_version >= "3.6"
appnope==0.1.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "darwin"
atomicwrites==1.4.0; python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6") or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.4.0" and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6")
@ -24,13 +24,14 @@ ipython-genutils==0.2.0; python_version >= "3.7" and python_version < "4.0"
ipython==7.21.0; python_version >= "3.7" and python_version < "4.0"
isodate==0.6.0
isort==5.7.0; python_version >= "3.6" and python_version < "4.0"
jdcal==1.4.1; python_version >= "3.6"
jedi==0.18.0; python_version >= "3.7" and python_version < "4.0"
langid==1.1.6
leather==0.3.3
mccabe==0.6.1; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
mypy-extensions==0.4.3; python_version >= "3.6"
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
openpyxl==3.0.7; python_version >= "3.6"
openpyxl==3.0.6; python_version >= "3.6"
packaging==20.9; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
pandas==1.2.3; python_full_version >= "3.7.1"
parsedatetime==2.6
@ -45,8 +46,7 @@ py==1.10.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_
pycodestyle==2.6.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
pycountry==19.8.18
pyflakes==2.2.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
pygments==2.8.1; python_version >= "3.7" and python_version < "4.0"
pyicu==2.6
pygments==2.8.0; python_version >= "3.7" and python_version < "4.0"
pyparsing==2.4.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
pytest-clarity==0.3.0a0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
pytest==6.2.2; python_version >= "3.6"
@ -59,7 +59,6 @@ regex==2020.11.13; python_version >= "3.6"
requests-cache==0.5.2
requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
six==1.15.0; python_full_version >= "3.7.1"
spdx-license-list==0.5.2
sqlalchemy==1.3.23; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
termcolor==1.1.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.4.0"
text-unidecode==1.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"

View File

@ -12,6 +12,5 @@ pytz==2021.1; python_full_version >= "3.7.1"
requests-cache==0.5.2
requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
six==1.15.0; python_full_version >= "3.7.1"
spdx-license-list==0.5.2
urllib3==1.26.3; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version < "4"
xlrd==1.2.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")

View File

@ -336,27 +336,3 @@ def test_check_correct_iso_639_3_language():
result = experimental.correct_language(series)
assert result == language
def test_check_valid_spdx_license_identifier():
"""Test valid SPDX license identifier."""
license = "CC-BY-SA-4.0"
result = check.spdx_license_identifier(license)
assert result == license
def test_check_invalid_spdx_license_identifier(capsys):
"""Test invalid SPDX license identifier."""
license = "CC-BY-SA"
result = check.spdx_license_identifier(license)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.YELLOW}Non-SPDX license identifier: {Fore.RESET}{license}\n"
)