1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-28 08:38:18 +01:00

Compare commits

..

No commits in common. "58b7b6e9d8d0834a01a6d894c3386ddf84dd906d" and "2e489fc9216aef8c71046f89e5a2f077b6eb5b1b" have entirely different histories.

14 changed files with 172 additions and 191 deletions

View File

@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.6.0] = 2022-09-02
## Unreleased
### Changed
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding
issues with ftfy
@ -13,7 +13,6 @@ issues with ftfy
### Updated
- Python dependencies
- Metadatata field exclude logic
### Added
- Ability to drop invalid AGROVOC values with `-d` when checking AGROVOC values
@ -21,10 +20,6 @@ with `-a <field.name>`
- Ability to add missing UN M.49 regions when both country and region columns
are present. Enable with `-u` (unsafe fixes) for now.
### Removed
- Support for reading Excel files (both `.xls` and `.xlsx`) as it was completely
untested
## [0.5.0] - 2021-12-08
### Added
- Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy)

View File

@ -8,7 +8,7 @@
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem (though it could theoretically work on any CSV that uses Dublin Core fields as columns). The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, unnecessary Unicode, AGROVOC terms, etc.
Requires Python 3.8 or greater. CSV support comes from the [Pandas](https://pandas.pydata.org/) library.
Requires Python 3.8 or greater. CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
If you use the DSpace CSV metadata quality checker please cite:

View File

@ -36,7 +36,7 @@ def parse_args(argv):
parser.add_argument(
"--input-file",
"-i",
help="Path to input file. Must be a UTF-8 CSV.",
help="Path to input file. Can be UTF-8 CSV or Excel XLSX.",
required=True,
type=argparse.FileType("r", encoding="UTF-8"),
)
@ -76,16 +76,16 @@ def run(argv):
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
df = pd.read_csv(args.input_file, dtype=str)
for column in df.columns:
# Check if the user requested to skip any fields
if args.exclude_fields:
# Split the list of excluded fields on ',' into a list. Note that the
# user should be careful to no include spaces here.
exclude = args.exclude_fields.split(",")
else:
exclude = list()
for column in df.columns:
if column in exclude:
skip = False
# Split the list of excludes on ',' so we can test exact matches
# rather than fuzzy matches with regexes or "if word in string"
for exclude in args.exclude_fields.split(","):
if column == exclude and skip is False:
skip = True
if skip:
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
continue
@ -200,22 +200,20 @@ def run(argv):
# should rename column in this for loop...
for column in df_transposed.columns:
# Check: citation DOI
check.citation_doi(df_transposed[column], exclude)
check.citation_doi(df_transposed[column])
# Check: title in citation
check.title_in_citation(df_transposed[column], exclude)
check.title_in_citation(df_transposed[column])
if args.unsafe_fixes:
# Fix: countries match regions
df_transposed[column] = fix.countries_match_regions(
df_transposed[column], exclude
)
df_transposed[column] = fix.countries_match_regions(df_transposed[column])
else:
# Check: countries match regions
check.countries_match_regions(df_transposed[column], exclude)
check.countries_match_regions(df_transposed[column])
if args.experimental_checks:
experimental.correct_language(df_transposed[column], exclude)
experimental.correct_language(df_transposed[column])
# Transpose the DataFrame back before writing. This is probably wasteful to
# do every time since we technically only need to do it if we've done the

View File

@ -391,20 +391,13 @@ def mojibake(field, field_name):
return
def citation_doi(row, exclude):
def citation_doi(row):
"""Check for the scenario where an item has a DOI listed in its citation,
but does not have a cg.identifier.doi field.
Function prints a warning if the DOI field is missing, but there is a DOI
in the citation.
"""
# Check if the user requested us to skip any DOI fields so we can
# just return before going any further.
for field in exclude:
match = re.match(r"^.*?doi.*$", field)
if match is not None:
return
# Initialize some variables at global scope so that we can set them in the
# loop scope below and still be able to access them afterwards.
citation = ""
@ -422,10 +415,9 @@ def citation_doi(row, exclude):
if match is not None:
return
# Check if the current label is a citation field and make sure the user
# hasn't asked to skip it. If not, then set the citation.
# Get the name of the citation field
match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None and label not in exclude:
if match is not None:
citation = row[label]
if citation != "":
@ -441,7 +433,7 @@ def citation_doi(row, exclude):
return
def title_in_citation(row, exclude):
def title_in_citation(row):
"""Check for the scenario where an item's title is missing from its cita-
tion. This could mean that it is missing entirely, or perhaps just exists
in a different format (whitespace, accents, etc).
@ -463,12 +455,12 @@ def title_in_citation(row, exclude):
# Find the name of the title column
match = re.match(r"^(dc|dcterms)\.title.*$", label)
if match is not None and label not in exclude:
if match is not None:
title = row[label]
# Find the name of the citation column
match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None and label not in exclude:
if match is not None:
citation = row[label]
if citation != "":
@ -478,7 +470,7 @@ def title_in_citation(row, exclude):
return
def countries_match_regions(row, exclude):
def countries_match_regions(row):
"""Check for the scenario where an item has country coverage metadata, but
does not have the corresponding region metadata. For example, an item that
has country coverage "Kenya" should also have region "Eastern Africa" acc-
@ -522,12 +514,6 @@ def countries_match_regions(row, exclude):
if match is not None:
title_column_name = label
# Make sure the user has not asked to exclude any metadata fields. If so, we
# should return immediately.
column_names = [country_column_name, region_column_name, title_column_name]
if any(field in column_names for field in exclude):
return
# Make sure we found the country and region columns
if country_column_name != "" and region_column_name != "":
# If we don't have any countries then we should return early before

View File

@ -8,7 +8,7 @@ from colorama import Fore
from pycountry import languages
def correct_language(row, exclude):
def correct_language(row):
"""Analyze the text used in the title, abstract, and citation fields to pre-
dict the language being used and compare it with the item's dc.language.iso
field.
@ -39,8 +39,7 @@ def correct_language(row, exclude):
language = row[label]
# Extract title if it is present (note that we don't allow excluding
# the title here because it complicates things).
# Extract title if it is present
match = re.match(r"^.*?title.*$", label)
if match is not None:
title = row[label]
@ -49,12 +48,12 @@ def correct_language(row, exclude):
# Extract abstract if it is present
match = re.match(r"^.*?abstract.*$", label)
if match is not None and label not in exclude:
if match is not None:
sample_strings.append(row[label])
# Extract citation if it is present
match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None and label not in exclude:
if match is not None:
sample_strings.append(row[label])
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction

View File

@ -293,7 +293,7 @@ def mojibake(field, field_name):
return field
def countries_match_regions(row, exclude):
def countries_match_regions(row):
"""Check for the scenario where an item has country coverage metadata, but
does not have the corresponding region metadata. For example, an item that
has country coverage "Kenya" should also have region "Eastern Africa" acc-
@ -337,12 +337,6 @@ def countries_match_regions(row, exclude):
if match is not None:
title_column_name = label
# Make sure the user has not asked to exclude any metadata fields. If so, we
# should return immediately.
column_names = [country_column_name, region_column_name, title_column_name]
if any(field in column_names for field in exclude):
return row
# Make sure we found the country and region columns
if country_column_name != "" and region_column_name != "":
# If we don't have any countries then we should return early before

View File

@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-3.0-only
VERSION = "0.6.0"
VERSION = "0.6.0-dev"

184
poetry.lock generated
View File

@ -74,6 +74,14 @@ category = "main"
optional = false
python-versions = "*"
[[package]]
name = "atomicwrites"
version = "1.4.1"
description = "Atomic file writes."
category = "dev"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "attrs"
version = "22.1.0"
@ -241,16 +249,16 @@ test = ["pytest (>=6)"]
[[package]]
name = "flake8"
version = "5.0.4"
version = "4.0.1"
description = "the modular source code checker: pep8 pyflakes and co"
category = "dev"
optional = false
python-versions = ">=3.6.1"
python-versions = ">=3.6"
[package.dependencies]
mccabe = ">=0.7.0,<0.8.0"
pycodestyle = ">=2.9.0,<2.10.0"
pyflakes = ">=2.5.0,<2.6.0"
mccabe = ">=0.6.0,<0.7.0"
pycodestyle = ">=2.8.0,<2.9.0"
pyflakes = ">=2.4.0,<2.5.0"
[[package]]
name = "ftfy"
@ -275,7 +283,7 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
name = "greenlet"
version = "1.1.3"
description = "Lightweight in-process concurrent programming"
category = "dev"
category = "main"
optional = false
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*"
@ -347,11 +355,11 @@ six = ">=1.6.1"
[[package]]
name = "mccabe"
version = "0.7.0"
version = "0.6.1"
description = "McCabe checker, plugin for flake8"
category = "dev"
optional = false
python-versions = ">=3.6"
python-versions = "*"
[[package]]
name = "mypy-extensions"
@ -433,7 +441,7 @@ future = "*"
[[package]]
name = "pathspec"
version = "0.10.1"
version = "0.10.0"
description = "Utility library for gitignore style pattern matching of file paths."
category = "dev"
optional = false
@ -481,11 +489,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "pycodestyle"
version = "2.9.1"
version = "2.8.0"
description = "Python style guide checker"
category = "dev"
optional = false
python-versions = ">=3.6"
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "pycountry"
@ -500,11 +508,11 @@ setuptools = "*"
[[package]]
name = "pyflakes"
version = "2.5.0"
version = "2.4.0"
description = "passive checker of Python programs"
category = "dev"
optional = false
python-versions = ">=3.6"
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "Pygments"
@ -530,23 +538,24 @@ diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "pytest"
version = "7.1.3"
version = "6.2.5"
description = "pytest: simple powerful testing with Python"
category = "dev"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.6"
[package.dependencies]
atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
attrs = ">=19.2.0"
colorama = {version = "*", markers = "sys_platform == \"win32\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
py = ">=1.8.2"
tomli = ">=1.0.0"
toml = "*"
[package.extras]
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
[[package]]
name = "pytest-clarity"
@ -707,31 +716,30 @@ python-versions = "*"
[[package]]
name = "SQLAlchemy"
version = "1.4.40"
version = "1.4.22"
description = "Database Abstraction Library"
category = "dev"
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\""}
[package.extras]
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing_extensions (!=3.10.0.1)"]
aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)"]
asyncio = ["greenlet (!=0.4.17)"]
asyncmy = ["asyncmy (>=0.2.3,!=0.2.4)", "greenlet (!=0.4.17)"]
mariadb_connector = ["mariadb (>=1.0.1,!=1.1.2)"]
mariadb_connector = ["mariadb (>=1.0.1)"]
mssql = ["pyodbc"]
mssql_pymssql = ["pymssql"]
mssql_pyodbc = ["pyodbc"]
mypy = ["mypy (>=0.910)", "sqlalchemy2-stubs"]
mypy = ["mypy (>=0.800)", "sqlalchemy2-stubs"]
mysql = ["mysqlclient (>=1.4.0)", "mysqlclient (>=1.4.0,<2)"]
mysql_connector = ["mysql-connector-python"]
mysql_connector = ["mysqlconnector"]
oracle = ["cx_oracle (>=7)", "cx_oracle (>=7,<8)"]
postgresql = ["psycopg2 (>=2.7)"]
postgresql_asyncpg = ["asyncpg", "greenlet (!=0.4.17)"]
postgresql_pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"]
postgresql_pg8000 = ["pg8000 (>=1.16.6)"]
postgresql_psycopg2binary = ["psycopg2-binary"]
postgresql_psycopg2cffi = ["psycopg2cffi"]
pymysql = ["pymysql", "pymysql (<1)"]
@ -745,6 +753,14 @@ category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "toml"
version = "0.10.2"
description = "Python Library for Tom's Obvious, Minimal Language"
category = "dev"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "tomli"
version = "2.0.1"
@ -795,21 +811,16 @@ python-versions = "*"
[[package]]
name = "xlrd"
version = "2.0.1"
description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files"
category = "dev"
version = "1.2.0"
description = "Library for developers to extract data from Microsoft Excel (tm) spreadsheet files"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
[package.extras]
build = ["twine", "wheel"]
docs = ["sphinx"]
test = ["pytest", "pytest-cov"]
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "266a3911a7403fb9fe468626e179c919143d6efc26cab0adad69bd64c9e2a06f"
content-hash = "71d3e50c7f44aa2e1d800e31b198cb8f614c4b0c31fdde1c20eae26eeaef646d"
[metadata.files]
agate = [
@ -832,6 +843,9 @@ appdirs = [
{file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
{file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
]
atomicwrites = [
{file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
]
attrs = [
{file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"},
{file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"},
@ -909,8 +923,8 @@ exceptiongroup = [
{file = "exceptiongroup-1.0.0rc9.tar.gz", hash = "sha256:9086a4a21ef9b31c72181c77c040a074ba0889ee56a7b289ff0afb0d97655f96"},
]
flake8 = [
{file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
{file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
{file = "flake8-4.0.1-py2.py3-none-any.whl", hash = "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d"},
{file = "flake8-4.0.1.tar.gz", hash = "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"},
]
ftfy = [
{file = "ftfy-6.1.1-py3-none-any.whl", hash = "sha256:0ffd33fce16b54cccaec78d6ec73d95ad370e5df5a25255c8966a6147bd667ca"},
@ -999,8 +1013,8 @@ leather = [
{file = "leather-0.3.4.tar.gz", hash = "sha256:b43e21c8fa46b2679de8449f4d953c06418666dc058ce41055ee8a8d3bb40918"},
]
mccabe = [
{file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
{file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
{file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
{file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
]
mypy-extensions = [
{file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
@ -1075,8 +1089,8 @@ parsedatetime = [
{file = "parsedatetime-2.4.tar.gz", hash = "sha256:3d817c58fb9570d1eec1dd46fa9448cd644eeed4fb612684b02dfda3a79cb84b"},
]
pathspec = [
{file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"},
{file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"},
{file = "pathspec-0.10.0-py3-none-any.whl", hash = "sha256:aefa80ac32d5bf1f96139dca67cefb69a431beff4e6bf1168468f37d7ab87015"},
{file = "pathspec-0.10.0.tar.gz", hash = "sha256:01eecd304ba0e6eeed188ae5fa568e99ef10265af7fd9ab737d6412b4ee0ab85"},
]
platformdirs = [
{file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
@ -1095,15 +1109,15 @@ py = [
{file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
]
pycodestyle = [
{file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
{file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
{file = "pycodestyle-2.8.0-py2.py3-none-any.whl", hash = "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20"},
{file = "pycodestyle-2.8.0.tar.gz", hash = "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"},
]
pycountry = [
{file = "pycountry-22.3.5.tar.gz", hash = "sha256:b2163a246c585894d808f18783e19137cb70a0c18fb36748dc01fc6f109c1646"},
]
pyflakes = [
{file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
{file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
{file = "pyflakes-2.4.0-py2.py3-none-any.whl", hash = "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"},
{file = "pyflakes-2.4.0.tar.gz", hash = "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c"},
]
Pygments = [
{file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"},
@ -1114,8 +1128,8 @@ pyparsing = [
{file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
]
pytest = [
{file = "pytest-7.1.3-py3-none-any.whl", hash = "sha256:1377bda3466d70b55e3f5cecfa55bb7cfcf219c7964629b967c37cf0bda818b7"},
{file = "pytest-7.1.3.tar.gz", hash = "sha256:4f365fec2dff9c1162f834d9f18af1ba13062db0c708bf7b946f8a5c76180c39"},
{file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"},
{file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
]
pytest-clarity = [
{file = "pytest-clarity-1.0.1.tar.gz", hash = "sha256:505fe345fad4fe11c6a4187fe683f2c7c52c077caa1e135f3e483fe112db7772"},
@ -1165,47 +1179,45 @@ spdx-license-list = [
{file = "spdx_license_list-0.5.2.tar.gz", hash = "sha256:952996f72ab807972dc2278bb9b91e5294767211e51f09aad9c0e2ff5b82a31b"},
]
SQLAlchemy = [
{file = "SQLAlchemy-1.4.40-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:b07fc38e6392a65935dc8b486229679142b2ea33c94059366b4d8b56f1e35a97"},
{file = "SQLAlchemy-1.4.40-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fb4edb6c354eac0fcc07cb91797e142f702532dbb16c1d62839d6eec35f814cf"},
{file = "SQLAlchemy-1.4.40-cp27-cp27m-win32.whl", hash = "sha256:2026632051a93997cf8f6fda14360f99230be1725b7ab2ef15be205a4b8a5430"},
{file = "SQLAlchemy-1.4.40-cp27-cp27m-win_amd64.whl", hash = "sha256:f2aa85aebc0ef6b342d5d3542f969caa8c6a63c8d36cf5098769158a9fa2123c"},
{file = "SQLAlchemy-1.4.40-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a0b9e3d81f86ba04007f0349e373a5b8c81ec2047aadb8d669caf8c54a092461"},
{file = "SQLAlchemy-1.4.40-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:1ab08141d93de83559f6a7d9a962830f918623a885b3759ec2b9d1a531ff28fe"},
{file = "SQLAlchemy-1.4.40-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00dd998b43b282c71de46b061627b5edb9332510eb1edfc5017b9e4356ed44ea"},
{file = "SQLAlchemy-1.4.40-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bb342c0e25cc8f78a0e7c692da3b984f072666b316fbbec2a0e371cb4dfef5f0"},
{file = "SQLAlchemy-1.4.40-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23b693876ac7963b6bc7b1a5f3a2642f38d2624af834faad5933913928089d1b"},
{file = "SQLAlchemy-1.4.40-cp310-cp310-win32.whl", hash = "sha256:2cf50611ef4221ad587fb7a1708e61ff72966f84330c6317642e08d6db4138fd"},
{file = "SQLAlchemy-1.4.40-cp310-cp310-win_amd64.whl", hash = "sha256:26ee4dbac5dd7abf18bf3cd8f04e51f72c339caf702f68172d308888cd26c6c9"},
{file = "SQLAlchemy-1.4.40-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:b41b87b929118838bafc4bb18cf3c5cd1b3be4b61cd9042e75174df79e8ac7a2"},
{file = "SQLAlchemy-1.4.40-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:885e11638946472b4a0a7db8e6df604b2cf64d23dc40eedc3806d869fcb18fae"},
{file = "SQLAlchemy-1.4.40-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b7ff0a8bf0aec1908b92b8dfa1246128bf4f94adbdd3da6730e9c542e112542d"},
{file = "SQLAlchemy-1.4.40-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfa8ab4ba0c97ab6bcae1f0948497d14c11b6c6ecd1b32b8a79546a0823d8211"},
{file = "SQLAlchemy-1.4.40-cp36-cp36m-win32.whl", hash = "sha256:d259fa08e4b3ed952c01711268bcf6cd2442b0c54866d64aece122f83da77c6d"},
{file = "SQLAlchemy-1.4.40-cp36-cp36m-win_amd64.whl", hash = "sha256:c8d974c991eef0cd29418a5957ae544559dc326685a6f26b3a914c87759bf2f4"},
{file = "SQLAlchemy-1.4.40-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:28b1791a30d62fc104070965f1a2866699c45bbf5adc0be0cf5f22935edcac58"},
{file = "SQLAlchemy-1.4.40-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7ccdca6cd167611f4a62a8c2c0c4285c2535640d77108f782ce3f3cccb70f3a"},
{file = "SQLAlchemy-1.4.40-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:69deec3a94de10062080d91e1ba69595efeafeafe68b996426dec9720031fb25"},
{file = "SQLAlchemy-1.4.40-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63ad778f4e80913fb171247e4fa82123d0068615ae1d51a9791fc4284cb81748"},
{file = "SQLAlchemy-1.4.40-cp37-cp37m-win32.whl", hash = "sha256:9ced2450c9fd016f9232d976661623e54c450679eeefc7aa48a3d29924a63189"},
{file = "SQLAlchemy-1.4.40-cp37-cp37m-win_amd64.whl", hash = "sha256:cdee4d475e35684d210dc6b430ff8ca2ed0636378ac19b457e2f6f350d1f5acc"},
{file = "SQLAlchemy-1.4.40-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:08b47c971327e733ffd6bae2d4f50a7b761793efe69d41067fcba86282819eea"},
{file = "SQLAlchemy-1.4.40-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cf03d37819dc17a388d313919daf32058d19ba1e592efdf14ce8cbd997e6023"},
{file = "SQLAlchemy-1.4.40-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a62c0ecbb9976550f26f7bf75569f425e661e7249349487f1483115e5fc893a6"},
{file = "SQLAlchemy-1.4.40-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ec440990ab00650d0c7ea2c75bc225087afdd7ddcb248e3d934def4dff62762"},
{file = "SQLAlchemy-1.4.40-cp38-cp38-win32.whl", hash = "sha256:2b64955850a14b9d481c17becf0d3f62fb1bb31ac2c45c2caf5ad06d9e811187"},
{file = "SQLAlchemy-1.4.40-cp38-cp38-win_amd64.whl", hash = "sha256:959bf4390766a8696aa01285016c766b4eb676f712878aac5fce956dd49695d9"},
{file = "SQLAlchemy-1.4.40-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:0992f3cc640ec0f88f721e426da884c34ff0a60eb73d3d64172e23dfadfc8a0b"},
{file = "SQLAlchemy-1.4.40-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa9e0d7832b7511b3b3fd0e67fac85ff11fd752834c143ca2364c9b778c0485a"},
{file = "SQLAlchemy-1.4.40-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c9d0f1a9538cc5e75f2ea0cb6c3d70155a1b7f18092c052e0d84105622a41b63"},
{file = "SQLAlchemy-1.4.40-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c956a5d1adb49a35d78ef0fae26717afc48a36262359bb5b0cbd7a3a247c26f"},
{file = "SQLAlchemy-1.4.40-cp39-cp39-win32.whl", hash = "sha256:6b70d02bbe1adbbf715d2249cacf9ac17c6f8d22dfcb3f1a4fbc5bf64364da8a"},
{file = "SQLAlchemy-1.4.40-cp39-cp39-win_amd64.whl", hash = "sha256:bf073c619b5a7f7cd731507d0fdc7329bee14b247a63b0419929e4acd24afea8"},
{file = "SQLAlchemy-1.4.40.tar.gz", hash = "sha256:44a660506080cc975e1dfa5776fe5f6315ddc626a77b50bf0eee18b0389ea265"},
{file = "SQLAlchemy-1.4.22-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:488608953385d6c127d2dcbc4b11f8d7f2f30b89f6bd27c01b042253d985cc2f"},
{file = "SQLAlchemy-1.4.22-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:5d856cc50fd26fc8dd04892ed5a5a3d7eeb914fea2c2e484183e2d84c14926e0"},
{file = "SQLAlchemy-1.4.22-cp27-cp27m-win32.whl", hash = "sha256:a00d9c6d3a8afe1d1681cd8a5266d2f0ed684b0b44bada2ca82403b9e8b25d39"},
{file = "SQLAlchemy-1.4.22-cp27-cp27m-win_amd64.whl", hash = "sha256:5908ea6c652a050d768580d01219c98c071e71910ab8e7b42c02af4010608397"},
{file = "SQLAlchemy-1.4.22-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b7fb937c720847879c7402fe300cfdb2aeff22349fa4ea3651bca4e2d6555939"},
{file = "SQLAlchemy-1.4.22-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:9bfe882d5a1bbde0245dca0bd48da0976bd6634cf2041d2fdf0417c5463e40e5"},
{file = "SQLAlchemy-1.4.22-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eedd76f135461cf237534a6dc0d1e0f6bb88a1dc193678fab48a11d223462da5"},
{file = "SQLAlchemy-1.4.22-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6a16c7c4452293da5143afa3056680db2d187b380b3ef4d470d4e29885720de3"},
{file = "SQLAlchemy-1.4.22-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44d23ea797a5e0be71bc5454b9ae99158ea0edc79e2393c6e9a2354de88329c0"},
{file = "SQLAlchemy-1.4.22-cp36-cp36m-win32.whl", hash = "sha256:a5e14cb0c0a4ac095395f24575a0e7ab5d1be27f5f9347f1762f21505e3ba9f1"},
{file = "SQLAlchemy-1.4.22-cp36-cp36m-win_amd64.whl", hash = "sha256:bc34a007e604091ca3a4a057525efc4cefd2b7fe970f44d20b9cfa109ab1bddb"},
{file = "SQLAlchemy-1.4.22-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:756f5d2f5b92d27450167247fb574b09c4cd192a3f8c2e493b3e518a204ee543"},
{file = "SQLAlchemy-1.4.22-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fcbb4b4756b250ed19adc5e28c005b8ed56fdb5c21efa24c6822c0575b4964d"},
{file = "SQLAlchemy-1.4.22-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:09dbb4bc01a734ccddbf188deb2a69aede4b3c153a72b6d5c6900be7fb2945b1"},
{file = "SQLAlchemy-1.4.22-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f028ef6a1d828bc754852a022b2160e036202ac8658a6c7d34875aafd14a9a15"},
{file = "SQLAlchemy-1.4.22-cp37-cp37m-win32.whl", hash = "sha256:68393d3fd31469845b6ba11f5b4209edbea0b58506be0e077aafbf9aa2e21e11"},
{file = "SQLAlchemy-1.4.22-cp37-cp37m-win_amd64.whl", hash = "sha256:891927a49b2363a4199763a9d436d97b0b42c65922a4ea09025600b81a00d17e"},
{file = "SQLAlchemy-1.4.22-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:fd2102a8f8a659522719ed73865dff3d3cc76eb0833039dc473e0ad3041d04be"},
{file = "SQLAlchemy-1.4.22-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4014978de28163cd8027434916a92d0f5bb1a3a38dff5e8bf8bff4d9372a9117"},
{file = "SQLAlchemy-1.4.22-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f814d80844969b0d22ea63663da4de5ca1c434cfbae226188901e5d368792c17"},
{file = "SQLAlchemy-1.4.22-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d09a760b0a045b4d799102ae7965b5491ccf102123f14b2a8cc6c01d1021a2d9"},
{file = "SQLAlchemy-1.4.22-cp38-cp38-win32.whl", hash = "sha256:26daa429f039e29b1e523bf763bfab17490556b974c77b5ca7acb545b9230e9a"},
{file = "SQLAlchemy-1.4.22-cp38-cp38-win_amd64.whl", hash = "sha256:12bac5fa1a6ea870bdccb96fe01610641dd44ebe001ed91ef7fcd980e9702db5"},
{file = "SQLAlchemy-1.4.22-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:39b5d36ab71f73c068cdcf70c38075511de73616e6c7fdd112d6268c2704d9f5"},
{file = "SQLAlchemy-1.4.22-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5102b9face693e8b2db3b2539c7e1a5d9a5b4dc0d79967670626ffd2f710d6e6"},
{file = "SQLAlchemy-1.4.22-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c9373ef67a127799027091fa53449125351a8c943ddaa97bec4e99271dbb21f4"},
{file = "SQLAlchemy-1.4.22-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36a089dc604032d41343d86290ce85d4e6886012eea73faa88001260abf5ff81"},
{file = "SQLAlchemy-1.4.22-cp39-cp39-win32.whl", hash = "sha256:b48148ceedfb55f764562e04c00539bb9ea72bf07820ca15a594a9a049ff6b0e"},
{file = "SQLAlchemy-1.4.22-cp39-cp39-win_amd64.whl", hash = "sha256:1fdae7d980a2fa617d119d0dc13ecb5c23cc63a8b04ffcb5298f2c59d86851e9"},
{file = "SQLAlchemy-1.4.22.tar.gz", hash = "sha256:ec1be26cdccd60d180359a527d5980d959a26269a2c7b1b327a1eea0cab37ed8"},
]
text-unidecode = [
{file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"},
{file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"},
]
toml = [
{file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
]
tomli = [
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
@ -1227,6 +1239,6 @@ wcwidth = [
{file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"},
]
xlrd = [
{file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"},
{file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"},
{file = "xlrd-1.2.0-py2.py3-none-any.whl", hash = "sha256:e551fb498759fa3a5384a94ccd4c3c02eb7c00ea424426e212ac0c57be9dfbde"},
{file = "xlrd-1.2.0.tar.gz", hash = "sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2"},
]

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "csv-metadata-quality"
version = "0.6.0"
version = "0.6.0-dev"
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
authors = ["Alan Orth <alan.orth@gmail.com>"]
license="GPL-3.0-only"
@ -14,22 +14,24 @@ csv-metadata-quality = 'csv_metadata_quality.__main__:main'
python = "^3.8"
pandas = "^1.4.0"
python-stdnum = "^1.13"
requests = "^2.28.1"
requests-cache = "^0.9.6"
pycountry = "^22.3.5"
xlrd = "^1.2.0"
requests = "^2.27.1"
requests-cache = "^0.9.1"
pycountry = "^22.1.10"
langid = "^1.1.6"
colorama = "^0.4.5"
colorama = "^0.4.4"
spdx-license-list = "^0.5.2"
ftfy = "^6.1.1"
country-converter = "^0.7.7"
ftfy = "^6.0"
SQLAlchemy = ">=1.3.3,<1.4.23"
country-converter = "^0.7.4"
[tool.poetry.dev-dependencies]
pytest = "^7.1.3"
flake8 = "^5.0.4"
pytest = "^6.1.1"
flake8 = "^4.0.1"
pytest-clarity = "^1.0.1"
black = "^22.8.0"
isort = "^5.10.1"
csvkit = "^1.0.7"
black = "^22.1.0"
isort = "^5.5.4"
csvkit = "^1.0.5"
[build-system]
requires = ["poetry>=0.12"]

View File

@ -3,6 +3,7 @@ agate-excel==0.2.5 ; python_version >= "3.8" and python_version < "4.0"
agate-sql==0.5.8 ; python_version >= "3.8" and python_version < "4.0"
agate==1.6.3 ; python_version >= "3.8" and python_version < "4.0"
appdirs==1.4.4 ; python_version >= "3.8" and python_version < "4.0"
atomicwrites==1.4.1 ; python_version >= "3.8" and python_version < "4.0" and sys_platform == "win32"
attrs==22.1.0 ; python_version >= "3.8" and python_version < "4.0"
babel==2.10.3 ; python_version >= "3.8" and python_version < "4.0"
black==22.8.0 ; python_version >= "3.8" and python_version < "4.0"
@ -17,17 +18,17 @@ csvkit==1.0.7 ; python_version >= "3.8" and python_version < "4.0"
dbfread==2.0.7 ; python_version >= "3.8" and python_version < "4.0"
et-xmlfile==1.1.0 ; python_version >= "3.8" and python_version < "4.0"
exceptiongroup==1.0.0rc9 ; python_version >= "3.8" and python_version <= "3.10"
flake8==5.0.4 ; python_version >= "3.8" and python_version < "4.0"
flake8==4.0.1 ; python_version >= "3.8" and python_version < "4.0"
ftfy==6.1.1 ; python_version >= "3.8" and python_version < "4"
future==0.18.2 ; python_version >= "3.8" and python_version < "4.0"
greenlet==1.1.3 ; python_version >= "3.8" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "4.0"
greenlet==1.1.3 ; python_version >= "3.8" and python_version < "4.0"
idna==3.3 ; python_version >= "3.8" and python_version < "4"
iniconfig==1.1.1 ; python_version >= "3.8" and python_version < "4.0"
isodate==0.6.1 ; python_version >= "3.8" and python_version < "4.0"
isort==5.10.1 ; python_version >= "3.8" and python_version < "4.0"
langid==1.1.6 ; python_version >= "3.8" and python_version < "4.0"
leather==0.3.4 ; python_version >= "3.8" and python_version < "4.0"
mccabe==0.7.0 ; python_version >= "3.8" and python_version < "4.0"
mccabe==0.6.1 ; python_version >= "3.8" and python_version < "4.0"
mypy-extensions==0.4.3 ; python_version >= "3.8" and python_version < "4.0"
numpy==1.23.2 ; python_version < "4.0" and python_version >= "3.8"
olefile==0.46 ; python_version >= "3.8" and python_version < "4.0"
@ -35,18 +36,18 @@ openpyxl==3.0.10 ; python_version >= "3.8" and python_version < "4.0"
packaging==21.3 ; python_version >= "3.8" and python_version < "4.0"
pandas==1.4.4 ; python_version >= "3.8" and python_version < "4.0"
parsedatetime==2.4 ; python_version >= "3.8" and python_version < "4.0"
pathspec==0.10.1 ; python_version >= "3.8" and python_version < "4.0"
pathspec==0.10.0 ; python_version >= "3.8" and python_version < "4.0"
platformdirs==2.5.2 ; python_version >= "3.8" and python_version < "4.0"
pluggy==1.0.0 ; python_version >= "3.8" and python_version < "4.0"
pprintpp==0.4.0 ; python_version >= "3.8" and python_version < "4.0"
py==1.11.0 ; python_version >= "3.8" and python_version < "4.0"
pycodestyle==2.9.1 ; python_version >= "3.8" and python_version < "4.0"
pycodestyle==2.8.0 ; python_version >= "3.8" and python_version < "4.0"
pycountry==22.3.5 ; python_version >= "3.8" and python_version < "4"
pyflakes==2.5.0 ; python_version >= "3.8" and python_version < "4.0"
pyflakes==2.4.0 ; python_version >= "3.8" and python_version < "4.0"
pygments==2.13.0 ; python_version >= "3.8" and python_version < "4.0"
pyparsing==3.0.9 ; python_version >= "3.8" and python_version < "4.0"
pytest-clarity==1.0.1 ; python_version >= "3.8" and python_version < "4.0"
pytest==7.1.3 ; python_version >= "3.8" and python_version < "4.0"
pytest==6.2.5 ; python_version >= "3.8" and python_version < "4.0"
python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "4.0"
python-slugify==6.1.2 ; python_version >= "3.8" and python_version < "4.0"
python-stdnum==1.17 ; python_version >= "3.8" and python_version < "4.0"
@ -58,11 +59,12 @@ rich==12.5.1 ; python_version >= "3.8" and python_version < "4.0"
setuptools==65.3.0 ; python_version >= "3.8" and python_version < "4"
six==1.16.0 ; python_version >= "3.8" and python_version < "4.0"
spdx-license-list==0.5.2 ; python_version >= "3.8" and python_version < "4.0"
sqlalchemy==1.4.40 ; python_version >= "3.8" and python_version < "4.0"
sqlalchemy==1.4.22 ; python_version >= "3.8" and python_version < "4.0"
text-unidecode==1.3 ; python_version >= "3.8" and python_version < "4.0"
tomli==2.0.1 ; python_version >= "3.8" and python_version < "4.0"
toml==0.10.2 ; python_version >= "3.8" and python_version < "4.0"
tomli==2.0.1 ; python_version >= "3.8" and python_full_version < "3.11.0a7"
typing-extensions==4.3.0 ; python_version >= "3.8" and python_version < "3.10"
url-normalize==1.4.3 ; python_version >= "3.8" and python_version < "4.0"
urllib3==1.26.12 ; python_version >= "3.8" and python_version < "4"
wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "4"
xlrd==2.0.1 ; python_version >= "3.8" and python_version < "4.0"
xlrd==1.2.0 ; python_version >= "3.8" and python_version < "4.0"

View File

@ -7,6 +7,7 @@ colorama==0.4.5 ; python_version >= "3.8" and python_version < "4.0"
country-converter==0.7.7 ; python_version >= "3.8" and python_version < "4.0"
exceptiongroup==1.0.0rc9 ; python_version >= "3.8" and python_version <= "3.10"
ftfy==6.1.1 ; python_version >= "3.8" and python_version < "4"
greenlet==1.1.3 ; python_version >= "3.8" and python_version < "4.0"
idna==3.3 ; python_version >= "3.8" and python_version < "4"
langid==1.1.6 ; python_version >= "3.8" and python_version < "4.0"
numpy==1.23.2 ; python_version < "4.0" and python_version >= "3.8"
@ -20,6 +21,8 @@ requests==2.28.1 ; python_version >= "3.8" and python_version < "4"
setuptools==65.3.0 ; python_version >= "3.8" and python_version < "4"
six==1.16.0 ; python_version >= "3.8" and python_version < "4.0"
spdx-license-list==0.5.2 ; python_version >= "3.8" and python_version < "4.0"
sqlalchemy==1.4.22 ; python_version >= "3.8" and python_version < "4.0"
url-normalize==1.4.3 ; python_version >= "3.8" and python_version < "4.0"
urllib3==1.26.12 ; python_version >= "3.8" and python_version < "4"
wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "4"
xlrd==1.2.0 ; python_version >= "3.8" and python_version < "4.0"

View File

@ -14,7 +14,7 @@ install_requires = [
setuptools.setup(
name="csv-metadata-quality",
version="0.6.0",
version="0.6.0-dev",
author="Alan Orth",
author_email="aorth@mjanja.ch",
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
@ -23,6 +23,7 @@ setuptools.setup(
long_description_content_type="text/markdown",
url="https://github.com/alanorth/csv-metadata-quality",
classifiers=[
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",

View File

@ -257,13 +257,12 @@ def test_check_incorrect_iso_639_1_language(capsys):
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "es"
exclude = list()
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
experimental.correct_language(series, exclude)
experimental.correct_language(series)
captured = capsys.readouterr()
assert (
@ -277,13 +276,12 @@ def test_check_incorrect_iso_639_3_language(capsys):
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "spa"
exclude = list()
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
experimental.correct_language(series, exclude)
experimental.correct_language(series)
captured = capsys.readouterr()
assert (
@ -297,13 +295,12 @@ def test_check_correct_iso_639_1_language():
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "en"
exclude = list()
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
result = experimental.correct_language(series, exclude)
result = experimental.correct_language(series)
assert result == None
@ -313,13 +310,12 @@ def test_check_correct_iso_639_3_language():
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
language = "eng"
exclude = list()
# Create a dictionary to mimic Pandas series
row = {"dc.title": title, "dc.language.iso": language}
series = pd.Series(row)
result = experimental.correct_language(series, exclude)
result = experimental.correct_language(series)
assert result == None
@ -407,9 +403,8 @@ def test_check_doi_field():
# the citation and a DOI field.
d = {"cg.identifier.doi": doi, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
exclude = list()
result = check.citation_doi(series, exclude)
result = check.citation_doi(series)
assert result == None
@ -418,14 +413,13 @@ def test_check_doi_only_in_citation(capsys):
"""Test an item with a DOI in its citation, but no DOI field."""
citation = "Orth, A. 2021. Testing all the things. doi: 10.1186/1743-422X-9-218"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with
# an empty DOI field and a citation containing a DOI.
d = {"cg.identifier.doi": None, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
check.citation_doi(series, exclude)
check.citation_doi(series)
captured = capsys.readouterr()
assert (
@ -439,14 +433,13 @@ def test_title_in_citation():
title = "Testing all the things"
citation = "Orth, A. 2021. Testing all the things."
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
result = check.title_in_citation(series, exclude)
result = check.title_in_citation(series)
assert result == None
@ -456,14 +449,13 @@ def test_title_not_in_citation(capsys):
title = "Testing all the things"
citation = "Orth, A. 2021. Testing all teh things."
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series), with
# the title and citation.
d = {"dc.title": title, "dcterms.bibliographicCitation": citation}
series = pd.Series(data=d)
check.title_in_citation(series, exclude)
check.title_in_citation(series)
captured = capsys.readouterr()
assert (
@ -477,13 +469,12 @@ def test_country_matches_region():
country = "Kenya"
region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series)
d = {"cg.coverage.country": country, "cg.coverage.region": region}
series = pd.Series(data=d)
result = check.countries_match_regions(series, exclude)
result = check.countries_match_regions(series)
assert result == None
@ -495,7 +486,6 @@ def test_country_not_matching_region(capsys):
country = "Kenya"
region = ""
missing_region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series)
d = {
@ -505,7 +495,7 @@ def test_country_not_matching_region(capsys):
}
series = pd.Series(data=d)
check.countries_match_regions(series, exclude)
check.countries_match_regions(series)
captured = capsys.readouterr()
assert (

View File

@ -131,7 +131,6 @@ def test_fix_country_not_matching_region():
country = "Kenya"
region = ""
missing_region = "Eastern Africa"
exclude = list()
# Emulate a column in a transposed dataframe (which is just a series)
d = {
@ -141,7 +140,7 @@ def test_fix_country_not_matching_region():
}
series = pd.Series(data=d)
result = fix.countries_match_regions(series, exclude)
result = fix.countries_match_regions(series)
# Emulate the correct series we are expecting
d_correct = {