mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-10-24 18:31:14 +02:00
Compare commits
10 Commits
v0.4.4
...
b16fa9121f
Author | SHA1 | Date | |
---|---|---|---|
b16fa9121f
|
|||
202bda862a
|
|||
7479310ac0
|
|||
98a91bc9c2
|
|||
fc5bedcc5c
|
|||
44d12d771a
|
|||
4a7000e975
|
|||
27b2d81ca8
|
|||
91ebd0f606
|
|||
dd2cfae047
|
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.4.5] - 2021-03-04
|
||||
### Added
|
||||
- Check dates in dcterms.issued field as well, not just fields that have the
|
||||
word "date" in them
|
||||
|
||||
### Updated
|
||||
- Run `poetry update` to update project dependencies
|
||||
|
||||
## [0.4.4] - 2021-02-21
|
||||
### Added
|
||||
- Accept dates formatted in ISO 8601 extended with combined date and time, for
|
||||
|
@@ -109,8 +109,13 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
- Validate ISSNs or journal titles against CrossRef API?
|
||||
- Better ISO 8601 date parsing (currently only supports simple dates, perhaps we need to use dateutil.parser.parseiso())
|
||||
- Fix lazy date check (assumes field name has "date" but could be dcterms.issued etc!)
|
||||
- Add configurable field validation, like specify a field name and a validation file?
|
||||
- Perhaps like --validate=field.name,filename
|
||||
- Add some row-based item sanity checks and fixes:
|
||||
- Warn if item is Open Access, but missing a filename or URL
|
||||
- Warn if item is Open Access, but missing a license
|
||||
- Warn if item has an ISSN but no journal title
|
||||
- Update journal titles from ISSN
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
@@ -142,7 +142,7 @@ def run(argv):
|
||||
df[column] = df[column].apply(check.isbn)
|
||||
|
||||
# Check: invalid date
|
||||
match = re.match(r"^.*?date.*$", column)
|
||||
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.date, field_name=column)
|
||||
|
||||
|
@@ -1 +1 @@
|
||||
VERSION = "0.4.4"
|
||||
VERSION = "0.4.5"
|
||||
|
42
poetry.lock
generated
42
poetry.lock
generated
@@ -233,7 +233,7 @@ python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "ipython"
|
||||
version = "7.20.0"
|
||||
version = "7.21.0"
|
||||
description = "IPython: Productive Interactive Computing"
|
||||
category = "dev"
|
||||
optional = false
|
||||
@@ -388,7 +388,7 @@ pyparsing = ">=2.0.2"
|
||||
|
||||
[[package]]
|
||||
name = "pandas"
|
||||
version = "1.2.2"
|
||||
version = "1.2.3"
|
||||
description = "Powerful data structures for data analysis, time series, and statistics"
|
||||
category = "main"
|
||||
optional = false
|
||||
@@ -851,8 +851,8 @@ iniconfig = [
|
||||
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
|
||||
]
|
||||
ipython = [
|
||||
{file = "ipython-7.20.0-py3-none-any.whl", hash = "sha256:1918dea4bfdc5d1a830fcfce9a710d1d809cbed123e85eab0539259cb0f56640"},
|
||||
{file = "ipython-7.20.0.tar.gz", hash = "sha256:1923af00820a8cf58e91d56b89efc59780a6e81363b94464a0f17c039dffff9e"},
|
||||
{file = "ipython-7.21.0-py3-none-any.whl", hash = "sha256:34207ffb2f653bced2bc8e3756c1db86e7d93e44ed049daae9814fed66d408ec"},
|
||||
{file = "ipython-7.21.0.tar.gz", hash = "sha256:04323f72d5b85b606330b6d7e2dc8d2683ad46c3905e955aa96ecc7a99388e70"},
|
||||
]
|
||||
ipython-genutils = [
|
||||
{file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"},
|
||||
@@ -924,24 +924,22 @@ packaging = [
|
||||
{file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
|
||||
]
|
||||
pandas = [
|
||||
{file = "pandas-1.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c76a108272a4de63189b8f64086bbaf8348841d7e610b52f50959fbbf401524f"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e61a089151f1ed78682aa77a3bcae0495cf8e585546c26924857d7e8a9960568"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:fc351cd2df318674669481eb978a7799f24fd14ef26987a1aa75105b0531d1a1"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-win32.whl", hash = "sha256:05ca6bda50123158eb15e716789083ca4c3b874fd47688df1716daa72644ee1c"},
|
||||
{file = "pandas-1.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:08b6bbe74ae2b3e4741a744d2bce35ce0868a6b4189d8b84be26bb334f73da4c"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:230de25bd9791748b2638c726a5f37d77a96a83854710110fadd068d1e2c2c9f"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:a50cf3110a1914442e7b7b9cef394ef6bed0d801b8a34d56f4c4e927bbbcc7d0"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4d33537a375cfb2db4d388f9a929b6582a364137ea6c6b161b0166440d6ffe36"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8ac028cd9a6e1efe43f3dc36f708263838283535cc45430a98b9803f44f4c84b"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-win32.whl", hash = "sha256:c43d1beb098a1da15934262009a7120aac8dafa20d042b31dab48c28868eb5a4"},
|
||||
{file = "pandas-1.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:69a70d79a791fa1fd5f6e84b8b6dec2ec92369bde4ab2e18d43fc8a1825f51d1"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cbad4155028b8ca66aa19a8b13f593ebbf51bfb6c3f2685fe64f04d695a81864"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:fbddbb20f30308ba2546193d64e18c23b69f59d48cdef73676cbed803495c8dc"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:214ae60b1f863844e97c87f758c29940ffad96c666257323a4bb2a33c58719c2"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:26b4919eb3039a686a86cd4f4a74224f8f66e3a419767da26909dcdd3b37c31e"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-win32.whl", hash = "sha256:e3c250faaf9979d0ec836d25e420428db37783fa5fed218da49c9fc06f80f51c"},
|
||||
{file = "pandas-1.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:e9bbcc7b5c432600797981706f5b54611990c6a86b2e424329c995eea5f9c42b"},
|
||||
{file = "pandas-1.2.2.tar.gz", hash = "sha256:14ed84b463e9b84c8ff9308a79b04bf591ae3122a376ee0f62c68a1bd917a773"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4d821b9b911fc1b7d428978d04ace33f0af32bb7549525c8a7b08444bce46b74"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:9f5829e64507ad10e2561b60baf285c470f3c4454b007c860e77849b88865ae7"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:97b1954533b2a74c7e20d1342c4f01311d3203b48f2ebf651891e6a6eaf01104"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-win32.whl", hash = "sha256:5e3c8c60541396110586bcbe6eccdc335a38e7de8c217060edaf4722260b158f"},
|
||||
{file = "pandas-1.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8a051e957c5206f722e83f295f95a2cf053e890f9a1fba0065780a8c2d045f5d"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a93e34f10f67d81de706ce00bf8bb3798403cabce4ccb2de10c61b5ae8786ab5"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:46fc671c542a8392a4f4c13edc8527e3a10f6cb62912d856f82248feb747f06e"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:43e00770552595c2250d8d712ec8b6e08ca73089ac823122344f023efa4abea3"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-win32.whl", hash = "sha256:475b7772b6e18a93a43ea83517932deff33954a10d4fbae18d0c1aba4182310f"},
|
||||
{file = "pandas-1.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:72ffcea00ae8ffcdbdefff800284311e155fbb5ed6758f1a6110fc1f8f8f0c1c"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:621c044a1b5e535cf7dcb3ab39fca6f867095c3ef223a524f18f60c7fee028ea"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:0f27fd1adfa256388dc34895ca5437eaf254832223812afd817a6f73127f969c"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:dbb255975eb94143f2e6ec7dadda671d25147939047839cd6b8a4aff0379bb9b"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-win32.whl", hash = "sha256:d59842a5aa89ca03c2099312163ffdd06f56486050e641a45d926a072f04d994"},
|
||||
{file = "pandas-1.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:09761bf5f8c741d47d4b8b9073288de1be39bbfccc281d70b889ade12b2aad29"},
|
||||
{file = "pandas-1.2.3.tar.gz", hash = "sha256:df6f10b85aef7a5bb25259ad651ad1cc1d6bb09000595cab47e718cbac250b1d"},
|
||||
]
|
||||
parsedatetime = [
|
||||
{file = "parsedatetime-2.6-py3-none-any.whl", hash = "sha256:cb96edd7016872f58479e35879294258c71437195760746faffedb692aef000b"},
|
||||
|
@@ -1,12 +1,15 @@
|
||||
[tool.poetry]
|
||||
name = "csv-metadata-quality"
|
||||
version = "0.4.4"
|
||||
version = "0.4.5"
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
||||
license="GPL-3.0-only"
|
||||
repository = "https://github.com/ilri/csv-metadata-quality"
|
||||
homepage = "https://github.com/ilri/csv-metadata-quality"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
pandas = "^1.0.4"
|
||||
|
@@ -21,7 +21,7 @@ flake8==3.8.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (p
|
||||
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
iniconfig==1.1.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
ipython-genutils==0.2.0; python_version >= "3.7" and python_version < "4.0"
|
||||
ipython==7.20.0; python_version >= "3.7" and python_version < "4.0"
|
||||
ipython==7.21.0; python_version >= "3.7" and python_version < "4.0"
|
||||
isodate==0.6.0
|
||||
isort==5.7.0; python_version >= "3.6" and python_version < "4.0"
|
||||
jdcal==1.4.1; python_version >= "3.6"
|
||||
@@ -33,7 +33,7 @@ mypy-extensions==0.4.3; python_version >= "3.6"
|
||||
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
openpyxl==3.0.6; python_version >= "3.6"
|
||||
packaging==20.9; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6"
|
||||
pandas==1.2.2; python_full_version >= "3.7.1"
|
||||
pandas==1.2.3; python_full_version >= "3.7.1"
|
||||
parsedatetime==2.6
|
||||
parso==0.8.1; python_version >= "3.7" and python_version < "4.0"
|
||||
pathspec==0.8.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version >= "3.6"
|
||||
|
@@ -4,7 +4,7 @@ colorama==0.4.4; (python_version >= "2.7" and python_full_version < "3.0.0") or
|
||||
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
langid==1.1.6
|
||||
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
pandas==1.2.2; python_full_version >= "3.7.1"
|
||||
pandas==1.2.3; python_full_version >= "3.7.1"
|
||||
pycountry==19.8.18
|
||||
python-dateutil==2.8.1; python_full_version >= "3.7.1"
|
||||
python-stdnum==1.16
|
||||
|
2
setup.py
2
setup.py
@@ -14,7 +14,7 @@ install_requires = [
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.4.3",
|
||||
version="0.4.5",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
Reference in New Issue
Block a user