mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-09 22:56:01 +02:00
Compare commits
55 Commits
Author | SHA1 | Date | |
---|---|---|---|
e15c98cccb
|
|||
93c4e1a993
|
|||
9963b2bb64
|
|||
76291c1876
|
|||
604bd5bda6
|
|||
e7c220039b
|
|||
d7b5e378bc
|
|||
8435ee242d
|
|||
7ac1c6f554
|
|||
86d4623fd3
|
|||
ddbe970342
|
|||
31c78ca6f3
|
|||
154d05b5e2
|
|||
186f146edb
|
|||
a4cb301943
|
|||
219e37526d
|
|||
f304ca6a33
|
|||
3d5c8bdf5d
|
|||
480956d54d
|
|||
d9fc09f121
|
|||
b5899001b7
|
|||
c92977d1ca
|
|||
280a99c8a8
|
|||
0388145b81
|
|||
d97dcd19db
|
|||
b375f0e895
|
|||
865c61d316
|
|||
3b2ba57b75
|
|||
2805c556a9
|
|||
c354a3687c
|
|||
07f80cb37f
|
|||
89d72540f1
|
|||
81190d56bb
|
|||
2af714fb05
|
|||
cc863a6bdd
|
|||
113e7cd8b6
|
|||
bd984f3db5
|
|||
3f4e84a638
|
|||
c52b3ed131
|
|||
884e8f970d
|
|||
6d02f5026a
|
|||
e7cb8920db
|
|||
ed5612fbcf
|
|||
3247495cee
|
|||
7255bf4707
|
|||
3aaf18c290
|
|||
745306edd7
|
|||
e324e321a2
|
|||
232ff99898
|
|||
13d5221378
|
|||
3c7a9eb75b
|
|||
a99fbd8a51
|
|||
e801042340
|
|||
62ef2a4489
|
|||
9ce7dc6716
|
28
CHANGELOG.md
28
CHANGELOG.md
@ -4,6 +4,34 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.3.0] - 2019-09-26
|
||||
### Updated
|
||||
- Update python dependencies to latest versions, including numpy 1.17.2, pandas
|
||||
0.25.1, pytest 5.1.3, and requests-cache 0.5.2
|
||||
|
||||
## Added
|
||||
- csvkit to dev requirements (csvcut etc are useful during development)
|
||||
- Experimental language validation using `-e` (see README.md)
|
||||
|
||||
### Changed
|
||||
- Re-formatted code with black and isort
|
||||
|
||||
## [0.2.2] - 2019-08-27
|
||||
### Changed
|
||||
- Output of date checks to include column names (helps debugging in case there are multiple date fields)
|
||||
|
||||
### Added
|
||||
- Ability to exclude certain fields using `--exclude-fields`
|
||||
- Fix for missing space after a comma, ie "Orth,Alan S."
|
||||
|
||||
### Improved
|
||||
- AGROVOC lookup code
|
||||
|
||||
## [0.2.1] - 2019-08-11
|
||||
### Added
|
||||
- Check for uncommon filename extensions
|
||||
- Replacement of unneccessary Unicode characters like soft hyphens (U+00AD)
|
||||
|
||||
## [0.2.0] - 2019-08-09
|
||||
### Added
|
||||
- Handle Ctrl-C interrupt gracefully
|
||||
|
4
Pipfile
4
Pipfile
@ -8,6 +8,9 @@ pytest = "*"
|
||||
ipython = "*"
|
||||
flake8 = "*"
|
||||
pytest-clarity = "*"
|
||||
black = "*"
|
||||
isort = "*"
|
||||
csvkit = "*"
|
||||
|
||||
[packages]
|
||||
pandas = "*"
|
||||
@ -17,6 +20,7 @@ requests = "*"
|
||||
requests-cache = "*"
|
||||
pycountry = "*"
|
||||
csv-metadata-quality = {editable = true,path = "."}
|
||||
langid = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
|
315
Pipfile.lock
generated
315
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "f8f0a9f208ec41f4d8183ecfc68356b40674b083b2f126c37468b3c9533ba5df"
|
||||
"sha256": "59562d8c59eb09e23b49475d6901687edbf605f5b84e283e90cc8e2de518641f"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -18,10 +18,10 @@
|
||||
"default": {
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939",
|
||||
"sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695"
|
||||
"sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50",
|
||||
"sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef"
|
||||
],
|
||||
"version": "==2019.6.16"
|
||||
"version": "==2019.9.11"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
@ -41,53 +41,61 @@
|
||||
],
|
||||
"version": "==2.8"
|
||||
},
|
||||
"langid": {
|
||||
"hashes": [
|
||||
"sha256:044bcae1912dab85c33d8e98f2811b8f4ff1213e5e9a9e9510137b84da2cb293"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.1.6"
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e",
|
||||
"sha256:0cdd229a53d2720d21175012ab0599665f8c9588b3b8ffa6095dd7b90f0691dd",
|
||||
"sha256:312bb18e95218bedc3563f26fcc9c1c6bfaaf9d453d15942c0839acdd7e4c473",
|
||||
"sha256:464b1c48baf49e8505b1bb754c47a013d2c305c5b14269b5c85ea0625b6a988a",
|
||||
"sha256:5adfde7bd3ee4864536e230bcab1c673f866736698724d5d28c11a4d63672658",
|
||||
"sha256:7724e9e31ee72389d522b88c0d4201f24edc34277999701ccd4a5392e7d8af61",
|
||||
"sha256:8d36f7c53ae741e23f54793ffefb2912340b800476eb0a831c6eb602e204c5c4",
|
||||
"sha256:910d2272403c2ea8a52d9159827dc9f7c27fb4b263749dca884e2e4a8af3b302",
|
||||
"sha256:951fefe2fb73f84c620bec4e001e80a80ddaa1b84dce244ded7f1e0cbe0ed34a",
|
||||
"sha256:9588c6b4157f493edeb9378788dcd02cb9e6a6aeaa518b511a1c79d06cbd8094",
|
||||
"sha256:9ce8300950f2f1d29d0e49c28ebfff0d2f1e2a7444830fbb0b913c7c08f31511",
|
||||
"sha256:be39cca66cc6806652da97103605c7b65ee4442c638f04ff064a7efd9a81d50a",
|
||||
"sha256:c3ab2d835b95ccb59d11dfcd56eb0480daea57cdf95d686d22eff35584bc4554",
|
||||
"sha256:eb0fc4a492cb896346c9e2c7a22eae3e766d407df3eb20f4ce027f23f76e4c54",
|
||||
"sha256:ec0c56eae6cee6299f41e780a0280318a93db519bbb2906103c43f3e2be1206c",
|
||||
"sha256:f4e4612de60a4f1c4d06c8c2857cdcb2b8b5289189a12053f37d3f41f06c60d0"
|
||||
"sha256:05dbfe72684cc14b92568de1bc1f41e5f62b00f714afc9adee42f6311738091f",
|
||||
"sha256:0d82cb7271a577529d07bbb05cb58675f2deb09772175fab96dc8de025d8ac05",
|
||||
"sha256:10132aa1fef99adc85a905d82e8497a580f83739837d7cbd234649f2e9b9dc58",
|
||||
"sha256:12322df2e21f033a60c80319c25011194cd2a21294cc66fee0908aeae2c27832",
|
||||
"sha256:16f19b3aa775dddc9814e02a46b8e6ae6a54ed8cf143962b4e53f0471dbd7b16",
|
||||
"sha256:3d0b0989dd2d066db006158de7220802899a1e5c8cf622abe2d0bd158fd01c2c",
|
||||
"sha256:438a3f0e7b681642898fd7993d38e2bf140a2d1eafaf3e89bb626db7f50db355",
|
||||
"sha256:5fd214f482ab53f2cea57414c5fb3e58895b17df6e6f5bca5be6a0bb6aea23bb",
|
||||
"sha256:73615d3edc84dd7c4aeb212fa3748fb83217e00d201875a47327f55363cef2df",
|
||||
"sha256:7bd355ad7496f4ce1d235e9814ec81ee3d28308d591c067ce92e49f745ba2c2f",
|
||||
"sha256:7d077f2976b8f3de08a0dcf5d72083f4af5411e8fddacd662aae27baa2601196",
|
||||
"sha256:a4092682778dc48093e8bda8d26ee8360153e2047826f95a3f5eae09f0ae3abf",
|
||||
"sha256:b458de8624c9f6034af492372eb2fee41a8e605f03f4732f43fc099e227858b2",
|
||||
"sha256:e70fc8ff03a961f13363c2c95ef8285e0cf6a720f8271836f852cc0fa64e97c8",
|
||||
"sha256:ee8e9d7cad5fe6dde50ede0d2e978d81eafeaa6233fb0b8719f60214cf226578",
|
||||
"sha256:f4a4f6aba148858a5a5d546a99280f71f5ee6ec8182a7d195af1a914195b21a2"
|
||||
],
|
||||
"version": "==1.17.0"
|
||||
"version": "==1.17.2"
|
||||
},
|
||||
"pandas": {
|
||||
"hashes": [
|
||||
"sha256:074a032f99bb55d178b93bd98999c971542f19317829af08c99504febd9e9b8b",
|
||||
"sha256:20f1728182b49575c2f6f681b3e2af5fac9e84abdf29488e76d569a7969b362e",
|
||||
"sha256:2745ba6e16c34d13d765c3657bb64fa20a0e2daf503e6216a36ed61770066179",
|
||||
"sha256:32c44e5b628c48ba17703f734d59f369d4cdcb4239ef26047d6c8a8bfda29a6b",
|
||||
"sha256:3b9f7dcee6744d9dcdd53bce19b91d20b4311bf904303fa00ef58e7df398e901",
|
||||
"sha256:544f2033250980fb6f069ce4a960e5f64d99b8165d01dc39afd0b244eeeef7d7",
|
||||
"sha256:58f9ef68975b9f00ba96755d5702afdf039dea9acef6a0cfd8ddcde32918a79c",
|
||||
"sha256:9023972a92073a495eba1380824b197ad1737550fe1c4ef8322e65fe58662888",
|
||||
"sha256:914341ad2d5b1ea522798efa4016430b66107d05781dbfe7cf05eba8f37df995",
|
||||
"sha256:9d151bfb0e751e2c987f931c57792871c8d7ff292bcdfcaa7233012c367940ee",
|
||||
"sha256:b932b127da810fef57d427260dde1ad54542c136c44b227a1e367551bb1a684b",
|
||||
"sha256:cfb862aa37f4dd5be0730731fdb8185ac935aba8b51bf3bd035658111c9ee1c9",
|
||||
"sha256:de7ecb4b120e98b91e8a2a21f186571266a8d1faa31d92421e979c7ca67d8e5c",
|
||||
"sha256:df7e1933a0b83920769611c5d6b9a1bf301e3fa6a544641c6678c67621fe9843"
|
||||
"sha256:18d91a9199d1dfaa01ad645f7540370ba630bdcef09daaf9edf45b4b1bca0232",
|
||||
"sha256:3f26e5da310a0c0b83ea50da1fd397de2640b02b424aa69be7e0784228f656c9",
|
||||
"sha256:4182e32f4456d2c64619e97c58571fa5ca0993d1e8c2d9ca44916185e1726e15",
|
||||
"sha256:426e590e2eb0e60f765271d668a30cf38b582eaae5ec9b31229c8c3c10c5bc21",
|
||||
"sha256:5eb934a8f0dc358f0e0cdf314072286bbac74e4c124b64371395e94644d5d919",
|
||||
"sha256:717928808043d3ea55b9bcde636d4a52d2236c246f6df464163a66ff59980ad8",
|
||||
"sha256:8145f97c5ed71827a6ec98ceaef35afed1377e2d19c4078f324d209ff253ecb5",
|
||||
"sha256:8744c84c914dcc59cbbb2943b32b7664df1039d99e834e1034a3372acb89ea4d",
|
||||
"sha256:c1ac1d9590d0c9314ebf01591bd40d4c03d710bfc84a3889e5263c97d7891dee",
|
||||
"sha256:cb2e197b7b0687becb026b84d3c242482f20cbb29a9981e43604eb67576da9f6",
|
||||
"sha256:d4001b71ad2c9b84ff18b182cea22b7b6cbf624216da3ea06fb7af28d1f93165",
|
||||
"sha256:d8930772adccb2882989ab1493fa74bd87d47c8ac7417f5dd3dd834ba8c24dc9",
|
||||
"sha256:dfbb0173ee2399bc4ed3caf2d236e5c0092f948aafd0a15fbe4a0e77ee61a958",
|
||||
"sha256:eebfbba048f4fa8ac711b22c78516e16ff8117d05a580e7eeef6b0c2be554c18",
|
||||
"sha256:f1b21bc5cf3dbea53d33615d1ead892dfdae9d7052fa8898083bec88be20dcd2"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.25.0"
|
||||
"version": "==0.25.1"
|
||||
},
|
||||
"pycountry": {
|
||||
"hashes": [
|
||||
"sha256:68e58bfd3bedeea49ba9d4b38f2bd5e042f9753628eba9a819fb03f551d89096"
|
||||
"sha256:3c57aa40adcf293d59bebaffbe60d8c39976fba78d846a018dc0c2ec9c6cb3cb"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==19.7.15"
|
||||
"version": "==19.8.18"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
@ -121,11 +129,11 @@
|
||||
},
|
||||
"requests-cache": {
|
||||
"hashes": [
|
||||
"sha256:6822f788c5ee248995c4bfbd725de2002ad710182ba26a666e85b64981866060",
|
||||
"sha256:73a7211870f7d67af5fd81cad2f67cfe1cd3eb4ee6a85155e07613968cc72dfc"
|
||||
"sha256:813023269686045f8e01e2289cc1e7e9ae5ab22ddd1e2849a9093ab3ab7270eb",
|
||||
"sha256:81e13559baee64677a7d73b85498a5a8f0639e204517b5d05ff378e44a57831a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.5.0"
|
||||
"version": "==0.5.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
@ -136,10 +144,10 @@
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1",
|
||||
"sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232"
|
||||
"sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398",
|
||||
"sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86"
|
||||
],
|
||||
"version": "==1.25.3"
|
||||
"version": "==1.25.6"
|
||||
},
|
||||
"xlrd": {
|
||||
"hashes": [
|
||||
@ -151,6 +159,38 @@
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"agate": {
|
||||
"hashes": [
|
||||
"sha256:48d6f80b35611c1ba25a642cbc5b90fcbdeeb2a54711c4a8d062ee2809334d1c",
|
||||
"sha256:c93aaa500b439d71e4a5cf088d0006d2ce2c76f1950960c8843114e5f361dfd3"
|
||||
],
|
||||
"version": "==1.6.1"
|
||||
},
|
||||
"agate-dbf": {
|
||||
"hashes": [
|
||||
"sha256:00c93c498ec9a04cc587bf63dd7340e67e2541f0df4c9a7259d7cb3dd4ce372f"
|
||||
],
|
||||
"version": "==0.2.1"
|
||||
},
|
||||
"agate-excel": {
|
||||
"hashes": [
|
||||
"sha256:8f255ef2c87c436b7132049e1dd86c8e08bf82d8c773aea86f3069b461a17d52"
|
||||
],
|
||||
"version": "==0.2.3"
|
||||
},
|
||||
"agate-sql": {
|
||||
"hashes": [
|
||||
"sha256:9277490ba8b8e7c747a9ae3671f52fe486784b48d4a14e78ca197fb0e36f281b"
|
||||
],
|
||||
"version": "==0.5.4"
|
||||
},
|
||||
"appdirs": {
|
||||
"hashes": [
|
||||
"sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92",
|
||||
"sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"
|
||||
],
|
||||
"version": "==1.4.3"
|
||||
},
|
||||
"atomicwrites": {
|
||||
"hashes": [
|
||||
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
|
||||
@ -165,6 +205,13 @@
|
||||
],
|
||||
"version": "==19.1.0"
|
||||
},
|
||||
"babel": {
|
||||
"hashes": [
|
||||
"sha256:af92e6106cb7c55286b25b38ad7695f8b4efb36a90ba483d7f7a6628c46158ab",
|
||||
"sha256:e86135ae101e31e2c8ec20a4e0c5220f4eed12487d5cf3f78be7e98d3a57fc28"
|
||||
],
|
||||
"version": "==2.7.0"
|
||||
},
|
||||
"backcall": {
|
||||
"hashes": [
|
||||
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
|
||||
@ -172,6 +219,35 @@
|
||||
],
|
||||
"version": "==0.1.0"
|
||||
},
|
||||
"black": {
|
||||
"hashes": [
|
||||
"sha256:09a9dcb7c46ed496a9850b76e4e825d6049ecd38b611f1224857a79bd985a8cf",
|
||||
"sha256:68950ffd4d9169716bcb8719a56c07a2f4485354fec061cdd5910aa07369731c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==19.3b0"
|
||||
},
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
|
||||
"sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
|
||||
],
|
||||
"version": "==7.0"
|
||||
},
|
||||
"csvkit": {
|
||||
"hashes": [
|
||||
"sha256:1353a383531bee191820edfb88418c13dfe1cdfa9dd3dc46f431c05cd2a260a0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.0.4"
|
||||
},
|
||||
"dbfread": {
|
||||
"hashes": [
|
||||
"sha256:07c8a9af06ffad3f6f03e8fe91ad7d2733e31a26d2b72c4dd4cfbae07ee3b73d",
|
||||
"sha256:f604def58c59694fa0160d7be5d0b8d594467278d2bb6a47d46daf7162c84cec"
|
||||
],
|
||||
"version": "==2.0.7"
|
||||
},
|
||||
"decorator": {
|
||||
"hashes": [
|
||||
"sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
|
||||
@ -186,6 +262,12 @@
|
||||
],
|
||||
"version": "==0.3"
|
||||
},
|
||||
"et-xmlfile": {
|
||||
"hashes": [
|
||||
"sha256:614d9722d572f6246302c4491846d2c393c199cfa4edc9af593437691683335b"
|
||||
],
|
||||
"version": "==1.0.1"
|
||||
},
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:19241c1cbc971b9962473e4438a2ca19749a7dd002dd1a946eaba171b4114548",
|
||||
@ -194,20 +276,27 @@
|
||||
"index": "pypi",
|
||||
"version": "==3.7.8"
|
||||
},
|
||||
"future": {
|
||||
"hashes": [
|
||||
"sha256:67045236dcfd6816dc439556d009594abf643e5eb48992e36beac09c2ca659b8"
|
||||
],
|
||||
"version": "==0.17.1"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:23d3d873e008a513952355379d93cbcab874c58f4f034ff657c7a87422fa64e8",
|
||||
"sha256:80d2de76188eabfbfcf27e6a37342c2827801e59c4cc14b0371c56fed43820e3"
|
||||
"sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26",
|
||||
"sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af"
|
||||
],
|
||||
"version": "==0.19"
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==0.23"
|
||||
},
|
||||
"ipython": {
|
||||
"hashes": [
|
||||
"sha256:1d3a1692921e932751bc1a1f7bb96dc38671eeefdc66ed33ee4cbc57e92a410e",
|
||||
"sha256:537cd0176ff6abd06ef3e23f2d0c4c2c8a4d9277b7451544c6cbf56d1c79a83d"
|
||||
"sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b",
|
||||
"sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==7.7.0"
|
||||
"version": "==7.8.0"
|
||||
},
|
||||
"ipython-genutils": {
|
||||
"hashes": [
|
||||
@ -216,12 +305,41 @@
|
||||
],
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"isodate": {
|
||||
"hashes": [
|
||||
"sha256:2e364a3d5759479cdb2d37cce6b9376ea504db2ff90252a2e5b7cc89cc9ff2d8",
|
||||
"sha256:aa4d33c06640f5352aca96e4b81afd8ab3b47337cc12089822d6f322ac772c81"
|
||||
],
|
||||
"version": "==0.6.0"
|
||||
},
|
||||
"isort": {
|
||||
"hashes": [
|
||||
"sha256:54da7e92468955c4fceacd0c86bd0ec997b0e1ee80d97f67c35a78b719dccab1",
|
||||
"sha256:6e811fcb295968434526407adb8796944f1988c5b65e8139058f2014cbe100fd"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.3.21"
|
||||
},
|
||||
"jdcal": {
|
||||
"hashes": [
|
||||
"sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba",
|
||||
"sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"
|
||||
],
|
||||
"version": "==1.4.1"
|
||||
},
|
||||
"jedi": {
|
||||
"hashes": [
|
||||
"sha256:53c850f1a7d3cfcd306cc513e2450a54bdf5cacd7604b74e42dd1f0758eaaf36",
|
||||
"sha256:e07457174ef7cb2342ff94fa56484fe41cec7ef69b0059f01d3f812379cb6f7c"
|
||||
"sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27",
|
||||
"sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e"
|
||||
],
|
||||
"version": "==0.14.1"
|
||||
"version": "==0.15.1"
|
||||
},
|
||||
"leather": {
|
||||
"hashes": [
|
||||
"sha256:076d1603b5281488285718ce1a5ce78cf1027fe1e76adf9c548caf83c519b988",
|
||||
"sha256:e0bb36a6d5f59fbf3c1a6e75e7c8bee29e67f06f5b48c0134407dde612eba5e2"
|
||||
],
|
||||
"version": "==0.3.3"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
@ -237,12 +355,25 @@
|
||||
],
|
||||
"version": "==7.2.0"
|
||||
},
|
||||
"openpyxl": {
|
||||
"hashes": [
|
||||
"sha256:340a1ab2069764559b9d58027a43a24db18db0e25deb80f81ecb8ca7ee5253db"
|
||||
],
|
||||
"version": "==3.0.0"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:a7ac867b97fdc07ee80a8058fe4435ccd274ecc3b0ed61d852d7d53055528cf9",
|
||||
"sha256:c491ca87294da7cc01902edbe30a5bc6c4c28172b5138ab4e4aa1b9d7bfaeafe"
|
||||
"sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47",
|
||||
"sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108"
|
||||
],
|
||||
"version": "==19.1"
|
||||
"version": "==19.2"
|
||||
},
|
||||
"parsedatetime": {
|
||||
"hashes": [
|
||||
"sha256:3d817c58fb9570d1eec1dd46fa9448cd644eeed4fb612684b02dfda3a79cb84b",
|
||||
"sha256:9ee3529454bf35c40a77115f5a596771e59e1aee8c53306f346c461b8e913094"
|
||||
],
|
||||
"version": "==2.4"
|
||||
},
|
||||
"parso": {
|
||||
"hashes": [
|
||||
@ -268,10 +399,10 @@
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc",
|
||||
"sha256:b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c"
|
||||
"sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6",
|
||||
"sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34"
|
||||
],
|
||||
"version": "==0.12.0"
|
||||
"version": "==0.13.0"
|
||||
},
|
||||
"prompt-toolkit": {
|
||||
"hashes": [
|
||||
@ -325,11 +456,11 @@
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:6ef6d06de77ce2961156013e9dff62f1b2688aa04d0dc244299fe7d67e09370d",
|
||||
"sha256:a736fed91c12681a7b34617c8fcefe39ea04599ca72c608751c31d89579a3f77"
|
||||
"sha256:813b99704b22c7d377bbd756ebe56c35252bb710937b46f207100e843440b3c2",
|
||||
"sha256:cc6620b96bc667a0c8d4fa592a8c9c94178a1bd6cc799dbb057dfd9286d31a31"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==5.0.1"
|
||||
"version": "==5.1.3"
|
||||
},
|
||||
"pytest-clarity": {
|
||||
"hashes": [
|
||||
@ -338,6 +469,26 @@
|
||||
"index": "pypi",
|
||||
"version": "==0.2.0a1"
|
||||
},
|
||||
"python-slugify": {
|
||||
"hashes": [
|
||||
"sha256:575d03256a132fc1efb4c52966c6eb11c57a13b071618f0b26076057a23f6937"
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"pytimeparse": {
|
||||
"hashes": [
|
||||
"sha256:04b7be6cc8bd9f5647a6325444926c3ac34ee6bc7e69da4367ba282f076036bd",
|
||||
"sha256:e86136477be924d7e670646a98561957e8ca7308d44841e21f5ddea757556a0a"
|
||||
],
|
||||
"version": "==1.1.8"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32",
|
||||
"sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7"
|
||||
],
|
||||
"version": "==2019.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||
@ -345,18 +496,38 @@
|
||||
],
|
||||
"version": "==1.12.0"
|
||||
},
|
||||
"sqlalchemy": {
|
||||
"hashes": [
|
||||
"sha256:2f8ff566a4d3a92246d367f2e9cd6ed3edeef670dcd6dda6dfdc9efed88bcd80"
|
||||
],
|
||||
"version": "==1.3.8"
|
||||
},
|
||||
"termcolor": {
|
||||
"hashes": [
|
||||
"sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
|
||||
],
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"text-unidecode": {
|
||||
"hashes": [
|
||||
"sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8",
|
||||
"sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"
|
||||
],
|
||||
"version": "==1.3"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c",
|
||||
"sha256:235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e"
|
||||
],
|
||||
"version": "==0.10.0"
|
||||
},
|
||||
"traitlets": {
|
||||
"hashes": [
|
||||
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
||||
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
|
||||
"sha256:262089114405f22f4833be96b31e143ab906d7764a22c04c71fee0bbda4787ba",
|
||||
"sha256:6ad5b30dacd5e2424c46cc94a0aeab990d98ae17d181acea2cc4272ac3409fca"
|
||||
],
|
||||
"version": "==4.3.2"
|
||||
"version": "==4.3.3.dev0"
|
||||
},
|
||||
"wcwidth": {
|
||||
"hashes": [
|
||||
@ -365,12 +536,20 @@
|
||||
],
|
||||
"version": "==0.1.7"
|
||||
},
|
||||
"xlrd": {
|
||||
"hashes": [
|
||||
"sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2",
|
||||
"sha256:e551fb498759fa3a5384a94ccd4c3c02eb7c00ea424426e212ac0c57be9dfbde"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.2.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:4970c3758f4e89a7857a973b1e2a5d75bcdc47794442f2e2dd4fe8e0466e809a",
|
||||
"sha256:8a5712cfd3bb4248015eb3b0b3c54a5f6ee3f2425963ef2a0125b8bc40aafaec"
|
||||
"sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e",
|
||||
"sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"
|
||||
],
|
||||
"version": "==0.5.2"
|
||||
"version": "==0.6.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
28
README.md
28
README.md
@ -1,4 +1,4 @@
|
||||
# CSV Metadata Quality [](https://travis-ci.org/alanorth/csv-metadata-quality) [](https://builds.sr.ht/~alanorth/csv-metadata-quality?)
|
||||
# CSV Metadata Quality [](https://travis-ci.org/ilri/csv-metadata-quality) [](https://builds.sr.ht/~alanorth/csv-metadata-quality?)
|
||||
A simple, but opinionated metadata quality checker and fixer designed to work with CSVs in the DSpace ecosystem. The implementation is essentially a pipeline of checks and fixes that begins with splitting multi-value fields on the standard DSpace "||" separator, trimming leading/trailing whitespace, and then proceeding to more specialized cases like ISSNs, ISBNs, languages, etc.
|
||||
|
||||
Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](https://pandas.pydata.org/) library, though your mileage may vary with Excel because this is much less tested.
|
||||
@ -6,7 +6,8 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
|
||||
## Functionality
|
||||
|
||||
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
|
||||
- Validate languages against ISO 639-2 and ISO 639-3
|
||||
- Validate languages against ISO 639-1 (alpha2) and ISO 639-3 (alpha3)
|
||||
- Experimental validation of titles and abstracts against item's Dublin Core language field
|
||||
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
|
||||
- Fix leading, trailing, and excessive (ie, more than one) whitespace
|
||||
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
|
||||
@ -19,7 +20,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
|
||||
The easiest way to install CSV Metadata Quality is with [pipenv](https://github.com/pypa/pipenv):
|
||||
|
||||
```
|
||||
$ git clone https://git.sr.ht/~alanorth/csv-metadata-quality
|
||||
$ git clone https://github.com/ilri/csv-metadata-quality.git
|
||||
$ cd csv-metadata-quality
|
||||
$ pipenv install
|
||||
$ pipenv shell
|
||||
@ -28,7 +29,7 @@ $ pipenv shell
|
||||
Otherwise, if you don't have pipenv, you can use a vanilla Python virtual environment:
|
||||
|
||||
```
|
||||
$ git clone https://git.sr.ht/~alanorth/csv-metadata-quality
|
||||
$ git clone https://github.com/ilri/csv-metadata-quality.git
|
||||
$ cd csv-metadata-quality
|
||||
$ python3 -m venv venv
|
||||
$ source venv/bin/activate
|
||||
@ -69,11 +70,30 @@ Invalid AGROVOC (cg.coverage.country): KENYAA
|
||||
|
||||
*Note: Requests to the AGROVOC REST API are cached using [requests_cache](https://pypi.org/project/requests-cache/) to speed up subsequent runs with the same data and to be kind to the system's administrators.*
|
||||
|
||||
## Experimental Checks
|
||||
You can enable experimental support for validating whether the value of an item's `dc.language.iso` or `dcterms.language` field matches the actual language used in its title, abstract, and citation.
|
||||
|
||||
```
|
||||
$ csv-metadata-quality -i data/test.csv -o /tmp/test.csv -e
|
||||
...
|
||||
Possibly incorrect language es (detected en): Incorrect ISO 639-1 language
|
||||
Possibly incorrect language spa (detected eng): Incorrect ISO 639-3 language
|
||||
```
|
||||
|
||||
This currently uses the [Python langid](https://github.com/saffsd/langid.py) library. In the future I would like to move to the fastText library, but there is currently an [issue with their Python bindings](https://github.com/facebookresearch/fastText/issues/909) that makes this unfeasible.
|
||||
|
||||
## Todo
|
||||
|
||||
- Reporting / summary
|
||||
- Better logging, for example with INFO, WARN, and ERR levels
|
||||
- Verbose, debug, or quiet options
|
||||
- Warn if an author is shorter than 3 characters?
|
||||
- Validate dc.rights field against SPDX? Perhaps with an option like `-m spdx` to enable the spdx module?
|
||||
- Validate DOIs? Normalize to https://doi.org format? Or use just the DOI part: 10.1016/j.worlddev.2010.06.006
|
||||
- Warn if two items use the same file in `filename` column
|
||||
- Add an option to drop invalid AGROVOC subjects?
|
||||
- Add check for author names with incorrect spacing after commas, ie "Orth,Alan S."
|
||||
- Add tests for application invocation, ie `tests/test_app.py`?
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
@ -1,10 +1,11 @@
|
||||
from csv_metadata_quality import app
|
||||
from sys import argv
|
||||
|
||||
from csv_metadata_quality import app
|
||||
|
||||
|
||||
def main():
|
||||
app.run(argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -1,20 +1,53 @@
|
||||
from csv_metadata_quality.version import VERSION
|
||||
import argparse
|
||||
import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.fix as fix
|
||||
import pandas as pd
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.experimental as experimental
|
||||
import csv_metadata_quality.fix as fix
|
||||
from csv_metadata_quality.version import VERSION
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.')
|
||||
parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country')
|
||||
parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8'))
|
||||
parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8'))
|
||||
parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true')
|
||||
parser.add_argument('--version', '-V', action='version', version=f'CSV Metadata Quality v{VERSION}')
|
||||
parser = argparse.ArgumentParser(description="Metadata quality checker and fixer.")
|
||||
parser.add_argument(
|
||||
"--agrovoc-fields",
|
||||
"-a",
|
||||
help="Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--experimental-checks",
|
||||
"-e",
|
||||
help="Enable experimental checks like language detection", action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-file",
|
||||
"-i",
|
||||
help="Path to input file. Can be UTF-8 CSV or Excel XLSX.",
|
||||
required=True,
|
||||
type=argparse.FileType("r", encoding="UTF-8"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-file",
|
||||
"-o",
|
||||
help="Path to output file (always CSV).",
|
||||
required=True,
|
||||
type=argparse.FileType("w", encoding="UTF-8"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--unsafe-fixes", "-u", help="Perform unsafe fixes.", action="store_true"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version", "-V", action="version", version=f"CSV Metadata Quality v{VERSION}"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exclude-fields",
|
||||
"-x",
|
||||
help="Comma-separated list of fields to skip, for example: dc.contributor.author,dc.identifier.citation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
@ -33,7 +66,20 @@ def run(argv):
|
||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||
df = pd.read_csv(args.input_file, dtype=str)
|
||||
|
||||
for column in df.columns.values.tolist():
|
||||
for column in df.columns:
|
||||
# Check if the user requested to skip any fields
|
||||
if args.exclude_fields:
|
||||
skip = False
|
||||
# Split the list of excludes on ',' so we can test exact matches
|
||||
# rather than fuzzy matches with regexes or "if word in string"
|
||||
for exclude in args.exclude_fields.split(","):
|
||||
if column == exclude and skip is False:
|
||||
skip = True
|
||||
if skip:
|
||||
print(f"Skipping {column}")
|
||||
|
||||
continue
|
||||
|
||||
# Fix: whitespace
|
||||
df[column] = df[column].apply(fix.whitespace)
|
||||
|
||||
@ -41,6 +87,13 @@ def run(argv):
|
||||
if args.unsafe_fixes:
|
||||
df[column] = df[column].apply(fix.newlines)
|
||||
|
||||
# Fix: missing space after comma. Only run on author and citation
|
||||
# fields for now, as this problem is mostly an issue in names.
|
||||
if args.unsafe_fixes:
|
||||
match = re.match(r"^.*?(author|citation).*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
||||
|
||||
# Fix: unnecessary Unicode
|
||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||
|
||||
@ -62,29 +115,51 @@ def run(argv):
|
||||
# Check: invalid AGROVOC subject
|
||||
if args.agrovoc_fields:
|
||||
# Identify fields the user wants to validate against AGROVOC
|
||||
for field in args.agrovoc_fields.split(','):
|
||||
for field in args.agrovoc_fields.split(","):
|
||||
if column == field:
|
||||
df[column] = df[column].apply(check.agrovoc, field_name=column)
|
||||
|
||||
# Check: invalid language
|
||||
match = re.match(r'^.*?language.*$', column)
|
||||
match = re.match(r"^.*?language.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.language)
|
||||
|
||||
# Check: invalid ISSN
|
||||
match = re.match(r'^.*?issn.*$', column)
|
||||
match = re.match(r"^.*?issn.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.issn)
|
||||
|
||||
# Check: invalid ISBN
|
||||
match = re.match(r'^.*?isbn.*$', column)
|
||||
match = re.match(r"^.*?isbn.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.isbn)
|
||||
|
||||
# Check: invalid date
|
||||
match = re.match(r'^.*?date.*$', column)
|
||||
match = re.match(r"^.*?date.*$", column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.date)
|
||||
df[column] = df[column].apply(check.date, field_name=column)
|
||||
|
||||
# Check: filename extension
|
||||
if column == "filename":
|
||||
df[column] = df[column].apply(check.filename_extension)
|
||||
|
||||
##
|
||||
# Perform some checks on rows so we can consider items as a whole rather
|
||||
# than simple on a field-by-field basis. This allows us to check whether
|
||||
# the language used in the title and abstract matches the language indi-
|
||||
# cated in the language field, for example.
|
||||
#
|
||||
# This is slower and apparently frowned upon in the Pandas community be-
|
||||
# cause it requires iterating over rows rather than using apply over a
|
||||
# column. For now it will have to do.
|
||||
##
|
||||
|
||||
if args.experimental_checks:
|
||||
# Transpose the DataFrame so we can consider each row as a column
|
||||
df_transposed = df.T
|
||||
|
||||
for column in df_transposed.columns:
|
||||
experimental.correct_language(df_transposed[column])
|
||||
|
||||
# Write
|
||||
df.to_csv(args.output_file, index=False)
|
||||
|
@ -18,10 +18,10 @@ def issn(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
for value in field.split("||"):
|
||||
|
||||
if not issn.is_valid(value):
|
||||
print(f'Invalid ISSN: {value}')
|
||||
print(f"Invalid ISSN: {value}")
|
||||
|
||||
return field
|
||||
|
||||
@ -43,10 +43,10 @@ def isbn(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
for value in field.split("||"):
|
||||
|
||||
if not isbn.is_valid(value):
|
||||
print(f'Invalid ISBN: {value}')
|
||||
print(f"Invalid ISBN: {value}")
|
||||
|
||||
return field
|
||||
|
||||
@ -64,18 +64,18 @@ def separators(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
for value in field.split("||"):
|
||||
|
||||
# After splitting, see if there are any remaining "|" characters
|
||||
match = re.findall(r'^.*?\|.*$', value)
|
||||
match = re.findall(r"^.*?\|.*$", value)
|
||||
|
||||
if match:
|
||||
print(f'Invalid multi-value separator: {field}')
|
||||
print(f"Invalid multi-value separator: {field}")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def date(field):
|
||||
def date(field, field_name):
|
||||
"""Check if a date is valid.
|
||||
|
||||
In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
|
||||
@ -88,22 +88,22 @@ def date(field):
|
||||
from datetime import datetime
|
||||
|
||||
if pd.isna(field):
|
||||
print(f'Missing date.')
|
||||
print(f"Missing date ({field_name}).")
|
||||
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
multiple_dates = field.split('||')
|
||||
multiple_dates = field.split("||")
|
||||
|
||||
# We don't allow multi-value date fields
|
||||
if len(multiple_dates) > 1:
|
||||
print(f'Multiple dates not allowed: {field}')
|
||||
print(f"Multiple dates not allowed ({field_name}): {field}")
|
||||
|
||||
return field
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY format
|
||||
datetime.strptime(field, '%Y')
|
||||
datetime.strptime(field, "%Y")
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
@ -111,7 +111,7 @@ def date(field):
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY-MM format
|
||||
datetime.strptime(field, '%Y-%m')
|
||||
datetime.strptime(field, "%Y-%m")
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
@ -119,11 +119,11 @@ def date(field):
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY-MM-DD format
|
||||
datetime.strptime(field, '%Y-%m-%d')
|
||||
datetime.strptime(field, "%Y-%m-%d")
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
print(f'Invalid date: {field}')
|
||||
print(f"Invalid date ({field_name}): {field}")
|
||||
|
||||
return field
|
||||
|
||||
@ -140,7 +140,7 @@ def suspicious_characters(field, field_name):
|
||||
return
|
||||
|
||||
# List of suspicious characters, for example: ́ˆ~`
|
||||
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
|
||||
suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"]
|
||||
|
||||
for character in suspicious_characters:
|
||||
# Find the position of the suspicious character in the string
|
||||
@ -156,14 +156,16 @@ def suspicious_characters(field, field_name):
|
||||
# character and spanning enough of the rest to give a preview,
|
||||
# but not too much to cause the line to break in terminals with
|
||||
# a default of 80 characters width.
|
||||
suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
|
||||
print(f'{suspicious_character_msg:1.80}')
|
||||
suspicious_character_msg = (
|
||||
f"Suspicious character ({field_name}): {field_subset}"
|
||||
)
|
||||
print(f"{suspicious_character_msg:1.80}")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def language(field):
|
||||
"""Check if a language is valid ISO 639-2 or ISO 639-3.
|
||||
"""Check if a language is valid ISO 639-1 (alpha 2) or ISO 639-3 (alpha 3).
|
||||
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
@ -177,22 +179,22 @@ def language(field):
|
||||
# need to handle "Other" values here...
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
for value in field.split("||"):
|
||||
|
||||
# After splitting, check if language value is 2 or 3 characters so we
|
||||
# can check it against ISO 639-2 or ISO 639-3 accordingly.
|
||||
# can check it against ISO 639-1 or ISO 639-3 accordingly.
|
||||
if len(value) == 2:
|
||||
if not languages.get(alpha_2=value):
|
||||
print(f'Invalid ISO 639-2 language: {value}')
|
||||
print(f"Invalid ISO 639-1 language: {value}")
|
||||
|
||||
pass
|
||||
elif len(value) == 3:
|
||||
if not languages.get(alpha_3=value):
|
||||
print(f'Invalid ISO 639-3 language: {value}')
|
||||
print(f"Invalid ISO 639-3 language: {value}")
|
||||
|
||||
pass
|
||||
else:
|
||||
print(f'Invalid language: {value}')
|
||||
print(f"Invalid language: {value}")
|
||||
|
||||
return field
|
||||
|
||||
@ -212,7 +214,6 @@ def agrovoc(field, field_name):
|
||||
"""
|
||||
|
||||
from datetime import timedelta
|
||||
import re
|
||||
import requests
|
||||
import requests_cache
|
||||
|
||||
@ -221,35 +222,80 @@ def agrovoc(field, field_name):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
# match lines beginning with words, paying attention to subjects with
|
||||
# special characters like spaces, quotes, dashes, parentheses, etc:
|
||||
# SUBJECT
|
||||
# ANOTHER SUBJECT
|
||||
# XANTHOMONAS CAMPESTRIS PV. MANIHOTIS
|
||||
# WOMEN'S PARTICIPATION
|
||||
# COMMUNITY-BASED FOREST MANAGEMENT
|
||||
# INTERACCIÓN GENOTIPO AMBIENTE
|
||||
# COCOA (PLANT)
|
||||
pattern = re.compile(r'^[\w\-\.\'\(\)]+?[\w\s\-\.\'\(\)]+$')
|
||||
for value in field.split("||"):
|
||||
request_url = (
|
||||
f"http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}"
|
||||
)
|
||||
|
||||
if pattern.match(value):
|
||||
request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}'
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
requests_cache.install_cache(
|
||||
"agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after)
|
||||
request = requests.get(request_url)
|
||||
|
||||
request = requests.get(request_url)
|
||||
# prune old cache entries
|
||||
requests_cache.core.remove_expired_responses()
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.core.remove_expired_responses()
|
||||
if request.status_code == requests.codes.ok:
|
||||
data = request.json()
|
||||
|
||||
if request.status_code == requests.codes.ok:
|
||||
data = request.json()
|
||||
|
||||
# check if there are any results
|
||||
if len(data['results']) == 0:
|
||||
print(f'Invalid AGROVOC ({field_name}): {value}')
|
||||
# check if there are any results
|
||||
if len(data["results"]) == 0:
|
||||
print(f"Invalid AGROVOC ({field_name}): {value}")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def filename_extension(field):
|
||||
"""Check filename extension.
|
||||
|
||||
CSVs with a 'filename' column are likely meant as input for the SAFBuilder
|
||||
tool, which creates a Simple Archive Format bundle for importing metadata
|
||||
with accompanying PDFs or other files into DSpace.
|
||||
|
||||
This check warns if a filename has an uncommon extension (that is, other
|
||||
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
values = field.split("||")
|
||||
|
||||
# List of common filename extentions
|
||||
common_filename_extensions = [
|
||||
".pdf",
|
||||
".doc",
|
||||
".docx",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".xls",
|
||||
".xlsx",
|
||||
]
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
# Assume filename extension does not match
|
||||
filename_extension_match = False
|
||||
|
||||
for filename_extension in common_filename_extensions:
|
||||
# Check for extension at the end of the filename
|
||||
pattern = re.escape(filename_extension) + r"$"
|
||||
match = re.search(pattern, value, re.IGNORECASE)
|
||||
|
||||
if match is not None:
|
||||
# Register the match and stop checking for this filename
|
||||
filename_extension_match = True
|
||||
|
||||
break
|
||||
|
||||
if filename_extension_match is False:
|
||||
print(f"Filename with uncommon extension: {value}")
|
||||
|
||||
return field
|
||||
|
95
csv_metadata_quality/experimental.py
Normal file
95
csv_metadata_quality/experimental.py
Normal file
@ -0,0 +1,95 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def correct_language(row):
|
||||
"""Analyze the text used in the title, abstract, and citation fields to pre-
|
||||
dict the language being used and compare it with the item's dc.language.iso
|
||||
field.
|
||||
|
||||
Function prints an error if the language field does not match the detected
|
||||
language and returns the value in the language field if it does match.
|
||||
"""
|
||||
|
||||
from pycountry import languages
|
||||
import langid
|
||||
import re
|
||||
|
||||
# Initialize some variables at global scope so that we can set them in the
|
||||
# loop scope below and still be able to access them afterwards.
|
||||
language = ""
|
||||
sample_strings = list()
|
||||
title = None
|
||||
|
||||
# Iterate over the labels of the current row's values. Before we transposed
|
||||
# the DataFrame these were the columns in the CSV, ie dc.title and dc.type.
|
||||
for label in row.axes[0]:
|
||||
# Skip fields with missing values
|
||||
if pd.isna(row[label]):
|
||||
continue
|
||||
|
||||
# Check if current row has multiple language values (separated by "||")
|
||||
match = re.match(r"^.*?language.*$", label)
|
||||
if match is not None:
|
||||
# Skip fields with multiple language values
|
||||
if "||" in row[label]:
|
||||
return
|
||||
|
||||
language = row[label]
|
||||
|
||||
# Extract title if it is present
|
||||
match = re.match(r"^.*?title.*$", label)
|
||||
if match is not None:
|
||||
title = row[label]
|
||||
# Append title to sample strings
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Extract abstract if it is present
|
||||
match = re.match(r"^.*?abstract.*$", label)
|
||||
if match is not None:
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Extract citation if it is present
|
||||
match = re.match(r"^.*?citation.*$", label)
|
||||
if match is not None:
|
||||
sample_strings.append(row[label])
|
||||
|
||||
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction
|
||||
if language != "":
|
||||
# Check language value like "es"
|
||||
if len(language) == 2:
|
||||
if not languages.get(alpha_2=language):
|
||||
return
|
||||
# Check language value like "spa"
|
||||
elif len(language) == 3:
|
||||
if not languages.get(alpha_3=language):
|
||||
return
|
||||
# Language value is something else like "Span", do not proceed
|
||||
else:
|
||||
return
|
||||
# Language is blank, do not proceed
|
||||
else:
|
||||
return
|
||||
|
||||
# Concatenate all sample strings into one string
|
||||
sample_text = " ".join(sample_strings)
|
||||
|
||||
# Restrict the langid detection space to reduce false positives
|
||||
langid.set_languages(
|
||||
["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ko", "pt", "ru", "vi", "zh"]
|
||||
)
|
||||
langid_classification = langid.classify(sample_text)
|
||||
|
||||
# langid returns an ISO 639-1 (alpha 2) representation of the detected language, but the current item's language field might be ISO 639-3 (alpha 3) so we should use a pycountry Language object to compare both represenations and give appropriate error messages that match the format used by in the input file.
|
||||
detected_language = languages.get(alpha_2=langid_classification[0])
|
||||
if len(language) == 2 and language != detected_language.alpha_2:
|
||||
print(
|
||||
f"Possibly incorrect language {language} (detected {detected_language.alpha_2}): {title}"
|
||||
)
|
||||
|
||||
elif len(language) == 3 and language != detected_language.alpha_3:
|
||||
print(
|
||||
f"Possibly incorrect language {language} (detected {detected_language.alpha_3}): {title}"
|
||||
)
|
||||
|
||||
else:
|
||||
return language
|
@ -1,6 +1,7 @@
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def whitespace(field):
|
||||
"""Fix whitespace issues.
|
||||
@ -16,23 +17,23 @@ def whitespace(field):
|
||||
values = list()
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
for value in field.split("||"):
|
||||
# Strip leading and trailing whitespace
|
||||
value = value.strip()
|
||||
|
||||
# Replace excessive whitespace (>2) with one space
|
||||
pattern = re.compile(r'\s{2,}')
|
||||
pattern = re.compile(r"\s{2,}")
|
||||
match = re.findall(pattern, value)
|
||||
|
||||
if match:
|
||||
print(f'Excessive whitespace: {value}')
|
||||
value = re.sub(pattern, ' ', value)
|
||||
print(f"Excessive whitespace: {value}")
|
||||
value = re.sub(pattern, " ", value)
|
||||
|
||||
# Save cleaned value
|
||||
values.append(value)
|
||||
|
||||
# Create a new field consisting of all values joined with "||"
|
||||
new_field = '||'.join(values)
|
||||
new_field = "||".join(values)
|
||||
|
||||
return new_field
|
||||
|
||||
@ -48,34 +49,37 @@ def separators(field):
|
||||
values = list()
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
for value in field.split("||"):
|
||||
# After splitting, see if there are any remaining "|" characters
|
||||
pattern = re.compile(r'\|')
|
||||
pattern = re.compile(r"\|")
|
||||
match = re.findall(pattern, value)
|
||||
|
||||
if match:
|
||||
print(f'Fixing invalid multi-value separator: {value}')
|
||||
print(f"Fixing invalid multi-value separator: {value}")
|
||||
|
||||
value = re.sub(pattern, '||', value)
|
||||
value = re.sub(pattern, "||", value)
|
||||
|
||||
# Save cleaned value
|
||||
values.append(value)
|
||||
|
||||
# Create a new field consisting of all values joined with "||"
|
||||
new_field = '||'.join(values)
|
||||
new_field = "||".join(values)
|
||||
|
||||
return new_field
|
||||
|
||||
|
||||
def unnecessary_unicode(field):
|
||||
"""Remove unnecessary Unicode characters.
|
||||
"""Remove and replace unnecessary Unicode characters.
|
||||
|
||||
Removes unnecessary Unicode characters like:
|
||||
- Zero-width space (U+200B)
|
||||
- Replacement character (U+FFFD)
|
||||
- No-break space (U+00A0)
|
||||
|
||||
Return string with characters removed.
|
||||
Replaces unnecessary Unicode characters like:
|
||||
- Soft hyphen (U+00AD) → hyphen
|
||||
|
||||
Return string with characters removed or replaced.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
@ -83,28 +87,36 @@ def unnecessary_unicode(field):
|
||||
return
|
||||
|
||||
# Check for zero-width space characters (U+200B)
|
||||
pattern = re.compile(r'\u200B')
|
||||
pattern = re.compile(r"\u200B")
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Removing unnecessary Unicode (U+200B): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
print(f"Removing unnecessary Unicode (U+200B): {field}")
|
||||
field = re.sub(pattern, "", field)
|
||||
|
||||
# Check for replacement characters (U+FFFD)
|
||||
pattern = re.compile(r'\uFFFD')
|
||||
pattern = re.compile(r"\uFFFD")
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Removing unnecessary Unicode (U+FFFD): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
print(f"Removing unnecessary Unicode (U+FFFD): {field}")
|
||||
field = re.sub(pattern, "", field)
|
||||
|
||||
# Check for no-break spaces (U+00A0)
|
||||
pattern = re.compile(r'\u00A0')
|
||||
pattern = re.compile(r"\u00A0")
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
print(f"Removing unnecessary Unicode (U+00A0): {field}")
|
||||
field = re.sub(pattern, "", field)
|
||||
|
||||
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
||||
pattern = re.compile(r"\u002D*?\u00AD")
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f"Replacing unnecessary Unicode (U+00AD): {field}")
|
||||
field = re.sub(pattern, "-", field)
|
||||
|
||||
return field
|
||||
|
||||
@ -117,7 +129,7 @@ def duplicates(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
values = field.split('||')
|
||||
values = field.split("||")
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = list()
|
||||
@ -128,10 +140,10 @@ def duplicates(field):
|
||||
if value not in new_values:
|
||||
new_values.append(value)
|
||||
else:
|
||||
print(f'Dropping duplicate value: {value}')
|
||||
print(f"Dropping duplicate value: {value}")
|
||||
|
||||
# Create a new field consisting of all values joined with "||"
|
||||
new_field = '||'.join(new_values)
|
||||
new_field = "||".join(new_values)
|
||||
|
||||
return new_field
|
||||
|
||||
@ -158,10 +170,34 @@ def newlines(field):
|
||||
return
|
||||
|
||||
# Check for Unix line feed (LF)
|
||||
match = re.findall(r'\n', field)
|
||||
match = re.findall(r"\n", field)
|
||||
|
||||
if match:
|
||||
print(f'Removing newline: {field}')
|
||||
field = field.replace('\n', '')
|
||||
print(f"Removing newline: {field}")
|
||||
field = field.replace("\n", "")
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def comma_space(field, field_name):
|
||||
"""Fix occurrences of commas missing a trailing space, for example:
|
||||
|
||||
Orth,Alan S.
|
||||
|
||||
This is a very common mistake in author and citation fields.
|
||||
|
||||
Return string with a space added.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Check for comma followed by a word character
|
||||
match = re.findall(r",\w", field)
|
||||
|
||||
if match:
|
||||
print(f"Adding space after comma ({field_name}): {field}")
|
||||
field = re.sub(r",(\w)", r", \1", field)
|
||||
|
||||
return field
|
||||
|
@ -1 +1 @@
|
||||
VERSION = '0.2.0'
|
||||
VERSION = "0.3.0"
|
||||
|
@ -1,23 +1,28 @@
|
||||
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country
|
||||
Leading space,2019-07-29,,,,,
|
||||
Trailing space ,2019-07-29,,,,,
|
||||
Excessive space,2019-07-29,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,
|
||||
Invalid date,2019-07-260,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,
|
||||
Invalid ISO 639-2 language,2019-07-29,,,jp,,
|
||||
Invalid ISO 639-3 language,2019-07-29,,,chi,,
|
||||
Invalid language,2019-07-29,,,Span,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,
|
||||
dc.title,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
|
||||
Leading space,2019-07-29,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
|
||||
Invalid date,2019-07-260,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,
|
||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,
|
||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,
|
||||
Invalid language,2019-07-29,,,Span,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
|
||||
Newline (LF),2019-07-30,,,,"TANZA
|
||||
NIA",
|
||||
Missing date,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA
|
||||
NIA",,
|
||||
Missing date,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,
|
||||
"Missing space,after comma",2019-08-27,,,,,,
|
||||
Incorrect ISO 639-1 language,2019-09-26,,,es,,,
|
||||
Incorrect ISO 639-3 language,2019-09-26,,,spa,,,
|
||||
|
|
@ -1,21 +1,39 @@
|
||||
-i https://pypi.org/simple
|
||||
agate-dbf==0.2.1
|
||||
agate-excel==0.2.3
|
||||
agate-sql==0.5.4
|
||||
agate==1.6.1
|
||||
appdirs==1.4.3
|
||||
atomicwrites==1.3.0
|
||||
attrs==19.1.0
|
||||
babel==2.7.0
|
||||
backcall==0.1.0
|
||||
black==19.3b0
|
||||
click==7.0
|
||||
csvkit==1.0.4
|
||||
dbfread==2.0.7
|
||||
decorator==4.4.0
|
||||
entrypoints==0.3
|
||||
et-xmlfile==1.0.1
|
||||
flake8==3.7.8
|
||||
importlib-metadata==0.19
|
||||
future==0.17.1
|
||||
importlib-metadata==0.23 ; python_version < '3.8'
|
||||
ipython-genutils==0.2.0
|
||||
ipython==7.7.0
|
||||
jedi==0.14.1
|
||||
ipython==7.8.0
|
||||
isodate==0.6.0
|
||||
isort==4.3.21
|
||||
jdcal==1.4.1
|
||||
jedi==0.15.1
|
||||
leather==0.3.3
|
||||
mccabe==0.6.1
|
||||
more-itertools==7.2.0
|
||||
packaging==19.1
|
||||
openpyxl==3.0.0
|
||||
packaging==19.2
|
||||
parsedatetime==2.4
|
||||
parso==0.5.1
|
||||
pexpect==4.7.0 ; sys_platform != 'win32'
|
||||
pickleshare==0.7.5
|
||||
pluggy==0.12.0
|
||||
pluggy==0.13.0
|
||||
prompt-toolkit==2.0.9
|
||||
ptyprocess==0.6.0
|
||||
py==1.8.0
|
||||
@ -24,9 +42,16 @@ pyflakes==2.1.1
|
||||
pygments==2.4.2
|
||||
pyparsing==2.4.2
|
||||
pytest-clarity==0.2.0a1
|
||||
pytest==5.0.1
|
||||
pytest==5.1.3
|
||||
python-slugify==3.0.4
|
||||
pytimeparse==1.1.8
|
||||
pytz==2019.2
|
||||
six==1.12.0
|
||||
sqlalchemy==1.3.8
|
||||
termcolor==1.1.0
|
||||
traitlets==4.3.2
|
||||
text-unidecode==1.3
|
||||
toml==0.10.0
|
||||
traitlets==4.3.3.dev0
|
||||
wcwidth==0.1.7
|
||||
zipp==0.5.2
|
||||
xlrd==1.2.0
|
||||
zipp==0.6.0
|
||||
|
@ -1,16 +1,17 @@
|
||||
-i https://pypi.org/simple
|
||||
-e .
|
||||
certifi==2019.6.16
|
||||
certifi==2019.9.11
|
||||
chardet==3.0.4
|
||||
idna==2.8
|
||||
numpy==1.17.0
|
||||
pandas==0.25.0
|
||||
pycountry==19.7.15
|
||||
langid==1.1.6
|
||||
numpy==1.17.2
|
||||
pandas==0.25.1
|
||||
pycountry==19.8.18
|
||||
python-dateutil==2.8.0
|
||||
python-stdnum==1.11
|
||||
pytz==2019.2
|
||||
requests-cache==0.5.0
|
||||
requests-cache==0.5.2
|
||||
requests==2.22.0
|
||||
six==1.12.0
|
||||
urllib3==1.25.3
|
||||
urllib3==1.25.6
|
||||
xlrd==1.2.0
|
||||
|
6
setup.cfg
Normal file
6
setup.cfg
Normal file
@ -0,0 +1,6 @@
|
||||
[isort]
|
||||
multi_line_output=3
|
||||
include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
5
setup.py
5
setup.py
@ -8,12 +8,13 @@ install_requires = [
|
||||
'python-stdnum',
|
||||
'requests',
|
||||
'requests-cache',
|
||||
'pycountry'
|
||||
'pycountry',
|
||||
'langid'
|
||||
]
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.2.0",
|
||||
version="0.3.0",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
@ -1,21 +1,23 @@
|
||||
import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.experimental as experimental
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def test_check_invalid_issn(capsys):
|
||||
'''Test checking invalid ISSN.'''
|
||||
"""Test checking invalid ISSN."""
|
||||
|
||||
value = '2321-2302'
|
||||
value = "2321-2302"
|
||||
|
||||
check.issn(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid ISSN: {value}\n'
|
||||
assert captured.out == f"Invalid ISSN: {value}\n"
|
||||
|
||||
|
||||
def test_check_valid_issn():
|
||||
'''Test checking valid ISSN.'''
|
||||
"""Test checking valid ISSN."""
|
||||
|
||||
value = '0024-9319'
|
||||
value = "0024-9319"
|
||||
|
||||
result = check.issn(value)
|
||||
|
||||
@ -23,20 +25,20 @@ def test_check_valid_issn():
|
||||
|
||||
|
||||
def test_check_invalid_isbn(capsys):
|
||||
'''Test checking invalid ISBN.'''
|
||||
"""Test checking invalid ISBN."""
|
||||
|
||||
value = '99921-58-10-6'
|
||||
value = "99921-58-10-6"
|
||||
|
||||
check.isbn(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid ISBN: {value}\n'
|
||||
assert captured.out == f"Invalid ISBN: {value}\n"
|
||||
|
||||
|
||||
def test_check_valid_isbn():
|
||||
'''Test checking valid ISBN.'''
|
||||
"""Test checking valid ISBN."""
|
||||
|
||||
value = '99921-58-10-7'
|
||||
value = "99921-58-10-7"
|
||||
|
||||
result = check.isbn(value)
|
||||
|
||||
@ -44,20 +46,20 @@ def test_check_valid_isbn():
|
||||
|
||||
|
||||
def test_check_invalid_separators(capsys):
|
||||
'''Test checking invalid multi-value separators.'''
|
||||
"""Test checking invalid multi-value separators."""
|
||||
|
||||
value = 'Alan|Orth'
|
||||
value = "Alan|Orth"
|
||||
|
||||
check.separators(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid multi-value separator: {value}\n'
|
||||
assert captured.out == f"Invalid multi-value separator: {value}\n"
|
||||
|
||||
|
||||
def test_check_valid_separators():
|
||||
'''Test checking valid multi-value separators.'''
|
||||
"""Test checking valid multi-value separators."""
|
||||
|
||||
value = 'Alan||Orth'
|
||||
value = "Alan||Orth"
|
||||
|
||||
result = check.separators(value)
|
||||
|
||||
@ -65,65 +67,73 @@ def test_check_valid_separators():
|
||||
|
||||
|
||||
def test_check_missing_date(capsys):
|
||||
'''Test checking missing date.'''
|
||||
"""Test checking missing date."""
|
||||
|
||||
value = None
|
||||
|
||||
check.date(value)
|
||||
field_name = "dc.date.issued"
|
||||
|
||||
check.date(value, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Missing date.\n'
|
||||
assert captured.out == f"Missing date ({field_name}).\n"
|
||||
|
||||
|
||||
def test_check_multiple_dates(capsys):
|
||||
'''Test checking multiple dates.'''
|
||||
"""Test checking multiple dates."""
|
||||
|
||||
value = '1990||1991'
|
||||
value = "1990||1991"
|
||||
|
||||
check.date(value)
|
||||
field_name = "dc.date.issued"
|
||||
|
||||
check.date(value, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Multiple dates not allowed: {value}\n'
|
||||
assert captured.out == f"Multiple dates not allowed ({field_name}): {value}\n"
|
||||
|
||||
|
||||
def test_check_invalid_date(capsys):
|
||||
'''Test checking invalid ISO8601 date.'''
|
||||
"""Test checking invalid ISO8601 date."""
|
||||
|
||||
value = '1990-0'
|
||||
value = "1990-0"
|
||||
|
||||
check.date(value)
|
||||
field_name = "dc.date.issued"
|
||||
|
||||
check.date(value, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid date: {value}\n'
|
||||
assert captured.out == f"Invalid date ({field_name}): {value}\n"
|
||||
|
||||
|
||||
def test_check_valid_date():
|
||||
'''Test checking valid ISO8601 date.'''
|
||||
"""Test checking valid ISO8601 date."""
|
||||
|
||||
value = '1990'
|
||||
value = "1990"
|
||||
|
||||
result = check.date(value)
|
||||
field_name = "dc.date.issued"
|
||||
|
||||
result = check.date(value, field_name)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_suspicious_characters(capsys):
|
||||
'''Test checking for suspicious characters.'''
|
||||
"""Test checking for suspicious characters."""
|
||||
|
||||
value = 'foreˆt'
|
||||
value = "foreˆt"
|
||||
|
||||
field_name = 'dc.contributor.author'
|
||||
field_name = "dc.contributor.author"
|
||||
|
||||
check.suspicious_characters(value, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Suspicious character ({field_name}): ˆt\n'
|
||||
assert captured.out == f"Suspicious character ({field_name}): ˆt\n"
|
||||
|
||||
|
||||
def test_check_valid_iso639_2_language():
|
||||
'''Test valid ISO 639-2 language.'''
|
||||
def test_check_valid_iso639_1_language():
|
||||
"""Test valid ISO 639-1 (alpha 2) language."""
|
||||
|
||||
value = 'ja'
|
||||
value = "ja"
|
||||
|
||||
result = check.language(value)
|
||||
|
||||
@ -131,66 +141,155 @@ def test_check_valid_iso639_2_language():
|
||||
|
||||
|
||||
def test_check_valid_iso639_3_language():
|
||||
'''Test invalid ISO 639-3 language.'''
|
||||
"""Test valid ISO 639-3 (alpha 3) language."""
|
||||
|
||||
value = 'eng'
|
||||
value = "eng"
|
||||
|
||||
result = check.language(value)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_invalid_iso639_2_language(capsys):
|
||||
'''Test invalid ISO 639-2 language.'''
|
||||
def test_check_invalid_iso639_1_language(capsys):
|
||||
"""Test invalid ISO 639-1 (alpha 2) language."""
|
||||
|
||||
value = 'jp'
|
||||
value = "jp"
|
||||
|
||||
check.language(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid ISO 639-2 language: {value}\n'
|
||||
assert captured.out == f"Invalid ISO 639-1 language: {value}\n"
|
||||
|
||||
|
||||
def test_check_invalid_iso639_3_language(capsys):
|
||||
'''Test invalid ISO 639-3 language.'''
|
||||
"""Test invalid ISO 639-3 (alpha 3) language."""
|
||||
|
||||
value = 'chi'
|
||||
value = "chi"
|
||||
|
||||
check.language(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid ISO 639-3 language: {value}\n'
|
||||
assert captured.out == f"Invalid ISO 639-3 language: {value}\n"
|
||||
|
||||
|
||||
def test_check_invalid_language(capsys):
|
||||
'''Test invalid language.'''
|
||||
"""Test invalid language."""
|
||||
|
||||
value = 'Span'
|
||||
value = "Span"
|
||||
|
||||
check.language(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid language: {value}\n'
|
||||
assert captured.out == f"Invalid language: {value}\n"
|
||||
|
||||
|
||||
def test_check_invalid_agrovoc(capsys):
|
||||
'''Test invalid AGROVOC subject.'''
|
||||
"""Test invalid AGROVOC subject."""
|
||||
|
||||
value = 'FOREST'
|
||||
field_name = 'dc.subject'
|
||||
value = "FOREST"
|
||||
field_name = "dc.subject"
|
||||
|
||||
check.agrovoc(value, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid AGROVOC ({field_name}): {value}\n'
|
||||
assert captured.out == f"Invalid AGROVOC ({field_name}): {value}\n"
|
||||
|
||||
|
||||
def test_check_valid_agrovoc():
|
||||
'''Test valid AGROVOC subject.'''
|
||||
"""Test valid AGROVOC subject."""
|
||||
|
||||
value = 'FORESTS'
|
||||
field_name = 'dc.subject'
|
||||
value = "FORESTS"
|
||||
field_name = "dc.subject"
|
||||
|
||||
result = check.agrovoc(value, field_name)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_uncommon_filename_extension(capsys):
|
||||
"""Test uncommon filename extension."""
|
||||
|
||||
value = "file.pdf.lck"
|
||||
|
||||
check.filename_extension(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f"Filename with uncommon extension: {value}\n"
|
||||
|
||||
|
||||
def test_check_common_filename_extension():
|
||||
"""Test common filename extension."""
|
||||
|
||||
value = "file.pdf"
|
||||
|
||||
result = check.filename_extension(value)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_incorrect_iso_639_1_language(capsys):
|
||||
"""Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "es"
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
series = pd.Series(row)
|
||||
|
||||
experimental.correct_language(series)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
captured.out
|
||||
== f"Possibly incorrect language {language} (detected en): {title}\n"
|
||||
)
|
||||
|
||||
|
||||
def test_check_incorrect_iso_639_3_language(capsys):
|
||||
"""Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "spa"
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
series = pd.Series(row)
|
||||
|
||||
experimental.correct_language(series)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
captured.out
|
||||
== f"Possibly incorrect language {language} (detected eng): {title}\n"
|
||||
)
|
||||
|
||||
|
||||
def test_check_correct_iso_639_1_language():
|
||||
"""Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "en"
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
series = pd.Series(row)
|
||||
|
||||
result = experimental.correct_language(series)
|
||||
|
||||
assert result == language
|
||||
|
||||
|
||||
def test_check_correct_iso_639_3_language():
|
||||
"""Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title."""
|
||||
|
||||
title = "A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle"
|
||||
language = "eng"
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {"dc.title": title, "dc.language.iso": language}
|
||||
series = pd.Series(row)
|
||||
|
||||
result = experimental.correct_language(series)
|
||||
|
||||
assert result == language
|
||||
|
@ -2,57 +2,67 @@ import csv_metadata_quality.fix as fix
|
||||
|
||||
|
||||
def test_fix_leading_whitespace():
|
||||
'''Test fixing leading whitespace.'''
|
||||
"""Test fixing leading whitespace."""
|
||||
|
||||
value = ' Alan'
|
||||
value = " Alan"
|
||||
|
||||
assert fix.whitespace(value) == 'Alan'
|
||||
assert fix.whitespace(value) == "Alan"
|
||||
|
||||
|
||||
def test_fix_trailing_whitespace():
|
||||
'''Test fixing trailing whitespace.'''
|
||||
"""Test fixing trailing whitespace."""
|
||||
|
||||
value = 'Alan '
|
||||
value = "Alan "
|
||||
|
||||
assert fix.whitespace(value) == 'Alan'
|
||||
assert fix.whitespace(value) == "Alan"
|
||||
|
||||
|
||||
def test_fix_excessive_whitespace():
|
||||
'''Test fixing excessive whitespace.'''
|
||||
"""Test fixing excessive whitespace."""
|
||||
|
||||
value = 'Alan Orth'
|
||||
value = "Alan Orth"
|
||||
|
||||
assert fix.whitespace(value) == 'Alan Orth'
|
||||
assert fix.whitespace(value) == "Alan Orth"
|
||||
|
||||
|
||||
def test_fix_invalid_separators():
|
||||
'''Test fixing invalid multi-value separators.'''
|
||||
"""Test fixing invalid multi-value separators."""
|
||||
|
||||
value = 'Alan|Orth'
|
||||
value = "Alan|Orth"
|
||||
|
||||
assert fix.separators(value) == 'Alan||Orth'
|
||||
assert fix.separators(value) == "Alan||Orth"
|
||||
|
||||
|
||||
def test_fix_unnecessary_unicode():
|
||||
'''Test fixing unnecessary Unicode.'''
|
||||
"""Test fixing unnecessary Unicode."""
|
||||
|
||||
value = 'Alan Orth'
|
||||
value = "Alan Orth"
|
||||
|
||||
assert fix.unnecessary_unicode(value) == 'Alan Orth'
|
||||
assert fix.unnecessary_unicode(value) == "Alan Orth"
|
||||
|
||||
|
||||
def test_fix_duplicates():
|
||||
'''Test fixing duplicate metadata values.'''
|
||||
"""Test fixing duplicate metadata values."""
|
||||
|
||||
value = 'Kenya||Kenya'
|
||||
value = "Kenya||Kenya"
|
||||
|
||||
assert fix.duplicates(value) == 'Kenya'
|
||||
assert fix.duplicates(value) == "Kenya"
|
||||
|
||||
|
||||
def test_fix_newlines():
|
||||
'''Test fixing newlines.'''
|
||||
"""Test fixing newlines."""
|
||||
|
||||
value = '''Ken
|
||||
ya'''
|
||||
value = """Ken
|
||||
ya"""
|
||||
|
||||
assert fix.newlines(value) == 'Kenya'
|
||||
assert fix.newlines(value) == "Kenya"
|
||||
|
||||
|
||||
def test_fix_comma_space():
|
||||
"""Test adding space after comma."""
|
||||
|
||||
value = "Orth,Alan S."
|
||||
|
||||
field_name = "dc.contributor.author"
|
||||
|
||||
assert fix.comma_space(value, field_name) == "Orth, Alan S."
|
||||
|
Reference in New Issue
Block a user