1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-25 07:10:17 +01:00

Add ISSN and ISBN checks using python-stdnum

This commit is contained in:
Alan Orth 2019-07-26 23:14:10 +03:00
parent 30a4b0005f
commit e160b17fb0
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 76 additions and 25 deletions

View File

@ -7,6 +7,7 @@ verify_ssl = true
[packages] [packages]
pandas = "*" pandas = "*"
python-stdnum = "*"
[requires] [requires]
python_version = "3.7" python_version = "3.7"

51
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "fba383031295c98ef4ac605996144102d686f2d38c5a20f33ee0bebae0713200" "sha256": "276cc93ee297b83cefb775fe3ab89c4f0780d81e69d4143cc95ec745efda2a4c"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -18,31 +18,24 @@
"default": { "default": {
"numpy": { "numpy": {
"hashes": [ "hashes": [
"sha256:0778076e764e146d3078b17c24c4d89e0ecd4ac5401beff8e1c87879043a0633", "sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e",
"sha256:141c7102f20abe6cf0d54c4ced8d565b86df4d3077ba2343b61a6db996cefec7", "sha256:0cdd229a53d2720d21175012ab0599665f8c9588b3b8ffa6095dd7b90f0691dd",
"sha256:14270a1ee8917d11e7753fb54fc7ffd1934f4d529235beec0b275e2ccf00333b", "sha256:312bb18e95218bedc3563f26fcc9c1c6bfaaf9d453d15942c0839acdd7e4c473",
"sha256:27e11c7a8ec9d5838bc59f809bfa86efc8a4fd02e58960fa9c49d998e14332d5", "sha256:464b1c48baf49e8505b1bb754c47a013d2c305c5b14269b5c85ea0625b6a988a",
"sha256:2a04dda79606f3d2f760384c38ccd3d5b9bb79d4c8126b67aff5eb09a253763e", "sha256:5adfde7bd3ee4864536e230bcab1c673f866736698724d5d28c11a4d63672658",
"sha256:3c26010c1b51e1224a3ca6b8df807de6e95128b0908c7e34f190e7775455b0ca", "sha256:7724e9e31ee72389d522b88c0d4201f24edc34277999701ccd4a5392e7d8af61",
"sha256:52c40f1a4262c896420c6ea1c6fda62cf67070e3947e3307f5562bd783a90336", "sha256:8d36f7c53ae741e23f54793ffefb2912340b800476eb0a831c6eb602e204c5c4",
"sha256:6e4f8d9e8aa79321657079b9ac03f3cf3fd067bf31c1cca4f56d49543f4356a5", "sha256:910d2272403c2ea8a52d9159827dc9f7c27fb4b263749dca884e2e4a8af3b302",
"sha256:7242be12a58fec245ee9734e625964b97cf7e3f2f7d016603f9e56660ce479c7", "sha256:951fefe2fb73f84c620bec4e001e80a80ddaa1b84dce244ded7f1e0cbe0ed34a",
"sha256:7dc253b542bfd4b4eb88d9dbae4ca079e7bf2e2afd819ee18891a43db66c60c7", "sha256:9588c6b4157f493edeb9378788dcd02cb9e6a6aeaa518b511a1c79d06cbd8094",
"sha256:94f5bd885f67bbb25c82d80184abbf7ce4f6c3c3a41fbaa4182f034bba803e69", "sha256:9ce8300950f2f1d29d0e49c28ebfff0d2f1e2a7444830fbb0b913c7c08f31511",
"sha256:a89e188daa119ffa0d03ce5123dee3f8ffd5115c896c2a9d4f0dbb3d8b95bfa3", "sha256:be39cca66cc6806652da97103605c7b65ee4442c638f04ff064a7efd9a81d50a",
"sha256:ad3399da9b0ca36e2f24de72f67ab2854a62e623274607e37e0ce5f5d5fa9166", "sha256:c3ab2d835b95ccb59d11dfcd56eb0480daea57cdf95d686d22eff35584bc4554",
"sha256:b0348be89275fd1d4c44ffa39530c41a21062f52299b1e3ee7d1c61f060044b8", "sha256:eb0fc4a492cb896346c9e2c7a22eae3e766d407df3eb20f4ce027f23f76e4c54",
"sha256:b5554368e4ede1856121b0dfa35ce71768102e4aa55e526cb8de7f374ff78722", "sha256:ec0c56eae6cee6299f41e780a0280318a93db519bbb2906103c43f3e2be1206c",
"sha256:cbddc56b2502d3f87fda4f98d948eb5b11f36ff3902e17cb6cc44727f2200525", "sha256:f4e4612de60a4f1c4d06c8c2857cdcb2b8b5289189a12053f37d3f41f06c60d0"
"sha256:d79f18f41751725c56eceab2a886f021d70fd70a6188fd386e29a045945ffc10",
"sha256:dc2ca26a19ab32dc475dbad9dfe723d3a64c835f4c23f625c2b6566ca32b9f29",
"sha256:dd9bcd4f294eb0633bb33d1a74febdd2b9018b8b8ed325f861fffcd2c7660bb8",
"sha256:e8baab1bc7c9152715844f1faca6744f2416929de10d7639ed49555a85549f52",
"sha256:ec31fe12668af687b99acf1567399632a7c47b0e17cfb9ae47c098644ef36797",
"sha256:f12b4f7e2d8f9da3141564e6737d79016fe5336cc92de6814eba579744f65b0a",
"sha256:f58ac38d5ca045a377b3b377c84df8175ab992c970a53332fa8ac2373df44ff7"
], ],
"version": "==1.16.4" "version": "==1.17.0"
}, },
"pandas": { "pandas": {
"hashes": [ "hashes": [
@ -71,6 +64,14 @@
], ],
"version": "==2.8.0" "version": "==2.8.0"
}, },
"python-stdnum": {
"hashes": [
"sha256:d5f0af1bee9ddd9a20b398b46ce062dbd4d41fcc9646940f2667256a44df3854",
"sha256:f445ec32bf5246c90389204cabba465f494545371c29a83fa2d30e6c872a6763"
],
"index": "pypi",
"version": "==1.11"
},
"pytz": { "pytz": {
"hashes": [ "hashes": [
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",

View File

@ -1,3 +1,4 @@
import csv_metadata_quality.check as check
import csv_metadata_quality.fix as fix import csv_metadata_quality.fix as fix
import pandas as pd import pandas as pd
@ -13,5 +14,11 @@ def run():
df[column] = df[column].apply(fix.whitespace) df[column] = df[column].apply(fix.whitespace)
if column == 'dc.identifier.issn':
df[column] = df[column].apply(check.issn)
if column == 'dc.identifier.isbn':
df[column] = df[column].apply(check.isbn)
# Write # Write
df.to_csv('/tmp/omg.fixed.csv', index=False) df.to_csv('/tmp/omg.fixed.csv', index=False)

42
csv_metadata_quality/check.py Executable file
View File

@ -0,0 +1,42 @@
import pandas as pd
def issn(field):
"""Check if an ISSN is valid.
Prints the ISSN if invalid.
stdnum's is_valid() function never raises an exception.
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
"""
from stdnum import issn
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split('||'):
if not issn.is_valid(value):
print(f'Invalid ISSN: {value}')
def isbn(field):
"""Check if an ISBN is valid.
Prints the ISBN if invalid.
stdnum's is_valid() function never raises an exception.
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
"""
from stdnum import isbn
# Try to split multi-value field on "||" separator
for value in field.split('||'):
if not isbn.is_valid(value):
print(f'Invalid ISBN: {value}')