diff --git a/Pipfile b/Pipfile index 7130fb7..3c9ee0f 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,7 @@ verify_ssl = true [packages] pandas = "*" +python-stdnum = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 9ca2850..51a67ac 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "fba383031295c98ef4ac605996144102d686f2d38c5a20f33ee0bebae0713200" + "sha256": "276cc93ee297b83cefb775fe3ab89c4f0780d81e69d4143cc95ec745efda2a4c" }, "pipfile-spec": 6, "requires": { @@ -18,31 +18,24 @@ "default": { "numpy": { "hashes": [ - "sha256:0778076e764e146d3078b17c24c4d89e0ecd4ac5401beff8e1c87879043a0633", - "sha256:141c7102f20abe6cf0d54c4ced8d565b86df4d3077ba2343b61a6db996cefec7", - "sha256:14270a1ee8917d11e7753fb54fc7ffd1934f4d529235beec0b275e2ccf00333b", - "sha256:27e11c7a8ec9d5838bc59f809bfa86efc8a4fd02e58960fa9c49d998e14332d5", - "sha256:2a04dda79606f3d2f760384c38ccd3d5b9bb79d4c8126b67aff5eb09a253763e", - "sha256:3c26010c1b51e1224a3ca6b8df807de6e95128b0908c7e34f190e7775455b0ca", - "sha256:52c40f1a4262c896420c6ea1c6fda62cf67070e3947e3307f5562bd783a90336", - "sha256:6e4f8d9e8aa79321657079b9ac03f3cf3fd067bf31c1cca4f56d49543f4356a5", - "sha256:7242be12a58fec245ee9734e625964b97cf7e3f2f7d016603f9e56660ce479c7", - "sha256:7dc253b542bfd4b4eb88d9dbae4ca079e7bf2e2afd819ee18891a43db66c60c7", - "sha256:94f5bd885f67bbb25c82d80184abbf7ce4f6c3c3a41fbaa4182f034bba803e69", - "sha256:a89e188daa119ffa0d03ce5123dee3f8ffd5115c896c2a9d4f0dbb3d8b95bfa3", - "sha256:ad3399da9b0ca36e2f24de72f67ab2854a62e623274607e37e0ce5f5d5fa9166", - "sha256:b0348be89275fd1d4c44ffa39530c41a21062f52299b1e3ee7d1c61f060044b8", - "sha256:b5554368e4ede1856121b0dfa35ce71768102e4aa55e526cb8de7f374ff78722", - "sha256:cbddc56b2502d3f87fda4f98d948eb5b11f36ff3902e17cb6cc44727f2200525", - "sha256:d79f18f41751725c56eceab2a886f021d70fd70a6188fd386e29a045945ffc10", - "sha256:dc2ca26a19ab32dc475dbad9dfe723d3a64c835f4c23f625c2b6566ca32b9f29", - "sha256:dd9bcd4f294eb0633bb33d1a74febdd2b9018b8b8ed325f861fffcd2c7660bb8", - "sha256:e8baab1bc7c9152715844f1faca6744f2416929de10d7639ed49555a85549f52", - "sha256:ec31fe12668af687b99acf1567399632a7c47b0e17cfb9ae47c098644ef36797", - "sha256:f12b4f7e2d8f9da3141564e6737d79016fe5336cc92de6814eba579744f65b0a", - "sha256:f58ac38d5ca045a377b3b377c84df8175ab992c970a53332fa8ac2373df44ff7" + "sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e", + "sha256:0cdd229a53d2720d21175012ab0599665f8c9588b3b8ffa6095dd7b90f0691dd", + "sha256:312bb18e95218bedc3563f26fcc9c1c6bfaaf9d453d15942c0839acdd7e4c473", + "sha256:464b1c48baf49e8505b1bb754c47a013d2c305c5b14269b5c85ea0625b6a988a", + "sha256:5adfde7bd3ee4864536e230bcab1c673f866736698724d5d28c11a4d63672658", + "sha256:7724e9e31ee72389d522b88c0d4201f24edc34277999701ccd4a5392e7d8af61", + "sha256:8d36f7c53ae741e23f54793ffefb2912340b800476eb0a831c6eb602e204c5c4", + "sha256:910d2272403c2ea8a52d9159827dc9f7c27fb4b263749dca884e2e4a8af3b302", + "sha256:951fefe2fb73f84c620bec4e001e80a80ddaa1b84dce244ded7f1e0cbe0ed34a", + "sha256:9588c6b4157f493edeb9378788dcd02cb9e6a6aeaa518b511a1c79d06cbd8094", + "sha256:9ce8300950f2f1d29d0e49c28ebfff0d2f1e2a7444830fbb0b913c7c08f31511", + "sha256:be39cca66cc6806652da97103605c7b65ee4442c638f04ff064a7efd9a81d50a", + "sha256:c3ab2d835b95ccb59d11dfcd56eb0480daea57cdf95d686d22eff35584bc4554", + "sha256:eb0fc4a492cb896346c9e2c7a22eae3e766d407df3eb20f4ce027f23f76e4c54", + "sha256:ec0c56eae6cee6299f41e780a0280318a93db519bbb2906103c43f3e2be1206c", + "sha256:f4e4612de60a4f1c4d06c8c2857cdcb2b8b5289189a12053f37d3f41f06c60d0" ], - "version": "==1.16.4" + "version": "==1.17.0" }, "pandas": { "hashes": [ @@ -71,6 +64,14 @@ ], "version": "==2.8.0" }, + "python-stdnum": { + "hashes": [ + "sha256:d5f0af1bee9ddd9a20b398b46ce062dbd4d41fcc9646940f2667256a44df3854", + "sha256:f445ec32bf5246c90389204cabba465f494545371c29a83fa2d30e6c872a6763" + ], + "index": "pypi", + "version": "==1.11" + }, "pytz": { "hashes": [ "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 223121d..5571526 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -1,3 +1,4 @@ +import csv_metadata_quality.check as check import csv_metadata_quality.fix as fix import pandas as pd @@ -13,5 +14,11 @@ def run(): df[column] = df[column].apply(fix.whitespace) + if column == 'dc.identifier.issn': + df[column] = df[column].apply(check.issn) + + if column == 'dc.identifier.isbn': + df[column] = df[column].apply(check.isbn) + # Write df.to_csv('/tmp/omg.fixed.csv', index=False) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py new file mode 100755 index 0000000..09effb9 --- /dev/null +++ b/csv_metadata_quality/check.py @@ -0,0 +1,42 @@ +import pandas as pd + +def issn(field): + """Check if an ISSN is valid. + + Prints the ISSN if invalid. + + stdnum's is_valid() function never raises an exception. + + See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid + """ + + from stdnum import issn + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + + if not issn.is_valid(value): + print(f'Invalid ISSN: {value}') + + +def isbn(field): + """Check if an ISBN is valid. + + Prints the ISBN if invalid. + + stdnum's is_valid() function never raises an exception. + + See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid + """ + + from stdnum import isbn + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + + if not isbn.is_valid(value): + print(f'Invalid ISBN: {value}')