mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-25 07:10:17 +01:00
Add ISSN and ISBN checks using python-stdnum
This commit is contained in:
parent
30a4b0005f
commit
e160b17fb0
1
Pipfile
1
Pipfile
@ -7,6 +7,7 @@ verify_ssl = true
|
|||||||
|
|
||||||
[packages]
|
[packages]
|
||||||
pandas = "*"
|
pandas = "*"
|
||||||
|
python-stdnum = "*"
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.7"
|
python_version = "3.7"
|
||||||
|
51
Pipfile.lock
generated
51
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "fba383031295c98ef4ac605996144102d686f2d38c5a20f33ee0bebae0713200"
|
"sha256": "276cc93ee297b83cefb775fe3ab89c4f0780d81e69d4143cc95ec745efda2a4c"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
@ -18,31 +18,24 @@
|
|||||||
"default": {
|
"default": {
|
||||||
"numpy": {
|
"numpy": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:0778076e764e146d3078b17c24c4d89e0ecd4ac5401beff8e1c87879043a0633",
|
"sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e",
|
||||||
"sha256:141c7102f20abe6cf0d54c4ced8d565b86df4d3077ba2343b61a6db996cefec7",
|
"sha256:0cdd229a53d2720d21175012ab0599665f8c9588b3b8ffa6095dd7b90f0691dd",
|
||||||
"sha256:14270a1ee8917d11e7753fb54fc7ffd1934f4d529235beec0b275e2ccf00333b",
|
"sha256:312bb18e95218bedc3563f26fcc9c1c6bfaaf9d453d15942c0839acdd7e4c473",
|
||||||
"sha256:27e11c7a8ec9d5838bc59f809bfa86efc8a4fd02e58960fa9c49d998e14332d5",
|
"sha256:464b1c48baf49e8505b1bb754c47a013d2c305c5b14269b5c85ea0625b6a988a",
|
||||||
"sha256:2a04dda79606f3d2f760384c38ccd3d5b9bb79d4c8126b67aff5eb09a253763e",
|
"sha256:5adfde7bd3ee4864536e230bcab1c673f866736698724d5d28c11a4d63672658",
|
||||||
"sha256:3c26010c1b51e1224a3ca6b8df807de6e95128b0908c7e34f190e7775455b0ca",
|
"sha256:7724e9e31ee72389d522b88c0d4201f24edc34277999701ccd4a5392e7d8af61",
|
||||||
"sha256:52c40f1a4262c896420c6ea1c6fda62cf67070e3947e3307f5562bd783a90336",
|
"sha256:8d36f7c53ae741e23f54793ffefb2912340b800476eb0a831c6eb602e204c5c4",
|
||||||
"sha256:6e4f8d9e8aa79321657079b9ac03f3cf3fd067bf31c1cca4f56d49543f4356a5",
|
"sha256:910d2272403c2ea8a52d9159827dc9f7c27fb4b263749dca884e2e4a8af3b302",
|
||||||
"sha256:7242be12a58fec245ee9734e625964b97cf7e3f2f7d016603f9e56660ce479c7",
|
"sha256:951fefe2fb73f84c620bec4e001e80a80ddaa1b84dce244ded7f1e0cbe0ed34a",
|
||||||
"sha256:7dc253b542bfd4b4eb88d9dbae4ca079e7bf2e2afd819ee18891a43db66c60c7",
|
"sha256:9588c6b4157f493edeb9378788dcd02cb9e6a6aeaa518b511a1c79d06cbd8094",
|
||||||
"sha256:94f5bd885f67bbb25c82d80184abbf7ce4f6c3c3a41fbaa4182f034bba803e69",
|
"sha256:9ce8300950f2f1d29d0e49c28ebfff0d2f1e2a7444830fbb0b913c7c08f31511",
|
||||||
"sha256:a89e188daa119ffa0d03ce5123dee3f8ffd5115c896c2a9d4f0dbb3d8b95bfa3",
|
"sha256:be39cca66cc6806652da97103605c7b65ee4442c638f04ff064a7efd9a81d50a",
|
||||||
"sha256:ad3399da9b0ca36e2f24de72f67ab2854a62e623274607e37e0ce5f5d5fa9166",
|
"sha256:c3ab2d835b95ccb59d11dfcd56eb0480daea57cdf95d686d22eff35584bc4554",
|
||||||
"sha256:b0348be89275fd1d4c44ffa39530c41a21062f52299b1e3ee7d1c61f060044b8",
|
"sha256:eb0fc4a492cb896346c9e2c7a22eae3e766d407df3eb20f4ce027f23f76e4c54",
|
||||||
"sha256:b5554368e4ede1856121b0dfa35ce71768102e4aa55e526cb8de7f374ff78722",
|
"sha256:ec0c56eae6cee6299f41e780a0280318a93db519bbb2906103c43f3e2be1206c",
|
||||||
"sha256:cbddc56b2502d3f87fda4f98d948eb5b11f36ff3902e17cb6cc44727f2200525",
|
"sha256:f4e4612de60a4f1c4d06c8c2857cdcb2b8b5289189a12053f37d3f41f06c60d0"
|
||||||
"sha256:d79f18f41751725c56eceab2a886f021d70fd70a6188fd386e29a045945ffc10",
|
|
||||||
"sha256:dc2ca26a19ab32dc475dbad9dfe723d3a64c835f4c23f625c2b6566ca32b9f29",
|
|
||||||
"sha256:dd9bcd4f294eb0633bb33d1a74febdd2b9018b8b8ed325f861fffcd2c7660bb8",
|
|
||||||
"sha256:e8baab1bc7c9152715844f1faca6744f2416929de10d7639ed49555a85549f52",
|
|
||||||
"sha256:ec31fe12668af687b99acf1567399632a7c47b0e17cfb9ae47c098644ef36797",
|
|
||||||
"sha256:f12b4f7e2d8f9da3141564e6737d79016fe5336cc92de6814eba579744f65b0a",
|
|
||||||
"sha256:f58ac38d5ca045a377b3b377c84df8175ab992c970a53332fa8ac2373df44ff7"
|
|
||||||
],
|
],
|
||||||
"version": "==1.16.4"
|
"version": "==1.17.0"
|
||||||
},
|
},
|
||||||
"pandas": {
|
"pandas": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
@ -71,6 +64,14 @@
|
|||||||
],
|
],
|
||||||
"version": "==2.8.0"
|
"version": "==2.8.0"
|
||||||
},
|
},
|
||||||
|
"python-stdnum": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:d5f0af1bee9ddd9a20b398b46ce062dbd4d41fcc9646940f2667256a44df3854",
|
||||||
|
"sha256:f445ec32bf5246c90389204cabba465f494545371c29a83fa2d30e6c872a6763"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==1.11"
|
||||||
|
},
|
||||||
"pytz": {
|
"pytz": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
|
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import csv_metadata_quality.check as check
|
||||||
import csv_metadata_quality.fix as fix
|
import csv_metadata_quality.fix as fix
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
@ -13,5 +14,11 @@ def run():
|
|||||||
|
|
||||||
df[column] = df[column].apply(fix.whitespace)
|
df[column] = df[column].apply(fix.whitespace)
|
||||||
|
|
||||||
|
if column == 'dc.identifier.issn':
|
||||||
|
df[column] = df[column].apply(check.issn)
|
||||||
|
|
||||||
|
if column == 'dc.identifier.isbn':
|
||||||
|
df[column] = df[column].apply(check.isbn)
|
||||||
|
|
||||||
# Write
|
# Write
|
||||||
df.to_csv('/tmp/omg.fixed.csv', index=False)
|
df.to_csv('/tmp/omg.fixed.csv', index=False)
|
||||||
|
42
csv_metadata_quality/check.py
Executable file
42
csv_metadata_quality/check.py
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def issn(field):
|
||||||
|
"""Check if an ISSN is valid.
|
||||||
|
|
||||||
|
Prints the ISSN if invalid.
|
||||||
|
|
||||||
|
stdnum's is_valid() function never raises an exception.
|
||||||
|
|
||||||
|
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||||
|
"""
|
||||||
|
|
||||||
|
from stdnum import issn
|
||||||
|
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(field):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
for value in field.split('||'):
|
||||||
|
|
||||||
|
if not issn.is_valid(value):
|
||||||
|
print(f'Invalid ISSN: {value}')
|
||||||
|
|
||||||
|
|
||||||
|
def isbn(field):
|
||||||
|
"""Check if an ISBN is valid.
|
||||||
|
|
||||||
|
Prints the ISBN if invalid.
|
||||||
|
|
||||||
|
stdnum's is_valid() function never raises an exception.
|
||||||
|
|
||||||
|
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||||
|
"""
|
||||||
|
|
||||||
|
from stdnum import isbn
|
||||||
|
|
||||||
|
# Try to split multi-value field on "||" separator
|
||||||
|
for value in field.split('||'):
|
||||||
|
|
||||||
|
if not isbn.is_valid(value):
|
||||||
|
print(f'Invalid ISBN: {value}')
|
Loading…
Reference in New Issue
Block a user