1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Add ISSN and ISBN checks using python-stdnum

This commit is contained in:
2019-07-26 23:14:10 +03:00
parent 30a4b0005f
commit e160b17fb0
4 changed files with 76 additions and 25 deletions

View File

@ -1,3 +1,4 @@
import csv_metadata_quality.check as check
import csv_metadata_quality.fix as fix
import pandas as pd
@ -13,5 +14,11 @@ def run():
df[column] = df[column].apply(fix.whitespace)
if column == 'dc.identifier.issn':
df[column] = df[column].apply(check.issn)
if column == 'dc.identifier.isbn':
df[column] = df[column].apply(check.isbn)
# Write
df.to_csv('/tmp/omg.fixed.csv', index=False)

42
csv_metadata_quality/check.py Executable file
View File

@ -0,0 +1,42 @@
import pandas as pd
def issn(field):
"""Check if an ISSN is valid.
Prints the ISSN if invalid.
stdnum's is_valid() function never raises an exception.
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
"""
from stdnum import issn
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
for value in field.split('||'):
if not issn.is_valid(value):
print(f'Invalid ISSN: {value}')
def isbn(field):
"""Check if an ISBN is valid.
Prints the ISBN if invalid.
stdnum's is_valid() function never raises an exception.
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
"""
from stdnum import isbn
# Try to split multi-value field on "||" separator
for value in field.split('||'):
if not isbn.is_valid(value):
print(f'Invalid ISBN: {value}')