mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-08 06:06:00 +02:00
Add ISSN and ISBN checks using python-stdnum
This commit is contained in:
@ -1,3 +1,4 @@
|
||||
import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.fix as fix
|
||||
import pandas as pd
|
||||
|
||||
@ -13,5 +14,11 @@ def run():
|
||||
|
||||
df[column] = df[column].apply(fix.whitespace)
|
||||
|
||||
if column == 'dc.identifier.issn':
|
||||
df[column] = df[column].apply(check.issn)
|
||||
|
||||
if column == 'dc.identifier.isbn':
|
||||
df[column] = df[column].apply(check.isbn)
|
||||
|
||||
# Write
|
||||
df.to_csv('/tmp/omg.fixed.csv', index=False)
|
||||
|
42
csv_metadata_quality/check.py
Executable file
42
csv_metadata_quality/check.py
Executable file
@ -0,0 +1,42 @@
|
||||
import pandas as pd
|
||||
|
||||
def issn(field):
|
||||
"""Check if an ISSN is valid.
|
||||
|
||||
Prints the ISSN if invalid.
|
||||
|
||||
stdnum's is_valid() function never raises an exception.
|
||||
|
||||
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||
"""
|
||||
|
||||
from stdnum import issn
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
|
||||
if not issn.is_valid(value):
|
||||
print(f'Invalid ISSN: {value}')
|
||||
|
||||
|
||||
def isbn(field):
|
||||
"""Check if an ISBN is valid.
|
||||
|
||||
Prints the ISBN if invalid.
|
||||
|
||||
stdnum's is_valid() function never raises an exception.
|
||||
|
||||
See: https://arthurdejong.org/python-stdnum/doc/1.11/index.html#stdnum.module.is_valid
|
||||
"""
|
||||
|
||||
from stdnum import isbn
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
|
||||
if not isbn.is_valid(value):
|
||||
print(f'Invalid ISBN: {value}')
|
Reference in New Issue
Block a user