mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 04:02:19 +01:00
Add date validation
I'm only concerned with validating issue dates here. In DSpace they are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory they could be any valid ISO8601 format). This also checks for cases where the date is missing and where the metadata has specified multiple dates like "1990||1991", as this is valid, but there is no practical value for it in our system.
This commit is contained in:
parent
73b4061c7b
commit
196bb434fa
@ -1,6 +1,7 @@
|
||||
import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.fix as fix
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
def main():
|
||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||
@ -22,5 +23,10 @@ def main():
|
||||
if column == 'dc.identifier.isbn':
|
||||
df[column] = df[column].apply(check.isbn)
|
||||
|
||||
# check if column is a date column like dc.date.issued
|
||||
match = re.match(r'^.*?date.*$', column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.date)
|
||||
|
||||
# Write
|
||||
df.to_csv('/tmp/test.fixed.csv', index=False)
|
||||
|
@ -73,3 +73,54 @@ def separators(field):
|
||||
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def date(field):
|
||||
"""Check if a date is valid.
|
||||
|
||||
In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
|
||||
could technically even include time as long as it is ISO8601.
|
||||
|
||||
Also checks for other invalid cases like missing and multiple dates.
|
||||
|
||||
Prints the date if invalid.
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
if pd.isna(field):
|
||||
print(f'Missing date.')
|
||||
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
multiple_dates = field.split('||')
|
||||
|
||||
# We don't allow multi-value date fields
|
||||
if len(multiple_dates) > 1:
|
||||
print(f'Multiple dates not allowed: {field}')
|
||||
|
||||
return field
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY format
|
||||
datetime.strptime(field, '%Y')
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY-MM format
|
||||
datetime.strptime(field, '%Y-%m')
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY-MM-DD format
|
||||
datetime.strptime(field, '%Y-%m-%d')
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
print(f'Invalid date: {field}')
|
||||
|
@ -62,3 +62,46 @@ def test_check_valid_separators():
|
||||
result = check.separators(value)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_missing_date(capsys):
|
||||
'''Test checking missing date.'''
|
||||
|
||||
value = None
|
||||
|
||||
check.date(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Missing date.\n'
|
||||
|
||||
|
||||
def test_check_multiple_dates(capsys):
|
||||
'''Test checking multiple dates.'''
|
||||
|
||||
value = '1990||1991'
|
||||
|
||||
check.date(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Multiple dates not allowed: {value}\n'
|
||||
|
||||
|
||||
def test_check_invalid_date(capsys):
|
||||
'''Test checking invalid ISO8601 date.'''
|
||||
|
||||
value = '1990-0'
|
||||
|
||||
check.date(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid date: {value}\n'
|
||||
|
||||
|
||||
def test_check_valid_date():
|
||||
'''Test checking valid ISO8601 date.'''
|
||||
|
||||
value = '1990'
|
||||
|
||||
result = check.date(value)
|
||||
|
||||
assert result == value
|
||||
|
Loading…
Reference in New Issue
Block a user