Add date validation

I'm only concerned with validating issue dates here. In DSpace they
are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory
they could be any valid ISO8601 format).

This also checks for cases where the date is missing and where the
metadata has specified multiple dates like "1990||1991", as this is
valid, but there is no practical value for it in our system.
This commit is contained in:
Alan Orth 2019-07-28 16:11:36 +03:00
parent 73b4061c7b
commit 196bb434fa
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
3 changed files with 100 additions and 0 deletions

View File

@ -1,6 +1,7 @@
import csv_metadata_quality.check as check
import csv_metadata_quality.fix as fix
import pandas as pd
import re
def main():
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
@ -22,5 +23,10 @@ def main():
if column == 'dc.identifier.isbn':
df[column] = df[column].apply(check.isbn)
# check if column is a date column like dc.date.issued
match = re.match(r'^.*?date.*$', column)
if match is not None:
df[column] = df[column].apply(check.date)
# Write
df.to_csv('/tmp/test.fixed.csv', index=False)

View File

@ -73,3 +73,54 @@ def separators(field):
return field
def date(field):
"""Check if a date is valid.
In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
could technically even include time as long as it is ISO8601.
Also checks for other invalid cases like missing and multiple dates.
Prints the date if invalid.
"""
from datetime import datetime
if pd.isna(field):
print(f'Missing date.')
return
# Try to split multi-value field on "||" separator
multiple_dates = field.split('||')
# We don't allow multi-value date fields
if len(multiple_dates) > 1:
print(f'Multiple dates not allowed: {field}')
return field
try:
# Check if date is valid YYYY format
datetime.strptime(field, '%Y')
return field
except ValueError:
pass
try:
# Check if date is valid YYYY-MM format
datetime.strptime(field, '%Y-%m')
return field
except ValueError:
pass
try:
# Check if date is valid YYYY-MM-DD format
datetime.strptime(field, '%Y-%m-%d')
return field
except ValueError:
print(f'Invalid date: {field}')

View File

@ -62,3 +62,46 @@ def test_check_valid_separators():
result = check.separators(value)
assert result == value
def test_check_missing_date(capsys):
'''Test checking missing date.'''
value = None
check.date(value)
captured = capsys.readouterr()
assert captured.out == f'Missing date.\n'
def test_check_multiple_dates(capsys):
'''Test checking multiple dates.'''
value = '1990||1991'
check.date(value)
captured = capsys.readouterr()
assert captured.out == f'Multiple dates not allowed: {value}\n'
def test_check_invalid_date(capsys):
'''Test checking invalid ISO8601 date.'''
value = '1990-0'
check.date(value)
captured = capsys.readouterr()
assert captured.out == f'Invalid date: {value}\n'
def test_check_valid_date():
'''Test checking valid ISO8601 date.'''
value = '1990'
result = check.date(value)
assert result == value