mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-12 16:16:02 +02:00
Add date validation
I'm only concerned with validating issue dates here. In DSpace they are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory they could be any valid ISO8601 format). This also checks for cases where the date is missing and where the metadata has specified multiple dates like "1990||1991", as this is valid, but there is no practical value for it in our system.
This commit is contained in:
@ -73,3 +73,54 @@ def separators(field):
|
||||
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def date(field):
|
||||
"""Check if a date is valid.
|
||||
|
||||
In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it
|
||||
could technically even include time as long as it is ISO8601.
|
||||
|
||||
Also checks for other invalid cases like missing and multiple dates.
|
||||
|
||||
Prints the date if invalid.
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
if pd.isna(field):
|
||||
print(f'Missing date.')
|
||||
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
multiple_dates = field.split('||')
|
||||
|
||||
# We don't allow multi-value date fields
|
||||
if len(multiple_dates) > 1:
|
||||
print(f'Multiple dates not allowed: {field}')
|
||||
|
||||
return field
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY format
|
||||
datetime.strptime(field, '%Y')
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY-MM format
|
||||
datetime.strptime(field, '%Y-%m')
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Check if date is valid YYYY-MM-DD format
|
||||
datetime.strptime(field, '%Y-%m-%d')
|
||||
|
||||
return field
|
||||
except ValueError:
|
||||
print(f'Invalid date: {field}')
|
||||
|
Reference in New Issue
Block a user