From 196bb434faf87cb19bd99a5e27b8eb97e587aa24 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 28 Jul 2019 16:11:36 +0300 Subject: [PATCH] Add date validation I'm only concerned with validating issue dates here. In DSpace they are generally always YYYY, YYY-MM, or YYYY-MM-DD (though in theory they could be any valid ISO8601 format). This also checks for cases where the date is missing and where the metadata has specified multiple dates like "1990||1991", as this is valid, but there is no practical value for it in our system. --- csv_metadata_quality/app.py | 6 +++++ csv_metadata_quality/check.py | 51 +++++++++++++++++++++++++++++++++++ tests/test_check.py | 43 +++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 13dbcc0..7b5f3e7 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -1,6 +1,7 @@ import csv_metadata_quality.check as check import csv_metadata_quality.fix as fix import pandas as pd +import re def main(): # Read all fields as strings so dates don't get converted from 1998 to 1998.0 @@ -22,5 +23,10 @@ def main(): if column == 'dc.identifier.isbn': df[column] = df[column].apply(check.isbn) + # check if column is a date column like dc.date.issued + match = re.match(r'^.*?date.*$', column) + if match is not None: + df[column] = df[column].apply(check.date) + # Write df.to_csv('/tmp/test.fixed.csv', index=False) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 559d7e6..0527153 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -73,3 +73,54 @@ def separators(field): return field + + +def date(field): + """Check if a date is valid. + + In DSpace the issue date is usually 1990, 1990-01, or 1990-01-01, but it + could technically even include time as long as it is ISO8601. + + Also checks for other invalid cases like missing and multiple dates. + + Prints the date if invalid. + """ + from datetime import datetime + + if pd.isna(field): + print(f'Missing date.') + + return + + # Try to split multi-value field on "||" separator + multiple_dates = field.split('||') + + # We don't allow multi-value date fields + if len(multiple_dates) > 1: + print(f'Multiple dates not allowed: {field}') + + return field + + try: + # Check if date is valid YYYY format + datetime.strptime(field, '%Y') + + return field + except ValueError: + pass + + try: + # Check if date is valid YYYY-MM format + datetime.strptime(field, '%Y-%m') + + return field + except ValueError: + pass + + try: + # Check if date is valid YYYY-MM-DD format + datetime.strptime(field, '%Y-%m-%d') + + return field + except ValueError: + print(f'Invalid date: {field}') diff --git a/tests/test_check.py b/tests/test_check.py index 1f85eb9..d04b221 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -62,3 +62,46 @@ def test_check_valid_separators(): result = check.separators(value) assert result == value + + +def test_check_missing_date(capsys): + '''Test checking missing date.''' + + value = None + + check.date(value) + + captured = capsys.readouterr() + assert captured.out == f'Missing date.\n' + + +def test_check_multiple_dates(capsys): + '''Test checking multiple dates.''' + + value = '1990||1991' + + check.date(value) + + captured = capsys.readouterr() + assert captured.out == f'Multiple dates not allowed: {value}\n' + + +def test_check_invalid_date(capsys): + '''Test checking invalid ISO8601 date.''' + + value = '1990-0' + + check.date(value) + + captured = capsys.readouterr() + assert captured.out == f'Invalid date: {value}\n' + + +def test_check_valid_date(): + '''Test checking valid ISO8601 date.''' + + value = '1990' + + result = check.date(value) + + assert result == value