From 6ba16d5d4c78a86170438773b12e6591921eb04c Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 6 Oct 2021 19:32:40 +0300 Subject: [PATCH] csv_metadata_quality/check.py: Fix duplicate checker Fix the incorrect type field regex, and improve the title regex to consider dcterms.title and dc.title (along with the DSpace language variants like dc.title[en_US]), but ignore dc.title.alternative. See: https://regex101.com/r/I4m06F/1 --- csv_metadata_quality/check.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 5d9c461..91ea225 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -321,10 +321,16 @@ def duplicate_items(df): # # Index(['dcterms.title[en_US]'], dtype='object') # - title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0] - type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0] + # But, we need to consider that dc.title.alternative might come before the + # main title in the CSV, so use a negative lookahead to eliminate that. + # + # See: https://regex101.com/r/elyXkW/1 + title_column_name = df.filter( + regex=r"^(dc|dcterms)\.title(?!\.alternative).*$" + ).columns[0] + type_column_name = df.filter(regex=r"^(dcterms\.type|dc\.type).*$").columns[0] date_column_name = df.filter( - regex=r"dcterms\.issued|dc\.date\.accessioned" + regex=r"^(dcterms\.issued|dc\.date\.accessioned).*$" ).columns[0] items_count_total = df[title_column_name].count()