mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
csv_metadata_quality/check.py: Fix duplicate checker
Fix the incorrect type field regex, and improve the title regex to consider dcterms.title and dc.title (along with the DSpace language variants like dc.title[en_US]), but ignore dc.title.alternative. See: https://regex101.com/r/I4m06F/1
This commit is contained in:
parent
81069259ba
commit
6ba16d5d4c
@ -321,10 +321,16 @@ def duplicate_items(df):
|
||||
#
|
||||
# Index(['dcterms.title[en_US]'], dtype='object')
|
||||
#
|
||||
title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
|
||||
type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
|
||||
# But, we need to consider that dc.title.alternative might come before the
|
||||
# main title in the CSV, so use a negative lookahead to eliminate that.
|
||||
#
|
||||
# See: https://regex101.com/r/elyXkW/1
|
||||
title_column_name = df.filter(
|
||||
regex=r"^(dc|dcterms)\.title(?!\.alternative).*$"
|
||||
).columns[0]
|
||||
type_column_name = df.filter(regex=r"^(dcterms\.type|dc\.type).*$").columns[0]
|
||||
date_column_name = df.filter(
|
||||
regex=r"dcterms\.issued|dc\.date\.accessioned"
|
||||
regex=r"^(dcterms\.issued|dc\.date\.accessioned).*$"
|
||||
).columns[0]
|
||||
|
||||
items_count_total = df[title_column_name].count()
|
||||
|
Loading…
Reference in New Issue
Block a user