1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-22 12:12:18 +01:00

csv_metadata_quality/check.py: Fix duplicate checker

Fix the incorrect type field regex, and improve the title regex to
consider dcterms.title and dc.title (along with the DSpace language
variants like dc.title[en_US]), but ignore dc.title.alternative.

See: https://regex101.com/r/I4m06F/1
This commit is contained in:
Alan Orth 2021-10-06 19:32:40 +03:00
parent 81069259ba
commit 6ba16d5d4c
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -321,10 +321,16 @@ def duplicate_items(df):
#
# Index(['dcterms.title[en_US]'], dtype='object')
#
title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
# But, we need to consider that dc.title.alternative might come before the
# main title in the CSV, so use a negative lookahead to eliminate that.
#
# See: https://regex101.com/r/elyXkW/1
title_column_name = df.filter(
regex=r"^(dc|dcterms)\.title(?!\.alternative).*$"
).columns[0]
type_column_name = df.filter(regex=r"^(dcterms\.type|dc\.type).*$").columns[0]
date_column_name = df.filter(
regex=r"dcterms\.issued|dc\.date\.accessioned"
regex=r"^(dcterms\.issued|dc\.date\.accessioned).*$"
).columns[0]
items_count_total = df[title_column_name].count()