mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-22 05:45:02 +01:00
csv_metadata_quality/check.py: Fix duplicate checker
Fix the incorrect type field regex, and improve the title regex to consider dcterms.title and dc.title (along with the DSpace language variants like dc.title[en_US]), but ignore dc.title.alternative. See: https://regex101.com/r/I4m06F/1
This commit is contained in:
parent
81069259ba
commit
6ba16d5d4c
@ -321,10 +321,16 @@ def duplicate_items(df):
|
|||||||
#
|
#
|
||||||
# Index(['dcterms.title[en_US]'], dtype='object')
|
# Index(['dcterms.title[en_US]'], dtype='object')
|
||||||
#
|
#
|
||||||
title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
|
# But, we need to consider that dc.title.alternative might come before the
|
||||||
type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
|
# main title in the CSV, so use a negative lookahead to eliminate that.
|
||||||
|
#
|
||||||
|
# See: https://regex101.com/r/elyXkW/1
|
||||||
|
title_column_name = df.filter(
|
||||||
|
regex=r"^(dc|dcterms)\.title(?!\.alternative).*$"
|
||||||
|
).columns[0]
|
||||||
|
type_column_name = df.filter(regex=r"^(dcterms\.type|dc\.type).*$").columns[0]
|
||||||
date_column_name = df.filter(
|
date_column_name = df.filter(
|
||||||
regex=r"dcterms\.issued|dc\.date\.accessioned"
|
regex=r"^(dcterms\.issued|dc\.date\.accessioned).*$"
|
||||||
).columns[0]
|
).columns[0]
|
||||||
|
|
||||||
items_count_total = df[title_column_name].count()
|
items_count_total = df[title_column_name].count()
|
||||||
|
Loading…
Reference in New Issue
Block a user