From 9f2dc0a0f52ad3c7e7db5f342999cd1643d6ed82 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Wed, 17 Mar 2021 09:53:07 +0200 Subject: [PATCH] Add support for detecting duplicate items This uses the title, type, and date issued as a sort of "key" when determining if an item already exists in the data set. --- csv_metadata_quality/app.py | 12 ++++++++++ csv_metadata_quality/check.py | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 9e81de6..47bd4dc 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -151,6 +151,18 @@ def run(argv): if match is not None: df[column].apply(check.spdx_license_identifier) + ### End individual column checks ### + + # Check: duplicate items + # We extract just the title, type, and date issued columns to analyze + duplicates_df = df.filter( + regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued" + ) + check.duplicate_items(duplicates_df) + + # Delete the temporary duplicates DataFrame + del duplicates_df + ## # Perform some checks on rows so we can consider items as a whole rather # than simple on a field-by-field basis. This allows us to check whether diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 6663df6..07b2919 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -304,3 +304,44 @@ def spdx_license_identifier(field): pass return + + +def duplicate_items(df): + """Attempt to identify duplicate items. + + First we check the total number of titles and compare it with the number of + unique titles. If there are less unique titles than total titles we expand + the search by creating a key (of sorts) for each item that includes their + title, type, and date issued, and compare it with all the others. If there + are multiple occurrences of the same title, type, date string then it's a + very good indicator that the items are duplicates. + """ + + # Extract the names of the title, type, and date issued columns so we can + # reference them later. First we filter columns by likely patterns, then + # we extract the name from the first item of the resulting object, ie: + # + # Index(['dcterms.title[en_US]'], dtype='object') + # + title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0] + type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0] + date_column_name = df.filter( + regex=r"dcterms\.issued|dc\.date\.accessioned" + ).columns[0] + + items_count_total = df[title_column_name].count() + items_count_unique = df[title_column_name].nunique() + + if items_count_unique < items_count_total: + # Create a list to hold our items while we check for duplicates + items = list() + + for index, row in df.iterrows(): + item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}" + + if item_title_type_date in items: + print( + f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}" + ) + else: + items.append(item_title_type_date)