From 9f2dc0a0f52ad3c7e7db5f342999cd1643d6ed82 Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Wed, 17 Mar 2021 09:53:07 +0200
Subject: [PATCH] Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.
---
 csv_metadata_quality/app.py   | 12 ++++++++++
 csv_metadata_quality/check.py | 41 +++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py
index 9e81de6..47bd4dc 100644
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -151,6 +151,18 @@ def run(argv):
         if match is not None:
             df[column].apply(check.spdx_license_identifier)
 
+    ### End individual column checks ###
+
+    # Check: duplicate items
+    # We extract just the title, type, and date issued columns to analyze
+    duplicates_df = df.filter(
+        regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
+    )
+    check.duplicate_items(duplicates_df)
+
+    # Delete the temporary duplicates DataFrame
+    del duplicates_df
+
     ##
     # Perform some checks on rows so we can consider items as a whole rather
     # than simple on a field-by-field basis. This allows us to check whether
diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py
index 6663df6..07b2919 100755
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -304,3 +304,44 @@ def spdx_license_identifier(field):
             pass
 
     return
+
+
+def duplicate_items(df):
+    """Attempt to identify duplicate items.
+
+    First we check the total number of titles and compare it with the number of
+    unique titles. If there are less unique titles than total titles we expand
+    the search by creating a key (of sorts) for each item that includes their
+    title, type, and date issued, and compare it with all the others. If there
+    are multiple occurrences of the same title, type, date string then it's a
+    very good indicator that the items are duplicates.
+    """
+
+    # Extract the names of the title, type, and date issued columns so we can
+    # reference them later. First we filter columns by likely patterns, then
+    # we extract the name from the first item of the resulting object, ie:
+    #
+    #   Index(['dcterms.title[en_US]'], dtype='object')
+    #
+    title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
+    type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
+    date_column_name = df.filter(
+        regex=r"dcterms\.issued|dc\.date\.accessioned"
+    ).columns[0]
+
+    items_count_total = df[title_column_name].count()
+    items_count_unique = df[title_column_name].nunique()
+
+    if items_count_unique < items_count_total:
+        # Create a list to hold our items while we check for duplicates
+        items = list()
+
+        for index, row in df.iterrows():
+            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
+
+            if item_title_type_date in items:
+                print(
+                    f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
+                )
+            else:
+                items.append(item_title_type_date)