Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when determining if an item already exists in the data set.
2025-07-27 00:08:03 +02:00 · 2021-03-17 09:53:07 +02:00
parent 14010896a5
commit 9f2dc0a0f5
2 changed files with 53 additions and 0 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -151,6 +151,18 @@ def run(argv):
        if match is not None:
            df[column].apply(check.spdx_license_identifier)

+    ### End individual column checks ###
+
+    # Check: duplicate items
+    # We extract just the title, type, and date issued columns to analyze
+    duplicates_df = df.filter(
+        regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
+    )
+    check.duplicate_items(duplicates_df)
+
+    # Delete the temporary duplicates DataFrame
+    del duplicates_df
+
    ##
    # Perform some checks on rows so we can consider items as a whole rather
    # than simple on a field-by-field basis. This allows us to check whether
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -304,3 +304,44 @@ def spdx_license_identifier(field):
            pass

    return
+
+
+def duplicate_items(df):
+    """Attempt to identify duplicate items.
+
+    First we check the total number of titles and compare it with the number of
+    unique titles. If there are less unique titles than total titles we expand
+    the search by creating a key (of sorts) for each item that includes their
+    title, type, and date issued, and compare it with all the others. If there
+    are multiple occurrences of the same title, type, date string then it's a
+    very good indicator that the items are duplicates.
+    """
+
+    # Extract the names of the title, type, and date issued columns so we can
+    # reference them later. First we filter columns by likely patterns, then
+    # we extract the name from the first item of the resulting object, ie:
+    #
+    #   Index(['dcterms.title[en_US]'], dtype='object')
+    #
+    title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
+    type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
+    date_column_name = df.filter(
+        regex=r"dcterms\.issued|dc\.date\.accessioned"
+    ).columns[0]
+
+    items_count_total = df[title_column_name].count()
+    items_count_unique = df[title_column_name].nunique()
+
+    if items_count_unique < items_count_total:
+        # Create a list to hold our items while we check for duplicates
+        items = list()
+
+        for index, row in df.iterrows():
+            item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
+
+            if item_title_type_date in items:
+                print(
+                    f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
+                )
+            else:
+                items.append(item_title_type_date)