Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when determining if an item already exists in the data set.
2025-07-06 06:21:36 +02:00 · 2021-03-17 09:53:07 +02:00
parent 14010896a5
commit 9f2dc0a0f5
2 changed files with 53 additions and 0 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -151,6 +151,18 @@ def run(argv):
        if match is not None:
            df[column].apply(check.spdx_license_identifier)

+    ### End individual column checks ###
+
+    # Check: duplicate items
+    # We extract just the title, type, and date issued columns to analyze
+    duplicates_df = df.filter(
+        regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
+    )
+    check.duplicate_items(duplicates_df)
+
+    # Delete the temporary duplicates DataFrame
+    del duplicates_df
+
    ##
    # Perform some checks on rows so we can consider items as a whole rather
    # than simple on a field-by-field basis. This allows us to check whether