1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-01-24 11:13:22 +01:00

Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.
This commit is contained in:
Alan Orth 2021-03-17 09:53:07 +02:00
parent 14010896a5
commit 9f2dc0a0f5
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 53 additions and 0 deletions

View File

@ -151,6 +151,18 @@ def run(argv):
if match is not None:
df[column].apply(check.spdx_license_identifier)
### End individual column checks ###
# Check: duplicate items
# We extract just the title, type, and date issued columns to analyze
duplicates_df = df.filter(
regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
)
check.duplicate_items(duplicates_df)
# Delete the temporary duplicates DataFrame
del duplicates_df
##
# Perform some checks on rows so we can consider items as a whole rather
# than simple on a field-by-field basis. This allows us to check whether

View File

@ -304,3 +304,44 @@ def spdx_license_identifier(field):
pass
return
def duplicate_items(df):
"""Attempt to identify duplicate items.
First we check the total number of titles and compare it with the number of
unique titles. If there are less unique titles than total titles we expand
the search by creating a key (of sorts) for each item that includes their
title, type, and date issued, and compare it with all the others. If there
are multiple occurrences of the same title, type, date string then it's a
very good indicator that the items are duplicates.
"""
# Extract the names of the title, type, and date issued columns so we can
# reference them later. First we filter columns by likely patterns, then
# we extract the name from the first item of the resulting object, ie:
#
# Index(['dcterms.title[en_US]'], dtype='object')
#
title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
date_column_name = df.filter(
regex=r"dcterms\.issued|dc\.date\.accessioned"
).columns[0]
items_count_total = df[title_column_name].count()
items_count_unique = df[title_column_name].nunique()
if items_count_unique < items_count_total:
# Create a list to hold our items while we check for duplicates
items = list()
for index, row in df.iterrows():
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
if item_title_type_date in items:
print(
f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
)
else:
items.append(item_title_type_date)