mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-01-24 19:23:22 +01:00
Add support for detecting duplicate items
This uses the title, type, and date issued as a sort of "key" when determining if an item already exists in the data set.
This commit is contained in:
parent
14010896a5
commit
9f2dc0a0f5
@ -151,6 +151,18 @@ def run(argv):
|
|||||||
if match is not None:
|
if match is not None:
|
||||||
df[column].apply(check.spdx_license_identifier)
|
df[column].apply(check.spdx_license_identifier)
|
||||||
|
|
||||||
|
### End individual column checks ###
|
||||||
|
|
||||||
|
# Check: duplicate items
|
||||||
|
# We extract just the title, type, and date issued columns to analyze
|
||||||
|
duplicates_df = df.filter(
|
||||||
|
regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
|
||||||
|
)
|
||||||
|
check.duplicate_items(duplicates_df)
|
||||||
|
|
||||||
|
# Delete the temporary duplicates DataFrame
|
||||||
|
del duplicates_df
|
||||||
|
|
||||||
##
|
##
|
||||||
# Perform some checks on rows so we can consider items as a whole rather
|
# Perform some checks on rows so we can consider items as a whole rather
|
||||||
# than simple on a field-by-field basis. This allows us to check whether
|
# than simple on a field-by-field basis. This allows us to check whether
|
||||||
|
@ -304,3 +304,44 @@ def spdx_license_identifier(field):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def duplicate_items(df):
|
||||||
|
"""Attempt to identify duplicate items.
|
||||||
|
|
||||||
|
First we check the total number of titles and compare it with the number of
|
||||||
|
unique titles. If there are less unique titles than total titles we expand
|
||||||
|
the search by creating a key (of sorts) for each item that includes their
|
||||||
|
title, type, and date issued, and compare it with all the others. If there
|
||||||
|
are multiple occurrences of the same title, type, date string then it's a
|
||||||
|
very good indicator that the items are duplicates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Extract the names of the title, type, and date issued columns so we can
|
||||||
|
# reference them later. First we filter columns by likely patterns, then
|
||||||
|
# we extract the name from the first item of the resulting object, ie:
|
||||||
|
#
|
||||||
|
# Index(['dcterms.title[en_US]'], dtype='object')
|
||||||
|
#
|
||||||
|
title_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
|
||||||
|
type_column_name = df.filter(regex=r"dcterms\.title|dc\.title").columns[0]
|
||||||
|
date_column_name = df.filter(
|
||||||
|
regex=r"dcterms\.issued|dc\.date\.accessioned"
|
||||||
|
).columns[0]
|
||||||
|
|
||||||
|
items_count_total = df[title_column_name].count()
|
||||||
|
items_count_unique = df[title_column_name].nunique()
|
||||||
|
|
||||||
|
if items_count_unique < items_count_total:
|
||||||
|
# Create a list to hold our items while we check for duplicates
|
||||||
|
items = list()
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
item_title_type_date = f"{row[title_column_name]}{row[type_column_name]}{row[date_column_name]}"
|
||||||
|
|
||||||
|
if item_title_type_date in items:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Possible duplicate ({title_column_name}): {Fore.RESET}{row[title_column_name]}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
items.append(item_title_type_date)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user