1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-20 11:03:26 +02:00

Add support for detecting duplicate items

This uses the title, type, and date issued as a sort of "key" when
determining if an item already exists in the data set.
This commit is contained in:
2021-03-17 09:53:07 +02:00
parent 14010896a5
commit 9f2dc0a0f5
2 changed files with 53 additions and 0 deletions
csv_metadata_quality

@ -151,6 +151,18 @@ def run(argv):
if match is not None:
df[column].apply(check.spdx_license_identifier)
### End individual column checks ###
# Check: duplicate items
# We extract just the title, type, and date issued columns to analyze
duplicates_df = df.filter(
regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
)
check.duplicate_items(duplicates_df)
# Delete the temporary duplicates DataFrame
del duplicates_df
##
# Perform some checks on rows so we can consider items as a whole rather
# than simple on a field-by-field basis. This allows us to check whether