1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Add fix for duplicate metadata values

This commit is contained in:
2019-07-29 18:05:03 +03:00
parent d7888d59a8
commit 1e444cf040
4 changed files with 38 additions and 1 deletions

View File

@ -40,6 +40,9 @@ def main(argv):
# Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace)
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates)
# Check: invalid ISSN
match = re.match(r'^.*?issn.*$', column)
if match is not None:

View File

@ -107,3 +107,30 @@ def unnecessary_unicode(field):
field = re.sub(pattern, '', field)
return field
def duplicates(field):
"""Remove duplicate metadata values."""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
values = field.split('||')
# Initialize an empty list to hold the de-duplicated values
new_values = list()
# Iterate over all values
for value in values:
# Check if each value exists in our list of values already
if value not in new_values:
new_values.append(value)
else:
print(f'Dropping duplicate value: {value}')
# Create a new field consisting of all values joined with "||"
new_field = '||'.join(new_values)
return new_field