Add fix for duplicate metadata values

This commit is contained in:
Alan Orth 2019-07-29 18:05:03 +03:00
parent d7888d59a8
commit 1e444cf040
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
4 changed files with 38 additions and 1 deletions

View File

@ -52,7 +52,6 @@ You can enable "unsafe fixes" with the `--unsafe-fixes` option. This will attemp
- Reporting / summary
- Real logging
- Detect and fix duplicate values like "Alan||Alan"
## License
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).

View File

@ -40,6 +40,9 @@ def main(argv):
# Run whitespace fix again after fixing invalid separators
df[column] = df[column].apply(fix.whitespace)
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates)
# Check: invalid ISSN
match = re.match(r'^.*?issn.*$', column)
if match is not None:

View File

@ -107,3 +107,30 @@ def unnecessary_unicode(field):
field = re.sub(pattern, '', field)
return field
def duplicates(field):
"""Remove duplicate metadata values."""
# Skip fields with missing values
if pd.isna(field):
return
# Try to split multi-value field on "||" separator
values = field.split('||')
# Initialize an empty list to hold the de-duplicated values
new_values = list()
# Iterate over all values
for value in values:
# Check if each value exists in our list of values already
if value not in new_values:
new_values.append(value)
else:
print(f'Dropping duplicate value: {value}')
# Create a new field consisting of all values joined with "||"
new_field = '||'.join(new_values)
return new_field

View File

@ -39,3 +39,11 @@ def test_fix_unnecessary_unicode():
value = 'Alan Orth'
assert fix.unnecessary_unicode(value) == 'Alan Orth'
def test_fix_duplicates():
'''Test fixing duplicate metadata values.'''
value = 'Kenya||Kenya'
assert fix.duplicates(value) == 'Kenya'