mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-19 04:27:03 +01:00
Add fix for duplicate metadata values
This commit is contained in:
parent
d7888d59a8
commit
1e444cf040
@ -52,7 +52,6 @@ You can enable "unsafe fixes" with the `--unsafe-fixes` option. This will attemp
|
||||
|
||||
- Reporting / summary
|
||||
- Real logging
|
||||
- Detect and fix duplicate values like "Alan||Alan"
|
||||
|
||||
## License
|
||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||
|
@ -40,6 +40,9 @@ def main(argv):
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
df[column] = df[column].apply(fix.whitespace)
|
||||
|
||||
# Fix: duplicate metadata values
|
||||
df[column] = df[column].apply(fix.duplicates)
|
||||
|
||||
# Check: invalid ISSN
|
||||
match = re.match(r'^.*?issn.*$', column)
|
||||
if match is not None:
|
||||
|
@ -107,3 +107,30 @@ def unnecessary_unicode(field):
|
||||
field = re.sub(pattern, '', field)
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def duplicates(field):
|
||||
"""Remove duplicate metadata values."""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
values = field.split('||')
|
||||
|
||||
# Initialize an empty list to hold the de-duplicated values
|
||||
new_values = list()
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
# Check if each value exists in our list of values already
|
||||
if value not in new_values:
|
||||
new_values.append(value)
|
||||
else:
|
||||
print(f'Dropping duplicate value: {value}')
|
||||
|
||||
# Create a new field consisting of all values joined with "||"
|
||||
new_field = '||'.join(new_values)
|
||||
|
||||
return new_field
|
||||
|
@ -39,3 +39,11 @@ def test_fix_unnecessary_unicode():
|
||||
value = 'Alan Orth'
|
||||
|
||||
assert fix.unnecessary_unicode(value) == 'Alan Orth'
|
||||
|
||||
|
||||
def test_fix_duplicates():
|
||||
'''Test fixing duplicate metadata values.'''
|
||||
|
||||
value = 'Kenya||Kenya'
|
||||
|
||||
assert fix.duplicates(value) == 'Kenya'
|
||||
|
Loading…
Reference in New Issue
Block a user