diff --git a/README.md b/README.md index e06b6ff..e35a6fa 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,6 @@ You can enable "unsafe fixes" with the `--unsafe-fixes` option. This will attemp - Reporting / summary - Real logging -- Detect and fix duplicate values like "Alan||Alan" ## License This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html). diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 3d7745a..c80940a 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -40,6 +40,9 @@ def main(argv): # Run whitespace fix again after fixing invalid separators df[column] = df[column].apply(fix.whitespace) + # Fix: duplicate metadata values + df[column] = df[column].apply(fix.duplicates) + # Check: invalid ISSN match = re.match(r'^.*?issn.*$', column) if match is not None: diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index ac2c7b2..2d40558 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -107,3 +107,30 @@ def unnecessary_unicode(field): field = re.sub(pattern, '', field) return field + + +def duplicates(field): + """Remove duplicate metadata values.""" + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + values = field.split('||') + + # Initialize an empty list to hold the de-duplicated values + new_values = list() + + # Iterate over all values + for value in values: + # Check if each value exists in our list of values already + if value not in new_values: + new_values.append(value) + else: + print(f'Dropping duplicate value: {value}') + + # Create a new field consisting of all values joined with "||" + new_field = '||'.join(new_values) + + return new_field diff --git a/tests/test_fix.py b/tests/test_fix.py index e02514a..421aa41 100644 --- a/tests/test_fix.py +++ b/tests/test_fix.py @@ -39,3 +39,11 @@ def test_fix_unnecessary_unicode(): value = 'Alan​ Orth' assert fix.unnecessary_unicode(value) == 'Alan Orth' + + +def test_fix_duplicates(): + '''Test fixing duplicate metadata values.''' + + value = 'Kenya||Kenya' + + assert fix.duplicates(value) == 'Kenya'