diff --git a/README.md b/README.md index 10c108e..de9b33d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Requires Python 3.6 or greater (3.8 recommended). CSV and Excel support comes fr - Experimental validation of titles and abstracts against item's Dublin Core language field - Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option) - Fix leading, trailing, and excessive (ie, more than one) whitespace -- Fix invalid multi-value separators (`|`) using `--unsafe-fixes` +- Fix invalid and unnecessary multi-value separators (`|`) using `--unsafe-fixes` - Fix problematic newlines (line feeds) using `--unsafe-fixes` - Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc - Check for "suspicious" characters that indicate encoding or copy/paste issues, for example "foreˆt" should be "forêt" @@ -56,6 +56,8 @@ You can enable several "unsafe" fixes with the `--unsafe-fixes` option. Currentl ### Invalid Multi-Value Separators This is considered "unsafe" because it is *theoretically* possible for a single `|` character to be used legitimately in a metadata value, though in my experience it is always a typo. For example, if a user mistakenly writes `Kenya|Tanzania` when attempting to indicate two countries, the result will be one metadata value with the literal text `Kenya|Tanzania`. The `--unsafe-fixes` option will correct the invalid multi-value separator so that there are two metadata values, ie `Kenya||Tanzania`. +This will also remove unnecessary trailing multi-value separators, for example `Kenya||Tanzania||`. + ### Newlines This is considered "unsafe" because some systems give special importance to vertical space and render it properly. DSpace does not support rendering newlines in its XMLUI and has, at times, suffered from parsing errors that cause the import process to fail if an input file had newlines. The `--unsafe-fixes` option strips Unix line feeds (U+000A). diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index b348eb0..1183d9b 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -103,13 +103,13 @@ def run(argv): # Fix: unnecessary Unicode df[column] = df[column].apply(fix.unnecessary_unicode) - # Check: invalid multi-value separator + # Check: invalid and unnecessary multi-value separators df[column] = df[column].apply(check.separators, field_name=column) # Check: suspicious characters df[column] = df[column].apply(check.suspicious_characters, field_name=column) - # Fix: invalid multi-value separator + # Fix: invalid and unnecessary multi-value separators if args.unsafe_fixes: df[column] = df[column].apply(fix.separators, field_name=column) # Run whitespace fix again after fixing invalid separators diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 4336dcd..ef43a21 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -57,7 +57,11 @@ def isbn(field): def separators(field, field_name): - """Check for invalid multi-value separators (ie "|" or "|||"). + """Check for invalid and unnecessary multi-value separators, for example: + + value|value + value|||value + value||value|| Prints the field with the invalid multi-value separator. """ @@ -70,10 +74,16 @@ def separators(field, field_name): # Try to split multi-value field on "||" separator for value in field.split("||"): + # Check if the current value is blank + if value == "": + print(f"Unnecessary multi-value separator ({field_name}): {field}") + + continue # After splitting, see if there are any remaining "|" characters match = re.findall(r"^.*?\|.*$", value) + # Check if there was a match if match: print(f"Invalid multi-value separator ({field_name}): {field}") diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index c2c232d..7ef12fb 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -42,7 +42,14 @@ def whitespace(field, field_name): def separators(field, field_name): - """Fix for invalid multi-value separators (ie "|").""" + """Fix for invalid and unnecessary multi-value separators, for example: + + value|value + value|||value + value||value|| + + Prints the field with the invalid multi-value separator. + """ # Skip fields with missing values if pd.isna(field): @@ -53,6 +60,12 @@ def separators(field, field_name): # Try to split multi-value field on "||" separator for value in field.split("||"): + # Check if the value is blank and skip it + if value == "": + print(f"Fixing unnecessary multi-value separator ({field_name}): {field}") + + continue + # After splitting, see if there are any remaining "|" characters pattern = re.compile(r"\|") match = re.findall(pattern, value)