diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 1183d9b..7d33965 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -4,6 +4,7 @@ import signal import sys import pandas as pd +from colorama import Fore import csv_metadata_quality.check as check import csv_metadata_quality.experimental as experimental @@ -77,7 +78,7 @@ def run(argv): if column == exclude and skip is False: skip = True if skip: - print(f"Skipping {column}") + print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}") continue diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 16a881f..f5a7eda 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta import pandas as pd import requests import requests_cache +from colorama import Fore from pycountry import languages @@ -26,7 +27,7 @@ def issn(field): for value in field.split("||"): if not issn.is_valid(value): - print(f"Invalid ISSN: {value}") + print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}") return field @@ -51,7 +52,7 @@ def isbn(field): for value in field.split("||"): if not isbn.is_valid(value): - print(f"Invalid ISBN: {value}") + print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}") return field @@ -76,7 +77,9 @@ def separators(field, field_name): for value in field.split("||"): # Check if the current value is blank if value == "": - print(f"Unnecessary multi-value separator ({field_name}): {field}") + print( + f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}" + ) continue @@ -85,7 +88,9 @@ def separators(field, field_name): # Check if there was a match if match: - print(f"Invalid multi-value separator ({field_name}): {field}") + print( + f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{field}" + ) return field @@ -102,7 +107,7 @@ def date(field, field_name): """ if pd.isna(field): - print(f"Missing date ({field_name}).") + print(f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}") return @@ -111,7 +116,9 @@ def date(field, field_name): # We don't allow multi-value date fields if len(multiple_dates) > 1: - print(f"Multiple dates not allowed ({field_name}): {field}") + print( + f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}" + ) return field @@ -145,7 +152,7 @@ def date(field, field_name): return field except ValueError: - print(f"Invalid date ({field_name}): {field}") + print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}") return field @@ -178,9 +185,7 @@ def suspicious_characters(field, field_name): # character and spanning enough of the rest to give a preview, # but not too much to cause the line to break in terminals with # a default of 80 characters width. - suspicious_character_msg = ( - f"Suspicious character ({field_name}): {field_subset}" - ) + suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}" print(f"{suspicious_character_msg:1.80}") return field @@ -205,16 +210,16 @@ def language(field): # can check it against ISO 639-1 or ISO 639-3 accordingly. if len(value) == 2: if not languages.get(alpha_2=value): - print(f"Invalid ISO 639-1 language: {value}") + print(f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}") pass elif len(value) == 3: if not languages.get(alpha_3=value): - print(f"Invalid ISO 639-3 language: {value}") + print(f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}") pass else: - print(f"Invalid language: {value}") + print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}") return field @@ -256,7 +261,7 @@ def agrovoc(field, field_name): # check if there are any results if len(data["results"]) == 0: - print(f"Invalid AGROVOC ({field_name}): {value}") + print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") return field @@ -309,6 +314,6 @@ def filename_extension(field): break if filename_extension_match is False: - print(f"Filename with uncommon extension: {value}") + print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") return field diff --git a/csv_metadata_quality/experimental.py b/csv_metadata_quality/experimental.py index de6c857..ba45273 100644 --- a/csv_metadata_quality/experimental.py +++ b/csv_metadata_quality/experimental.py @@ -1,4 +1,5 @@ import pandas as pd +from colorama import Fore def correct_language(row): @@ -10,10 +11,11 @@ def correct_language(row): language and returns the value in the language field if it does match. """ - from pycountry import languages - import langid import re + import langid + from pycountry import languages + # Initialize some variables at global scope so that we can set them in the # loop scope below and still be able to access them afterwards. language = "" @@ -83,12 +85,12 @@ def correct_language(row): detected_language = languages.get(alpha_2=langid_classification[0]) if len(language) == 2 and language != detected_language.alpha_2: print( - f"Possibly incorrect language {language} (detected {detected_language.alpha_2}): {title}" + f"{Fore.YELLOW}Possibly incorrect language {language} (detected {detected_language.alpha_2}): {Fore.RESET}{title}" ) elif len(language) == 3 and language != detected_language.alpha_3: print( - f"Possibly incorrect language {language} (detected {detected_language.alpha_3}): {title}" + f"{Fore.YELLOW}Possibly incorrect language {language} (detected {detected_language.alpha_3}): {Fore.RESET}{title}" ) else: diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index 7ef12fb..5c3da7f 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -2,6 +2,7 @@ import re from unicodedata import normalize import pandas as pd +from colorama import Fore from csv_metadata_quality.util import is_nfc @@ -29,7 +30,9 @@ def whitespace(field, field_name): match = re.findall(pattern, value) if match: - print(f"Removing excessive whitespace ({field_name}): {value}") + print( + f"{Fore.GREEN}Removing excessive whitespace ({field_name}): {Fore.RESET}{value}" + ) value = re.sub(pattern, " ", value) # Save cleaned value @@ -62,7 +65,9 @@ def separators(field, field_name): for value in field.split("||"): # Check if the value is blank and skip it if value == "": - print(f"Fixing unnecessary multi-value separator ({field_name}): {field}") + print( + f"{Fore.GREEN}Fixing unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}" + ) continue @@ -71,7 +76,9 @@ def separators(field, field_name): match = re.findall(pattern, value) if match: - print(f"Fixing invalid multi-value separator ({field_name}): {value}") + print( + f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}" + ) value = re.sub(pattern, "||", value) @@ -107,7 +114,7 @@ def unnecessary_unicode(field): match = re.findall(pattern, field) if match: - print(f"Removing unnecessary Unicode (U+200B): {field}") + print(f"{Fore.GREEN}Removing unnecessary Unicode (U+200B): {Fore.RESET}{field}") field = re.sub(pattern, "", field) # Check for replacement characters (U+FFFD) @@ -115,7 +122,7 @@ def unnecessary_unicode(field): match = re.findall(pattern, field) if match: - print(f"Removing unnecessary Unicode (U+FFFD): {field}") + print(f"{Fore.GREEN}Removing unnecessary Unicode (U+FFFD): {Fore.RESET}{field}") field = re.sub(pattern, "", field) # Check for no-break spaces (U+00A0) @@ -123,7 +130,9 @@ def unnecessary_unicode(field): match = re.findall(pattern, field) if match: - print(f"Replacing unnecessary Unicode (U+00A0): {field}") + print( + f"{Fore.GREEN}Replacing unnecessary Unicode (U+00A0): {Fore.RESET}{field}" + ) field = re.sub(pattern, " ", field) # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen @@ -131,7 +140,9 @@ def unnecessary_unicode(field): match = re.findall(pattern, field) if match: - print(f"Replacing unnecessary Unicode (U+00AD): {field}") + print( + f"{Fore.GREEN}Replacing unnecessary Unicode (U+00AD): {Fore.RESET}{field}" + ) field = re.sub(pattern, "-", field) return field @@ -156,7 +167,9 @@ def duplicates(field, field_name): if value not in new_values: new_values.append(value) else: - print(f"Removing duplicate value ({field_name}): {value}") + print( + f"{Fore.GREEN}Removing duplicate value ({field_name}): {Fore.RESET}{value}" + ) # Create a new field consisting of all values joined with "||" new_field = "||".join(new_values) @@ -189,7 +202,7 @@ def newlines(field): match = re.findall(r"\n", field) if match: - print(f"Removing newline: {field}") + print(f"{Fore.GREEN}Removing newline: {Fore.RESET}{field}") field = field.replace("\n", "") return field @@ -213,7 +226,9 @@ def comma_space(field, field_name): match = re.findall(r",\w", field) if match: - print(f"Adding space after comma ({field_name}): {field}") + print( + f"{Fore.GREEN}Adding space after comma ({field_name}): {Fore.RESET}{field}" + ) field = re.sub(r",(\w)", r", \1", field) return field @@ -234,7 +249,7 @@ def normalize_unicode(field, field_name): # Check if the current string is using normalized Unicode (NFC) if not is_nfc(field): - print(f"Normalizing Unicode ({field_name}): {field}") + print(f"{Fore.GREEN}Normalizing Unicode ({field_name}): {Fore.RESET}{field}") field = normalize("NFC", field) return field diff --git a/tests/test_check.py b/tests/test_check.py index e7f9430..4ce3c34 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -1,4 +1,5 @@ import pandas as pd +from colorama import Fore import csv_metadata_quality.check as check import csv_metadata_quality.experimental as experimental @@ -12,7 +13,7 @@ def test_check_invalid_issn(capsys): check.issn(value) captured = capsys.readouterr() - assert captured.out == f"Invalid ISSN: {value}\n" + assert captured.out == f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}\n" def test_check_valid_issn(): @@ -33,7 +34,7 @@ def test_check_invalid_isbn(capsys): check.isbn(value) captured = capsys.readouterr() - assert captured.out == f"Invalid ISBN: {value}\n" + assert captured.out == f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}\n" def test_check_valid_isbn(): @@ -56,7 +57,10 @@ def test_check_invalid_separators(capsys): check.separators(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Invalid multi-value separator ({field_name}): {value}\n" + assert ( + captured.out + == f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{value}\n" + ) def test_check_unnecessary_separators(capsys): @@ -70,7 +74,8 @@ def test_check_unnecessary_separators(capsys): captured = capsys.readouterr() assert ( - captured.out == f"Unnecessary multi-value separator ({field_name}): {field}\n" + captured.out + == f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}\n" ) @@ -96,7 +101,7 @@ def test_check_missing_date(capsys): check.date(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Missing date ({field_name}).\n" + assert captured.out == f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}\n" def test_check_multiple_dates(capsys): @@ -109,7 +114,10 @@ def test_check_multiple_dates(capsys): check.date(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Multiple dates not allowed ({field_name}): {value}\n" + assert ( + captured.out + == f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{value}\n" + ) def test_check_invalid_date(capsys): @@ -122,7 +130,9 @@ def test_check_invalid_date(capsys): check.date(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Invalid date ({field_name}): {value}\n" + assert ( + captured.out == f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{value}\n" + ) def test_check_valid_date(): @@ -147,7 +157,10 @@ def test_check_suspicious_characters(capsys): check.suspicious_characters(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Suspicious character ({field_name}): ˆt\n" + assert ( + captured.out + == f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}ˆt\n" + ) def test_check_valid_iso639_1_language(): @@ -178,7 +191,9 @@ def test_check_invalid_iso639_1_language(capsys): check.language(value) captured = capsys.readouterr() - assert captured.out == f"Invalid ISO 639-1 language: {value}\n" + assert ( + captured.out == f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}\n" + ) def test_check_invalid_iso639_3_language(capsys): @@ -189,7 +204,9 @@ def test_check_invalid_iso639_3_language(capsys): check.language(value) captured = capsys.readouterr() - assert captured.out == f"Invalid ISO 639-3 language: {value}\n" + assert ( + captured.out == f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}\n" + ) def test_check_invalid_language(capsys): @@ -200,7 +217,7 @@ def test_check_invalid_language(capsys): check.language(value) captured = capsys.readouterr() - assert captured.out == f"Invalid language: {value}\n" + assert captured.out == f"{Fore.RED}Invalid language: {Fore.RESET}{value}\n" def test_check_invalid_agrovoc(capsys): @@ -212,7 +229,10 @@ def test_check_invalid_agrovoc(capsys): check.agrovoc(value, field_name) captured = capsys.readouterr() - assert captured.out == f"Invalid AGROVOC ({field_name}): {value}\n" + assert ( + captured.out + == f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}\n" + ) def test_check_valid_agrovoc(): @@ -234,7 +254,10 @@ def test_check_uncommon_filename_extension(capsys): check.filename_extension(value) captured = capsys.readouterr() - assert captured.out == f"Filename with uncommon extension: {value}\n" + assert ( + captured.out + == f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}\n" + ) def test_check_common_filename_extension(): @@ -262,7 +285,7 @@ def test_check_incorrect_iso_639_1_language(capsys): captured = capsys.readouterr() assert ( captured.out - == f"Possibly incorrect language {language} (detected en): {title}\n" + == f"{Fore.YELLOW}Possibly incorrect language {language} (detected en): {Fore.RESET}{title}\n" ) @@ -281,7 +304,7 @@ def test_check_incorrect_iso_639_3_language(capsys): captured = capsys.readouterr() assert ( captured.out - == f"Possibly incorrect language {language} (detected eng): {title}\n" + == f"{Fore.YELLOW}Possibly incorrect language {language} (detected eng): {Fore.RESET}{title}\n" )