1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-10-18 14:34:17 +02:00

Compare commits

..

2 Commits

Author SHA1 Message Date
a7fc5a246c
Colorize output
Some checks failed
continuous-integration/drone/push Build is failing
Messages will be colorized:

- Red for errors
- Yellow for warnings or information
- Green for fixes
2021-02-21 13:01:25 +02:00
7fb8acb866
Add colorama for colored output
Red for errors, yellow for warnings or information, and green for
fixes.
2021-02-21 13:00:31 +02:00
7 changed files with 95 additions and 48 deletions

View File

@ -4,6 +4,7 @@ import signal
import sys import sys
import pandas as pd import pandas as pd
from colorama import Fore
import csv_metadata_quality.check as check import csv_metadata_quality.check as check
import csv_metadata_quality.experimental as experimental import csv_metadata_quality.experimental as experimental
@ -77,7 +78,7 @@ def run(argv):
if column == exclude and skip is False: if column == exclude and skip is False:
skip = True skip = True
if skip: if skip:
print(f"Skipping {column}") print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
continue continue

View File

@ -3,6 +3,7 @@ from datetime import datetime, timedelta
import pandas as pd import pandas as pd
import requests import requests
import requests_cache import requests_cache
from colorama import Fore
from pycountry import languages from pycountry import languages
@ -26,7 +27,7 @@ def issn(field):
for value in field.split("||"): for value in field.split("||"):
if not issn.is_valid(value): if not issn.is_valid(value):
print(f"Invalid ISSN: {value}") print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
return field return field
@ -51,7 +52,7 @@ def isbn(field):
for value in field.split("||"): for value in field.split("||"):
if not isbn.is_valid(value): if not isbn.is_valid(value):
print(f"Invalid ISBN: {value}") print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
return field return field
@ -76,7 +77,9 @@ def separators(field, field_name):
for value in field.split("||"): for value in field.split("||"):
# Check if the current value is blank # Check if the current value is blank
if value == "": if value == "":
print(f"Unnecessary multi-value separator ({field_name}): {field}") print(
f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}"
)
continue continue
@ -85,7 +88,9 @@ def separators(field, field_name):
# Check if there was a match # Check if there was a match
if match: if match:
print(f"Invalid multi-value separator ({field_name}): {field}") print(
f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{field}"
)
return field return field
@ -102,7 +107,7 @@ def date(field, field_name):
""" """
if pd.isna(field): if pd.isna(field):
print(f"Missing date ({field_name}).") print(f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}")
return return
@ -111,7 +116,9 @@ def date(field, field_name):
# We don't allow multi-value date fields # We don't allow multi-value date fields
if len(multiple_dates) > 1: if len(multiple_dates) > 1:
print(f"Multiple dates not allowed ({field_name}): {field}") print(
f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
)
return field return field
@ -145,7 +152,7 @@ def date(field, field_name):
return field return field
except ValueError: except ValueError:
print(f"Invalid date ({field_name}): {field}") print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
return field return field
@ -178,9 +185,7 @@ def suspicious_characters(field, field_name):
# character and spanning enough of the rest to give a preview, # character and spanning enough of the rest to give a preview,
# but not too much to cause the line to break in terminals with # but not too much to cause the line to break in terminals with
# a default of 80 characters width. # a default of 80 characters width.
suspicious_character_msg = ( suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
f"Suspicious character ({field_name}): {field_subset}"
)
print(f"{suspicious_character_msg:1.80}") print(f"{suspicious_character_msg:1.80}")
return field return field
@ -205,16 +210,16 @@ def language(field):
# can check it against ISO 639-1 or ISO 639-3 accordingly. # can check it against ISO 639-1 or ISO 639-3 accordingly.
if len(value) == 2: if len(value) == 2:
if not languages.get(alpha_2=value): if not languages.get(alpha_2=value):
print(f"Invalid ISO 639-1 language: {value}") print(f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}")
pass pass
elif len(value) == 3: elif len(value) == 3:
if not languages.get(alpha_3=value): if not languages.get(alpha_3=value):
print(f"Invalid ISO 639-3 language: {value}") print(f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}")
pass pass
else: else:
print(f"Invalid language: {value}") print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
return field return field
@ -256,7 +261,7 @@ def agrovoc(field, field_name):
# check if there are any results # check if there are any results
if len(data["results"]) == 0: if len(data["results"]) == 0:
print(f"Invalid AGROVOC ({field_name}): {value}") print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
return field return field
@ -309,6 +314,6 @@ def filename_extension(field):
break break
if filename_extension_match is False: if filename_extension_match is False:
print(f"Filename with uncommon extension: {value}") print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
return field return field

View File

@ -1,4 +1,5 @@
import pandas as pd import pandas as pd
from colorama import Fore
def correct_language(row): def correct_language(row):
@ -10,10 +11,11 @@ def correct_language(row):
language and returns the value in the language field if it does match. language and returns the value in the language field if it does match.
""" """
from pycountry import languages
import langid
import re import re
import langid
from pycountry import languages
# Initialize some variables at global scope so that we can set them in the # Initialize some variables at global scope so that we can set them in the
# loop scope below and still be able to access them afterwards. # loop scope below and still be able to access them afterwards.
language = "" language = ""
@ -83,12 +85,12 @@ def correct_language(row):
detected_language = languages.get(alpha_2=langid_classification[0]) detected_language = languages.get(alpha_2=langid_classification[0])
if len(language) == 2 and language != detected_language.alpha_2: if len(language) == 2 and language != detected_language.alpha_2:
print( print(
f"Possibly incorrect language {language} (detected {detected_language.alpha_2}): {title}" f"{Fore.YELLOW}Possibly incorrect language {language} (detected {detected_language.alpha_2}): {Fore.RESET}{title}"
) )
elif len(language) == 3 and language != detected_language.alpha_3: elif len(language) == 3 and language != detected_language.alpha_3:
print( print(
f"Possibly incorrect language {language} (detected {detected_language.alpha_3}): {title}" f"{Fore.YELLOW}Possibly incorrect language {language} (detected {detected_language.alpha_3}): {Fore.RESET}{title}"
) )
else: else:

View File

@ -2,6 +2,7 @@ import re
from unicodedata import normalize from unicodedata import normalize
import pandas as pd import pandas as pd
from colorama import Fore
from csv_metadata_quality.util import is_nfc from csv_metadata_quality.util import is_nfc
@ -29,7 +30,9 @@ def whitespace(field, field_name):
match = re.findall(pattern, value) match = re.findall(pattern, value)
if match: if match:
print(f"Removing excessive whitespace ({field_name}): {value}") print(
f"{Fore.GREEN}Removing excessive whitespace ({field_name}): {Fore.RESET}{value}"
)
value = re.sub(pattern, " ", value) value = re.sub(pattern, " ", value)
# Save cleaned value # Save cleaned value
@ -62,7 +65,9 @@ def separators(field, field_name):
for value in field.split("||"): for value in field.split("||"):
# Check if the value is blank and skip it # Check if the value is blank and skip it
if value == "": if value == "":
print(f"Fixing unnecessary multi-value separator ({field_name}): {field}") print(
f"{Fore.GREEN}Fixing unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}"
)
continue continue
@ -71,7 +76,9 @@ def separators(field, field_name):
match = re.findall(pattern, value) match = re.findall(pattern, value)
if match: if match:
print(f"Fixing invalid multi-value separator ({field_name}): {value}") print(
f"{Fore.RED}Fixing invalid multi-value separator ({field_name}): {Fore.RESET}{value}"
)
value = re.sub(pattern, "||", value) value = re.sub(pattern, "||", value)
@ -107,7 +114,7 @@ def unnecessary_unicode(field):
match = re.findall(pattern, field) match = re.findall(pattern, field)
if match: if match:
print(f"Removing unnecessary Unicode (U+200B): {field}") print(f"{Fore.GREEN}Removing unnecessary Unicode (U+200B): {Fore.RESET}{field}")
field = re.sub(pattern, "", field) field = re.sub(pattern, "", field)
# Check for replacement characters (U+FFFD) # Check for replacement characters (U+FFFD)
@ -115,7 +122,7 @@ def unnecessary_unicode(field):
match = re.findall(pattern, field) match = re.findall(pattern, field)
if match: if match:
print(f"Removing unnecessary Unicode (U+FFFD): {field}") print(f"{Fore.GREEN}Removing unnecessary Unicode (U+FFFD): {Fore.RESET}{field}")
field = re.sub(pattern, "", field) field = re.sub(pattern, "", field)
# Check for no-break spaces (U+00A0) # Check for no-break spaces (U+00A0)
@ -123,7 +130,9 @@ def unnecessary_unicode(field):
match = re.findall(pattern, field) match = re.findall(pattern, field)
if match: if match:
print(f"Replacing unnecessary Unicode (U+00A0): {field}") print(
f"{Fore.GREEN}Replacing unnecessary Unicode (U+00A0): {Fore.RESET}{field}"
)
field = re.sub(pattern, " ", field) field = re.sub(pattern, " ", field)
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
@ -131,7 +140,9 @@ def unnecessary_unicode(field):
match = re.findall(pattern, field) match = re.findall(pattern, field)
if match: if match:
print(f"Replacing unnecessary Unicode (U+00AD): {field}") print(
f"{Fore.GREEN}Replacing unnecessary Unicode (U+00AD): {Fore.RESET}{field}"
)
field = re.sub(pattern, "-", field) field = re.sub(pattern, "-", field)
return field return field
@ -156,7 +167,9 @@ def duplicates(field, field_name):
if value not in new_values: if value not in new_values:
new_values.append(value) new_values.append(value)
else: else:
print(f"Removing duplicate value ({field_name}): {value}") print(
f"{Fore.GREEN}Removing duplicate value ({field_name}): {Fore.RESET}{value}"
)
# Create a new field consisting of all values joined with "||" # Create a new field consisting of all values joined with "||"
new_field = "||".join(new_values) new_field = "||".join(new_values)
@ -189,7 +202,7 @@ def newlines(field):
match = re.findall(r"\n", field) match = re.findall(r"\n", field)
if match: if match:
print(f"Removing newline: {field}") print(f"{Fore.GREEN}Removing newline: {Fore.RESET}{field}")
field = field.replace("\n", "") field = field.replace("\n", "")
return field return field
@ -213,7 +226,9 @@ def comma_space(field, field_name):
match = re.findall(r",\w", field) match = re.findall(r",\w", field)
if match: if match:
print(f"Adding space after comma ({field_name}): {field}") print(
f"{Fore.GREEN}Adding space after comma ({field_name}): {Fore.RESET}{field}"
)
field = re.sub(r",(\w)", r", \1", field) field = re.sub(r",(\w)", r", \1", field)
return field return field
@ -234,7 +249,7 @@ def normalize_unicode(field, field_name):
# Check if the current string is using normalized Unicode (NFC) # Check if the current string is using normalized Unicode (NFC)
if not is_nfc(field): if not is_nfc(field):
print(f"Normalizing Unicode ({field_name}): {field}") print(f"{Fore.GREEN}Normalizing Unicode ({field_name}): {Fore.RESET}{field}")
field = normalize("NFC", field) field = normalize("NFC", field)
return field return field

4
poetry.lock generated
View File

@ -159,7 +159,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
name = "colorama" name = "colorama"
version = "0.4.4" version = "0.4.4"
description = "Cross-platform colored terminal text." description = "Cross-platform colored terminal text."
category = "dev" category = "main"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
@ -765,7 +765,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.8" python-versions = "^3.8"
content-hash = "63f2c6ef09652c4f8407660ff7b4690c8a07e5501eb8fc8c477f485de5888fcf" content-hash = "8c4ba410bbdc930d2d74f7864470a18827029a5697869833959708d7425460ae"
[metadata.files] [metadata.files]
agate = [ agate = [

View File

@ -16,6 +16,7 @@ requests = "^2.23.0"
requests-cache = "^0.5.2" requests-cache = "^0.5.2"
pycountry = "^19.8.18" pycountry = "^19.8.18"
langid = "^1.1.6" langid = "^1.1.6"
colorama = "^0.4.4"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "^6.1.1" pytest = "^6.1.1"

View File

@ -1,4 +1,5 @@
import pandas as pd import pandas as pd
from colorama import Fore
import csv_metadata_quality.check as check import csv_metadata_quality.check as check
import csv_metadata_quality.experimental as experimental import csv_metadata_quality.experimental as experimental
@ -12,7 +13,7 @@ def test_check_invalid_issn(capsys):
check.issn(value) check.issn(value)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid ISSN: {value}\n" assert captured.out == f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}\n"
def test_check_valid_issn(): def test_check_valid_issn():
@ -33,7 +34,7 @@ def test_check_invalid_isbn(capsys):
check.isbn(value) check.isbn(value)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid ISBN: {value}\n" assert captured.out == f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}\n"
def test_check_valid_isbn(): def test_check_valid_isbn():
@ -56,7 +57,10 @@ def test_check_invalid_separators(capsys):
check.separators(value, field_name) check.separators(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid multi-value separator ({field_name}): {value}\n" assert (
captured.out
== f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_unnecessary_separators(capsys): def test_check_unnecessary_separators(capsys):
@ -70,7 +74,8 @@ def test_check_unnecessary_separators(capsys):
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (
captured.out == f"Unnecessary multi-value separator ({field_name}): {field}\n" captured.out
== f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}\n"
) )
@ -96,7 +101,7 @@ def test_check_missing_date(capsys):
check.date(value, field_name) check.date(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Missing date ({field_name}).\n" assert captured.out == f"{Fore.RED}Missing date ({field_name}).{Fore.RESET}\n"
def test_check_multiple_dates(capsys): def test_check_multiple_dates(capsys):
@ -109,7 +114,10 @@ def test_check_multiple_dates(capsys):
check.date(value, field_name) check.date(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Multiple dates not allowed ({field_name}): {value}\n" assert (
captured.out
== f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_invalid_date(capsys): def test_check_invalid_date(capsys):
@ -122,7 +130,9 @@ def test_check_invalid_date(capsys):
check.date(value, field_name) check.date(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid date ({field_name}): {value}\n" assert (
captured.out == f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_valid_date(): def test_check_valid_date():
@ -147,7 +157,10 @@ def test_check_suspicious_characters(capsys):
check.suspicious_characters(value, field_name) check.suspicious_characters(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Suspicious character ({field_name}): ˆt\n" assert (
captured.out
== f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}ˆt\n"
)
def test_check_valid_iso639_1_language(): def test_check_valid_iso639_1_language():
@ -178,7 +191,9 @@ def test_check_invalid_iso639_1_language(capsys):
check.language(value) check.language(value)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid ISO 639-1 language: {value}\n" assert (
captured.out == f"{Fore.RED}Invalid ISO 639-1 language: {Fore.RESET}{value}\n"
)
def test_check_invalid_iso639_3_language(capsys): def test_check_invalid_iso639_3_language(capsys):
@ -189,7 +204,9 @@ def test_check_invalid_iso639_3_language(capsys):
check.language(value) check.language(value)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid ISO 639-3 language: {value}\n" assert (
captured.out == f"{Fore.RED}Invalid ISO 639-3 language: {Fore.RESET}{value}\n"
)
def test_check_invalid_language(capsys): def test_check_invalid_language(capsys):
@ -200,7 +217,7 @@ def test_check_invalid_language(capsys):
check.language(value) check.language(value)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid language: {value}\n" assert captured.out == f"{Fore.RED}Invalid language: {Fore.RESET}{value}\n"
def test_check_invalid_agrovoc(capsys): def test_check_invalid_agrovoc(capsys):
@ -212,7 +229,10 @@ def test_check_invalid_agrovoc(capsys):
check.agrovoc(value, field_name) check.agrovoc(value, field_name)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Invalid AGROVOC ({field_name}): {value}\n" assert (
captured.out
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}\n"
)
def test_check_valid_agrovoc(): def test_check_valid_agrovoc():
@ -234,7 +254,10 @@ def test_check_uncommon_filename_extension(capsys):
check.filename_extension(value) check.filename_extension(value)
captured = capsys.readouterr() captured = capsys.readouterr()
assert captured.out == f"Filename with uncommon extension: {value}\n" assert (
captured.out
== f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}\n"
)
def test_check_common_filename_extension(): def test_check_common_filename_extension():
@ -262,7 +285,7 @@ def test_check_incorrect_iso_639_1_language(capsys):
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (
captured.out captured.out
== f"Possibly incorrect language {language} (detected en): {title}\n" == f"{Fore.YELLOW}Possibly incorrect language {language} (detected en): {Fore.RESET}{title}\n"
) )
@ -281,7 +304,7 @@ def test_check_incorrect_iso_639_3_language(capsys):
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (
captured.out captured.out
== f"Possibly incorrect language {language} (detected eng): {title}\n" == f"{Fore.YELLOW}Possibly incorrect language {language} (detected eng): {Fore.RESET}{title}\n"
) )