diff --git a/Pipfile b/Pipfile index 8d4d1d3..ccded6b 100644 --- a/Pipfile +++ b/Pipfile @@ -20,6 +20,7 @@ requests = "*" requests-cache = "*" pycountry = "*" csv-metadata-quality = {editable = true,path = "."} +langid = "*" [requires] python_version = "3.7" diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index f8afdeb..9d8a2ce 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -6,6 +6,7 @@ import sys import pandas as pd import csv_metadata_quality.check as check +import csv_metadata_quality.experimental as experimental import csv_metadata_quality.fix as fix from csv_metadata_quality.version import VERSION @@ -17,6 +18,11 @@ def parse_args(argv): "-a", help="Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country", ) + parser.add_argument( + "--experimental-checks", + "-e", + help="Enable experimental checks like language detection", action="store_true" + ) parser.add_argument( "--input-file", "-i", @@ -137,6 +143,24 @@ def run(argv): if column == "filename": df[column] = df[column].apply(check.filename_extension) + ## + # Perform some checks on rows so we can consider items as a whole rather + # than simple on a field-by-field basis. This allows us to check whether + # the language used in the title and abstract matches the language indi- + # cated in the language field, for example. + # + # This is slower and apparently frowned upon in the Pandas community be- + # cause it requires iterating over rows rather than using apply over a + # column. For now it will have to do. + ## + + if args.experimental_checks: + # Transpose the DataFrame so we can consider each row as a column + df_transposed = df.T + + for column in df_transposed.columns: + experimental.correct_language(df_transposed[column]) + # Write df.to_csv(args.output_file, index=False) diff --git a/csv_metadata_quality/experimental.py b/csv_metadata_quality/experimental.py new file mode 100644 index 0000000..de6c857 --- /dev/null +++ b/csv_metadata_quality/experimental.py @@ -0,0 +1,95 @@ +import pandas as pd + + +def correct_language(row): + """Analyze the text used in the title, abstract, and citation fields to pre- + dict the language being used and compare it with the item's dc.language.iso + field. + + Function prints an error if the language field does not match the detected + language and returns the value in the language field if it does match. + """ + + from pycountry import languages + import langid + import re + + # Initialize some variables at global scope so that we can set them in the + # loop scope below and still be able to access them afterwards. + language = "" + sample_strings = list() + title = None + + # Iterate over the labels of the current row's values. Before we transposed + # the DataFrame these were the columns in the CSV, ie dc.title and dc.type. + for label in row.axes[0]: + # Skip fields with missing values + if pd.isna(row[label]): + continue + + # Check if current row has multiple language values (separated by "||") + match = re.match(r"^.*?language.*$", label) + if match is not None: + # Skip fields with multiple language values + if "||" in row[label]: + return + + language = row[label] + + # Extract title if it is present + match = re.match(r"^.*?title.*$", label) + if match is not None: + title = row[label] + # Append title to sample strings + sample_strings.append(row[label]) + + # Extract abstract if it is present + match = re.match(r"^.*?abstract.*$", label) + if match is not None: + sample_strings.append(row[label]) + + # Extract citation if it is present + match = re.match(r"^.*?citation.*$", label) + if match is not None: + sample_strings.append(row[label]) + + # Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction + if language != "": + # Check language value like "es" + if len(language) == 2: + if not languages.get(alpha_2=language): + return + # Check language value like "spa" + elif len(language) == 3: + if not languages.get(alpha_3=language): + return + # Language value is something else like "Span", do not proceed + else: + return + # Language is blank, do not proceed + else: + return + + # Concatenate all sample strings into one string + sample_text = " ".join(sample_strings) + + # Restrict the langid detection space to reduce false positives + langid.set_languages( + ["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ko", "pt", "ru", "vi", "zh"] + ) + langid_classification = langid.classify(sample_text) + + # langid returns an ISO 639-1 (alpha 2) representation of the detected language, but the current item's language field might be ISO 639-3 (alpha 3) so we should use a pycountry Language object to compare both represenations and give appropriate error messages that match the format used by in the input file. + detected_language = languages.get(alpha_2=langid_classification[0]) + if len(language) == 2 and language != detected_language.alpha_2: + print( + f"Possibly incorrect language {language} (detected {detected_language.alpha_2}): {title}" + ) + + elif len(language) == 3 and language != detected_language.alpha_3: + print( + f"Possibly incorrect language {language} (detected {detected_language.alpha_3}): {title}" + ) + + else: + return language diff --git a/data/test.csv b/data/test.csv index b197da6..39cf9e7 100644 --- a/data/test.csv +++ b/data/test.csv @@ -24,3 +24,5 @@ Invalid country,2019-08-01,,,,,KENYAA, Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,, "Missing space,after comma",2019-08-27,,,,,, +Incorrect ISO 639-1 language,2019-09-26,,,es,,, +Incorrect ISO 639-3 language,2019-09-26,,,spa,,, diff --git a/tests/test_check.py b/tests/test_check.py index 1bdc414..0ad968f 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -1,4 +1,6 @@ import csv_metadata_quality.check as check +import csv_metadata_quality.experimental as experimental +import pandas as pd def test_check_invalid_issn(capsys): @@ -223,3 +225,65 @@ def test_check_common_filename_extension(): result = check.filename_extension(value) assert result == value + + +def test_check_incorrect_iso_639_1_language(capsys): + '''Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'es' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + experimental.correct_language(series) + + captured = capsys.readouterr() + assert captured.out == f'Possibly incorrect language {language} (detected en): {title}\n' + + +def test_check_incorrect_iso_639_3_language(capsys): + '''Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'spa' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + experimental.correct_language(series) + + captured = capsys.readouterr() + assert captured.out == f'Possibly incorrect language {language} (detected eng): {title}\n' + + +def test_check_correct_iso_639_1_language(): + '''Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'en' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + result = experimental.correct_language(series) + + assert result == language + + +def test_check_correct_iso_639_3_language(): + '''Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'eng' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + result = experimental.correct_language(series) + + assert result == language