From 8435ee242d8fc65683991bb52da29b869a208ff8 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 24 Sep 2019 18:55:05 +0300 Subject: [PATCH] Experimental language detection using langid Works decenty well assuming the title, abstract, and citation fields are an accurate representation of the language as identified by the language field. Handles ISO 639-1 (alpha 2) and ISO 639-3 (alpha 3) values seamlessly. This includes updated pipenv environment, test data, pytest tests for both correct and incorrect ISO 639-1 and ISO 639-3 languages, and a new command line option "-e". --- Pipfile | 1 + csv_metadata_quality/app.py | 24 +++++++ csv_metadata_quality/experimental.py | 95 ++++++++++++++++++++++++++++ data/test.csv | 2 + tests/test_check.py | 64 +++++++++++++++++++ 5 files changed, 186 insertions(+) create mode 100644 csv_metadata_quality/experimental.py diff --git a/Pipfile b/Pipfile index 8d4d1d3..ccded6b 100644 --- a/Pipfile +++ b/Pipfile @@ -20,6 +20,7 @@ requests = "*" requests-cache = "*" pycountry = "*" csv-metadata-quality = {editable = true,path = "."} +langid = "*" [requires] python_version = "3.7" diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index f8afdeb..9d8a2ce 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -6,6 +6,7 @@ import sys import pandas as pd import csv_metadata_quality.check as check +import csv_metadata_quality.experimental as experimental import csv_metadata_quality.fix as fix from csv_metadata_quality.version import VERSION @@ -17,6 +18,11 @@ def parse_args(argv): "-a", help="Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country", ) + parser.add_argument( + "--experimental-checks", + "-e", + help="Enable experimental checks like language detection", action="store_true" + ) parser.add_argument( "--input-file", "-i", @@ -137,6 +143,24 @@ def run(argv): if column == "filename": df[column] = df[column].apply(check.filename_extension) + ## + # Perform some checks on rows so we can consider items as a whole rather + # than simple on a field-by-field basis. This allows us to check whether + # the language used in the title and abstract matches the language indi- + # cated in the language field, for example. + # + # This is slower and apparently frowned upon in the Pandas community be- + # cause it requires iterating over rows rather than using apply over a + # column. For now it will have to do. + ## + + if args.experimental_checks: + # Transpose the DataFrame so we can consider each row as a column + df_transposed = df.T + + for column in df_transposed.columns: + experimental.correct_language(df_transposed[column]) + # Write df.to_csv(args.output_file, index=False) diff --git a/csv_metadata_quality/experimental.py b/csv_metadata_quality/experimental.py new file mode 100644 index 0000000..de6c857 --- /dev/null +++ b/csv_metadata_quality/experimental.py @@ -0,0 +1,95 @@ +import pandas as pd + + +def correct_language(row): + """Analyze the text used in the title, abstract, and citation fields to pre- + dict the language being used and compare it with the item's dc.language.iso + field. + + Function prints an error if the language field does not match the detected + language and returns the value in the language field if it does match. + """ + + from pycountry import languages + import langid + import re + + # Initialize some variables at global scope so that we can set them in the + # loop scope below and still be able to access them afterwards. + language = "" + sample_strings = list() + title = None + + # Iterate over the labels of the current row's values. Before we transposed + # the DataFrame these were the columns in the CSV, ie dc.title and dc.type. + for label in row.axes[0]: + # Skip fields with missing values + if pd.isna(row[label]): + continue + + # Check if current row has multiple language values (separated by "||") + match = re.match(r"^.*?language.*$", label) + if match is not None: + # Skip fields with multiple language values + if "||" in row[label]: + return + + language = row[label] + + # Extract title if it is present + match = re.match(r"^.*?title.*$", label) + if match is not None: + title = row[label] + # Append title to sample strings + sample_strings.append(row[label]) + + # Extract abstract if it is present + match = re.match(r"^.*?abstract.*$", label) + if match is not None: + sample_strings.append(row[label]) + + # Extract citation if it is present + match = re.match(r"^.*?citation.*$", label) + if match is not None: + sample_strings.append(row[label]) + + # Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction + if language != "": + # Check language value like "es" + if len(language) == 2: + if not languages.get(alpha_2=language): + return + # Check language value like "spa" + elif len(language) == 3: + if not languages.get(alpha_3=language): + return + # Language value is something else like "Span", do not proceed + else: + return + # Language is blank, do not proceed + else: + return + + # Concatenate all sample strings into one string + sample_text = " ".join(sample_strings) + + # Restrict the langid detection space to reduce false positives + langid.set_languages( + ["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ko", "pt", "ru", "vi", "zh"] + ) + langid_classification = langid.classify(sample_text) + + # langid returns an ISO 639-1 (alpha 2) representation of the detected language, but the current item's language field might be ISO 639-3 (alpha 3) so we should use a pycountry Language object to compare both represenations and give appropriate error messages that match the format used by in the input file. + detected_language = languages.get(alpha_2=langid_classification[0]) + if len(language) == 2 and language != detected_language.alpha_2: + print( + f"Possibly incorrect language {language} (detected {detected_language.alpha_2}): {title}" + ) + + elif len(language) == 3 and language != detected_language.alpha_3: + print( + f"Possibly incorrect language {language} (detected {detected_language.alpha_3}): {title}" + ) + + else: + return language diff --git a/data/test.csv b/data/test.csv index b197da6..39cf9e7 100644 --- a/data/test.csv +++ b/data/test.csv @@ -24,3 +24,5 @@ Invalid country,2019-08-01,,,,,KENYAA, Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-­92-­9043-­823-­6,,,, "Missing space,after comma",2019-08-27,,,,,, +Incorrect ISO 639-1 language,2019-09-26,,,es,,, +Incorrect ISO 639-3 language,2019-09-26,,,spa,,, diff --git a/tests/test_check.py b/tests/test_check.py index 1bdc414..0ad968f 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -1,4 +1,6 @@ import csv_metadata_quality.check as check +import csv_metadata_quality.experimental as experimental +import pandas as pd def test_check_invalid_issn(capsys): @@ -223,3 +225,65 @@ def test_check_common_filename_extension(): result = check.filename_extension(value) assert result == value + + +def test_check_incorrect_iso_639_1_language(capsys): + '''Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'es' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + experimental.correct_language(series) + + captured = capsys.readouterr() + assert captured.out == f'Possibly incorrect language {language} (detected en): {title}\n' + + +def test_check_incorrect_iso_639_3_language(capsys): + '''Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'spa' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + experimental.correct_language(series) + + captured = capsys.readouterr() + assert captured.out == f'Possibly incorrect language {language} (detected eng): {title}\n' + + +def test_check_correct_iso_639_1_language(): + '''Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'en' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + result = experimental.correct_language(series) + + assert result == language + + +def test_check_correct_iso_639_3_language(): + '''Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.''' + + title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle' + language = 'eng' + + # Create a dictionary to mimic Pandas series + row = {'dc.title': title, 'dc.language.iso': language} + series = pd.Series(row) + + result = experimental.correct_language(series) + + assert result == language