mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-08 06:06:00 +02:00
Experimental language detection using langid
Works decenty well assuming the title, abstract, and citation fields are an accurate representation of the language as identified by the language field. Handles ISO 639-1 (alpha 2) and ISO 639-3 (alpha 3) values seamlessly. This includes updated pipenv environment, test data, pytest tests for both correct and incorrect ISO 639-1 and ISO 639-3 languages, and a new command line option "-e".
This commit is contained in:
@ -1,4 +1,6 @@
|
||||
import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.experimental as experimental
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def test_check_invalid_issn(capsys):
|
||||
@ -223,3 +225,65 @@ def test_check_common_filename_extension():
|
||||
result = check.filename_extension(value)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_incorrect_iso_639_1_language(capsys):
|
||||
'''Test incorrect ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.'''
|
||||
|
||||
title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle'
|
||||
language = 'es'
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {'dc.title': title, 'dc.language.iso': language}
|
||||
series = pd.Series(row)
|
||||
|
||||
experimental.correct_language(series)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Possibly incorrect language {language} (detected en): {title}\n'
|
||||
|
||||
|
||||
def test_check_incorrect_iso_639_3_language(capsys):
|
||||
'''Test incorrect ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.'''
|
||||
|
||||
title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle'
|
||||
language = 'spa'
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {'dc.title': title, 'dc.language.iso': language}
|
||||
series = pd.Series(row)
|
||||
|
||||
experimental.correct_language(series)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Possibly incorrect language {language} (detected eng): {title}\n'
|
||||
|
||||
|
||||
def test_check_correct_iso_639_1_language():
|
||||
'''Test correct ISO 639-1 language, as determined by comparing the item's language field with the actual language predicted in the item's title.'''
|
||||
|
||||
title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle'
|
||||
language = 'en'
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {'dc.title': title, 'dc.language.iso': language}
|
||||
series = pd.Series(row)
|
||||
|
||||
result = experimental.correct_language(series)
|
||||
|
||||
assert result == language
|
||||
|
||||
|
||||
def test_check_correct_iso_639_3_language():
|
||||
'''Test correct ISO 639-3 language, as determined by comparing the item's language field with the actual language predicted in the item's title.'''
|
||||
|
||||
title = 'A randomised vaccine field trial in Kenya demonstrates protection against wildebeest-associated malignant catarrhal fever in cattle'
|
||||
language = 'eng'
|
||||
|
||||
# Create a dictionary to mimic Pandas series
|
||||
row = {'dc.title': title, 'dc.language.iso': language}
|
||||
series = pd.Series(row)
|
||||
|
||||
result = experimental.correct_language(series)
|
||||
|
||||
assert result == language
|
||||
|
Reference in New Issue
Block a user