mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-01-24 11:13:22 +01:00
Alan Orth
8435ee242d
Works decenty well assuming the title, abstract, and citation fields are an accurate representation of the language as identified by the language field. Handles ISO 639-1 (alpha 2) and ISO 639-3 (alpha 3) values seamlessly. This includes updated pipenv environment, test data, pytest tests for both correct and incorrect ISO 639-1 and ISO 639-3 languages, and a new command line option "-e".
96 lines
3.5 KiB
Python
96 lines
3.5 KiB
Python
import pandas as pd
|
|
|
|
|
|
def correct_language(row):
|
|
"""Analyze the text used in the title, abstract, and citation fields to pre-
|
|
dict the language being used and compare it with the item's dc.language.iso
|
|
field.
|
|
|
|
Function prints an error if the language field does not match the detected
|
|
language and returns the value in the language field if it does match.
|
|
"""
|
|
|
|
from pycountry import languages
|
|
import langid
|
|
import re
|
|
|
|
# Initialize some variables at global scope so that we can set them in the
|
|
# loop scope below and still be able to access them afterwards.
|
|
language = ""
|
|
sample_strings = list()
|
|
title = None
|
|
|
|
# Iterate over the labels of the current row's values. Before we transposed
|
|
# the DataFrame these were the columns in the CSV, ie dc.title and dc.type.
|
|
for label in row.axes[0]:
|
|
# Skip fields with missing values
|
|
if pd.isna(row[label]):
|
|
continue
|
|
|
|
# Check if current row has multiple language values (separated by "||")
|
|
match = re.match(r"^.*?language.*$", label)
|
|
if match is not None:
|
|
# Skip fields with multiple language values
|
|
if "||" in row[label]:
|
|
return
|
|
|
|
language = row[label]
|
|
|
|
# Extract title if it is present
|
|
match = re.match(r"^.*?title.*$", label)
|
|
if match is not None:
|
|
title = row[label]
|
|
# Append title to sample strings
|
|
sample_strings.append(row[label])
|
|
|
|
# Extract abstract if it is present
|
|
match = re.match(r"^.*?abstract.*$", label)
|
|
if match is not None:
|
|
sample_strings.append(row[label])
|
|
|
|
# Extract citation if it is present
|
|
match = re.match(r"^.*?citation.*$", label)
|
|
if match is not None:
|
|
sample_strings.append(row[label])
|
|
|
|
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction
|
|
if language != "":
|
|
# Check language value like "es"
|
|
if len(language) == 2:
|
|
if not languages.get(alpha_2=language):
|
|
return
|
|
# Check language value like "spa"
|
|
elif len(language) == 3:
|
|
if not languages.get(alpha_3=language):
|
|
return
|
|
# Language value is something else like "Span", do not proceed
|
|
else:
|
|
return
|
|
# Language is blank, do not proceed
|
|
else:
|
|
return
|
|
|
|
# Concatenate all sample strings into one string
|
|
sample_text = " ".join(sample_strings)
|
|
|
|
# Restrict the langid detection space to reduce false positives
|
|
langid.set_languages(
|
|
["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ko", "pt", "ru", "vi", "zh"]
|
|
)
|
|
langid_classification = langid.classify(sample_text)
|
|
|
|
# langid returns an ISO 639-1 (alpha 2) representation of the detected language, but the current item's language field might be ISO 639-3 (alpha 3) so we should use a pycountry Language object to compare both represenations and give appropriate error messages that match the format used by in the input file.
|
|
detected_language = languages.get(alpha_2=langid_classification[0])
|
|
if len(language) == 2 and language != detected_language.alpha_2:
|
|
print(
|
|
f"Possibly incorrect language {language} (detected {detected_language.alpha_2}): {title}"
|
|
)
|
|
|
|
elif len(language) == 3 and language != detected_language.alpha_3:
|
|
print(
|
|
f"Possibly incorrect language {language} (detected {detected_language.alpha_3}): {title}"
|
|
)
|
|
|
|
else:
|
|
return language
|