1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-09 22:56:01 +02:00

Experimental language detection using langid

Works decenty well assuming the title, abstract, and citation fields
are an accurate representation of the language as identified by the
language field. Handles ISO 639-1 (alpha 2) and ISO 639-3 (alpha 3)
values seamlessly.

This includes updated pipenv environment, test data, pytest tests
for both correct and incorrect ISO 639-1 and ISO 639-3 languages,
and a new command line option "-e".
This commit is contained in:
2019-09-24 18:55:05 +03:00
parent 7ac1c6f554
commit 8435ee242d
5 changed files with 186 additions and 0 deletions

View File

@ -6,6 +6,7 @@ import sys
import pandas as pd
import csv_metadata_quality.check as check
import csv_metadata_quality.experimental as experimental
import csv_metadata_quality.fix as fix
from csv_metadata_quality.version import VERSION
@ -17,6 +18,11 @@ def parse_args(argv):
"-a",
help="Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country",
)
parser.add_argument(
"--experimental-checks",
"-e",
help="Enable experimental checks like language detection", action="store_true"
)
parser.add_argument(
"--input-file",
"-i",
@ -137,6 +143,24 @@ def run(argv):
if column == "filename":
df[column] = df[column].apply(check.filename_extension)
##
# Perform some checks on rows so we can consider items as a whole rather
# than simple on a field-by-field basis. This allows us to check whether
# the language used in the title and abstract matches the language indi-
# cated in the language field, for example.
#
# This is slower and apparently frowned upon in the Pandas community be-
# cause it requires iterating over rows rather than using apply over a
# column. For now it will have to do.
##
if args.experimental_checks:
# Transpose the DataFrame so we can consider each row as a column
df_transposed = df.T
for column in df_transposed.columns:
experimental.correct_language(df_transposed[column])
# Write
df.to_csv(args.output_file, index=False)