1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-16 17:43:54 +02:00

Improve exclude function

When a user explicitly requests that a field be excluded with -x we
skip that field in most checks. Up until now that did not include
the item-based checks using a transposed dataframe because we don't
know the metadata field names (labels) until we iterate over them.

Now the excludes are respected for item-based checks.
This commit is contained in:
2022-09-02 15:59:22 +03:00
parent 1f76247353
commit 040e56fc76
6 changed files with 54 additions and 24 deletions

@ -8,7 +8,7 @@ from colorama import Fore
from pycountry import languages
def correct_language(row):
def correct_language(row, exclude):
"""Analyze the text used in the title, abstract, and citation fields to pre-
dict the language being used and compare it with the item's dc.language.iso
field.
@ -39,7 +39,8 @@ def correct_language(row):
language = row[label]
# Extract title if it is present
# Extract title if it is present (note that we don't allow excluding
# the title here because it complicates things).
match = re.match(r"^.*?title.*$", label)
if match is not None:
title = row[label]
@ -48,12 +49,12 @@ def correct_language(row):
# Extract abstract if it is present
match = re.match(r"^.*?abstract.*$", label)
if match is not None:
if match is not None and label not in exclude:
sample_strings.append(row[label])
# Extract citation if it is present
match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None:
if match is not None and label not in exclude:
sample_strings.append(row[label])
# Make sure language is not blank and is valid ISO 639-1/639-3 before proceeding with language prediction