Add support for dropping invalid AGROVOC terms

Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.
This commit is contained in:
Alan Orth 2021-12-23 12:43:10 +02:00
parent 7763a021c5
commit a7727b8431
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 32 additions and 5 deletions

View File

@ -21,6 +21,12 @@ def parse_args(argv):
"-a",
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
)
parser.add_argument(
"--drop-invalid-agrovoc",
"-d",
help="After validating metadata values against AGROVOC, drop invalid values.",
action="store_true",
)
parser.add_argument(
"--experimental-checks",
"-e",
@ -123,12 +129,14 @@ def run(argv):
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates, field_name=column)
# Check: invalid AGROVOC subject
# Check: invalid AGROVOC subject and optionally drop them
if args.agrovoc_fields:
# Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","):
if column == field:
df[column].apply(check.agrovoc, field_name=column)
df[column] = df[column].apply(
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
)
# Check: invalid language
match = re.match(r"^.*?language.*$", column)

View File

@ -188,7 +188,7 @@ def language(field):
return
def agrovoc(field, field_name):
def agrovoc(field, field_name, drop):
"""Check subject terms against AGROVOC REST API.
Function constructor expects the field as well as the field name because
@ -219,6 +219,9 @@ def agrovoc(field, field_name):
# prune old cache entries
requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values
values = list()
# Try to split multi-value field on "||" separator
for value in field.split("||"):
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
@ -231,9 +234,25 @@ def agrovoc(field, field_name):
# check if there are any results
if len(data["results"]) == 0:
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
if drop:
print(
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
else:
print(
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
return
# value is invalid AGROVOC, but we are not dropping
values.append(value)
else:
# value is valid AGROVOC so save it
values.append(value)
# Create a new field consisting of all values joined with "||"
new_field = "||".join(values)
return new_field
def filename_extension(field):