Add support for dropping invalid AGROVOC terms

Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.
This commit is contained in:
Alan Orth 2021-12-23 12:43:10 +02:00
parent 7763a021c5
commit a7727b8431
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 32 additions and 5 deletions

View File

@ -21,6 +21,12 @@ def parse_args(argv):
"-a", "-a",
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country", help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
) )
parser.add_argument(
"--drop-invalid-agrovoc",
"-d",
help="After validating metadata values against AGROVOC, drop invalid values.",
action="store_true",
)
parser.add_argument( parser.add_argument(
"--experimental-checks", "--experimental-checks",
"-e", "-e",
@ -123,12 +129,14 @@ def run(argv):
# Fix: duplicate metadata values # Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates, field_name=column) df[column] = df[column].apply(fix.duplicates, field_name=column)
# Check: invalid AGROVOC subject # Check: invalid AGROVOC subject and optionally drop them
if args.agrovoc_fields: if args.agrovoc_fields:
# Identify fields the user wants to validate against AGROVOC # Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","): for field in args.agrovoc_fields.split(","):
if column == field: if column == field:
df[column].apply(check.agrovoc, field_name=column) df[column] = df[column].apply(
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
)
# Check: invalid language # Check: invalid language
match = re.match(r"^.*?language.*$", column) match = re.match(r"^.*?language.*$", column)

View File

@ -188,7 +188,7 @@ def language(field):
return return
def agrovoc(field, field_name): def agrovoc(field, field_name, drop):
"""Check subject terms against AGROVOC REST API. """Check subject terms against AGROVOC REST API.
Function constructor expects the field as well as the field name because Function constructor expects the field as well as the field name because
@ -219,6 +219,9 @@ def agrovoc(field, field_name):
# prune old cache entries # prune old cache entries
requests_cache.remove_expired_responses() requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values
values = list()
# Try to split multi-value field on "||" separator # Try to split multi-value field on "||" separator
for value in field.split("||"): for value in field.split("||"):
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
@ -231,9 +234,25 @@ def agrovoc(field, field_name):
# check if there are any results # check if there are any results
if len(data["results"]) == 0: if len(data["results"]) == 0:
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") if drop:
print(
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
else:
print(
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
return # value is invalid AGROVOC, but we are not dropping
values.append(value)
else:
# value is valid AGROVOC so save it
values.append(value)
# Create a new field consisting of all values joined with "||"
new_field = "||".join(values)
return new_field
def filename_extension(field): def filename_extension(field):