mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-21 11:42:20 +01:00
Add support for dropping invalid AGROVOC terms
Requires --agrovoc-fields <field.name> to do the actual validation, and -d to drop invalid ones.
This commit is contained in:
parent
7763a021c5
commit
a7727b8431
@ -21,6 +21,12 @@ def parse_args(argv):
|
||||
"-a",
|
||||
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--drop-invalid-agrovoc",
|
||||
"-d",
|
||||
help="After validating metadata values against AGROVOC, drop invalid values.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--experimental-checks",
|
||||
"-e",
|
||||
@ -123,12 +129,14 @@ def run(argv):
|
||||
# Fix: duplicate metadata values
|
||||
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
||||
|
||||
# Check: invalid AGROVOC subject
|
||||
# Check: invalid AGROVOC subject and optionally drop them
|
||||
if args.agrovoc_fields:
|
||||
# Identify fields the user wants to validate against AGROVOC
|
||||
for field in args.agrovoc_fields.split(","):
|
||||
if column == field:
|
||||
df[column].apply(check.agrovoc, field_name=column)
|
||||
df[column] = df[column].apply(
|
||||
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
|
||||
)
|
||||
|
||||
# Check: invalid language
|
||||
match = re.match(r"^.*?language.*$", column)
|
||||
|
@ -188,7 +188,7 @@ def language(field):
|
||||
return
|
||||
|
||||
|
||||
def agrovoc(field, field_name):
|
||||
def agrovoc(field, field_name, drop):
|
||||
"""Check subject terms against AGROVOC REST API.
|
||||
|
||||
Function constructor expects the field as well as the field name because
|
||||
@ -219,6 +219,9 @@ def agrovoc(field, field_name):
|
||||
# prune old cache entries
|
||||
requests_cache.remove_expired_responses()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
@ -231,9 +234,25 @@ def agrovoc(field, field_name):
|
||||
|
||||
# check if there are any results
|
||||
if len(data["results"]) == 0:
|
||||
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
|
||||
if drop:
|
||||
print(
|
||||
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
|
||||
)
|
||||
|
||||
return
|
||||
# value is invalid AGROVOC, but we are not dropping
|
||||
values.append(value)
|
||||
else:
|
||||
# value is valid AGROVOC so save it
|
||||
values.append(value)
|
||||
|
||||
# Create a new field consisting of all values joined with "||"
|
||||
new_field = "||".join(values)
|
||||
|
||||
return new_field
|
||||
|
||||
|
||||
def filename_extension(field):
|
||||
|
Loading…
Reference in New Issue
Block a user