mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-21 19:52:18 +01:00
Add support for dropping invalid AGROVOC terms
Requires --agrovoc-fields <field.name> to do the actual validation, and -d to drop invalid ones.
This commit is contained in:
parent
7763a021c5
commit
a7727b8431
@ -21,6 +21,12 @@ def parse_args(argv):
|
|||||||
"-a",
|
"-a",
|
||||||
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
|
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--drop-invalid-agrovoc",
|
||||||
|
"-d",
|
||||||
|
help="After validating metadata values against AGROVOC, drop invalid values.",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--experimental-checks",
|
"--experimental-checks",
|
||||||
"-e",
|
"-e",
|
||||||
@ -123,12 +129,14 @@ def run(argv):
|
|||||||
# Fix: duplicate metadata values
|
# Fix: duplicate metadata values
|
||||||
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
||||||
|
|
||||||
# Check: invalid AGROVOC subject
|
# Check: invalid AGROVOC subject and optionally drop them
|
||||||
if args.agrovoc_fields:
|
if args.agrovoc_fields:
|
||||||
# Identify fields the user wants to validate against AGROVOC
|
# Identify fields the user wants to validate against AGROVOC
|
||||||
for field in args.agrovoc_fields.split(","):
|
for field in args.agrovoc_fields.split(","):
|
||||||
if column == field:
|
if column == field:
|
||||||
df[column].apply(check.agrovoc, field_name=column)
|
df[column] = df[column].apply(
|
||||||
|
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
|
||||||
|
)
|
||||||
|
|
||||||
# Check: invalid language
|
# Check: invalid language
|
||||||
match = re.match(r"^.*?language.*$", column)
|
match = re.match(r"^.*?language.*$", column)
|
||||||
|
@ -188,7 +188,7 @@ def language(field):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def agrovoc(field, field_name):
|
def agrovoc(field, field_name, drop):
|
||||||
"""Check subject terms against AGROVOC REST API.
|
"""Check subject terms against AGROVOC REST API.
|
||||||
|
|
||||||
Function constructor expects the field as well as the field name because
|
Function constructor expects the field as well as the field name because
|
||||||
@ -219,6 +219,9 @@ def agrovoc(field, field_name):
|
|||||||
# prune old cache entries
|
# prune old cache entries
|
||||||
requests_cache.remove_expired_responses()
|
requests_cache.remove_expired_responses()
|
||||||
|
|
||||||
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
|
values = list()
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||||
@ -231,9 +234,25 @@ def agrovoc(field, field_name):
|
|||||||
|
|
||||||
# check if there are any results
|
# check if there are any results
|
||||||
if len(data["results"]) == 0:
|
if len(data["results"]) == 0:
|
||||||
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
|
if drop:
|
||||||
|
print(
|
||||||
|
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
|
||||||
|
)
|
||||||
|
|
||||||
return
|
# value is invalid AGROVOC, but we are not dropping
|
||||||
|
values.append(value)
|
||||||
|
else:
|
||||||
|
# value is valid AGROVOC so save it
|
||||||
|
values.append(value)
|
||||||
|
|
||||||
|
# Create a new field consisting of all values joined with "||"
|
||||||
|
new_field = "||".join(values)
|
||||||
|
|
||||||
|
return new_field
|
||||||
|
|
||||||
|
|
||||||
def filename_extension(field):
|
def filename_extension(field):
|
||||||
|
Loading…
Reference in New Issue
Block a user