1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-06-09 13:25:08 +02:00

Rework AGROVOC validation

AGROVOC validation is now disabled by default, but can be enabled
on a field-by-field basis. For example, countries and regions are
also present in AGROVOC. Fields with these values can be enabled
using the new `--agrovoc-fields` option.

I reworked the script output to show the field name when printing
an invalid term so that the user knows in which field the term is.
This commit is contained in:
Alan Orth 2019-08-01 23:51:58 +03:00
parent 576b3a3638
commit bf876a046a
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 12 additions and 5 deletions

View File

@ -7,6 +7,7 @@ import re
def parse_args(argv):
parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.')
parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country')
parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8'))
parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8'))
parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true')
@ -48,9 +49,11 @@ def run(argv):
df[column] = df[column].apply(fix.duplicates)
# Check: invalid AGROVOC subject
match = re.match(r'.*?dc\.subject.*$', column)
if match is not None:
df[column] = df[column].apply(check.agrovoc)
if args.agrovoc_fields:
# Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(','):
if column == field:
df[column] = df[column].apply(check.agrovoc, field_name=column)
# Check: invalid language
match = re.match(r'^.*?language.*$', column)

View File

@ -186,9 +186,13 @@ def language(field):
return field
def agrovoc(field):
def agrovoc(field, field_name):
"""Check subject terms against AGROVOC REST API.
Function constructor expects the field as well as the field name because
many fields can now be validated against AGROVOC and we want to be able
to inform the user in which field the invalid term is.
Logic copied from agrovoc-lookup.py.
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
@ -235,6 +239,6 @@ def agrovoc(field):
# check if there are any results
if len(data['results']) == 0:
print(f'Invalid AGROVOC subject: {value}')
print(f'Invalid AGROVOC ({field_name}): {value}')
return field