mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 04:02:19 +01:00
Rework AGROVOC validation
AGROVOC validation is now disabled by default, but can be enabled on a field-by-field basis. For example, countries and regions are also present in AGROVOC. Fields with these values can be enabled using the new `--agrovoc-fields` option. I reworked the script output to show the field name when printing an invalid term so that the user knows in which field the term is.
This commit is contained in:
parent
576b3a3638
commit
bf876a046a
@ -7,6 +7,7 @@ import re
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.')
|
||||
parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country')
|
||||
parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8'))
|
||||
parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8'))
|
||||
parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true')
|
||||
@ -48,9 +49,11 @@ def run(argv):
|
||||
df[column] = df[column].apply(fix.duplicates)
|
||||
|
||||
# Check: invalid AGROVOC subject
|
||||
match = re.match(r'.*?dc\.subject.*$', column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.agrovoc)
|
||||
if args.agrovoc_fields:
|
||||
# Identify fields the user wants to validate against AGROVOC
|
||||
for field in args.agrovoc_fields.split(','):
|
||||
if column == field:
|
||||
df[column] = df[column].apply(check.agrovoc, field_name=column)
|
||||
|
||||
# Check: invalid language
|
||||
match = re.match(r'^.*?language.*$', column)
|
||||
|
@ -186,9 +186,13 @@ def language(field):
|
||||
return field
|
||||
|
||||
|
||||
def agrovoc(field):
|
||||
def agrovoc(field, field_name):
|
||||
"""Check subject terms against AGROVOC REST API.
|
||||
|
||||
Function constructor expects the field as well as the field name because
|
||||
many fields can now be validated against AGROVOC and we want to be able
|
||||
to inform the user in which field the invalid term is.
|
||||
|
||||
Logic copied from agrovoc-lookup.py.
|
||||
|
||||
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
|
||||
@ -235,6 +239,6 @@ def agrovoc(field):
|
||||
|
||||
# check if there are any results
|
||||
if len(data['results']) == 0:
|
||||
print(f'Invalid AGROVOC subject: {value}')
|
||||
print(f'Invalid AGROVOC ({field_name}): {value}')
|
||||
|
||||
return field
|
||||
|
Loading…
Reference in New Issue
Block a user