mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
Rework AGROVOC validation
AGROVOC validation is now disabled by default, but can be enabled on a field-by-field basis. For example, countries and regions are also present in AGROVOC. Fields with these values can be enabled using the new `--agrovoc-fields` option. I reworked the script output to show the field name when printing an invalid term so that the user knows in which field the term is.
This commit is contained in:
parent
576b3a3638
commit
bf876a046a
@ -7,6 +7,7 @@ import re
|
|||||||
|
|
||||||
def parse_args(argv):
|
def parse_args(argv):
|
||||||
parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.')
|
parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.')
|
||||||
|
parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country')
|
||||||
parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8'))
|
parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8'))
|
||||||
parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8'))
|
parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8'))
|
||||||
parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true')
|
parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true')
|
||||||
@ -48,9 +49,11 @@ def run(argv):
|
|||||||
df[column] = df[column].apply(fix.duplicates)
|
df[column] = df[column].apply(fix.duplicates)
|
||||||
|
|
||||||
# Check: invalid AGROVOC subject
|
# Check: invalid AGROVOC subject
|
||||||
match = re.match(r'.*?dc\.subject.*$', column)
|
if args.agrovoc_fields:
|
||||||
if match is not None:
|
# Identify fields the user wants to validate against AGROVOC
|
||||||
df[column] = df[column].apply(check.agrovoc)
|
for field in args.agrovoc_fields.split(','):
|
||||||
|
if column == field:
|
||||||
|
df[column] = df[column].apply(check.agrovoc, field_name=column)
|
||||||
|
|
||||||
# Check: invalid language
|
# Check: invalid language
|
||||||
match = re.match(r'^.*?language.*$', column)
|
match = re.match(r'^.*?language.*$', column)
|
||||||
|
@ -186,9 +186,13 @@ def language(field):
|
|||||||
return field
|
return field
|
||||||
|
|
||||||
|
|
||||||
def agrovoc(field):
|
def agrovoc(field, field_name):
|
||||||
"""Check subject terms against AGROVOC REST API.
|
"""Check subject terms against AGROVOC REST API.
|
||||||
|
|
||||||
|
Function constructor expects the field as well as the field name because
|
||||||
|
many fields can now be validated against AGROVOC and we want to be able
|
||||||
|
to inform the user in which field the invalid term is.
|
||||||
|
|
||||||
Logic copied from agrovoc-lookup.py.
|
Logic copied from agrovoc-lookup.py.
|
||||||
|
|
||||||
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
|
See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py
|
||||||
@ -235,6 +239,6 @@ def agrovoc(field):
|
|||||||
|
|
||||||
# check if there are any results
|
# check if there are any results
|
||||||
if len(data['results']) == 0:
|
if len(data['results']) == 0:
|
||||||
print(f'Invalid AGROVOC subject: {value}')
|
print(f'Invalid AGROVOC ({field_name}): {value}')
|
||||||
|
|
||||||
return field
|
return field
|
||||||
|
Loading…
Reference in New Issue
Block a user