From bf876a046a9542e8e655d3151c309a6bf064daaa Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 1 Aug 2019 23:51:58 +0300 Subject: [PATCH] Rework AGROVOC validation AGROVOC validation is now disabled by default, but can be enabled on a field-by-field basis. For example, countries and regions are also present in AGROVOC. Fields with these values can be enabled using the new `--agrovoc-fields` option. I reworked the script output to show the field name when printing an invalid term so that the user knows in which field the term is. --- csv_metadata_quality/app.py | 9 ++++++--- csv_metadata_quality/check.py | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 4c27ecd..953f961 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -7,6 +7,7 @@ import re def parse_args(argv): parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.') + parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country') parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8')) parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8')) parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true') @@ -48,9 +49,11 @@ def run(argv): df[column] = df[column].apply(fix.duplicates) # Check: invalid AGROVOC subject - match = re.match(r'.*?dc\.subject.*$', column) - if match is not None: - df[column] = df[column].apply(check.agrovoc) + if args.agrovoc_fields: + # Identify fields the user wants to validate against AGROVOC + for field in args.agrovoc_fields.split(','): + if column == field: + df[column] = df[column].apply(check.agrovoc, field_name=column) # Check: invalid language match = re.match(r'^.*?language.*$', column) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 899bf84..f057bc9 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -186,9 +186,13 @@ def language(field): return field -def agrovoc(field): +def agrovoc(field, field_name): """Check subject terms against AGROVOC REST API. + Function constructor expects the field as well as the field name because + many fields can now be validated against AGROVOC and we want to be able + to inform the user in which field the invalid term is. + Logic copied from agrovoc-lookup.py. See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py @@ -235,6 +239,6 @@ def agrovoc(field): # check if there are any results if len(data['results']) == 0: - print(f'Invalid AGROVOC subject: {value}') + print(f'Invalid AGROVOC ({field_name}): {value}') return field