diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 4c27ecd..953f961 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -7,6 +7,7 @@ import re def parse_args(argv): parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.') + parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country') parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8')) parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8')) parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true') @@ -48,9 +49,11 @@ def run(argv): df[column] = df[column].apply(fix.duplicates) # Check: invalid AGROVOC subject - match = re.match(r'.*?dc\.subject.*$', column) - if match is not None: - df[column] = df[column].apply(check.agrovoc) + if args.agrovoc_fields: + # Identify fields the user wants to validate against AGROVOC + for field in args.agrovoc_fields.split(','): + if column == field: + df[column] = df[column].apply(check.agrovoc, field_name=column) # Check: invalid language match = re.match(r'^.*?language.*$', column) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 899bf84..f057bc9 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -186,9 +186,13 @@ def language(field): return field -def agrovoc(field): +def agrovoc(field, field_name): """Check subject terms against AGROVOC REST API. + Function constructor expects the field as well as the field name because + many fields can now be validated against AGROVOC and we want to be able + to inform the user in which field the invalid term is. + Logic copied from agrovoc-lookup.py. See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py @@ -235,6 +239,6 @@ def agrovoc(field): # check if there are any results if len(data['results']) == 0: - print(f'Invalid AGROVOC subject: {value}') + print(f'Invalid AGROVOC ({field_name}): {value}') return field