diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index d025be9..36d8aa7 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -21,6 +21,12 @@ def parse_args(argv): "-a", help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country", ) + parser.add_argument( + "--drop-invalid-agrovoc", + "-d", + help="After validating metadata values against AGROVOC, drop invalid values.", + action="store_true", + ) parser.add_argument( "--experimental-checks", "-e", @@ -123,12 +129,14 @@ def run(argv): # Fix: duplicate metadata values df[column] = df[column].apply(fix.duplicates, field_name=column) - # Check: invalid AGROVOC subject + # Check: invalid AGROVOC subject and optionally drop them if args.agrovoc_fields: # Identify fields the user wants to validate against AGROVOC for field in args.agrovoc_fields.split(","): if column == field: - df[column].apply(check.agrovoc, field_name=column) + df[column] = df[column].apply( + check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc + ) # Check: invalid language match = re.match(r"^.*?language.*$", column) diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 6a2f89f..7de9374 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -188,7 +188,7 @@ def language(field): return -def agrovoc(field, field_name): +def agrovoc(field, field_name, drop): """Check subject terms against AGROVOC REST API. Function constructor expects the field as well as the field name because @@ -219,6 +219,9 @@ def agrovoc(field, field_name): # prune old cache entries requests_cache.remove_expired_responses() + # Initialize an empty list to hold the validated AGROVOC values + values = list() + # Try to split multi-value field on "||" separator for value in field.split("||"): request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" @@ -231,9 +234,25 @@ def agrovoc(field, field_name): # check if there are any results if len(data["results"]) == 0: - print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") + if drop: + print( + f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}" + ) + else: + print( + f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}" + ) - return + # value is invalid AGROVOC, but we are not dropping + values.append(value) + else: + # value is valid AGROVOC so save it + values.append(value) + + # Create a new field consisting of all values joined with "||" + new_field = "||".join(values) + + return new_field def filename_extension(field):