1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-18 10:21:54 +02:00

Add support for dropping invalid AGROVOC terms

Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.
This commit is contained in:
2021-12-23 12:43:10 +02:00
parent 7763a021c5
commit a7727b8431
2 changed files with 32 additions and 5 deletions
csv_metadata_quality

@ -21,6 +21,12 @@ def parse_args(argv):
"-a",
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
)
parser.add_argument(
"--drop-invalid-agrovoc",
"-d",
help="After validating metadata values against AGROVOC, drop invalid values.",
action="store_true",
)
parser.add_argument(
"--experimental-checks",
"-e",
@ -123,12 +129,14 @@ def run(argv):
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates, field_name=column)
# Check: invalid AGROVOC subject
# Check: invalid AGROVOC subject and optionally drop them
if args.agrovoc_fields:
# Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","):
if column == field:
df[column].apply(check.agrovoc, field_name=column)
df[column] = df[column].apply(
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
)
# Check: invalid language
match = re.match(r"^.*?language.*$", column)