1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-29 00:58:19 +01:00

Compare commits

..

4 Commits

Author SHA1 Message Date
e1b270cf83
CHANGELOG.md: add note about dropping invalid AGROVOC values
All checks were successful
continuous-integration/drone/push Build is passing
2021-12-23 12:47:42 +02:00
b7efe2de40
data/test.csv: update invalid AGROVOC entry
Now that we can drop invalid AGROVOC values we should have a valid
value and an invalid value here. Depending on how the checker is
invoked we will either print a warning or drop the invalid value.
2021-12-23 12:45:38 +02:00
c43095139a
tests/test_check.py: add tests for dropping invalid AGROVOC 2021-12-23 12:44:32 +02:00
a7727b8431
Add support for dropping invalid AGROVOC terms
Requires --agrovoc-fields <field.name> to do the actual validation,
and -d to drop invalid ones.
2021-12-23 12:43:55 +02:00
5 changed files with 69 additions and 14 deletions

View File

@ -5,15 +5,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased ## Unreleased
## Changed ### Changed
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding - Perform fix for "unnecessary" Unicode characters after we try to fix encoding
issues with ftfy issues with ftfy
- ftfy heuristics to use `is_bad()` instead of `sequence_weirdness()` - ftfy heuristics to use `is_bad()` instead of `sequence_weirdness()`
- ftfy `fix_text()` to *not* change “smart quotes” to "ASCII quotes" - ftfy `fix_text()` to *not* change “smart quotes” to "ASCII quotes"
## Updated ### Updated
- Python dependencies - Python dependencies
### Added
- Ability to drop invalid AGROVOC values with `-d` when checking AGROVOC values
with `-a <field.name>`
## [0.5.0] - 2021-12-08 ## [0.5.0] - 2021-12-08
### Added ### Added
- Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy) - Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy)

View File

@ -21,6 +21,12 @@ def parse_args(argv):
"-a", "-a",
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country", help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
) )
parser.add_argument(
"--drop-invalid-agrovoc",
"-d",
help="After validating metadata values against AGROVOC, drop invalid values.",
action="store_true",
)
parser.add_argument( parser.add_argument(
"--experimental-checks", "--experimental-checks",
"-e", "-e",
@ -123,12 +129,14 @@ def run(argv):
# Fix: duplicate metadata values # Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates, field_name=column) df[column] = df[column].apply(fix.duplicates, field_name=column)
# Check: invalid AGROVOC subject # Check: invalid AGROVOC subject and optionally drop them
if args.agrovoc_fields: if args.agrovoc_fields:
# Identify fields the user wants to validate against AGROVOC # Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","): for field in args.agrovoc_fields.split(","):
if column == field: if column == field:
df[column].apply(check.agrovoc, field_name=column) df[column] = df[column].apply(
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
)
# Check: invalid language # Check: invalid language
match = re.match(r"^.*?language.*$", column) match = re.match(r"^.*?language.*$", column)

View File

@ -188,7 +188,7 @@ def language(field):
return return
def agrovoc(field, field_name): def agrovoc(field, field_name, drop):
"""Check subject terms against AGROVOC REST API. """Check subject terms against AGROVOC REST API.
Function constructor expects the field as well as the field name because Function constructor expects the field as well as the field name because
@ -219,6 +219,9 @@ def agrovoc(field, field_name):
# prune old cache entries # prune old cache entries
requests_cache.remove_expired_responses() requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values
values = list()
# Try to split multi-value field on "||" separator # Try to split multi-value field on "||" separator
for value in field.split("||"): for value in field.split("||"):
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
@ -231,9 +234,25 @@ def agrovoc(field, field_name):
# check if there are any results # check if there are any results
if len(data["results"]) == 0: if len(data["results"]) == 0:
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") if drop:
print(
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
else:
print(
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
return # value is invalid AGROVOC, but we are not dropping
values.append(value)
else:
# value is valid AGROVOC so save it
values.append(value)
# Create a new field consisting of all values joined with "||"
new_field = "||".join(values)
return new_field
def filename_extension(field): def filename_extension(field):

View File

@ -16,7 +16,7 @@ Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,, Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,, Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,
Invalid language,2019-07-29,,,Span,,,,,,,, Invalid language,2019-07-29,,,Span,,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,,, Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,
Newline (LF),2019-07-30,,,,"TANZA Newline (LF),2019-07-30,,,,"TANZA
NIA",,,,,,, NIA",,,,,,,
Missing date,,,,,,,,,,,, Missing date,,,,,,,,,,,,

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi cg.coverage.region
16 Invalid ISO 639-1 (alpha 2) language 2019-07-29 jp
17 Invalid ISO 639-3 (alpha 3) language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 FOREST LIVESTOCK||FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA

View File

@ -179,18 +179,41 @@ def test_check_invalid_language(capsys):
def test_check_invalid_agrovoc(capsys): def test_check_invalid_agrovoc(capsys):
"""Test invalid AGROVOC subject.""" """Test invalid AGROVOC subject. Invalid values *will not* be dropped."""
value = "FOREST" valid_agrovoc = "LIVESTOCK"
invalid_agrovoc = "FOREST"
value = f"{valid_agrovoc}||{invalid_agrovoc}"
field_name = "dcterms.subject" field_name = "dcterms.subject"
drop = False
check.agrovoc(value, field_name) new_value = check.agrovoc(value, field_name, drop)
captured = capsys.readouterr() captured = capsys.readouterr()
assert ( assert (
captured.out captured.out
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}\n" == f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
) )
assert new_value == value
def test_check_invalid_agrovoc_dropped(capsys):
"""Test invalid AGROVOC subjects. Invalid values *will* be dropped."""
valid_agrovoc = "LIVESTOCK"
invalid_agrovoc = "FOREST"
value = f"{valid_agrovoc}||{invalid_agrovoc}"
field_name = "dcterms.subject"
drop = True
new_value = check.agrovoc(value, field_name, drop)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
)
assert new_value == valid_agrovoc
def test_check_valid_agrovoc(): def test_check_valid_agrovoc():
@ -198,10 +221,11 @@ def test_check_valid_agrovoc():
value = "FORESTS" value = "FORESTS"
field_name = "dcterms.subject" field_name = "dcterms.subject"
drop = False
result = check.agrovoc(value, field_name) result = check.agrovoc(value, field_name, drop)
assert result == None assert result == "FORESTS"
def test_check_uncommon_filename_extension(capsys): def test_check_uncommon_filename_extension(capsys):