mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-29 00:58:19 +01:00
Compare commits
4 Commits
7763a021c5
...
e1b270cf83
Author | SHA1 | Date | |
---|---|---|---|
e1b270cf83 | |||
b7efe2de40 | |||
c43095139a | |||
a7727b8431 |
@ -5,15 +5,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
## Changed
|
### Changed
|
||||||
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding
|
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding
|
||||||
issues with ftfy
|
issues with ftfy
|
||||||
- ftfy heuristics to use `is_bad()` instead of `sequence_weirdness()`
|
- ftfy heuristics to use `is_bad()` instead of `sequence_weirdness()`
|
||||||
- ftfy `fix_text()` to *not* change “smart quotes” to "ASCII quotes"
|
- ftfy `fix_text()` to *not* change “smart quotes” to "ASCII quotes"
|
||||||
|
|
||||||
## Updated
|
### Updated
|
||||||
- Python dependencies
|
- Python dependencies
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Ability to drop invalid AGROVOC values with `-d` when checking AGROVOC values
|
||||||
|
with `-a <field.name>`
|
||||||
|
|
||||||
## [0.5.0] - 2021-12-08
|
## [0.5.0] - 2021-12-08
|
||||||
### Added
|
### Added
|
||||||
- Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy)
|
- Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy)
|
||||||
|
@ -21,6 +21,12 @@ def parse_args(argv):
|
|||||||
"-a",
|
"-a",
|
||||||
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
|
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--drop-invalid-agrovoc",
|
||||||
|
"-d",
|
||||||
|
help="After validating metadata values against AGROVOC, drop invalid values.",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--experimental-checks",
|
"--experimental-checks",
|
||||||
"-e",
|
"-e",
|
||||||
@ -123,12 +129,14 @@ def run(argv):
|
|||||||
# Fix: duplicate metadata values
|
# Fix: duplicate metadata values
|
||||||
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
||||||
|
|
||||||
# Check: invalid AGROVOC subject
|
# Check: invalid AGROVOC subject and optionally drop them
|
||||||
if args.agrovoc_fields:
|
if args.agrovoc_fields:
|
||||||
# Identify fields the user wants to validate against AGROVOC
|
# Identify fields the user wants to validate against AGROVOC
|
||||||
for field in args.agrovoc_fields.split(","):
|
for field in args.agrovoc_fields.split(","):
|
||||||
if column == field:
|
if column == field:
|
||||||
df[column].apply(check.agrovoc, field_name=column)
|
df[column] = df[column].apply(
|
||||||
|
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
|
||||||
|
)
|
||||||
|
|
||||||
# Check: invalid language
|
# Check: invalid language
|
||||||
match = re.match(r"^.*?language.*$", column)
|
match = re.match(r"^.*?language.*$", column)
|
||||||
|
@ -188,7 +188,7 @@ def language(field):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def agrovoc(field, field_name):
|
def agrovoc(field, field_name, drop):
|
||||||
"""Check subject terms against AGROVOC REST API.
|
"""Check subject terms against AGROVOC REST API.
|
||||||
|
|
||||||
Function constructor expects the field as well as the field name because
|
Function constructor expects the field as well as the field name because
|
||||||
@ -219,6 +219,9 @@ def agrovoc(field, field_name):
|
|||||||
# prune old cache entries
|
# prune old cache entries
|
||||||
requests_cache.remove_expired_responses()
|
requests_cache.remove_expired_responses()
|
||||||
|
|
||||||
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
|
values = list()
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||||
@ -231,9 +234,25 @@ def agrovoc(field, field_name):
|
|||||||
|
|
||||||
# check if there are any results
|
# check if there are any results
|
||||||
if len(data["results"]) == 0:
|
if len(data["results"]) == 0:
|
||||||
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
|
if drop:
|
||||||
|
print(
|
||||||
|
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
|
||||||
|
)
|
||||||
|
|
||||||
return
|
# value is invalid AGROVOC, but we are not dropping
|
||||||
|
values.append(value)
|
||||||
|
else:
|
||||||
|
# value is valid AGROVOC so save it
|
||||||
|
values.append(value)
|
||||||
|
|
||||||
|
# Create a new field consisting of all values joined with "||"
|
||||||
|
new_field = "||".join(values)
|
||||||
|
|
||||||
|
return new_field
|
||||||
|
|
||||||
|
|
||||||
def filename_extension(field):
|
def filename_extension(field):
|
||||||
|
@ -16,7 +16,7 @@ Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,
|
|||||||
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,
|
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,
|
||||||
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,
|
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,
|
||||||
Invalid language,2019-07-29,,,Span,,,,,,,,
|
Invalid language,2019-07-29,,,Span,,,,,,,,
|
||||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,,,
|
Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,
|
||||||
Newline (LF),2019-07-30,,,,"TANZA
|
Newline (LF),2019-07-30,,,,"TANZA
|
||||||
NIA",,,,,,,
|
NIA",,,,,,,
|
||||||
Missing date,,,,,,,,,,,,
|
Missing date,,,,,,,,,,,,
|
||||||
|
|
@ -179,18 +179,41 @@ def test_check_invalid_language(capsys):
|
|||||||
|
|
||||||
|
|
||||||
def test_check_invalid_agrovoc(capsys):
|
def test_check_invalid_agrovoc(capsys):
|
||||||
"""Test invalid AGROVOC subject."""
|
"""Test invalid AGROVOC subject. Invalid values *will not* be dropped."""
|
||||||
|
|
||||||
value = "FOREST"
|
valid_agrovoc = "LIVESTOCK"
|
||||||
|
invalid_agrovoc = "FOREST"
|
||||||
|
value = f"{valid_agrovoc}||{invalid_agrovoc}"
|
||||||
field_name = "dcterms.subject"
|
field_name = "dcterms.subject"
|
||||||
|
drop = False
|
||||||
|
|
||||||
check.agrovoc(value, field_name)
|
new_value = check.agrovoc(value, field_name, drop)
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert (
|
assert (
|
||||||
captured.out
|
captured.out
|
||||||
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}\n"
|
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
|
||||||
)
|
)
|
||||||
|
assert new_value == value
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_invalid_agrovoc_dropped(capsys):
|
||||||
|
"""Test invalid AGROVOC subjects. Invalid values *will* be dropped."""
|
||||||
|
|
||||||
|
valid_agrovoc = "LIVESTOCK"
|
||||||
|
invalid_agrovoc = "FOREST"
|
||||||
|
value = f"{valid_agrovoc}||{invalid_agrovoc}"
|
||||||
|
field_name = "dcterms.subject"
|
||||||
|
drop = True
|
||||||
|
|
||||||
|
new_value = check.agrovoc(value, field_name, drop)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert (
|
||||||
|
captured.out
|
||||||
|
== f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
|
||||||
|
)
|
||||||
|
assert new_value == valid_agrovoc
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_agrovoc():
|
def test_check_valid_agrovoc():
|
||||||
@ -198,10 +221,11 @@ def test_check_valid_agrovoc():
|
|||||||
|
|
||||||
value = "FORESTS"
|
value = "FORESTS"
|
||||||
field_name = "dcterms.subject"
|
field_name = "dcterms.subject"
|
||||||
|
drop = False
|
||||||
|
|
||||||
result = check.agrovoc(value, field_name)
|
result = check.agrovoc(value, field_name, drop)
|
||||||
|
|
||||||
assert result == None
|
assert result == "FORESTS"
|
||||||
|
|
||||||
|
|
||||||
def test_check_uncommon_filename_extension(capsys):
|
def test_check_uncommon_filename_extension(capsys):
|
||||||
|
Loading…
Reference in New Issue
Block a user