1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-29 00:58:19 +01:00

Compare commits

..

No commits in common. "e1b270cf83acbd5c8a893b05cd8360dd42aa3823" and "7763a021c5c8b7e13bfda774e1de0919bbc0cdfb" have entirely different histories.

5 changed files with 14 additions and 69 deletions

View File

@ -5,19 +5,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
### Changed
## Changed
- Perform fix for "unnecessary" Unicode characters after we try to fix encoding
issues with ftfy
- ftfy heuristics to use `is_bad()` instead of `sequence_weirdness()`
- ftfy `fix_text()` to *not* change “smart quotes” to "ASCII quotes"
### Updated
## Updated
- Python dependencies
### Added
- Ability to drop invalid AGROVOC values with `-d` when checking AGROVOC values
with `-a <field.name>`
## [0.5.0] - 2021-12-08
### Added
- Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy)

View File

@ -21,12 +21,6 @@ def parse_args(argv):
"-a",
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
)
parser.add_argument(
"--drop-invalid-agrovoc",
"-d",
help="After validating metadata values against AGROVOC, drop invalid values.",
action="store_true",
)
parser.add_argument(
"--experimental-checks",
"-e",
@ -129,14 +123,12 @@ def run(argv):
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates, field_name=column)
# Check: invalid AGROVOC subject and optionally drop them
# Check: invalid AGROVOC subject
if args.agrovoc_fields:
# Identify fields the user wants to validate against AGROVOC
for field in args.agrovoc_fields.split(","):
if column == field:
df[column] = df[column].apply(
check.agrovoc, field_name=column, drop=args.drop_invalid_agrovoc
)
df[column].apply(check.agrovoc, field_name=column)
# Check: invalid language
match = re.match(r"^.*?language.*$", column)

View File

@ -188,7 +188,7 @@ def language(field):
return
def agrovoc(field, field_name, drop):
def agrovoc(field, field_name):
"""Check subject terms against AGROVOC REST API.
Function constructor expects the field as well as the field name because
@ -219,9 +219,6 @@ def agrovoc(field, field_name, drop):
# prune old cache entries
requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values
values = list()
# Try to split multi-value field on "||" separator
for value in field.split("||"):
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
@ -234,25 +231,9 @@ def agrovoc(field, field_name, drop):
# check if there are any results
if len(data["results"]) == 0:
if drop:
print(
f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
else:
print(
f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}"
)
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
# value is invalid AGROVOC, but we are not dropping
values.append(value)
else:
# value is valid AGROVOC so save it
values.append(value)
# Create a new field consisting of all values joined with "||"
new_field = "||".join(values)
return new_field
return
def filename_extension(field):

View File

@ -16,7 +16,7 @@ Suspicious character||foreˆt,2019-07-29,,,,,,,,,,,
Invalid ISO 639-1 (alpha 2) language,2019-07-29,,,jp,,,,,,,,
Invalid ISO 639-3 (alpha 3) language,2019-07-29,,,chi,,,,,,,,
Invalid language,2019-07-29,,,Span,,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,LIVESTOCK||FOREST,,,,,,,
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,,,,,,
Newline (LF),2019-07-30,,,,"TANZA
NIA",,,,,,,
Missing date,,,,,,,,,,,,

1 dc.title dcterms.issued dc.identifier.issn dc.identifier.isbn dcterms.language dcterms.subject cg.coverage.country filename dcterms.license dcterms.type dcterms.bibliographicCitation cg.identifier.doi cg.coverage.region
16 Invalid ISO 639-1 (alpha 2) language 2019-07-29 jp
17 Invalid ISO 639-3 (alpha 3) language 2019-07-29 chi
18 Invalid language 2019-07-29 Span
19 Invalid AGROVOC subject 2019-07-29 LIVESTOCK||FOREST FOREST
20 Newline (LF) 2019-07-30 TANZA NIA
21 Missing date
22 Invalid country 2019-08-01 KENYAA

View File

@ -179,41 +179,18 @@ def test_check_invalid_language(capsys):
def test_check_invalid_agrovoc(capsys):
"""Test invalid AGROVOC subject. Invalid values *will not* be dropped."""
"""Test invalid AGROVOC subject."""
valid_agrovoc = "LIVESTOCK"
invalid_agrovoc = "FOREST"
value = f"{valid_agrovoc}||{invalid_agrovoc}"
value = "FOREST"
field_name = "dcterms.subject"
drop = False
new_value = check.agrovoc(value, field_name, drop)
check.agrovoc(value, field_name)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
== f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}\n"
)
assert new_value == value
def test_check_invalid_agrovoc_dropped(capsys):
"""Test invalid AGROVOC subjects. Invalid values *will* be dropped."""
valid_agrovoc = "LIVESTOCK"
invalid_agrovoc = "FOREST"
value = f"{valid_agrovoc}||{invalid_agrovoc}"
field_name = "dcterms.subject"
drop = True
new_value = check.agrovoc(value, field_name, drop)
captured = capsys.readouterr()
assert (
captured.out
== f"{Fore.GREEN}Dropping invalid AGROVOC ({field_name}): {Fore.RESET}{invalid_agrovoc}\n"
)
assert new_value == valid_agrovoc
def test_check_valid_agrovoc():
@ -221,11 +198,10 @@ def test_check_valid_agrovoc():
value = "FORESTS"
field_name = "dcterms.subject"
drop = False
result = check.agrovoc(value, field_name, drop)
result = check.agrovoc(value, field_name)
assert result == "FORESTS"
assert result == None
def test_check_uncommon_filename_extension(capsys):