mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-22 05:45:02 +01:00
Fix references to ISO 639
It turns out that ISO 639-1 is the two-letter codes, and ISO 639-2 is the three-letter codes, aka alpha2 and alpha3. See: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
This commit is contained in:
parent
b5899001b7
commit
d9fc09f121
@ -6,7 +6,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
|
|||||||
## Functionality
|
## Functionality
|
||||||
|
|
||||||
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
|
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
|
||||||
- Validate languages against ISO 639-2 and ISO 639-3
|
- Validate languages against ISO 639-1 (alpha2) and ISO 639-2 (alpha3)
|
||||||
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
|
- Validate subjects against the AGROVOC REST API (see the `--agrovoc-fields` option)
|
||||||
- Fix leading, trailing, and excessive (ie, more than one) whitespace
|
- Fix leading, trailing, and excessive (ie, more than one) whitespace
|
||||||
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
|
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
|
||||||
|
@ -165,7 +165,7 @@ def suspicious_characters(field, field_name):
|
|||||||
|
|
||||||
|
|
||||||
def language(field):
|
def language(field):
|
||||||
"""Check if a language is valid ISO 639-2 or ISO 639-3.
|
"""Check if a language is valid ISO 639-1 or ISO 639-2.
|
||||||
|
|
||||||
Prints the value if it is invalid.
|
Prints the value if it is invalid.
|
||||||
"""
|
"""
|
||||||
@ -182,15 +182,15 @@ def language(field):
|
|||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
|
|
||||||
# After splitting, check if language value is 2 or 3 characters so we
|
# After splitting, check if language value is 2 or 3 characters so we
|
||||||
# can check it against ISO 639-2 or ISO 639-3 accordingly.
|
# can check it against ISO 639-1 or ISO 639-2 accordingly.
|
||||||
if len(value) == 2:
|
if len(value) == 2:
|
||||||
if not languages.get(alpha_2=value):
|
if not languages.get(alpha_2=value):
|
||||||
print(f"Invalid ISO 639-2 language: {value}")
|
print(f"Invalid ISO 639-1 language: {value}")
|
||||||
|
|
||||||
pass
|
pass
|
||||||
elif len(value) == 3:
|
elif len(value) == 3:
|
||||||
if not languages.get(alpha_3=value):
|
if not languages.get(alpha_3=value):
|
||||||
print(f"Invalid ISO 639-3 language: {value}")
|
print(f"Invalid ISO 639-2 language: {value}")
|
||||||
|
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -128,8 +128,8 @@ def test_check_suspicious_characters(capsys):
|
|||||||
assert captured.out == f'Suspicious character ({field_name}): ˆt\n'
|
assert captured.out == f'Suspicious character ({field_name}): ˆt\n'
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_iso639_2_language():
|
def test_check_valid_iso639_1_language():
|
||||||
'''Test valid ISO 639-2 language.'''
|
'''Test valid ISO 639-1 language.'''
|
||||||
|
|
||||||
value = 'ja'
|
value = 'ja'
|
||||||
|
|
||||||
@ -138,8 +138,8 @@ def test_check_valid_iso639_2_language():
|
|||||||
assert result == value
|
assert result == value
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_iso639_3_language():
|
def test_check_valid_iso639_2_language():
|
||||||
'''Test invalid ISO 639-3 language.'''
|
'''Test invalid ISO 639-2 language.'''
|
||||||
|
|
||||||
value = 'eng'
|
value = 'eng'
|
||||||
|
|
||||||
@ -148,26 +148,26 @@ def test_check_valid_iso639_3_language():
|
|||||||
assert result == value
|
assert result == value
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_iso639_2_language(capsys):
|
def test_check_invalid_iso639_1_language(capsys):
|
||||||
'''Test invalid ISO 639-2 language.'''
|
'''Test invalid ISO 639-1 language.'''
|
||||||
|
|
||||||
value = 'jp'
|
value = 'jp'
|
||||||
|
|
||||||
check.language(value)
|
check.language(value)
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert captured.out == f'Invalid ISO 639-2 language: {value}\n'
|
assert captured.out == f'Invalid ISO 639-1 language: {value}\n'
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_iso639_3_language(capsys):
|
def test_check_invalid_iso639_2_language(capsys):
|
||||||
'''Test invalid ISO 639-3 language.'''
|
'''Test invalid ISO 639-2 language.'''
|
||||||
|
|
||||||
value = 'chi'
|
value = 'chi'
|
||||||
|
|
||||||
check.language(value)
|
check.language(value)
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert captured.out == f'Invalid ISO 639-3 language: {value}\n'
|
assert captured.out == f'Invalid ISO 639-2 language: {value}\n'
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_language(capsys):
|
def test_check_invalid_language(capsys):
|
||||||
|
Loading…
Reference in New Issue
Block a user