1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-07-15 09:11:19 +02:00

Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".
This commit is contained in:
Alan Orth 2019-07-29 18:59:42 +03:00
parent 1978fa7b48
commit a36454a3ac
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
5 changed files with 117 additions and 15 deletions

View File

@ -8,6 +8,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
- Read/write CSV files
- Read Excel files
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
- Validate languages against ISO 639-2 and ISO 639-3
- Fix leading, trailing, and excessive whitespace
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc

View File

@ -43,6 +43,11 @@ def main(argv):
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates)
# Check: invalid language
match = re.match(r'^.*?language.*$', column)
if match is not None:
df[column] = df[column].apply(check.language)
# Check: invalid ISSN
match = re.match(r'^.*?issn.*$', column)
if match is not None:

View File

@ -149,3 +149,43 @@ def suspicious_characters(field):
print(f'Suspicious character: {field}')
return field
def language(field):
"""Check if a language is valid ISO 639-2 or ISO 639-3.
Prints the value if it is invalid.
"""
from iso639 import languages
# Skip fields with missing values
if pd.isna(field):
return
# need to handle "Other" values here...
# Try to split multi-value field on "||" separator
for value in field.split('||'):
# After splitting, check if language value is 2 or 3 characters so we
# can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639
# library ISO 639-2 is "part1" and ISO 639-3 is "part3".
if len(value) == 2:
try:
languages.get(part1=value)
except KeyError:
print(f'Invalid ISO 639-2 language: {value}')
pass
elif len(value) == 3:
try:
languages.get(part3=value)
except KeyError:
print(f'Invalid ISO 639-3 language: {value}')
pass
else:
print(f'Invalid language: {value}')
return field

View File

@ -1,15 +1,18 @@
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn
Leading space,2019-07-29,,
Trailing space ,2019-07-29,,
Excessive space,2019-07-29,,
Miscellaenous ||whitespace | issues ,2019-07-29,,
Duplicate||Duplicate,2019-07-29,,
Invalid ISSN,2019-07-29,2321-2302,
Invalid ISBN,2019-07-29,,978-0-306-40615-6
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7
Invalid date,2019-07-260,,
Multiple dates,2019-07-26||2019-01-10,,
Invalid multi-value separator,,0378-5955|0024-9319,
Unnecessary Unicode,2019-07-29,,
Suspicious character||foreˆt,2019-07-29,,
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso
Leading space,2019-07-29,,,
Trailing space ,2019-07-29,,,
Excessive space,2019-07-29,,,
Miscellaenous ||whitespace | issues ,2019-07-29,,,
Duplicate||Duplicate,2019-07-29,,,
Invalid ISSN,2019-07-29,2321-2302,,
Invalid ISBN,2019-07-29,,978-0-306-40615-6,
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,
Invalid date,2019-07-260,,,
Multiple dates,2019-07-26||2019-01-10,,,
Invalid multi-value separator,,0378-5955|0024-9319,,
Unnecessary Unicode,2019-07-29,,,
Suspicious character||foreˆt,2019-07-29,,,
Invalid ISO 639-2 language,2019-07-29,,,jp
Invalid ISO 639-3 language,2019-07-29,,,chi
Invalid language,2019-07-29,,,Span

1 dc.contributor.author birthdate dc.identifier.issn dc.identifier.isbn dc.language.iso
2 Leading space 2019-07-29
3 Trailing space 2019-07-29
4 Excessive space 2019-07-29
5 Miscellaenous ||whitespace | issues 2019-07-29
6 Duplicate||Duplicate 2019-07-29
7 Invalid ISSN 2019-07-29 2321-2302
8 Invalid ISBN 2019-07-29 978-0-306-40615-6
9 Multiple valid ISSNs 2019-07-29 0378-5955||0024-9319
10 Multiple valid ISBNs 2019-07-29 99921-58-10-7||978-0-306-40615-7
11 Invalid date 2019-07-260
12 Multiple dates 2019-07-26||2019-01-10
13 Invalid multi-value separator 0378-5955|0024-9319
14 Unnecessary Unicode​ 2019-07-29
15 Suspicious character||foreˆt 2019-07-29
16 Invalid ISO 639-2 language 2019-07-29 jp
17 Invalid ISO 639-3 language 2019-07-29 chi
18 Invalid language 2019-07-29 Span

View File

@ -116,3 +116,56 @@ def test_check_suspicious_characters(capsys):
captured = capsys.readouterr()
assert captured.out == f'Suspicious character: {value}\n'
def test_check_valid_iso639_2_language():
'''Test valid ISO 639-2 language.'''
value = 'ja'
result = check.language(value)
assert result == value
def test_check_valid_iso639_3_language():
'''Test invalid ISO 639-3 language.'''
value = 'eng'
result = check.language(value)
assert result == value
def test_check_invalid_iso639_2_language(capsys):
'''Test invalid ISO 639-2 language.'''
value = 'jp'
check.language(value)
captured = capsys.readouterr()
assert captured.out == f'Invalid ISO 639-2 language: {value}\n'
def test_check_invalid_iso639_3_language(capsys):
'''Test invalid ISO 639-3 language.'''
value = 'chi'
check.language(value)
captured = capsys.readouterr()
assert captured.out == f'Invalid ISO 639-3 language: {value}\n'
def test_check_invalid_language(capsys):
'''Test invalid language.'''
value = 'Span'
check.language(value)
captured = capsys.readouterr()
assert captured.out == f'Invalid language: {value}\n'