mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-01-24 19:23:22 +01:00
Add support for validating languages
Will validate against ISO 639-2 or ISO 639-3 depending on how long the language field is. Otherwise will return that the language is invalid. Does not currently have any support for generic values like "Other".
This commit is contained in:
parent
1978fa7b48
commit
a36454a3ac
@ -8,6 +8,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht
|
||||
- Read/write CSV files
|
||||
- Read Excel files
|
||||
- Validate dates, ISSNs, ISBNs, and multi-value separators ("||")
|
||||
- Validate languages against ISO 639-2 and ISO 639-3
|
||||
- Fix leading, trailing, and excessive whitespace
|
||||
- Fix invalid multi-value separators (`|`) using `--unsafe-fixes`
|
||||
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc
|
||||
|
@ -43,6 +43,11 @@ def main(argv):
|
||||
# Fix: duplicate metadata values
|
||||
df[column] = df[column].apply(fix.duplicates)
|
||||
|
||||
# Check: invalid language
|
||||
match = re.match(r'^.*?language.*$', column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.language)
|
||||
|
||||
# Check: invalid ISSN
|
||||
match = re.match(r'^.*?issn.*$', column)
|
||||
if match is not None:
|
||||
|
@ -149,3 +149,43 @@ def suspicious_characters(field):
|
||||
print(f'Suspicious character: {field}')
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def language(field):
|
||||
"""Check if a language is valid ISO 639-2 or ISO 639-3.
|
||||
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
|
||||
from iso639 import languages
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# need to handle "Other" values here...
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
|
||||
# After splitting, check if language value is 2 or 3 characters so we
|
||||
# can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639
|
||||
# library ISO 639-2 is "part1" and ISO 639-3 is "part3".
|
||||
if len(value) == 2:
|
||||
try:
|
||||
languages.get(part1=value)
|
||||
except KeyError:
|
||||
print(f'Invalid ISO 639-2 language: {value}')
|
||||
|
||||
pass
|
||||
elif len(value) == 3:
|
||||
try:
|
||||
languages.get(part3=value)
|
||||
except KeyError:
|
||||
print(f'Invalid ISO 639-3 language: {value}')
|
||||
|
||||
pass
|
||||
else:
|
||||
print(f'Invalid language: {value}')
|
||||
|
||||
return field
|
||||
|
@ -1,15 +1,18 @@
|
||||
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn
|
||||
Leading space,2019-07-29,,
|
||||
Trailing space ,2019-07-29,,
|
||||
Excessive space,2019-07-29,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,
|
||||
Duplicate||Duplicate,2019-07-29,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7
|
||||
Invalid date,2019-07-260,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,
|
||||
Invalid multi-value separator,,0378-5955|0024-9319,
|
||||
Unnecessary Unicode,2019-07-29,,
|
||||
Suspicious character||foreˆt,2019-07-29,,
|
||||
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso
|
||||
Leading space,2019-07-29,,,
|
||||
Trailing space ,2019-07-29,,,
|
||||
Excessive space,2019-07-29,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,
|
||||
Invalid date,2019-07-260,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,
|
||||
Invalid multi-value separator,,0378-5955|0024-9319,,
|
||||
Unnecessary Unicode,2019-07-29,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,
|
||||
Invalid ISO 639-2 language,2019-07-29,,,jp
|
||||
Invalid ISO 639-3 language,2019-07-29,,,chi
|
||||
Invalid language,2019-07-29,,,Span
|
||||
|
|
@ -116,3 +116,56 @@ def test_check_suspicious_characters(capsys):
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Suspicious character: {value}\n'
|
||||
|
||||
|
||||
def test_check_valid_iso639_2_language():
|
||||
'''Test valid ISO 639-2 language.'''
|
||||
|
||||
value = 'ja'
|
||||
|
||||
result = check.language(value)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_valid_iso639_3_language():
|
||||
'''Test invalid ISO 639-3 language.'''
|
||||
|
||||
value = 'eng'
|
||||
|
||||
result = check.language(value)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_invalid_iso639_2_language(capsys):
|
||||
'''Test invalid ISO 639-2 language.'''
|
||||
|
||||
value = 'jp'
|
||||
|
||||
check.language(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid ISO 639-2 language: {value}\n'
|
||||
|
||||
|
||||
def test_check_invalid_iso639_3_language(capsys):
|
||||
'''Test invalid ISO 639-3 language.'''
|
||||
|
||||
value = 'chi'
|
||||
|
||||
check.language(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid ISO 639-3 language: {value}\n'
|
||||
|
||||
|
||||
def test_check_invalid_language(capsys):
|
||||
'''Test invalid language.'''
|
||||
|
||||
value = 'Span'
|
||||
|
||||
check.language(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Invalid language: {value}\n'
|
||||
|
Loading…
x
Reference in New Issue
Block a user