1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 06:06:00 +02:00

Use pycountry instead of iso-639 for languages

The latter is a fork that hasn't been updated since 2016 and the
original still seems to be well maintained, with recent database
updates as well as tests for Python 3.7.

Also, pycountry supports ISO 3166-2 (administrative zones), which
we could eventually use for sub regions.
This commit is contained in:
2019-07-30 16:39:26 +03:00
parent a85b410ab9
commit 3c798fb504
3 changed files with 19 additions and 24 deletions

View File

@ -157,7 +157,7 @@ def language(field):
Prints the value if it is invalid.
"""
from iso639 import languages
from pycountry import languages
# Skip fields with missing values
if pd.isna(field):
@ -169,19 +169,14 @@ def language(field):
for value in field.split('||'):
# After splitting, check if language value is 2 or 3 characters so we
# can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639
# library ISO 639-2 is "part1" and ISO 639-3 is "part3".
# can check it against ISO 639-2 or ISO 639-3 accordingly.
if len(value) == 2:
try:
languages.get(part1=value)
except KeyError:
if not languages.get(alpha_2=value):
print(f'Invalid ISO 639-2 language: {value}')
pass
elif len(value) == 3:
try:
languages.get(part3=value)
except KeyError:
if not languages.get(alpha_3=value):
print(f'Invalid ISO 639-3 language: {value}')
pass