From 3c798fb504839c588d4a6485d1b7807406b78725 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 30 Jul 2019 16:39:26 +0300 Subject: [PATCH] Use pycountry instead of iso-639 for languages The latter is a fork that hasn't been updated since 2016 and the original still seems to be well maintained, with recent database updates as well as tests for Python 3.7. Also, pycountry supports ISO 3166-2 (administrative zones), which we could eventually use for sub regions. --- Pipfile | 2 +- Pipfile.lock | 28 ++++++++++++++-------------- csv_metadata_quality/check.py | 13 ++++--------- 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/Pipfile b/Pipfile index a6a4b07..f95af42 100644 --- a/Pipfile +++ b/Pipfile @@ -12,9 +12,9 @@ flake8 = "*" pandas = "*" python-stdnum = "*" xlrd = "*" -iso-639 = "*" requests = "*" requests-cache = "*" +pycountry = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index edb8cf4..936a0f0 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "c5c86b4dae011bcbf6705514d97aa55e0a59dd8b7927c38e34103d77eca13cc7" + "sha256": "1c4130ed98fb55545244ba2926f2b4246dc86af7545cb892a45311426f934cae" }, "pipfile-spec": 6, "requires": { @@ -37,13 +37,6 @@ ], "version": "==2.8" }, - "iso-639": { - "hashes": [ - "sha256:dc9cd4b880b898d774c47fe9775167404af8a85dd889d58f9008035109acce49" - ], - "index": "pypi", - "version": "==0.4.5" - }, "numpy": { "hashes": [ "sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e", @@ -85,6 +78,13 @@ "index": "pypi", "version": "==0.25.0" }, + "pycountry": { + "hashes": [ + "sha256:68e58bfd3bedeea49ba9d4b38f2bd5e042f9753628eba9a819fb03f551d89096" + ], + "index": "pypi", + "version": "==19.7.15" + }, "python-dateutil": { "hashes": [ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", @@ -235,10 +235,10 @@ }, "packaging": { "hashes": [ - "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", - "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3" + "sha256:a7ac867b97fdc07ee80a8058fe4435ccd274ecc3b0ed61d852d7d53055528cf9", + "sha256:c491ca87294da7cc01902edbe30a5bc6c4c28172b5138ab4e4aa1b9d7bfaeafe" ], - "version": "==19.0" + "version": "==19.1" }, "parso": { "hashes": [ @@ -314,10 +314,10 @@ }, "pyparsing": { "hashes": [ - "sha256:43c5486cefefa536c9aab528881c992328f020eefe4f6d06332449c365218580", - "sha256:d6c5ffe9d0305b9b977f7a642d36b9370954d1da7ada4c62393382cbadad4265" + "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", + "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4" ], - "version": "==2.4.1.1" + "version": "==2.4.2" }, "pytest": { "hashes": [ diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 7be9f25..899bf84 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -157,7 +157,7 @@ def language(field): Prints the value if it is invalid. """ - from iso639 import languages + from pycountry import languages # Skip fields with missing values if pd.isna(field): @@ -169,19 +169,14 @@ def language(field): for value in field.split('||'): # After splitting, check if language value is 2 or 3 characters so we - # can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639 - # library ISO 639-2 is "part1" and ISO 639-3 is "part3". + # can check it against ISO 639-2 or ISO 639-3 accordingly. if len(value) == 2: - try: - languages.get(part1=value) - except KeyError: + if not languages.get(alpha_2=value): print(f'Invalid ISO 639-2 language: {value}') pass elif len(value) == 3: - try: - languages.get(part3=value) - except KeyError: + if not languages.get(alpha_3=value): print(f'Invalid ISO 639-3 language: {value}') pass