1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-12-21 19:52:18 +01:00

Use py3langid instead of langid

Faster and more modern code for Python 3 as a drop-in replacement.

See: https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html
This commit is contained in:
Alan Orth 2023-12-28 14:11:21 +03:00
parent fb341dd9fa
commit a21ffb0fa8
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
3 changed files with 3 additions and 2 deletions

View File

@ -15,6 +15,7 @@ fields
### Changed ### Changed
- Don't run newline fix on description fields - Don't run newline fix on description fields
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once - Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
### Updated ### Updated
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)

View File

@ -2,8 +2,8 @@
import re import re
import langid
import pandas as pd import pandas as pd
import py3langid as langid
from colorama import Fore from colorama import Fore
from pycountry import languages from pycountry import languages

View File

@ -16,11 +16,11 @@ pandas = {version = "^2.0.2", extras = ["feather", "performance"]}
python-stdnum = "^1.18" python-stdnum = "^1.18"
requests = "^2.28.2" requests = "^2.28.2"
requests-cache = "^1.0.0" requests-cache = "^1.0.0"
langid = "^1.1.6"
colorama = "^0.4.6" colorama = "^0.4.6"
ftfy = "^6.1.1" ftfy = "^6.1.1"
country-converter = "~1.1.0" country-converter = "~1.1.0"
pycountry = "^23.12.7" pycountry = "^23.12.7"
py3langid = "^0.2.2"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.2.1" pytest = "^7.2.1"