1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-17 11:37:03 +01:00

Use py3langid instead of langid

Faster and more modern code for Python 3 as a drop-in replacement.

See: https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html
This commit is contained in:
Alan Orth 2023-12-28 14:11:21 +03:00
parent fb341dd9fa
commit a21ffb0fa8
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
3 changed files with 3 additions and 2 deletions

View File

@ -15,6 +15,7 @@ fields
### Changed
- Don't run newline fix on description fields
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html)
### Updated
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)

View File

@ -2,8 +2,8 @@
import re
import langid
import pandas as pd
import py3langid as langid
from colorama import Fore
from pycountry import languages

View File

@ -16,11 +16,11 @@ pandas = {version = "^2.0.2", extras = ["feather", "performance"]}
python-stdnum = "^1.18"
requests = "^2.28.2"
requests-cache = "^1.0.0"
langid = "^1.1.6"
colorama = "^0.4.6"
ftfy = "^6.1.1"
country-converter = "~1.1.0"
pycountry = "^23.12.7"
py3langid = "^0.2.2"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.1"