From a21ffb0fa8714abb78618d5192385e8111751755 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 28 Dec 2023 14:11:21 +0300 Subject: [PATCH] Use py3langid instead of langid Faster and more modern code for Python 3 as a drop-in replacement. See: https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html --- CHANGELOG.md | 1 + csv_metadata_quality/experimental.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0911091..2e9473e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ fields ### Changed - Don't run newline fix on description fields - Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once +- Use py3langid instead of langid, see: [How to make language detection with langid.py faster](https://adrien.barbaresi.eu/blog/language-detection-langid-py-faster.html) ### Updated - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) diff --git a/csv_metadata_quality/experimental.py b/csv_metadata_quality/experimental.py index f9c32e8..f6add42 100644 --- a/csv_metadata_quality/experimental.py +++ b/csv_metadata_quality/experimental.py @@ -2,8 +2,8 @@ import re -import langid import pandas as pd +import py3langid as langid from colorama import Fore from pycountry import languages diff --git a/pyproject.toml b/pyproject.toml index 3c06fb5..2f9af85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,11 +16,11 @@ pandas = {version = "^2.0.2", extras = ["feather", "performance"]} python-stdnum = "^1.18" requests = "^2.28.2" requests-cache = "^1.0.0" -langid = "^1.1.6" colorama = "^0.4.6" ftfy = "^6.1.1" country-converter = "~1.1.0" pycountry = "^23.12.7" +py3langid = "^0.2.2" [tool.poetry.group.dev.dependencies] pytest = "^7.2.1"