1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-27 08:08:19 +01:00

Compare commits

...

7 Commits

Author SHA1 Message Date
1c03999582
Merge pull request #24 from ilri/renovate/actions-checkout-4.x
All checks were successful
continuous-integration/drone/push Build is passing
Update actions/checkout action to v4
2023-10-15 23:39:45 +03:00
1f637f32cd
Rework requests-cache
We should only be running this once per invocation, not for every
row we check. This should be more efficient, but it means that we
don't cache responses when running via pytest, which is actually
probably a good thing.
2023-10-15 23:37:38 +03:00
b8241e919d
poetry.lock: run poetry update 2023-10-15 23:22:48 +03:00
b8dc19cc3f
csv_metadata_quality/check.py: enable requests-cache
This was disabled at some point. We also need to use the new delete
method instead.
2023-10-15 23:21:58 +03:00
93c9b739ac
csv_metadata_quality/check.py: use HTTPS
Use HTTPS for AGROVOC REST API.
2023-10-15 22:38:45 +03:00
4ed2786703
pyproject.toml: update pycountry
Use the latest branch in my fork that has iso-codes 4.15.0.
2023-10-15 21:53:09 +03:00
renovate[bot]
8728789183
Update actions/checkout action to v4
All checks were successful
continuous-integration/drone/push Build is passing
2023-09-04 14:26:25 +00:00
6 changed files with 657 additions and 509 deletions

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v4
- name: Install poetry - name: Install poetry
run: pipx install poetry run: pipx install poetry
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4

View File

@ -14,6 +14,7 @@ fields
### Changed ### Changed
- Don't run newline fix on description fields - Don't run newline fix on description fields
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
### Updated ### Updated
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)

View File

@ -1,11 +1,14 @@
# SPDX-License-Identifier: GPL-3.0-only # SPDX-License-Identifier: GPL-3.0-only
import argparse import argparse
import os
import re import re
import signal import signal
import sys import sys
from datetime import timedelta
import pandas as pd import pandas as pd
import requests_cache
from colorama import Fore from colorama import Fore
import csv_metadata_quality.check as check import csv_metadata_quality.check as check
@ -84,6 +87,19 @@ def run(argv):
else: else:
exclude = list() exclude = list()
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
# Allow overriding the location of the requests cache, just in case we are
# running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries
requests_cache.delete()
for column in df.columns: for column in df.columns:
if column in exclude: if column in exclude:
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}") print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")

View File

@ -1,14 +1,12 @@
# SPDX-License-Identifier: GPL-3.0-only # SPDX-License-Identifier: GPL-3.0-only
import logging import logging
import os
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
import country_converter as coco import country_converter as coco
import pandas as pd import pandas as pd
import requests import requests
import requests_cache
from colorama import Fore from colorama import Fore
from pycountry import languages from pycountry import languages
from stdnum import isbn as stdnum_isbn from stdnum import isbn as stdnum_isbn
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
if pd.isna(field): if pd.isna(field):
return return
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
# Allow overriding the location of the requests cache, just in case we are
# running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries
# requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values # Initialize an empty list to hold the validated AGROVOC values
values = list() values = list()
# Try to split multi-value field on "||" separator # Try to split multi-value field on "||" separator
for value in field.split("||"): for value in field.split("||"):
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
request_params = {"query": value} request_params = {"query": value}
request = requests.get(request_url, params=request_params) request = requests.get(request_url, params=request_params)

1128
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@ langid = "^1.1.6"
colorama = "^0.4.6" colorama = "^0.4.6"
ftfy = "^6.1.1" ftfy = "^6.1.1"
country-converter = "~1.0.0" country-converter = "~1.0.0"
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.13.0"} pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"}
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.2.1" pytest = "^7.2.1"