mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-21 21:35:02 +01:00
Rework requests-cache
We should only be running this once per invocation, not for every row we check. This should be more efficient, but it means that we don't cache responses when running via pytest, which is actually probably a good thing.
This commit is contained in:
parent
b8241e919d
commit
1f637f32cd
@ -14,6 +14,7 @@ fields
|
||||
|
||||
### Changed
|
||||
- Don't run newline fix on description fields
|
||||
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||
|
||||
### Updated
|
||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||
|
@ -1,11 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
|
||||
import csv_metadata_quality.check as check
|
||||
@ -84,6 +87,19 @@ def run(argv):
|
||||
else:
|
||||
exclude = list()
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.delete()
|
||||
|
||||
for column in df.columns:
|
||||
if column in exclude:
|
||||
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
|
||||
|
@ -1,14 +1,12 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import country_converter as coco
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
@ -203,19 +201,6 @@ def agrovoc(field, field_name, drop):
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.delete()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user