From 1f637f32cd9435ca32405631dadcef6e57f795f7 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Sun, 15 Oct 2023 23:37:38 +0300 Subject: [PATCH] Rework requests-cache We should only be running this once per invocation, not for every row we check. This should be more efficient, but it means that we don't cache responses when running via pytest, which is actually probably a good thing. --- CHANGELOG.md | 1 + csv_metadata_quality/app.py | 16 ++++++++++++++++ csv_metadata_quality/check.py | 15 --------------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60a038c..0911091 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ fields ### Changed - Don't run newline fix on description fields +- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once ### Updated - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index df78571..5ab9eff 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -1,11 +1,14 @@ # SPDX-License-Identifier: GPL-3.0-only import argparse +import os import re import signal import sys +from datetime import timedelta import pandas as pd +import requests_cache from colorama import Fore import csv_metadata_quality.check as check @@ -84,6 +87,19 @@ def run(argv): else: exclude = list() + # enable transparent request cache with thirty days expiry + expire_after = timedelta(days=30) + # Allow overriding the location of the requests cache, just in case we are + # running in an environment where we can't write to the current working di- + # rectory (for example from csv-metadata-quality-web). + REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") + requests_cache.install_cache( + f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after + ) + + # prune old cache entries + requests_cache.delete() + for column in df.columns: if column in exclude: print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}") diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 350d9b0..f0ee2c1 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: GPL-3.0-only import logging -import os import re from datetime import datetime, timedelta import country_converter as coco import pandas as pd import requests -import requests_cache from colorama import Fore from pycountry import languages from stdnum import isbn as stdnum_isbn @@ -203,19 +201,6 @@ def agrovoc(field, field_name, drop): if pd.isna(field): return - # enable transparent request cache with thirty days expiry - expire_after = timedelta(days=30) - # Allow overriding the location of the requests cache, just in case we are - # running in an environment where we can't write to the current working di- - # rectory (for example from csv-metadata-quality-web). - REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") - requests_cache.install_cache( - f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after - ) - - # prune old cache entries - requests_cache.delete() - # Initialize an empty list to hold the validated AGROVOC values values = list()