Rework requests-cache

We should only be running this once per invocation, not for every row we check. This should be more efficient, but it means that we don't cache responses when running via pytest, which is actually probably a good thing.
2025-08-06 13:05:41 +02:00 · 2023-10-15 23:37:38 +03:00
parent b8241e919d
commit 1f637f32cd
3 changed files with 17 additions and 15 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -14,6 +14,7 @@ fields
 ### Changed
 - Don't run newline fix on description fields
 - Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
 ### Updated
 - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-3.0-only
 import argparse
 import os
 import re
 import signal
 import sys
 from datetime import timedelta
 import pandas as pd
 import requests_cache
 from colorama import Fore
 import csv_metadata_quality.check as check
@ -84,6 +87,19 @@ def run(argv):
    else:
        exclude = list()
    # enable transparent request cache with thirty days expiry
    expire_after = timedelta(days=30)
    # Allow overriding the location of the requests cache, just in case we are
    # running in an environment where we can't write to the current working di-
    # rectory (for example from csv-metadata-quality-web).
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
    requests_cache.install_cache(
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
    )
    # prune old cache entries
    requests_cache.delete()
    for column in df.columns:
        if column in exclude:
            print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -1,14 +1,12 @@
 # SPDX-License-Identifier: GPL-3.0-only
 import logging
 import os
 import re
 from datetime import datetime, timedelta
 import country_converter as coco
 import pandas as pd
 import requests
 import requests_cache
 from colorama import Fore
 from pycountry import languages
 from stdnum import isbn as stdnum_isbn
@ -203,19 +201,6 @@ def agrovoc(field, field_name, drop):
    if pd.isna(field):
        return
    # enable transparent request cache with thirty days expiry
    expire_after = timedelta(days=30)
    # Allow overriding the location of the requests cache, just in case we are
    # running in an environment where we can't write to the current working di-
    # rectory (for example from csv-metadata-quality-web).
    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
    requests_cache.install_cache(
        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
    )
    # prune old cache entries
    requests_cache.delete()
    # Initialize an empty list to hold the validated AGROVOC values
    values = list()