diff --git a/CHANGELOG.md b/CHANGELOG.md index 60a038c..0911091 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ fields ### Changed - Don't run newline fix on description fields +- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once ### Updated - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index df78571..5ab9eff 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -1,11 +1,14 @@ # SPDX-License-Identifier: GPL-3.0-only import argparse +import os import re import signal import sys +from datetime import timedelta import pandas as pd +import requests_cache from colorama import Fore import csv_metadata_quality.check as check @@ -84,6 +87,19 @@ def run(argv): else: exclude = list() + # enable transparent request cache with thirty days expiry + expire_after = timedelta(days=30) + # Allow overriding the location of the requests cache, just in case we are + # running in an environment where we can't write to the current working di- + # rectory (for example from csv-metadata-quality-web). + REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") + requests_cache.install_cache( + f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after + ) + + # prune old cache entries + requests_cache.delete() + for column in df.columns: if column in exclude: print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}") diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 350d9b0..f0ee2c1 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -1,14 +1,12 @@ # SPDX-License-Identifier: GPL-3.0-only import logging -import os import re from datetime import datetime, timedelta import country_converter as coco import pandas as pd import requests -import requests_cache from colorama import Fore from pycountry import languages from stdnum import isbn as stdnum_isbn @@ -203,19 +201,6 @@ def agrovoc(field, field_name, drop): if pd.isna(field): return - # enable transparent request cache with thirty days expiry - expire_after = timedelta(days=30) - # Allow overriding the location of the requests cache, just in case we are - # running in an environment where we can't write to the current working di- - # rectory (for example from csv-metadata-quality-web). - REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") - requests_cache.install_cache( - f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after - ) - - # prune old cache entries - requests_cache.delete() - # Initialize an empty list to hold the validated AGROVOC values values = list()