From 1f637f32cd9435ca32405631dadcef6e57f795f7 Mon Sep 17 00:00:00 2001
From: Alan Orth <alan.orth@gmail.com>
Date: Sun, 15 Oct 2023 23:37:38 +0300
Subject: [PATCH] Rework requests-cache

We should only be running this once per invocation, not for every
row we check. This should be more efficient, but it means that we
don't cache responses when running via pytest, which is actually
probably a good thing.
---
 CHANGELOG.md                  |  1 +
 csv_metadata_quality/app.py   | 16 ++++++++++++++++
 csv_metadata_quality/check.py | 15 ---------------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60a038c..0911091 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ fields
 
 ### Changed
 - Don't run newline fix on description fields
+- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
 
 ### Updated
 - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py
index df78571..5ab9eff 100644
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-3.0-only
 
 import argparse
+import os
 import re
 import signal
 import sys
+from datetime import timedelta
 
 import pandas as pd
+import requests_cache
 from colorama import Fore
 
 import csv_metadata_quality.check as check
@@ -84,6 +87,19 @@ def run(argv):
     else:
         exclude = list()
 
+    # enable transparent request cache with thirty days expiry
+    expire_after = timedelta(days=30)
+    # Allow overriding the location of the requests cache, just in case we are
+    # running in an environment where we can't write to the current working di-
+    # rectory (for example from csv-metadata-quality-web).
+    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
+    requests_cache.install_cache(
+        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
+    )
+
+    # prune old cache entries
+    requests_cache.delete()
+
     for column in df.columns:
         if column in exclude:
             print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py
index 350d9b0..f0ee2c1 100755
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: GPL-3.0-only
 
 import logging
-import os
 import re
 from datetime import datetime, timedelta
 
 import country_converter as coco
 import pandas as pd
 import requests
-import requests_cache
 from colorama import Fore
 from pycountry import languages
 from stdnum import isbn as stdnum_isbn
@@ -203,19 +201,6 @@ def agrovoc(field, field_name, drop):
     if pd.isna(field):
         return
 
-    # enable transparent request cache with thirty days expiry
-    expire_after = timedelta(days=30)
-    # Allow overriding the location of the requests cache, just in case we are
-    # running in an environment where we can't write to the current working di-
-    # rectory (for example from csv-metadata-quality-web).
-    REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
-    requests_cache.install_cache(
-        f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
-    )
-
-    # prune old cache entries
-    requests_cache.delete()
-
     # Initialize an empty list to hold the validated AGROVOC values
     values = list()