1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-27 16:18:19 +01:00

Compare commits

..

No commits in common. "1c03999582dc99f71838a4683fcf6cc44a607d12" and "bf904648096bb7468d836898a139011a4822ba25" have entirely different histories.

6 changed files with 509 additions and 657 deletions

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- name: Install poetry - name: Install poetry
run: pipx install poetry run: pipx install poetry
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4

View File

@ -14,7 +14,6 @@ fields
### Changed ### Changed
- Don't run newline fix on description fields - Don't run newline fix on description fields
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
### Updated ### Updated
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i) - Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)

View File

@ -1,14 +1,11 @@
# SPDX-License-Identifier: GPL-3.0-only # SPDX-License-Identifier: GPL-3.0-only
import argparse import argparse
import os
import re import re
import signal import signal
import sys import sys
from datetime import timedelta
import pandas as pd import pandas as pd
import requests_cache
from colorama import Fore from colorama import Fore
import csv_metadata_quality.check as check import csv_metadata_quality.check as check
@ -87,19 +84,6 @@ def run(argv):
else: else:
exclude = list() exclude = list()
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
# Allow overriding the location of the requests cache, just in case we are
# running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries
requests_cache.delete()
for column in df.columns: for column in df.columns:
if column in exclude: if column in exclude:
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}") print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")

View File

@ -1,12 +1,14 @@
# SPDX-License-Identifier: GPL-3.0-only # SPDX-License-Identifier: GPL-3.0-only
import logging import logging
import os
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
import country_converter as coco import country_converter as coco
import pandas as pd import pandas as pd
import requests import requests
import requests_cache
from colorama import Fore from colorama import Fore
from pycountry import languages from pycountry import languages
from stdnum import isbn as stdnum_isbn from stdnum import isbn as stdnum_isbn
@ -201,12 +203,25 @@ def agrovoc(field, field_name, drop):
if pd.isna(field): if pd.isna(field):
return return
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
# Allow overriding the location of the requests cache, just in case we are
# running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries
# requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values # Initialize an empty list to hold the validated AGROVOC values
values = list() values = list()
# Try to split multi-value field on "||" separator # Try to split multi-value field on "||" separator
for value in field.split("||"): for value in field.split("||"):
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search" request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
request_params = {"query": value} request_params = {"query": value}
request = requests.get(request_url, params=request_params) request = requests.get(request_url, params=request_params)

1128
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@ langid = "^1.1.6"
colorama = "^0.4.6" colorama = "^0.4.6"
ftfy = "^6.1.1" ftfy = "^6.1.1"
country-converter = "~1.0.0" country-converter = "~1.0.0"
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"} pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.13.0"}
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.2.1" pytest = "^7.2.1"