mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-27 08:08:19 +01:00
Compare commits
No commits in common. "1c03999582dc99f71838a4683fcf6cc44a607d12" and "bf904648096bb7468d836898a139011a4822ba25" have entirely different histories.
1c03999582
...
bf90464809
2
.github/workflows/python-app.yml
vendored
2
.github/workflows/python-app.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install poetry
|
||||
run: pipx install poetry
|
||||
- uses: actions/setup-python@v4
|
||||
|
@ -14,7 +14,6 @@ fields
|
||||
|
||||
### Changed
|
||||
- Don't run newline fix on description fields
|
||||
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||
|
||||
### Updated
|
||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||
|
@ -1,14 +1,11 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
|
||||
import pandas as pd
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
|
||||
import csv_metadata_quality.check as check
|
||||
@ -87,19 +84,6 @@ def run(argv):
|
||||
else:
|
||||
exclude = list()
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
requests_cache.delete()
|
||||
|
||||
for column in df.columns:
|
||||
if column in exclude:
|
||||
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
|
||||
|
@ -1,12 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-only
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import country_converter as coco
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_cache
|
||||
from colorama import Fore
|
||||
from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
@ -201,12 +203,25 @@ def agrovoc(field, field_name, drop):
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# enable transparent request cache with thirty days expiry
|
||||
expire_after = timedelta(days=30)
|
||||
# Allow overriding the location of the requests cache, just in case we are
|
||||
# running in an environment where we can't write to the current working di-
|
||||
# rectory (for example from csv-metadata-quality-web).
|
||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||
requests_cache.install_cache(
|
||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||
)
|
||||
|
||||
# prune old cache entries
|
||||
# requests_cache.remove_expired_responses()
|
||||
|
||||
# Initialize an empty list to hold the validated AGROVOC values
|
||||
values = list()
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split("||"):
|
||||
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||
request_params = {"query": value}
|
||||
|
||||
request = requests.get(request_url, params=request_params)
|
||||
|
1128
poetry.lock
generated
1128
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@ langid = "^1.1.6"
|
||||
colorama = "^0.4.6"
|
||||
ftfy = "^6.1.1"
|
||||
country-converter = "~1.0.0"
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"}
|
||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.13.0"}
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.2.1"
|
||||
|
Loading…
Reference in New Issue
Block a user