1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-27 08:08:19 +01:00

Compare commits

..

No commits in common. "1c03999582dc99f71838a4683fcf6cc44a607d12" and "bf904648096bb7468d836898a139011a4822ba25" have entirely different histories.

6 changed files with 509 additions and 657 deletions

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install poetry
run: pipx install poetry
- uses: actions/setup-python@v4

View File

@ -14,7 +14,6 @@ fields
### Changed
- Don't run newline fix on description fields
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
### Updated
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)

View File

@ -1,14 +1,11 @@
# SPDX-License-Identifier: GPL-3.0-only
import argparse
import os
import re
import signal
import sys
from datetime import timedelta
import pandas as pd
import requests_cache
from colorama import Fore
import csv_metadata_quality.check as check
@ -87,19 +84,6 @@ def run(argv):
else:
exclude = list()
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
# Allow overriding the location of the requests cache, just in case we are
# running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries
requests_cache.delete()
for column in df.columns:
if column in exclude:
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")

View File

@ -1,12 +1,14 @@
# SPDX-License-Identifier: GPL-3.0-only
import logging
import os
import re
from datetime import datetime, timedelta
import country_converter as coco
import pandas as pd
import requests
import requests_cache
from colorama import Fore
from pycountry import languages
from stdnum import isbn as stdnum_isbn
@ -201,12 +203,25 @@ def agrovoc(field, field_name, drop):
if pd.isna(field):
return
# enable transparent request cache with thirty days expiry
expire_after = timedelta(days=30)
# Allow overriding the location of the requests cache, just in case we are
# running in an environment where we can't write to the current working di-
# rectory (for example from csv-metadata-quality-web).
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
requests_cache.install_cache(
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
)
# prune old cache entries
# requests_cache.remove_expired_responses()
# Initialize an empty list to hold the validated AGROVOC values
values = list()
# Try to split multi-value field on "||" separator
for value in field.split("||"):
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
request_params = {"query": value}
request = requests.get(request_url, params=request_params)

1128
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -20,7 +20,7 @@ langid = "^1.1.6"
colorama = "^0.4.6"
ftfy = "^6.1.1"
country-converter = "~1.0.0"
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"}
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.13.0"}
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.1"