mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-27 16:18:19 +01:00
Compare commits
7 Commits
bf90464809
...
1c03999582
Author | SHA1 | Date | |
---|---|---|---|
1c03999582 | |||
1f637f32cd | |||
b8241e919d | |||
b8dc19cc3f | |||
93c9b739ac | |||
4ed2786703 | |||
|
8728789183 |
2
.github/workflows/python-app.yml
vendored
2
.github/workflows/python-app.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
|||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- name: Install poetry
|
- name: Install poetry
|
||||||
run: pipx install poetry
|
run: pipx install poetry
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
|
@ -14,6 +14,7 @@ fields
|
|||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Don't run newline fix on description fields
|
- Don't run newline fix on description fields
|
||||||
|
- Install requests-cache in main run() function instead of check.agrovoc() function so we only incur the overhead once
|
||||||
|
|
||||||
### Updated
|
### Updated
|
||||||
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
- Python dependencies, including Pandas 2.0.0 and [Arrow-backed dtypes](https://datapythonista.me/blog/pandas-20-and-the-arrow-revolution-part-i)
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import requests_cache
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
|
|
||||||
import csv_metadata_quality.check as check
|
import csv_metadata_quality.check as check
|
||||||
@ -84,6 +87,19 @@ def run(argv):
|
|||||||
else:
|
else:
|
||||||
exclude = list()
|
exclude = list()
|
||||||
|
|
||||||
|
# enable transparent request cache with thirty days expiry
|
||||||
|
expire_after = timedelta(days=30)
|
||||||
|
# Allow overriding the location of the requests cache, just in case we are
|
||||||
|
# running in an environment where we can't write to the current working di-
|
||||||
|
# rectory (for example from csv-metadata-quality-web).
|
||||||
|
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||||
|
requests_cache.install_cache(
|
||||||
|
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||||
|
)
|
||||||
|
|
||||||
|
# prune old cache entries
|
||||||
|
requests_cache.delete()
|
||||||
|
|
||||||
for column in df.columns:
|
for column in df.columns:
|
||||||
if column in exclude:
|
if column in exclude:
|
||||||
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
|
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
# SPDX-License-Identifier: GPL-3.0-only
|
# SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
import country_converter as coco
|
import country_converter as coco
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import requests_cache
|
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pycountry import languages
|
from pycountry import languages
|
||||||
from stdnum import isbn as stdnum_isbn
|
from stdnum import isbn as stdnum_isbn
|
||||||
@ -203,25 +201,12 @@ def agrovoc(field, field_name, drop):
|
|||||||
if pd.isna(field):
|
if pd.isna(field):
|
||||||
return
|
return
|
||||||
|
|
||||||
# enable transparent request cache with thirty days expiry
|
|
||||||
expire_after = timedelta(days=30)
|
|
||||||
# Allow overriding the location of the requests cache, just in case we are
|
|
||||||
# running in an environment where we can't write to the current working di-
|
|
||||||
# rectory (for example from csv-metadata-quality-web).
|
|
||||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
|
||||||
requests_cache.install_cache(
|
|
||||||
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
|
||||||
)
|
|
||||||
|
|
||||||
# prune old cache entries
|
|
||||||
# requests_cache.remove_expired_responses()
|
|
||||||
|
|
||||||
# Initialize an empty list to hold the validated AGROVOC values
|
# Initialize an empty list to hold the validated AGROVOC values
|
||||||
values = list()
|
values = list()
|
||||||
|
|
||||||
# Try to split multi-value field on "||" separator
|
# Try to split multi-value field on "||" separator
|
||||||
for value in field.split("||"):
|
for value in field.split("||"):
|
||||||
request_url = "http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
request_url = "https://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search"
|
||||||
request_params = {"query": value}
|
request_params = {"query": value}
|
||||||
|
|
||||||
request = requests.get(request_url, params=request_params)
|
request = requests.get(request_url, params=request_params)
|
||||||
|
1128
poetry.lock
generated
1128
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -20,7 +20,7 @@ langid = "^1.1.6"
|
|||||||
colorama = "^0.4.6"
|
colorama = "^0.4.6"
|
||||||
ftfy = "^6.1.1"
|
ftfy = "^6.1.1"
|
||||||
country-converter = "~1.0.0"
|
country-converter = "~1.0.0"
|
||||||
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.13.0"}
|
pycountry = {git = "https://github.com/alanorth/pycountry", rev = "iso-codes-4.15.0"}
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^7.2.1"
|
pytest = "^7.2.1"
|
||||||
|
Loading…
Reference in New Issue
Block a user