From 1f65a2830713f37d18ab3cad8a9a4b4ee88c30a6 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Tue, 30 Jul 2019 00:30:31 +0300 Subject: [PATCH] Add support for validating subjects against AGROVOC Checks values in the dc.subject or dcterms.subject field against the AGROVOC REST API hosted by FAO. Code borrowed from agrovoc-lookup.py. See: http://agrovoc.uniroma2.it/agrovoc/agrovoc/en/ See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py --- .gitignore | 1 + Pipfile | 2 ++ Pipfile.lock | 46 ++++++++++++++++++++++++++++- README.md | 1 + csv_metadata_quality/app.py | 5 ++++ csv_metadata_quality/check.py | 54 +++++++++++++++++++++++++++++++++++ tests/test_check.py | 21 ++++++++++++++ 7 files changed, 129 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bee8a64..a51c439 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ __pycache__ +*.sqlite diff --git a/Pipfile b/Pipfile index 9d51023..a6a4b07 100644 --- a/Pipfile +++ b/Pipfile @@ -13,6 +13,8 @@ pandas = "*" python-stdnum = "*" xlrd = "*" iso-639 = "*" +requests = "*" +requests-cache = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 1f79336..edb8cf4 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d5c625fc9bd915e3199fc4f617c157ee002795fa44d9fa6f0a298749acc134e6" + "sha256": "c5c86b4dae011bcbf6705514d97aa55e0a59dd8b7927c38e34103d77eca13cc7" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,27 @@ ] }, "default": { + "certifi": { + "hashes": [ + "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", + "sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695" + ], + "version": "==2019.6.16" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "version": "==2.8" + }, "iso-639": { "hashes": [ "sha256:dc9cd4b880b898d774c47fe9775167404af8a85dd889d58f9008035109acce49" @@ -86,6 +107,22 @@ ], "version": "==2019.1" }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "index": "pypi", + "version": "==2.22.0" + }, + "requests-cache": { + "hashes": [ + "sha256:6822f788c5ee248995c4bfbd725de2002ad710182ba26a666e85b64981866060", + "sha256:73a7211870f7d67af5fd81cad2f67cfe1cd3eb4ee6a85155e07613968cc72dfc" + ], + "index": "pypi", + "version": "==0.5.0" + }, "six": { "hashes": [ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", @@ -93,6 +130,13 @@ ], "version": "==1.12.0" }, + "urllib3": { + "hashes": [ + "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", + "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" + ], + "version": "==1.25.3" + }, "xlrd": { "hashes": [ "sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2", diff --git a/README.md b/README.md index c57b865..c22bf7e 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht - Read Excel files - Validate dates, ISSNs, ISBNs, and multi-value separators ("||") - Validate languages against ISO 639-2 and ISO 639-3 +- Validate subjects against AGROVOC REST API - Fix leading, trailing, and excessive whitespace - Fix invalid multi-value separators (`|`) using `--unsafe-fixes` - Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 49e6640..e30d4cd 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -43,6 +43,11 @@ def main(argv): # Fix: duplicate metadata values df[column] = df[column].apply(fix.duplicates) + # Check: invalid AGROVOC subject + match = re.match(r'.*?dc\.subject.*$', column) + if match is not None: + df[column] = df[column].apply(check.agrovoc) + # Check: invalid language match = re.match(r'^.*?language.*$', column) if match is not None: diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index 7502a53..f3b0e16 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -189,3 +189,57 @@ def language(field): print(f'Invalid language: {value}') return field + + +def agrovoc(field): + """Check subject terms against AGROVOC REST API. + + Logic copied from agrovoc-lookup.py. + + See: https://github.com/ilri/DSpace/blob/5_x-prod/agrovoc-lookup.py + + Prints a warning if the value is invalid. + """ + + from datetime import timedelta + import re + import requests + import requests_cache + + # Skip fields with missing values + if pd.isna(field): + return + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + # match lines beginning with words, paying attention to subjects with + # special characters like spaces, quotes, dashes, parentheses, etc: + # SUBJECT + # ANOTHER SUBJECT + # XANTHOMONAS CAMPESTRIS PV. MANIHOTIS + # WOMEN'S PARTICIPATION + # COMMUNITY-BASED FOREST MANAGEMENT + # INTERACCIÓN GENOTIPO AMBIENTE + # COCOA (PLANT) + pattern = re.compile(r'^[\w\-\.\'\(\)]+?[\w\s\-\.\'\(\)]+$') + + if pattern.match(value): + request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}' + + # enable transparent request cache with thirty days expiry + expire_after = timedelta(days=30) + requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after) + + request = requests.get(request_url) + + # prune old cache entries + requests_cache.core.remove_expired_responses() + + if request.status_code == requests.codes.ok: + data = request.json() + + # check if there is 1 result, ie an exact subject term match + if len(data['results']) != 1: + print(f'Invalid AGROVOC subject: {value}') + + return field diff --git a/tests/test_check.py b/tests/test_check.py index 527e50b..4880ccd 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -169,3 +169,24 @@ def test_check_invalid_language(capsys): captured = capsys.readouterr() assert captured.out == f'Invalid language: {value}\n' + + +def test_check_invalid_agrovoc(capsys): + '''Test invalid AGROVOC subject.''' + + value = 'FOREST' + + check.agrovoc(value) + + captured = capsys.readouterr() + assert captured.out == f'Invalid AGROVOC subject: {value}\n' + + +def test_check_valid_agrovoc(): + '''Test valid AGROVOC subject.''' + + value = 'FORESTS' + + result = check.agrovoc(value) + + assert result == value