mirror of
https://github.com/ilri/dspace-statistics-api.git
synced 2024-11-22 06:15:02 +01:00
Add communities and collections support to API
The basic logic is similar to items, where you can request single item statistics with a UUID, all item statistics, and item statis- tics for a list of items (optionally with a date range). Most of the item code was re-purposed to work on "elements", which can be items, communities, or collections depending on the request, with the use of Falcon's `before` hooks to set the statistics scope so we know how to behave for the current request. Other than the minor difference in facet fields, another issue I had with communities and collections is that the owningComm and owningColl fields are multi-valued (unlike items' id field). This means that, when you facet the results of your query, Solr returns ids that seem unrelated, but are actually present in the field, so I had to make sure I checked all returned ids to see if they were in the user's POSTed elements list. TODO: - Add tests - Revise docstrings - Refactor items.py as it is now generic
This commit is contained in:
parent
fba6f1ead1
commit
3339bf8d9c
@ -3,7 +3,8 @@ import psycopg2.extras
|
|||||||
|
|
||||||
from .database import DatabaseManager
|
from .database import DatabaseManager
|
||||||
from .items import get_downloads, get_views
|
from .items import get_downloads, get_views
|
||||||
from .util import validate_items_post_parameters
|
from .util import set_statistics_scope
|
||||||
|
from .util import validate_post_parameters
|
||||||
|
|
||||||
|
|
||||||
class RootResource:
|
class RootResource:
|
||||||
@ -14,7 +15,8 @@ class RootResource:
|
|||||||
resp.body = f.read()
|
resp.body = f.read()
|
||||||
|
|
||||||
|
|
||||||
class AllItemsResource:
|
class AllStatisticsResource:
|
||||||
|
@falcon.before(set_statistics_scope)
|
||||||
def on_get(self, req, resp):
|
def on_get(self, req, resp):
|
||||||
"""Handles GET requests"""
|
"""Handles GET requests"""
|
||||||
# Return HTTPBadRequest if id parameter is not present and valid
|
# Return HTTPBadRequest if id parameter is not present and valid
|
||||||
@ -26,26 +28,26 @@ class AllItemsResource:
|
|||||||
db.set_session(readonly=True)
|
db.set_session(readonly=True)
|
||||||
|
|
||||||
with db.cursor() as cursor:
|
with db.cursor() as cursor:
|
||||||
# get total number of items so we can estimate the pages
|
# get total number of communities/collections/items so we can estimate the pages
|
||||||
cursor.execute("SELECT COUNT(id) FROM items")
|
cursor.execute(f"SELECT COUNT(id) FROM {req.context.statistics_scope}")
|
||||||
pages = round(cursor.fetchone()[0] / limit)
|
pages = round(cursor.fetchone()[0] / limit)
|
||||||
|
|
||||||
# get statistics and use limit and offset to page through results
|
# get statistics and use limit and offset to page through results
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"SELECT id, views, downloads FROM items ORDER BY id LIMIT %s OFFSET %s",
|
f"SELECT id, views, downloads FROM {req.context.statistics_scope} ORDER BY id LIMIT %s OFFSET %s",
|
||||||
[limit, offset],
|
[limit, offset],
|
||||||
)
|
)
|
||||||
|
|
||||||
# create a list to hold dicts of item stats
|
# create a list to hold dicts of stats
|
||||||
statistics = list()
|
statistics = list()
|
||||||
|
|
||||||
# iterate over results and build statistics object
|
# iterate over results and build statistics object
|
||||||
for item in cursor:
|
for result in cursor:
|
||||||
statistics.append(
|
statistics.append(
|
||||||
{
|
{
|
||||||
"id": str(item["id"]),
|
"id": str(result["id"]),
|
||||||
"views": item["views"],
|
"views": result["views"],
|
||||||
"downloads": item["downloads"],
|
"downloads": result["downloads"],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -58,9 +60,15 @@ class AllItemsResource:
|
|||||||
|
|
||||||
resp.media = message
|
resp.media = message
|
||||||
|
|
||||||
@falcon.before(validate_items_post_parameters)
|
@falcon.before(set_statistics_scope)
|
||||||
|
@falcon.before(validate_post_parameters)
|
||||||
def on_post(self, req, resp):
|
def on_post(self, req, resp):
|
||||||
"""Handles POST requests"""
|
"""Handles POST requests.
|
||||||
|
|
||||||
|
Uses two `before` hooks to set the statistics "scope" and validate the
|
||||||
|
POST parameters. The "scope" is the type of statistics we want, which
|
||||||
|
will be items, communities, or collections, depending on the request.
|
||||||
|
"""
|
||||||
|
|
||||||
# Build the Solr date string, ie: [* TO *]
|
# Build the Solr date string, ie: [* TO *]
|
||||||
if req.context.dateFrom and req.context.dateTo:
|
if req.context.dateFrom and req.context.dateTo:
|
||||||
@ -74,10 +82,10 @@ class AllItemsResource:
|
|||||||
|
|
||||||
# Helper variables to make working with pages/items/results easier and
|
# Helper variables to make working with pages/items/results easier and
|
||||||
# to make the code easier to understand
|
# to make the code easier to understand
|
||||||
number_of_items: int = len(req.context.items)
|
number_of_elements: int = len(req.context.elements)
|
||||||
pages: int = int(number_of_items / req.context.limit)
|
pages: int = int(number_of_elements / req.context.limit)
|
||||||
first_item: int = req.context.page * req.context.limit
|
first_element: int = req.context.page * req.context.limit
|
||||||
last_item: int = first_item + req.context.limit
|
last_element: int = first_element + req.context.limit
|
||||||
# Get a subset of the POSTed items based on our limit. Note that Python
|
# Get a subset of the POSTed items based on our limit. Note that Python
|
||||||
# list slicing and indexing are both zero based, but the first and last
|
# list slicing and indexing are both zero based, but the first and last
|
||||||
# items in a slice can be confusing. See this ASCII diagram:
|
# items in a slice can be confusing. See this ASCII diagram:
|
||||||
@ -88,20 +96,24 @@ class AllItemsResource:
|
|||||||
# Slice position: 0 1 2 3 4 5 6
|
# Slice position: 0 1 2 3 4 5 6
|
||||||
# Index position: 0 1 2 3 4 5
|
# Index position: 0 1 2 3 4 5
|
||||||
#
|
#
|
||||||
# So if we have a list items with 240 items:
|
# So if we have a list of items with 240 items:
|
||||||
#
|
#
|
||||||
# 1st set: items[0:100] would give items at indexes 0 to 99
|
# 1st set: items[0:100] would give items at indexes 0 to 99
|
||||||
# 2nd set: items[100:200] would give items at indexes 100 to 199
|
# 2nd set: items[100:200] would give items at indexes 100 to 199
|
||||||
# 3rd set: items[200:300] would give items at indexes 200 to 239
|
# 3rd set: items[200:300] would give items at indexes 200 to 239
|
||||||
items_subset: list = req.context.items[first_item:last_item]
|
elements_subset: list = req.context.elements[first_element:last_element]
|
||||||
|
|
||||||
views: dict = get_views(solr_date_string, items_subset)
|
views: dict = get_views(
|
||||||
downloads: dict = get_downloads(solr_date_string, items_subset)
|
solr_date_string, elements_subset, req.context.views_facet_field
|
||||||
|
)
|
||||||
|
downloads: dict = get_downloads(
|
||||||
|
solr_date_string, elements_subset, req.context.downloads_facet_field
|
||||||
|
)
|
||||||
|
|
||||||
# create a list to hold dicts of item stats
|
# create a list to hold dicts of stats
|
||||||
statistics = list()
|
statistics = list()
|
||||||
|
|
||||||
# iterate over views dict to extract views and use the item id as an
|
# iterate over views dict to extract views and use the element id as an
|
||||||
# index to the downloads dict to extract downloads.
|
# index to the downloads dict to extract downloads.
|
||||||
for k, v in views.items():
|
for k, v in views.items():
|
||||||
statistics.append({"id": k, "views": v, "downloads": downloads[k]})
|
statistics.append({"id": k, "views": v, "downloads": downloads[k]})
|
||||||
@ -117,8 +129,9 @@ class AllItemsResource:
|
|||||||
resp.media = message
|
resp.media = message
|
||||||
|
|
||||||
|
|
||||||
class ItemResource:
|
class SingleStatisticsResource:
|
||||||
def on_get(self, req, resp, item_id):
|
@falcon.before(set_statistics_scope)
|
||||||
|
def on_get(self, req, resp, id_):
|
||||||
"""Handles GET requests"""
|
"""Handles GET requests"""
|
||||||
|
|
||||||
# Adapt Python’s uuid.UUID type to PostgreSQL’s uuid
|
# Adapt Python’s uuid.UUID type to PostgreSQL’s uuid
|
||||||
@ -131,18 +144,19 @@ class ItemResource:
|
|||||||
with db.cursor() as cursor:
|
with db.cursor() as cursor:
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"SELECT views, downloads FROM items WHERE id=%s", [str(item_id)]
|
f"SELECT views, downloads FROM {req.context.database} WHERE id=%s",
|
||||||
|
[str(id_)],
|
||||||
)
|
)
|
||||||
if cursor.rowcount == 0:
|
if cursor.rowcount == 0:
|
||||||
raise falcon.HTTPNotFound(
|
raise falcon.HTTPNotFound(
|
||||||
title="Item not found",
|
title=f"{req.context.statistics_scope} not found",
|
||||||
description=f'The item with id "{str(item_id)}" was not found.',
|
description=f'The {req.context.statistics_scope} with id "{str(id_)}" was not found.',
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
results = cursor.fetchone()
|
results = cursor.fetchone()
|
||||||
|
|
||||||
statistics = {
|
statistics = {
|
||||||
"id": str(item_id),
|
"id": str(id_),
|
||||||
"views": results["views"],
|
"views": results["views"],
|
||||||
"downloads": results["downloads"],
|
"downloads": results["downloads"],
|
||||||
}
|
}
|
||||||
@ -152,7 +166,17 @@ class ItemResource:
|
|||||||
|
|
||||||
api = application = falcon.API()
|
api = application = falcon.API()
|
||||||
api.add_route("/", RootResource())
|
api.add_route("/", RootResource())
|
||||||
api.add_route("/items", AllItemsResource())
|
|
||||||
api.add_route("/item/{item_id:uuid}", ItemResource())
|
# Item routes
|
||||||
|
api.add_route("/items", AllStatisticsResource())
|
||||||
|
api.add_route("/item/{id_:uuid}", SingleStatisticsResource())
|
||||||
|
|
||||||
|
# Community routes
|
||||||
|
api.add_route("/communities", AllStatisticsResource())
|
||||||
|
api.add_route("/community/{id_:uuid}", SingleStatisticsResource())
|
||||||
|
|
||||||
|
# Collection routes
|
||||||
|
api.add_route("/collections", AllStatisticsResource())
|
||||||
|
api.add_route("/collection/{id_:uuid}", SingleStatisticsResource())
|
||||||
|
|
||||||
# vim: set sw=4 ts=4 expandtab:
|
# vim: set sw=4 ts=4 expandtab:
|
||||||
|
@ -4,25 +4,27 @@ from .config import SOLR_SERVER
|
|||||||
from .util import get_statistics_shards
|
from .util import get_statistics_shards
|
||||||
|
|
||||||
|
|
||||||
def get_views(solr_date_string: str, items: list):
|
def get_views(solr_date_string: str, elements: list, facetField: str):
|
||||||
"""
|
"""
|
||||||
Get view statistics for a list of items from Solr.
|
Get view statistics for a list of elements from Solr. Depending on the req-
|
||||||
|
uest this could be items, communities, or collections.
|
||||||
|
|
||||||
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
|
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
|
||||||
:parameter items (list): a list of item IDs
|
:parameter elements (list): a list of IDs
|
||||||
:returns: A dict of item IDs and views
|
:parameter facetField (str): Solr field to facet by, for example "id"
|
||||||
|
:returns: A dict of IDs and views
|
||||||
"""
|
"""
|
||||||
shards = get_statistics_shards()
|
shards = get_statistics_shards()
|
||||||
|
|
||||||
# Join the UUIDs with "OR" and escape the hyphens for Solr
|
# Join the UUIDs with "OR" and escape the hyphens for Solr
|
||||||
solr_items_string: str = " OR ".join(items).replace("-", r"\-")
|
solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")
|
||||||
|
|
||||||
solr_query_params = {
|
solr_query_params = {
|
||||||
"q": f"id:({solr_items_string})",
|
"q": f"{facetField}:({solr_elements_string})",
|
||||||
"fq": f"type:2 AND isBot:false AND statistics_type:view AND time:{solr_date_string}",
|
"fq": f"type:2 AND isBot:false AND statistics_type:view AND time:{solr_date_string}",
|
||||||
"fl": "id",
|
"fl": facetField,
|
||||||
"facet": "true",
|
"facet": "true",
|
||||||
"facet.field": "id",
|
"facet.field": facetField,
|
||||||
"facet.mincount": 1,
|
"facet.mincount": 1,
|
||||||
"shards": shards,
|
"shards": shards,
|
||||||
"rows": 0,
|
"rows": 0,
|
||||||
@ -38,41 +40,53 @@ def get_views(solr_date_string: str, items: list):
|
|||||||
|
|
||||||
# Solr returns facets as a dict of dicts (see the json.nl parameter)
|
# Solr returns facets as a dict of dicts (see the json.nl parameter)
|
||||||
views = res.json()["facet_counts"]["facet_fields"]
|
views = res.json()["facet_counts"]["facet_fields"]
|
||||||
# iterate over the 'id' dict and get the item ids and views
|
# iterate over the facetField dict and ids and views
|
||||||
for item_id, item_views in views["id"].items():
|
for id_, views in views[facetField].items():
|
||||||
data[item_id] = item_views
|
# For items we can rely on Solr returning facets for the *only* the ids
|
||||||
|
# in our query, but for communities and collections, the owningComm and
|
||||||
|
# owningColl fields are multi-value so Solr will return facets with the
|
||||||
|
# values in our query as well as *any others* that happen to be present
|
||||||
|
# in the field (which looks like Solr returning unrelated results until
|
||||||
|
# you realize that the field is multi-value and this is correct).
|
||||||
|
#
|
||||||
|
# To work around this I make sure that each id in the returned dict are
|
||||||
|
# present in the elements list POSTed by the user.
|
||||||
|
if id_ in elements:
|
||||||
|
data[id_] = views
|
||||||
|
|
||||||
# Check if any items have missing stats so we can set them to 0
|
# Check if any ids have missing stats so we can set them to 0
|
||||||
if len(data) < len(items):
|
if len(data) < len(elements):
|
||||||
# List comprehension to get a list of item ids (keys) in the data
|
# List comprehension to get a list of ids (keys) in the data
|
||||||
data_ids = [k for k, v in data.items()]
|
data_ids = [k for k, v in data.items()]
|
||||||
for item_id in items:
|
for element_id in elements:
|
||||||
if item_id not in data_ids:
|
if element_id not in data_ids:
|
||||||
data[item_id] = 0
|
data[element_id] = 0
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def get_downloads(solr_date_string: str, items: list):
|
def get_downloads(solr_date_string: str, elements: list, facetField: str):
|
||||||
"""
|
"""
|
||||||
Get download statistics for a list of items from Solr.
|
Get download statistics for a list of items from Solr. Depending on the req-
|
||||||
|
uest this could be items, communities, or collections.
|
||||||
|
|
||||||
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
|
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
|
||||||
:parameter items (list): a list of item IDs
|
:parameter elements (list): a list of IDs
|
||||||
:returns: A dict of item IDs and downloads
|
:parameter facetField (str): Solr field to facet by, for example "id"
|
||||||
|
:returns: A dict of IDs and downloads
|
||||||
"""
|
"""
|
||||||
shards = get_statistics_shards()
|
shards = get_statistics_shards()
|
||||||
|
|
||||||
# Join the UUIDs with "OR" and escape the hyphens for Solr
|
# Join the UUIDs with "OR" and escape the hyphens for Solr
|
||||||
solr_items_string: str = " OR ".join(items).replace("-", r"\-")
|
solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")
|
||||||
|
|
||||||
solr_query_params = {
|
solr_query_params = {
|
||||||
"q": f"owningItem:({solr_items_string})",
|
"q": f"{facetField}:({solr_elements_string})",
|
||||||
"fq": f"type:0 AND isBot:false AND statistics_type:view AND bundleName:ORIGINAL AND time:{solr_date_string}",
|
"fq": f"type:0 AND isBot:false AND statistics_type:view AND bundleName:ORIGINAL AND time:{solr_date_string}",
|
||||||
"fl": "owningItem",
|
"fl": facetField,
|
||||||
"facet": "true",
|
"facet": "true",
|
||||||
"facet.field": "owningItem",
|
"facet.field": facetField,
|
||||||
"facet.mincount": 1,
|
"facet.mincount": 1,
|
||||||
"shards": shards,
|
"shards": shards,
|
||||||
"rows": 0,
|
"rows": 0,
|
||||||
@ -88,17 +102,20 @@ def get_downloads(solr_date_string: str, items: list):
|
|||||||
|
|
||||||
# Solr returns facets as a dict of dicts (see the json.nl parameter)
|
# Solr returns facets as a dict of dicts (see the json.nl parameter)
|
||||||
downloads = res.json()["facet_counts"]["facet_fields"]
|
downloads = res.json()["facet_counts"]["facet_fields"]
|
||||||
# Iterate over the 'owningItem' dict and get the item ids and downloads
|
# Iterate over the facetField dict and get the ids and downloads
|
||||||
for item_id, item_downloads in downloads["owningItem"].items():
|
for id_, downloads in downloads[facetField].items():
|
||||||
data[item_id] = item_downloads
|
# Make sure that each id in the returned dict are present in the
|
||||||
|
# elements list POSTed by the user.
|
||||||
|
if id_ in elements:
|
||||||
|
data[id_] = downloads
|
||||||
|
|
||||||
# Check if any items have missing stats so we can set them to 0
|
# Check if any elements have missing stats so we can set them to 0
|
||||||
if len(data) < len(items):
|
if len(data) < len(elements):
|
||||||
# List comprehension to get a list of item ids (keys) in the data
|
# List comprehension to get a list of ids (keys) in the data
|
||||||
data_ids = [k for k, v in data.items()]
|
data_ids = [k for k, v in data.items()]
|
||||||
for item_id in items:
|
for element_id in elements:
|
||||||
if item_id not in data_ids:
|
if element_id not in data_ids:
|
||||||
data[item_id] = 0
|
data[element_id] = 0
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
@ -74,8 +74,9 @@ def is_valid_date(date):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_items_post_parameters(req, resp, resource, params):
|
def validate_post_parameters(req, resp, resource, params):
|
||||||
"""Check the POSTed request parameters for the `/items` endpoint.
|
"""Check the POSTed request parameters for the `/items`, `/communities` and
|
||||||
|
`/collections` endpoints.
|
||||||
|
|
||||||
Meant to be used as a `before` hook.
|
Meant to be used as a `before` hook.
|
||||||
"""
|
"""
|
||||||
@ -125,14 +126,64 @@ def validate_items_post_parameters(req, resp, resource, params):
|
|||||||
else:
|
else:
|
||||||
req.context.page = 0
|
req.context.page = 0
|
||||||
|
|
||||||
# Parse the list of items from the POST request body
|
# Parse the list of elements from the POST request body
|
||||||
if "items" in doc:
|
if req.context.statistics_scope in doc:
|
||||||
if isinstance(doc["items"], list) and len(doc["items"]) > 0:
|
if (
|
||||||
req.context.items = doc["items"]
|
isinstance(doc[req.context.statistics_scope], list)
|
||||||
|
and len(doc[req.context.statistics_scope]) > 0
|
||||||
|
):
|
||||||
|
req.context.elements = doc[req.context.statistics_scope]
|
||||||
else:
|
else:
|
||||||
raise falcon.HTTPBadRequest(
|
raise falcon.HTTPBadRequest(
|
||||||
title="Invalid parameter",
|
title="Invalid parameter",
|
||||||
description='The "items" parameter is invalid. The value must be a comma-separated list of item UUIDs.',
|
description=f'The "{req.context.statistics_scope}" parameter is invalid. The value must be a comma-separated list of UUIDs.',
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
req.context.items = list()
|
req.context.elements = list()
|
||||||
|
|
||||||
|
|
||||||
|
def set_statistics_scope(req, resp, resource, params):
|
||||||
|
"""Set the statistics scope (item, collection, or community) of the request
|
||||||
|
as well as the appropriate database (for GET requests) and Solr facet fields
|
||||||
|
(for POST requests).
|
||||||
|
|
||||||
|
Meant to be used as a `before` hook.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Extract the scope from the request path. This is *guaranteed* to be one
|
||||||
|
# of the following values because we only send requests matching these few
|
||||||
|
# patterns to routes using this set_statistics_scope hook.
|
||||||
|
#
|
||||||
|
# Note: this regex is ordered so that "items" and "collections" match before
|
||||||
|
# "item" and "collection".
|
||||||
|
req.context.statistics_scope = re.findall(
|
||||||
|
r"^/(communities|community|collections|collection|items|item)", req.path
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
# Set the correct database based on the statistics_scope. The database is
|
||||||
|
# used for all GET requests where statistics are returned directly from the
|
||||||
|
# database. In this case we can return early.
|
||||||
|
if req.method == "GET":
|
||||||
|
if re.findall(r"^(item|items)$", req.context.statistics_scope):
|
||||||
|
req.context.database = "items"
|
||||||
|
elif re.findall(r"^(community|communities)$", req.context.statistics_scope):
|
||||||
|
req.context.database = "communities"
|
||||||
|
elif re.findall(r"^(collection|collections)$", req.context.statistics_scope):
|
||||||
|
req.context.database = "collections"
|
||||||
|
|
||||||
|
# GET requests only need the scope and the database so we can return now
|
||||||
|
return
|
||||||
|
|
||||||
|
# If the current request is for a plural items, communities, or collections
|
||||||
|
# that includes a list of element ids POSTed with the request body then we
|
||||||
|
# need to set the Solr facet field so we can get the live results.
|
||||||
|
if req.method == "POST":
|
||||||
|
if req.context.statistics_scope == "items":
|
||||||
|
req.context.views_facet_field = "id"
|
||||||
|
req.context.downloads_facet_field = "owningItem"
|
||||||
|
elif req.context.statistics_scope == "communities":
|
||||||
|
req.context.views_facet_field = "owningComm"
|
||||||
|
req.context.downloads_facet_field = "owningComm"
|
||||||
|
elif req.context.statistics_scope == "collections":
|
||||||
|
req.context.views_facet_field = "owningColl"
|
||||||
|
req.context.downloads_facet_field = "owningColl"
|
||||||
|
Loading…
Reference in New Issue
Block a user