Add communities and collections support to API

The basic logic is similar to items, where you can request single
item statistics with a UUID, all item statistics, and item statis-
tics for a list of items (optionally with a date range). Most of
the item code was re-purposed to work on "elements", which can be
items, communities, or collections depending on the request, with
the use of Falcon's `before` hooks to set the statistics scope so
we know how to behave for the current request.

Other than the minor difference in facet fields, another issue I
had with communities and collections is that the owningComm and
owningColl fields are multi-valued (unlike items' id field). This
means that, when you facet the results of your query, Solr returns
ids that seem unrelated, but are actually present in the field, so
I had to make sure I checked all returned ids to see if they were
in the user's POSTed elements list.

TODO:
  - Add tests
  - Revise docstrings
  - Refactor items.py as it is now generic
This commit is contained in:
Alan Orth 2020-12-20 16:14:46 +02:00
parent fba6f1ead1
commit 3339bf8d9c
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
3 changed files with 164 additions and 72 deletions

View File

@ -3,7 +3,8 @@ import psycopg2.extras
from .database import DatabaseManager
from .items import get_downloads, get_views
from .util import validate_items_post_parameters
from .util import set_statistics_scope
from .util import validate_post_parameters
class RootResource:
@ -14,7 +15,8 @@ class RootResource:
resp.body = f.read()
class AllItemsResource:
class AllStatisticsResource:
@falcon.before(set_statistics_scope)
def on_get(self, req, resp):
"""Handles GET requests"""
# Return HTTPBadRequest if id parameter is not present and valid
@ -26,26 +28,26 @@ class AllItemsResource:
db.set_session(readonly=True)
with db.cursor() as cursor:
# get total number of items so we can estimate the pages
cursor.execute("SELECT COUNT(id) FROM items")
# get total number of communities/collections/items so we can estimate the pages
cursor.execute(f"SELECT COUNT(id) FROM {req.context.statistics_scope}")
pages = round(cursor.fetchone()[0] / limit)
# get statistics and use limit and offset to page through results
cursor.execute(
"SELECT id, views, downloads FROM items ORDER BY id LIMIT %s OFFSET %s",
f"SELECT id, views, downloads FROM {req.context.statistics_scope} ORDER BY id LIMIT %s OFFSET %s",
[limit, offset],
)
# create a list to hold dicts of item stats
# create a list to hold dicts of stats
statistics = list()
# iterate over results and build statistics object
for item in cursor:
for result in cursor:
statistics.append(
{
"id": str(item["id"]),
"views": item["views"],
"downloads": item["downloads"],
"id": str(result["id"]),
"views": result["views"],
"downloads": result["downloads"],
}
)
@ -58,9 +60,15 @@ class AllItemsResource:
resp.media = message
@falcon.before(validate_items_post_parameters)
@falcon.before(set_statistics_scope)
@falcon.before(validate_post_parameters)
def on_post(self, req, resp):
"""Handles POST requests"""
"""Handles POST requests.
Uses two `before` hooks to set the statistics "scope" and validate the
POST parameters. The "scope" is the type of statistics we want, which
will be items, communities, or collections, depending on the request.
"""
# Build the Solr date string, ie: [* TO *]
if req.context.dateFrom and req.context.dateTo:
@ -74,10 +82,10 @@ class AllItemsResource:
# Helper variables to make working with pages/items/results easier and
# to make the code easier to understand
number_of_items: int = len(req.context.items)
pages: int = int(number_of_items / req.context.limit)
first_item: int = req.context.page * req.context.limit
last_item: int = first_item + req.context.limit
number_of_elements: int = len(req.context.elements)
pages: int = int(number_of_elements / req.context.limit)
first_element: int = req.context.page * req.context.limit
last_element: int = first_element + req.context.limit
# Get a subset of the POSTed items based on our limit. Note that Python
# list slicing and indexing are both zero based, but the first and last
# items in a slice can be confusing. See this ASCII diagram:
@ -88,20 +96,24 @@ class AllItemsResource:
# Slice position: 0 1 2 3 4 5 6
# Index position: 0 1 2 3 4 5
#
# So if we have a list items with 240 items:
# So if we have a list of items with 240 items:
#
# 1st set: items[0:100] would give items at indexes 0 to 99
# 2nd set: items[100:200] would give items at indexes 100 to 199
# 3rd set: items[200:300] would give items at indexes 200 to 239
items_subset: list = req.context.items[first_item:last_item]
elements_subset: list = req.context.elements[first_element:last_element]
views: dict = get_views(solr_date_string, items_subset)
downloads: dict = get_downloads(solr_date_string, items_subset)
views: dict = get_views(
solr_date_string, elements_subset, req.context.views_facet_field
)
downloads: dict = get_downloads(
solr_date_string, elements_subset, req.context.downloads_facet_field
)
# create a list to hold dicts of item stats
# create a list to hold dicts of stats
statistics = list()
# iterate over views dict to extract views and use the item id as an
# iterate over views dict to extract views and use the element id as an
# index to the downloads dict to extract downloads.
for k, v in views.items():
statistics.append({"id": k, "views": v, "downloads": downloads[k]})
@ -117,8 +129,9 @@ class AllItemsResource:
resp.media = message
class ItemResource:
def on_get(self, req, resp, item_id):
class SingleStatisticsResource:
@falcon.before(set_statistics_scope)
def on_get(self, req, resp, id_):
"""Handles GET requests"""
# Adapt Pythons uuid.UUID type to PostgreSQLs uuid
@ -131,18 +144,19 @@ class ItemResource:
with db.cursor() as cursor:
cursor = db.cursor()
cursor.execute(
"SELECT views, downloads FROM items WHERE id=%s", [str(item_id)]
f"SELECT views, downloads FROM {req.context.database} WHERE id=%s",
[str(id_)],
)
if cursor.rowcount == 0:
raise falcon.HTTPNotFound(
title="Item not found",
description=f'The item with id "{str(item_id)}" was not found.',
title=f"{req.context.statistics_scope} not found",
description=f'The {req.context.statistics_scope} with id "{str(id_)}" was not found.',
)
else:
results = cursor.fetchone()
statistics = {
"id": str(item_id),
"id": str(id_),
"views": results["views"],
"downloads": results["downloads"],
}
@ -152,7 +166,17 @@ class ItemResource:
api = application = falcon.API()
api.add_route("/", RootResource())
api.add_route("/items", AllItemsResource())
api.add_route("/item/{item_id:uuid}", ItemResource())
# Item routes
api.add_route("/items", AllStatisticsResource())
api.add_route("/item/{id_:uuid}", SingleStatisticsResource())
# Community routes
api.add_route("/communities", AllStatisticsResource())
api.add_route("/community/{id_:uuid}", SingleStatisticsResource())
# Collection routes
api.add_route("/collections", AllStatisticsResource())
api.add_route("/collection/{id_:uuid}", SingleStatisticsResource())
# vim: set sw=4 ts=4 expandtab:

View File

@ -4,25 +4,27 @@ from .config import SOLR_SERVER
from .util import get_statistics_shards
def get_views(solr_date_string: str, items: list):
def get_views(solr_date_string: str, elements: list, facetField: str):
"""
Get view statistics for a list of items from Solr.
Get view statistics for a list of elements from Solr. Depending on the req-
uest this could be items, communities, or collections.
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
:parameter items (list): a list of item IDs
:returns: A dict of item IDs and views
:parameter elements (list): a list of IDs
:parameter facetField (str): Solr field to facet by, for example "id"
:returns: A dict of IDs and views
"""
shards = get_statistics_shards()
# Join the UUIDs with "OR" and escape the hyphens for Solr
solr_items_string: str = " OR ".join(items).replace("-", r"\-")
solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")
solr_query_params = {
"q": f"id:({solr_items_string})",
"q": f"{facetField}:({solr_elements_string})",
"fq": f"type:2 AND isBot:false AND statistics_type:view AND time:{solr_date_string}",
"fl": "id",
"fl": facetField,
"facet": "true",
"facet.field": "id",
"facet.field": facetField,
"facet.mincount": 1,
"shards": shards,
"rows": 0,
@ -38,41 +40,53 @@ def get_views(solr_date_string: str, items: list):
# Solr returns facets as a dict of dicts (see the json.nl parameter)
views = res.json()["facet_counts"]["facet_fields"]
# iterate over the 'id' dict and get the item ids and views
for item_id, item_views in views["id"].items():
data[item_id] = item_views
# iterate over the facetField dict and ids and views
for id_, views in views[facetField].items():
# For items we can rely on Solr returning facets for the *only* the ids
# in our query, but for communities and collections, the owningComm and
# owningColl fields are multi-value so Solr will return facets with the
# values in our query as well as *any others* that happen to be present
# in the field (which looks like Solr returning unrelated results until
# you realize that the field is multi-value and this is correct).
#
# To work around this I make sure that each id in the returned dict are
# present in the elements list POSTed by the user.
if id_ in elements:
data[id_] = views
# Check if any items have missing stats so we can set them to 0
if len(data) < len(items):
# List comprehension to get a list of item ids (keys) in the data
# Check if any ids have missing stats so we can set them to 0
if len(data) < len(elements):
# List comprehension to get a list of ids (keys) in the data
data_ids = [k for k, v in data.items()]
for item_id in items:
if item_id not in data_ids:
data[item_id] = 0
for element_id in elements:
if element_id not in data_ids:
data[element_id] = 0
continue
return data
def get_downloads(solr_date_string: str, items: list):
def get_downloads(solr_date_string: str, elements: list, facetField: str):
"""
Get download statistics for a list of items from Solr.
Get download statistics for a list of items from Solr. Depending on the req-
uest this could be items, communities, or collections.
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
:parameter items (list): a list of item IDs
:returns: A dict of item IDs and downloads
:parameter elements (list): a list of IDs
:parameter facetField (str): Solr field to facet by, for example "id"
:returns: A dict of IDs and downloads
"""
shards = get_statistics_shards()
# Join the UUIDs with "OR" and escape the hyphens for Solr
solr_items_string: str = " OR ".join(items).replace("-", r"\-")
solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")
solr_query_params = {
"q": f"owningItem:({solr_items_string})",
"q": f"{facetField}:({solr_elements_string})",
"fq": f"type:0 AND isBot:false AND statistics_type:view AND bundleName:ORIGINAL AND time:{solr_date_string}",
"fl": "owningItem",
"fl": facetField,
"facet": "true",
"facet.field": "owningItem",
"facet.field": facetField,
"facet.mincount": 1,
"shards": shards,
"rows": 0,
@ -88,17 +102,20 @@ def get_downloads(solr_date_string: str, items: list):
# Solr returns facets as a dict of dicts (see the json.nl parameter)
downloads = res.json()["facet_counts"]["facet_fields"]
# Iterate over the 'owningItem' dict and get the item ids and downloads
for item_id, item_downloads in downloads["owningItem"].items():
data[item_id] = item_downloads
# Iterate over the facetField dict and get the ids and downloads
for id_, downloads in downloads[facetField].items():
# Make sure that each id in the returned dict are present in the
# elements list POSTed by the user.
if id_ in elements:
data[id_] = downloads
# Check if any items have missing stats so we can set them to 0
if len(data) < len(items):
# List comprehension to get a list of item ids (keys) in the data
# Check if any elements have missing stats so we can set them to 0
if len(data) < len(elements):
# List comprehension to get a list of ids (keys) in the data
data_ids = [k for k, v in data.items()]
for item_id in items:
if item_id not in data_ids:
data[item_id] = 0
for element_id in elements:
if element_id not in data_ids:
data[element_id] = 0
continue
return data

View File

@ -74,8 +74,9 @@ def is_valid_date(date):
)
def validate_items_post_parameters(req, resp, resource, params):
"""Check the POSTed request parameters for the `/items` endpoint.
def validate_post_parameters(req, resp, resource, params):
"""Check the POSTed request parameters for the `/items`, `/communities` and
`/collections` endpoints.
Meant to be used as a `before` hook.
"""
@ -125,14 +126,64 @@ def validate_items_post_parameters(req, resp, resource, params):
else:
req.context.page = 0
# Parse the list of items from the POST request body
if "items" in doc:
if isinstance(doc["items"], list) and len(doc["items"]) > 0:
req.context.items = doc["items"]
# Parse the list of elements from the POST request body
if req.context.statistics_scope in doc:
if (
isinstance(doc[req.context.statistics_scope], list)
and len(doc[req.context.statistics_scope]) > 0
):
req.context.elements = doc[req.context.statistics_scope]
else:
raise falcon.HTTPBadRequest(
title="Invalid parameter",
description='The "items" parameter is invalid. The value must be a comma-separated list of item UUIDs.',
description=f'The "{req.context.statistics_scope}" parameter is invalid. The value must be a comma-separated list of UUIDs.',
)
else:
req.context.items = list()
req.context.elements = list()
def set_statistics_scope(req, resp, resource, params):
"""Set the statistics scope (item, collection, or community) of the request
as well as the appropriate database (for GET requests) and Solr facet fields
(for POST requests).
Meant to be used as a `before` hook.
"""
# Extract the scope from the request path. This is *guaranteed* to be one
# of the following values because we only send requests matching these few
# patterns to routes using this set_statistics_scope hook.
#
# Note: this regex is ordered so that "items" and "collections" match before
# "item" and "collection".
req.context.statistics_scope = re.findall(
r"^/(communities|community|collections|collection|items|item)", req.path
)[0]
# Set the correct database based on the statistics_scope. The database is
# used for all GET requests where statistics are returned directly from the
# database. In this case we can return early.
if req.method == "GET":
if re.findall(r"^(item|items)$", req.context.statistics_scope):
req.context.database = "items"
elif re.findall(r"^(community|communities)$", req.context.statistics_scope):
req.context.database = "communities"
elif re.findall(r"^(collection|collections)$", req.context.statistics_scope):
req.context.database = "collections"
# GET requests only need the scope and the database so we can return now
return
# If the current request is for a plural items, communities, or collections
# that includes a list of element ids POSTed with the request body then we
# need to set the Solr facet field so we can get the live results.
if req.method == "POST":
if req.context.statistics_scope == "items":
req.context.views_facet_field = "id"
req.context.downloads_facet_field = "owningItem"
elif req.context.statistics_scope == "communities":
req.context.views_facet_field = "owningComm"
req.context.downloads_facet_field = "owningComm"
elif req.context.statistics_scope == "collections":
req.context.views_facet_field = "owningColl"
req.context.downloads_facet_field = "owningColl"