diff --git a/dspace_statistics_api/app.py b/dspace_statistics_api/app.py index 43e81c2..71580a7 100644 --- a/dspace_statistics_api/app.py +++ b/dspace_statistics_api/app.py @@ -3,7 +3,8 @@ import psycopg2.extras from .database import DatabaseManager from .items import get_downloads, get_views -from .util import validate_items_post_parameters +from .util import set_statistics_scope +from .util import validate_post_parameters class RootResource: @@ -14,7 +15,8 @@ class RootResource: resp.body = f.read() -class AllItemsResource: +class AllStatisticsResource: + @falcon.before(set_statistics_scope) def on_get(self, req, resp): """Handles GET requests""" # Return HTTPBadRequest if id parameter is not present and valid @@ -26,26 +28,26 @@ class AllItemsResource: db.set_session(readonly=True) with db.cursor() as cursor: - # get total number of items so we can estimate the pages - cursor.execute("SELECT COUNT(id) FROM items") + # get total number of communities/collections/items so we can estimate the pages + cursor.execute(f"SELECT COUNT(id) FROM {req.context.statistics_scope}") pages = round(cursor.fetchone()[0] / limit) # get statistics and use limit and offset to page through results cursor.execute( - "SELECT id, views, downloads FROM items ORDER BY id LIMIT %s OFFSET %s", + f"SELECT id, views, downloads FROM {req.context.statistics_scope} ORDER BY id LIMIT %s OFFSET %s", [limit, offset], ) - # create a list to hold dicts of item stats + # create a list to hold dicts of stats statistics = list() # iterate over results and build statistics object - for item in cursor: + for result in cursor: statistics.append( { - "id": str(item["id"]), - "views": item["views"], - "downloads": item["downloads"], + "id": str(result["id"]), + "views": result["views"], + "downloads": result["downloads"], } ) @@ -58,9 +60,15 @@ class AllItemsResource: resp.media = message - @falcon.before(validate_items_post_parameters) + @falcon.before(set_statistics_scope) + @falcon.before(validate_post_parameters) def on_post(self, req, resp): - """Handles POST requests""" + """Handles POST requests. + + Uses two `before` hooks to set the statistics "scope" and validate the + POST parameters. The "scope" is the type of statistics we want, which + will be items, communities, or collections, depending on the request. + """ # Build the Solr date string, ie: [* TO *] if req.context.dateFrom and req.context.dateTo: @@ -74,10 +82,10 @@ class AllItemsResource: # Helper variables to make working with pages/items/results easier and # to make the code easier to understand - number_of_items: int = len(req.context.items) - pages: int = int(number_of_items / req.context.limit) - first_item: int = req.context.page * req.context.limit - last_item: int = first_item + req.context.limit + number_of_elements: int = len(req.context.elements) + pages: int = int(number_of_elements / req.context.limit) + first_element: int = req.context.page * req.context.limit + last_element: int = first_element + req.context.limit # Get a subset of the POSTed items based on our limit. Note that Python # list slicing and indexing are both zero based, but the first and last # items in a slice can be confusing. See this ASCII diagram: @@ -88,20 +96,24 @@ class AllItemsResource: # Slice position: 0 1 2 3 4 5 6 # Index position: 0 1 2 3 4 5 # - # So if we have a list items with 240 items: + # So if we have a list of items with 240 items: # # 1st set: items[0:100] would give items at indexes 0 to 99 # 2nd set: items[100:200] would give items at indexes 100 to 199 # 3rd set: items[200:300] would give items at indexes 200 to 239 - items_subset: list = req.context.items[first_item:last_item] + elements_subset: list = req.context.elements[first_element:last_element] - views: dict = get_views(solr_date_string, items_subset) - downloads: dict = get_downloads(solr_date_string, items_subset) + views: dict = get_views( + solr_date_string, elements_subset, req.context.views_facet_field + ) + downloads: dict = get_downloads( + solr_date_string, elements_subset, req.context.downloads_facet_field + ) - # create a list to hold dicts of item stats + # create a list to hold dicts of stats statistics = list() - # iterate over views dict to extract views and use the item id as an + # iterate over views dict to extract views and use the element id as an # index to the downloads dict to extract downloads. for k, v in views.items(): statistics.append({"id": k, "views": v, "downloads": downloads[k]}) @@ -117,8 +129,9 @@ class AllItemsResource: resp.media = message -class ItemResource: - def on_get(self, req, resp, item_id): +class SingleStatisticsResource: + @falcon.before(set_statistics_scope) + def on_get(self, req, resp, id_): """Handles GET requests""" # Adapt Python’s uuid.UUID type to PostgreSQL’s uuid @@ -131,18 +144,19 @@ class ItemResource: with db.cursor() as cursor: cursor = db.cursor() cursor.execute( - "SELECT views, downloads FROM items WHERE id=%s", [str(item_id)] + f"SELECT views, downloads FROM {req.context.database} WHERE id=%s", + [str(id_)], ) if cursor.rowcount == 0: raise falcon.HTTPNotFound( - title="Item not found", - description=f'The item with id "{str(item_id)}" was not found.', + title=f"{req.context.statistics_scope} not found", + description=f'The {req.context.statistics_scope} with id "{str(id_)}" was not found.', ) else: results = cursor.fetchone() statistics = { - "id": str(item_id), + "id": str(id_), "views": results["views"], "downloads": results["downloads"], } @@ -152,7 +166,17 @@ class ItemResource: api = application = falcon.API() api.add_route("/", RootResource()) -api.add_route("/items", AllItemsResource()) -api.add_route("/item/{item_id:uuid}", ItemResource()) + +# Item routes +api.add_route("/items", AllStatisticsResource()) +api.add_route("/item/{id_:uuid}", SingleStatisticsResource()) + +# Community routes +api.add_route("/communities", AllStatisticsResource()) +api.add_route("/community/{id_:uuid}", SingleStatisticsResource()) + +# Collection routes +api.add_route("/collections", AllStatisticsResource()) +api.add_route("/collection/{id_:uuid}", SingleStatisticsResource()) # vim: set sw=4 ts=4 expandtab: diff --git a/dspace_statistics_api/items.py b/dspace_statistics_api/items.py index 8ed80dd..6a90735 100644 --- a/dspace_statistics_api/items.py +++ b/dspace_statistics_api/items.py @@ -4,25 +4,27 @@ from .config import SOLR_SERVER from .util import get_statistics_shards -def get_views(solr_date_string: str, items: list): +def get_views(solr_date_string: str, elements: list, facetField: str): """ - Get view statistics for a list of items from Solr. + Get view statistics for a list of elements from Solr. Depending on the req- + uest this could be items, communities, or collections. :parameter solr_date_string (str): Solr date string, for example "[* TO *]" - :parameter items (list): a list of item IDs - :returns: A dict of item IDs and views + :parameter elements (list): a list of IDs + :parameter facetField (str): Solr field to facet by, for example "id" + :returns: A dict of IDs and views """ shards = get_statistics_shards() # Join the UUIDs with "OR" and escape the hyphens for Solr - solr_items_string: str = " OR ".join(items).replace("-", r"\-") + solr_elements_string: str = " OR ".join(elements).replace("-", r"\-") solr_query_params = { - "q": f"id:({solr_items_string})", + "q": f"{facetField}:({solr_elements_string})", "fq": f"type:2 AND isBot:false AND statistics_type:view AND time:{solr_date_string}", - "fl": "id", + "fl": facetField, "facet": "true", - "facet.field": "id", + "facet.field": facetField, "facet.mincount": 1, "shards": shards, "rows": 0, @@ -38,41 +40,53 @@ def get_views(solr_date_string: str, items: list): # Solr returns facets as a dict of dicts (see the json.nl parameter) views = res.json()["facet_counts"]["facet_fields"] - # iterate over the 'id' dict and get the item ids and views - for item_id, item_views in views["id"].items(): - data[item_id] = item_views + # iterate over the facetField dict and ids and views + for id_, views in views[facetField].items(): + # For items we can rely on Solr returning facets for the *only* the ids + # in our query, but for communities and collections, the owningComm and + # owningColl fields are multi-value so Solr will return facets with the + # values in our query as well as *any others* that happen to be present + # in the field (which looks like Solr returning unrelated results until + # you realize that the field is multi-value and this is correct). + # + # To work around this I make sure that each id in the returned dict are + # present in the elements list POSTed by the user. + if id_ in elements: + data[id_] = views - # Check if any items have missing stats so we can set them to 0 - if len(data) < len(items): - # List comprehension to get a list of item ids (keys) in the data + # Check if any ids have missing stats so we can set them to 0 + if len(data) < len(elements): + # List comprehension to get a list of ids (keys) in the data data_ids = [k for k, v in data.items()] - for item_id in items: - if item_id not in data_ids: - data[item_id] = 0 + for element_id in elements: + if element_id not in data_ids: + data[element_id] = 0 continue return data -def get_downloads(solr_date_string: str, items: list): +def get_downloads(solr_date_string: str, elements: list, facetField: str): """ - Get download statistics for a list of items from Solr. + Get download statistics for a list of items from Solr. Depending on the req- + uest this could be items, communities, or collections. :parameter solr_date_string (str): Solr date string, for example "[* TO *]" - :parameter items (list): a list of item IDs - :returns: A dict of item IDs and downloads + :parameter elements (list): a list of IDs + :parameter facetField (str): Solr field to facet by, for example "id" + :returns: A dict of IDs and downloads """ shards = get_statistics_shards() # Join the UUIDs with "OR" and escape the hyphens for Solr - solr_items_string: str = " OR ".join(items).replace("-", r"\-") + solr_elements_string: str = " OR ".join(elements).replace("-", r"\-") solr_query_params = { - "q": f"owningItem:({solr_items_string})", + "q": f"{facetField}:({solr_elements_string})", "fq": f"type:0 AND isBot:false AND statistics_type:view AND bundleName:ORIGINAL AND time:{solr_date_string}", - "fl": "owningItem", + "fl": facetField, "facet": "true", - "facet.field": "owningItem", + "facet.field": facetField, "facet.mincount": 1, "shards": shards, "rows": 0, @@ -88,17 +102,20 @@ def get_downloads(solr_date_string: str, items: list): # Solr returns facets as a dict of dicts (see the json.nl parameter) downloads = res.json()["facet_counts"]["facet_fields"] - # Iterate over the 'owningItem' dict and get the item ids and downloads - for item_id, item_downloads in downloads["owningItem"].items(): - data[item_id] = item_downloads + # Iterate over the facetField dict and get the ids and downloads + for id_, downloads in downloads[facetField].items(): + # Make sure that each id in the returned dict are present in the + # elements list POSTed by the user. + if id_ in elements: + data[id_] = downloads - # Check if any items have missing stats so we can set them to 0 - if len(data) < len(items): - # List comprehension to get a list of item ids (keys) in the data + # Check if any elements have missing stats so we can set them to 0 + if len(data) < len(elements): + # List comprehension to get a list of ids (keys) in the data data_ids = [k for k, v in data.items()] - for item_id in items: - if item_id not in data_ids: - data[item_id] = 0 + for element_id in elements: + if element_id not in data_ids: + data[element_id] = 0 continue return data diff --git a/dspace_statistics_api/util.py b/dspace_statistics_api/util.py index 8c51ca5..12c2092 100644 --- a/dspace_statistics_api/util.py +++ b/dspace_statistics_api/util.py @@ -74,8 +74,9 @@ def is_valid_date(date): ) -def validate_items_post_parameters(req, resp, resource, params): - """Check the POSTed request parameters for the `/items` endpoint. +def validate_post_parameters(req, resp, resource, params): + """Check the POSTed request parameters for the `/items`, `/communities` and + `/collections` endpoints. Meant to be used as a `before` hook. """ @@ -125,14 +126,64 @@ def validate_items_post_parameters(req, resp, resource, params): else: req.context.page = 0 - # Parse the list of items from the POST request body - if "items" in doc: - if isinstance(doc["items"], list) and len(doc["items"]) > 0: - req.context.items = doc["items"] + # Parse the list of elements from the POST request body + if req.context.statistics_scope in doc: + if ( + isinstance(doc[req.context.statistics_scope], list) + and len(doc[req.context.statistics_scope]) > 0 + ): + req.context.elements = doc[req.context.statistics_scope] else: raise falcon.HTTPBadRequest( title="Invalid parameter", - description='The "items" parameter is invalid. The value must be a comma-separated list of item UUIDs.', + description=f'The "{req.context.statistics_scope}" parameter is invalid. The value must be a comma-separated list of UUIDs.', ) else: - req.context.items = list() + req.context.elements = list() + + +def set_statistics_scope(req, resp, resource, params): + """Set the statistics scope (item, collection, or community) of the request + as well as the appropriate database (for GET requests) and Solr facet fields + (for POST requests). + + Meant to be used as a `before` hook. + """ + + # Extract the scope from the request path. This is *guaranteed* to be one + # of the following values because we only send requests matching these few + # patterns to routes using this set_statistics_scope hook. + # + # Note: this regex is ordered so that "items" and "collections" match before + # "item" and "collection". + req.context.statistics_scope = re.findall( + r"^/(communities|community|collections|collection|items|item)", req.path + )[0] + + # Set the correct database based on the statistics_scope. The database is + # used for all GET requests where statistics are returned directly from the + # database. In this case we can return early. + if req.method == "GET": + if re.findall(r"^(item|items)$", req.context.statistics_scope): + req.context.database = "items" + elif re.findall(r"^(community|communities)$", req.context.statistics_scope): + req.context.database = "communities" + elif re.findall(r"^(collection|collections)$", req.context.statistics_scope): + req.context.database = "collections" + + # GET requests only need the scope and the database so we can return now + return + + # If the current request is for a plural items, communities, or collections + # that includes a list of element ids POSTed with the request body then we + # need to set the Solr facet field so we can get the live results. + if req.method == "POST": + if req.context.statistics_scope == "items": + req.context.views_facet_field = "id" + req.context.downloads_facet_field = "owningItem" + elif req.context.statistics_scope == "communities": + req.context.views_facet_field = "owningComm" + req.context.downloads_facet_field = "owningComm" + elif req.context.statistics_scope == "collections": + req.context.views_facet_field = "owningColl" + req.context.downloads_facet_field = "owningColl"