Add communities and collections support to API

The basic logic is similar to items, where you can request single item statistics with a UUID, all item statistics, and item statis- tics for a list of items (optionally with a date range). Most of the item code was re-purposed to work on "elements", which can be items, communities, or collections depending on the request, with the use of Falcon's `before` hooks to set the statistics scope so we know how to behave for the current request. Other than the minor difference in facet fields, another issue I had with communities and collections is that the owningComm and owningColl fields are multi-valued (unlike items' id field). This means that, when you facet the results of your query, Solr returns ids that seem unrelated, but are actually present in the field, so I had to make sure I checked all returned ids to see if they were in the user's POSTed elements list. TODO: - Add tests - Revise docstrings - Refactor items.py as it is now generic
2025-05-28 06:12:32 +02:00 · 2020-12-20 16:14:46 +02:00
parent fba6f1ead1
commit 3339bf8d9c
3 changed files with 164 additions and 72 deletions
--- a/dspace_statistics_api/app.py
+++ b/dspace_statistics_api/app.py
@ -3,7 +3,8 @@ import psycopg2.extras

 from .database import DatabaseManager
 from .items import get_downloads, get_views
-from .util import validate_items_post_parameters
+from .util import set_statistics_scope
+from .util import validate_post_parameters


 class RootResource:
@ -14,7 +15,8 @@ class RootResource:
            resp.body = f.read()


-class AllItemsResource:
+class AllStatisticsResource:
+    @falcon.before(set_statistics_scope)
    def on_get(self, req, resp):
        """Handles GET requests"""
        # Return HTTPBadRequest if id parameter is not present and valid
@ -26,26 +28,26 @@ class AllItemsResource:
            db.set_session(readonly=True)

            with db.cursor() as cursor:
-                # get total number of items so we can estimate the pages
-                cursor.execute("SELECT COUNT(id) FROM items")
+                # get total number of communities/collections/items so we can estimate the pages
+                cursor.execute(f"SELECT COUNT(id) FROM {req.context.statistics_scope}")
                pages = round(cursor.fetchone()[0] / limit)

                # get statistics and use limit and offset to page through results
                cursor.execute(
-                    "SELECT id, views, downloads FROM items ORDER BY id LIMIT %s OFFSET %s",
+                    f"SELECT id, views, downloads FROM {req.context.statistics_scope} ORDER BY id LIMIT %s OFFSET %s",
                    [limit, offset],
                )

-                # create a list to hold dicts of item stats
+                # create a list to hold dicts of stats
                statistics = list()

                # iterate over results and build statistics object
-                for item in cursor:
+                for result in cursor:
                    statistics.append(
                        {
-                            "id": str(item["id"]),
-                            "views": item["views"],
-                            "downloads": item["downloads"],
+                            "id": str(result["id"]),
+                            "views": result["views"],
+                            "downloads": result["downloads"],
                        }
                    )

@ -58,9 +60,15 @@ class AllItemsResource:

        resp.media = message

-    @falcon.before(validate_items_post_parameters)
+    @falcon.before(set_statistics_scope)
+    @falcon.before(validate_post_parameters)
    def on_post(self, req, resp):
-        """Handles POST requests"""
+        """Handles POST requests.
+
+        Uses two `before` hooks to set the statistics "scope" and validate the
+        POST parameters. The "scope" is the type of statistics we want, which
+        will be items, communities, or collections, depending on the request.
+        """

        # Build the Solr date string, ie: [* TO *]
        if req.context.dateFrom and req.context.dateTo:
@ -74,10 +82,10 @@ class AllItemsResource:

        # Helper variables to make working with pages/items/results easier and
        # to make the code easier to understand
-        number_of_items: int = len(req.context.items)
-        pages: int = int(number_of_items / req.context.limit)
-        first_item: int = req.context.page * req.context.limit
-        last_item: int = first_item + req.context.limit
+        number_of_elements: int = len(req.context.elements)
+        pages: int = int(number_of_elements / req.context.limit)
+        first_element: int = req.context.page * req.context.limit
+        last_element: int = first_element + req.context.limit
        # Get a subset of the POSTed items based on our limit. Note that Python
        # list slicing and indexing are both zero based, but the first and last
        # items in a slice can be confusing. See this ASCII diagram:
@ -88,20 +96,24 @@ class AllItemsResource:
        # Slice position: 0   1   2   3   4   5   6
        # Index position:   0   1   2   3   4   5
        #
-        # So if we have a list items with 240 items:
+        # So if we have a list of items with 240 items:
        #
        #   1st set: items[0:100] would give items at indexes 0 to 99
        #   2nd set: items[100:200] would give items at indexes 100 to 199
        #   3rd set: items[200:300] would give items at indexes 200 to 239
-        items_subset: list = req.context.items[first_item:last_item]
+        elements_subset: list = req.context.elements[first_element:last_element]

-        views: dict = get_views(solr_date_string, items_subset)
-        downloads: dict = get_downloads(solr_date_string, items_subset)
+        views: dict = get_views(
+            solr_date_string, elements_subset, req.context.views_facet_field
+        )
+        downloads: dict = get_downloads(
+            solr_date_string, elements_subset, req.context.downloads_facet_field
+        )

-        # create a list to hold dicts of item stats
+        # create a list to hold dicts of stats
        statistics = list()

-        # iterate over views dict to extract views and use the item id as an
+        # iterate over views dict to extract views and use the element id as an
        # index to the downloads dict to extract downloads.
        for k, v in views.items():
            statistics.append({"id": k, "views": v, "downloads": downloads[k]})
@ -117,8 +129,9 @@ class AllItemsResource:
        resp.media = message


-class ItemResource:
-    def on_get(self, req, resp, item_id):
+class SingleStatisticsResource:
+    @falcon.before(set_statistics_scope)
+    def on_get(self, req, resp, id_):
        """Handles GET requests"""

        # Adapt Python’s uuid.UUID type to PostgreSQL’s uuid
@ -131,18 +144,19 @@ class ItemResource:
            with db.cursor() as cursor:
                cursor = db.cursor()
                cursor.execute(
-                    "SELECT views, downloads FROM items WHERE id=%s", [str(item_id)]
+                    f"SELECT views, downloads FROM {req.context.database} WHERE id=%s",
+                    [str(id_)],
                )
                if cursor.rowcount == 0:
                    raise falcon.HTTPNotFound(
-                        title="Item not found",
-                        description=f'The item with id "{str(item_id)}" was not found.',
+                        title=f"{req.context.statistics_scope} not found",
+                        description=f'The {req.context.statistics_scope} with id "{str(id_)}" was not found.',
                    )
                else:
                    results = cursor.fetchone()

                    statistics = {
-                        "id": str(item_id),
+                        "id": str(id_),
                        "views": results["views"],
                        "downloads": results["downloads"],
                    }
@ -152,7 +166,17 @@ class ItemResource:

 api = application = falcon.API()
 api.add_route("/", RootResource())
-api.add_route("/items", AllItemsResource())
-api.add_route("/item/{item_id:uuid}", ItemResource())
+
+# Item routes
+api.add_route("/items", AllStatisticsResource())
+api.add_route("/item/{id_:uuid}", SingleStatisticsResource())
+
+# Community routes
+api.add_route("/communities", AllStatisticsResource())
+api.add_route("/community/{id_:uuid}", SingleStatisticsResource())
+
+# Collection routes
+api.add_route("/collections", AllStatisticsResource())
+api.add_route("/collection/{id_:uuid}", SingleStatisticsResource())

 # vim: set sw=4 ts=4 expandtab:
--- a/dspace_statistics_api/items.py
+++ b/dspace_statistics_api/items.py
@ -4,25 +4,27 @@ from .config import SOLR_SERVER
 from .util import get_statistics_shards


-def get_views(solr_date_string: str, items: list):
+def get_views(solr_date_string: str, elements: list, facetField: str):
    """
-    Get view statistics for a list of items from Solr.
+    Get view statistics for a list of elements from Solr. Depending on the req-
+    uest this could be items, communities, or collections.

    :parameter solr_date_string (str): Solr date string, for example "[* TO *]"
-    :parameter items (list): a list of item IDs
-    :returns: A dict of item IDs and views
+    :parameter elements (list): a list of IDs
+    :parameter facetField (str): Solr field to facet by, for example "id"
+    :returns: A dict of IDs and views
    """
    shards = get_statistics_shards()

    # Join the UUIDs with "OR" and escape the hyphens for Solr
-    solr_items_string: str = " OR ".join(items).replace("-", r"\-")
+    solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")

    solr_query_params = {
-        "q": f"id:({solr_items_string})",
+        "q": f"{facetField}:({solr_elements_string})",
        "fq": f"type:2 AND isBot:false AND statistics_type:view AND time:{solr_date_string}",
-        "fl": "id",
+        "fl": facetField,
        "facet": "true",
-        "facet.field": "id",
+        "facet.field": facetField,
        "facet.mincount": 1,
        "shards": shards,
        "rows": 0,
@ -38,41 +40,53 @@ def get_views(solr_date_string: str, items: list):

    # Solr returns facets as a dict of dicts (see the json.nl parameter)
    views = res.json()["facet_counts"]["facet_fields"]
-    # iterate over the 'id' dict and get the item ids and views
-    for item_id, item_views in views["id"].items():
-        data[item_id] = item_views
+    # iterate over the facetField dict and ids and views
+    for id_, views in views[facetField].items():
+        # For items we can rely on Solr returning facets for the *only* the ids
+        # in our query, but for communities and collections, the owningComm and
+        # owningColl fields are multi-value so Solr will return facets with the
+        # values in our query as well as *any others* that happen to be present
+        # in the field (which looks like Solr returning unrelated results until
+        # you realize that the field is multi-value and this is correct).
+        #
+        # To work around this I make sure that each id in the returned dict are
+        # present in the elements list POSTed by the user.
+        if id_ in elements:
+            data[id_] = views

-    # Check if any items have missing stats so we can set them to 0
-    if len(data) < len(items):
-        # List comprehension to get a list of item ids (keys) in the data
+    # Check if any ids have missing stats so we can set them to 0
+    if len(data) < len(elements):
+        # List comprehension to get a list of ids (keys) in the data
        data_ids = [k for k, v in data.items()]
-        for item_id in items:
-            if item_id not in data_ids:
-                data[item_id] = 0
+        for element_id in elements:
+            if element_id not in data_ids:
+                data[element_id] = 0
                continue

    return data


-def get_downloads(solr_date_string: str, items: list):
+def get_downloads(solr_date_string: str, elements: list, facetField: str):
    """
-    Get download statistics for a list of items from Solr.
+    Get download statistics for a list of items from Solr. Depending on the req-
+    uest this could be items, communities, or collections.

    :parameter solr_date_string (str): Solr date string, for example "[* TO *]"
-    :parameter items (list): a list of item IDs
-    :returns: A dict of item IDs and downloads
+    :parameter elements (list): a list of IDs
+    :parameter facetField (str): Solr field to facet by, for example "id"
+    :returns: A dict of IDs and downloads
    """
    shards = get_statistics_shards()

    # Join the UUIDs with "OR" and escape the hyphens for Solr
-    solr_items_string: str = " OR ".join(items).replace("-", r"\-")
+    solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")

    solr_query_params = {
-        "q": f"owningItem:({solr_items_string})",
+        "q": f"{facetField}:({solr_elements_string})",
        "fq": f"type:0 AND isBot:false AND statistics_type:view AND bundleName:ORIGINAL AND time:{solr_date_string}",
-        "fl": "owningItem",
+        "fl": facetField,
        "facet": "true",
-        "facet.field": "owningItem",
+        "facet.field": facetField,
        "facet.mincount": 1,
        "shards": shards,
        "rows": 0,
@ -88,17 +102,20 @@ def get_downloads(solr_date_string: str, items: list):

    # Solr returns facets as a dict of dicts (see the json.nl parameter)
    downloads = res.json()["facet_counts"]["facet_fields"]
-    # Iterate over the 'owningItem' dict and get the item ids and downloads
-    for item_id, item_downloads in downloads["owningItem"].items():
-        data[item_id] = item_downloads
+    # Iterate over the facetField dict and get the ids and downloads
+    for id_, downloads in downloads[facetField].items():
+        # Make sure that each id in the returned dict are present in the
+        # elements list POSTed by the user.
+        if id_ in elements:
+            data[id_] = downloads

-    # Check if any items have missing stats so we can set them to 0
-    if len(data) < len(items):
-        # List comprehension to get a list of item ids (keys) in the data
+    # Check if any elements have missing stats so we can set them to 0
+    if len(data) < len(elements):
+        # List comprehension to get a list of ids (keys) in the data
        data_ids = [k for k, v in data.items()]
-        for item_id in items:
-            if item_id not in data_ids:
-                data[item_id] = 0
+        for element_id in elements:
+            if element_id not in data_ids:
+                data[element_id] = 0
                continue

    return data
--- a/dspace_statistics_api/util.py
+++ b/dspace_statistics_api/util.py
@ -74,8 +74,9 @@ def is_valid_date(date):
        )


-def validate_items_post_parameters(req, resp, resource, params):
-    """Check the POSTed request parameters for the `/items` endpoint.
+def validate_post_parameters(req, resp, resource, params):
+    """Check the POSTed request parameters for the `/items`, `/communities` and
+    `/collections` endpoints.

    Meant to be used as a `before` hook.
    """
@ -125,14 +126,64 @@ def validate_items_post_parameters(req, resp, resource, params):
    else:
        req.context.page = 0

-    # Parse the list of items from the POST request body
-    if "items" in doc:
-        if isinstance(doc["items"], list) and len(doc["items"]) > 0:
-            req.context.items = doc["items"]
+    # Parse the list of elements from the POST request body
+    if req.context.statistics_scope in doc:
+        if (
+            isinstance(doc[req.context.statistics_scope], list)
+            and len(doc[req.context.statistics_scope]) > 0
+        ):
+            req.context.elements = doc[req.context.statistics_scope]
        else:
            raise falcon.HTTPBadRequest(
                title="Invalid parameter",
-                description='The "items" parameter is invalid. The value must be a comma-separated list of item UUIDs.',
+                description=f'The "{req.context.statistics_scope}" parameter is invalid. The value must be a comma-separated list of UUIDs.',
            )
    else:
-        req.context.items = list()
+        req.context.elements = list()
+
+
+def set_statistics_scope(req, resp, resource, params):
+    """Set the statistics scope (item, collection, or community) of the request
+    as well as the appropriate database (for GET requests) and Solr facet fields
+    (for POST requests).
+
+    Meant to be used as a `before` hook.
+    """
+
+    # Extract the scope from the request path. This is *guaranteed* to be one
+    # of the following values because we only send requests matching these few
+    # patterns to routes using this set_statistics_scope hook.
+    #
+    # Note: this regex is ordered so that "items" and "collections" match before
+    # "item" and "collection".
+    req.context.statistics_scope = re.findall(
+        r"^/(communities|community|collections|collection|items|item)", req.path
+    )[0]
+
+    # Set the correct database based on the statistics_scope. The database is
+    # used for all GET requests where statistics are returned directly from the
+    # database. In this case we can return early.
+    if req.method == "GET":
+        if re.findall(r"^(item|items)$", req.context.statistics_scope):
+            req.context.database = "items"
+        elif re.findall(r"^(community|communities)$", req.context.statistics_scope):
+            req.context.database = "communities"
+        elif re.findall(r"^(collection|collections)$", req.context.statistics_scope):
+            req.context.database = "collections"
+
+        # GET requests only need the scope and the database so we can return now
+        return
+
+    # If the current request is for a plural items, communities, or collections
+    # that includes a list of element ids POSTed with the request body then we
+    # need to set the Solr facet field so we can get the live results.
+    if req.method == "POST":
+        if req.context.statistics_scope == "items":
+            req.context.views_facet_field = "id"
+            req.context.downloads_facet_field = "owningItem"
+        elif req.context.statistics_scope == "communities":
+            req.context.views_facet_field = "owningComm"
+            req.context.downloads_facet_field = "owningComm"
+        elif req.context.statistics_scope == "collections":
+            req.context.views_facet_field = "owningColl"
+            req.context.downloads_facet_field = "owningColl"