1
0
mirror of https://github.com/ilri/dspace-statistics-api.git synced 2025-07-06 14:31:38 +02:00

Use uv build backend

uv's build backend expects our module to be in src.
This commit is contained in:
2025-07-03 14:43:07 +03:00
parent 7b8ff7c02f
commit f26b55bc5f
9 changed files with 3 additions and 4 deletions

View File

@ -1,254 +0,0 @@
# SPDX-License-Identifier: GPL-3.0-only
import json
import math
import falcon
from falcon_swagger_ui import register_swaggerui_app
from .config import DSPACE_STATISTICS_API_URL, VERSION
from .database import DatabaseManager
from .stats import get_downloads, get_views
from .util import set_statistics_scope, validate_post_parameters
class RootResource:
def on_get(self, req, resp):
resp.status = falcon.HTTP_200
resp.content_type = "text/html"
docs_html = (
"<!DOCTYPE html>"
'<html lang="en-US">'
" <head>"
' <meta charset="UTF-8">'
" <title>DSpace Statistics API</title>"
" </head>"
" <body>"
f" <h1>DSpace Statistics API {VERSION}</h1>"
f" <p>This site is running the <a href=\"https://github.com/ilri/dspace-statistics-api\" title=\"DSpace Statistics API project\">DSpace Statistics API</a>. For more information see the project's README.md or the interactive <a href=\"{DSPACE_STATISTICS_API_URL + '/swagger'}\">Swagger UI</a> built into this API.</p>"
" </body>"
"</html"
)
resp.text = docs_html
class StatusResource:
def on_get(self, req, resp):
message = {"version": VERSION}
resp.status = falcon.HTTP_200
resp.media = message
class OpenAPIJSONResource:
def on_get(self, req, resp):
resp.status = falcon.HTTP_200
resp.content_type = "text/html"
with open("dspace_statistics_api/docs/openapi.json", "r") as f:
# Load the openapi.json schema
data = json.load(f)
# Swagger assumes your API is at the root of the current host unless
# you configure a "servers" block in the schema. The problem is that
# I want this to work in both development and production, so we need
# to make this configurable.
#
# If the DSPACE_STATISTICS_API_URL is configured then we will add a
# server entry to the openapi.json schema before sending it.
if DSPACE_STATISTICS_API_URL != "":
data["servers"] = [{"url": DSPACE_STATISTICS_API_URL}]
# Set the version in the schema so Swagger UI can display it
data["info"]["version"] = VERSION
resp.text = json.dumps(data)
class AllStatisticsResource:
@falcon.before(set_statistics_scope)
def on_get(self, req, resp):
"""Handles GET requests"""
# Return HTTPBadRequest if id parameter is not present and valid
limit = req.get_param_as_int("limit", min_value=1, max_value=100) or 100
page = req.get_param_as_int("page", min_value=0) or 0
offset = limit * page
with DatabaseManager() as db:
db.set_read_only(True)
with db.cursor() as cursor:
# get total number of communities/collections/items so we can estimate the pages
cursor.execute(f"SELECT COUNT(id) FROM {req.context.statistics_scope}")
pages = math.ceil(cursor.fetchone()['count'] / limit)
# get statistics and use limit and offset to page through results
cursor.execute(
f"SELECT id, views, downloads FROM {req.context.statistics_scope} ORDER BY id LIMIT %s OFFSET %s",
[limit, offset],
)
# create a list to hold dicts of stats
statistics = []
# iterate over results and build statistics object
for result in cursor:
statistics.append(
{
"id": str(result["id"]),
"views": result["views"],
"downloads": result["downloads"],
}
)
message = {
"currentPage": page,
"totalPages": pages,
"limit": limit,
"statistics": statistics,
}
resp.media = message
@falcon.before(set_statistics_scope)
@falcon.before(validate_post_parameters)
def on_post(self, req, resp):
"""Handles POST requests.
Uses two `before` hooks to set the statistics "scope" and validate the
POST parameters. The "scope" is the type of statistics we want, which
will be items, communities, or collections, depending on the request.
"""
# Build the Solr date string, ie: [* TO *]
if req.context.dateFrom and req.context.dateTo:
solr_date_string = f"[{req.context.dateFrom} TO {req.context.dateTo}]"
elif not req.context.dateFrom and req.context.dateTo:
solr_date_string = f"[* TO {req.context.dateTo}]"
elif req.context.dateFrom and not req.context.dateTo:
solr_date_string = f"[{req.context.dateFrom} TO *]"
else:
solr_date_string = "[* TO *]"
# Helper variables to make working with pages/items/results easier and
# to make the code easier to understand
number_of_elements: int = len(req.context.elements)
pages: int = math.ceil(number_of_elements / req.context.limit)
first_element: int = req.context.page * req.context.limit
last_element: int = first_element + req.context.limit
# Get a subset of the POSTed items based on our limit. Note that Python
# list slicing and indexing are both zero based, but the first and last
# items in a slice can be confusing. See this ASCII diagram:
#
# +---+---+---+---+---+---+
# | P | y | t | h | o | n |
# +---+---+---+---+---+---+
# Slice position: 0 1 2 3 4 5 6
# Index position: 0 1 2 3 4 5
#
# So if we have a list of items with 240 items:
#
# 1st set: items[0:100] would give items at indexes 0 to 99
# 2nd set: items[100:200] would give items at indexes 100 to 199
# 3rd set: items[200:300] would give items at indexes 200 to 239
elements_subset: list = req.context.elements[first_element:last_element]
views: dict = get_views(
solr_date_string, elements_subset, req.context.views_facet_field
)
downloads: dict = get_downloads(
solr_date_string, elements_subset, req.context.downloads_facet_field
)
# create a list to hold dicts of stats
statistics = []
# iterate over views dict to extract views and use the element id as an
# index to the downloads dict to extract downloads.
for k, v in views.items():
statistics.append({"id": k, "views": v, "downloads": downloads[k]})
message = {
"currentPage": req.context.page,
"totalPages": pages,
"limit": req.context.limit,
"statistics": statistics,
}
resp.status = falcon.HTTP_200
resp.media = message
class SingleStatisticsResource:
@falcon.before(set_statistics_scope)
def on_get(self, req, resp, id_):
"""Handles GET requests"""
with DatabaseManager() as db:
db.set_read_only(True)
with db.cursor() as cursor:
cursor = db.cursor()
cursor.execute(
f"SELECT views, downloads FROM {req.context.database} WHERE id=%s",
[str(id_)],
)
if cursor.rowcount == 0:
raise falcon.HTTPNotFound(
title=f"{req.context.statistics_scope} not found",
description=f'The {req.context.statistics_scope} with id "{str(id_)}" was not found.',
)
else:
results = cursor.fetchone()
statistics = {
"id": str(id_),
"views": results["views"],
"downloads": results["downloads"],
}
resp.media = statistics
app = application = falcon.App()
app.add_route("/", RootResource())
app.add_route("/status", StatusResource())
# Item routes
app.add_route("/items", AllStatisticsResource())
app.add_route("/item/{id_:uuid}", SingleStatisticsResource())
# Community routes
app.add_route("/communities", AllStatisticsResource())
app.add_route("/community/{id_:uuid}", SingleStatisticsResource())
# Collection routes
app.add_route("/collections", AllStatisticsResource())
app.add_route("/collection/{id_:uuid}", SingleStatisticsResource())
# Route to the Swagger UI Openapp schema
app.add_route("/docs/openapi.json", OpenAPIJSONResource())
# Path to host the Swagger UI. Keep in mind that Falcon will add a route for
# this automatically when we register Swagger and the path will be relative
# to the Falcon app like all other routes, not the absolute root.
SWAGGERUI_PATH = "/swagger"
# The *absolute* path to the OpenJSON schema. This must be absolute because
# it will be requested by the client and must resolve absolutely. Note: the
# name of this variable is misleading because it is actually the schema URL
# but we pass it into the register_swaggerui_app() function as the app_url
# parameter.
SWAGGERUI_API_URL = f"{DSPACE_STATISTICS_API_URL}/docs/openapi.json"
register_swaggerui_app(
app,
SWAGGERUI_PATH,
SWAGGERUI_API_URL,
config={
"supportedSubmitMethods": ["get", "post"],
},
uri_prefix=DSPACE_STATISTICS_API_URL,
)
# vim: set sw=4 ts=4 expandtab:

View File

@ -1,23 +0,0 @@
# SPDX-License-Identifier: GPL-3.0-only
import os
# Check if Solr connection information was provided in the environment
SOLR_SERVER = os.environ.get("SOLR_SERVER", "http://localhost:8080/solr")
DATABASE_NAME = os.environ.get("DATABASE_NAME", "dspacestatistics")
DATABASE_USER = os.environ.get("DATABASE_USER", "dspacestatistics")
DATABASE_PASS = os.environ.get("DATABASE_PASS", "dspacestatistics")
DATABASE_HOST = os.environ.get("DATABASE_HOST", "localhost")
DATABASE_PORT = os.environ.get("DATABASE_PORT", "5432")
# URL to DSpace Statistics API, which will be used as a prefix to API calls in
# the Swagger UI. An empty string will allow this to work out of the box in a
# local development environment, but for production it should be set to a value
# like "/rest/statistics", assuming that the statistics API is deployed next to
# the vanilla DSpace REST API.
DSPACE_STATISTICS_API_URL = os.environ.get("DSPACE_STATISTICS_API_URL", "")
VERSION = "1.4.6-dev"
# vim: set sw=4 ts=4 expandtab:

View File

@ -1,37 +0,0 @@
# SPDX-License-Identifier: GPL-3.0-only
import falcon
import psycopg
from .config import (
DATABASE_HOST,
DATABASE_NAME,
DATABASE_PASS,
DATABASE_PORT,
DATABASE_USER,
)
class DatabaseManager:
"""Manage database connection."""
def __init__(self):
self._connection_uri = f"dbname={DATABASE_NAME} user={DATABASE_USER} password={DATABASE_PASS} host={DATABASE_HOST} port={DATABASE_PORT}"
def __enter__(self):
try:
self._connection = psycopg.connect(
self._connection_uri, row_factory=psycopg.rows.dict_row
)
except psycopg.OperationalError:
title = "500 Internal Server Error"
description = "Could not connect to database"
raise falcon.HTTPInternalServerError(title, description)
return self._connection
def __exit__(self, exc_type, exc_value, exc_traceback):
self._connection.close()
# vim: set sw=4 ts=4 expandtab:

View File

@ -1,616 +0,0 @@
{
"openapi": "3.0.3",
"info": {
"version": "1.4.6-dev",
"title": "DSpace Statistics API",
"description": "A [Falcon-based](https://falcon.readthedocs.io/) web application to make DSpace's item, community, and collection statistics available via a simple REST API. This Swagger interface is powered by [falcon-swagger-ui](https://github.com/rdidyk/falcon-swagger-ui).",
"license": {
"name": "GPLv3.0",
"url": "https://www.gnu.org/licenses/gpl-3.0.en.html"
}
},
"paths": {
"/item/{item_uuid}": {
"get": {
"summary": "Statistics for a specific item",
"operationId": "getItem",
"tags": [
"item"
],
"parameters": [
{
"name": "item_uuid",
"in": "path",
"required": true,
"description": "The UUID of the item to retrieve",
"schema": {
"type": "string",
"format": "uuid",
"example": "9596aeff-0b90-47d3-9fec-02d578920507"
}
}
],
"responses": {
"200": {
"description": "Expected response to a valid request",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
},
"404": {
"description": "Item not found"
}
}
}
},
"/items": {
"get": {
"summary": "Get statistics for all items",
"operationId": "getItems",
"tags": [
"items"
],
"parameters": [
{
"name": "limit",
"in": "query",
"description": "How many items to return at once (optional)",
"required": false,
"schema": {
"type": "integer",
"format": "int32",
"minimum": 1,
"maximum": 100,
"default": 100,
"example": 100
}
},
{
"name": "page",
"in": "query",
"description": "Page of results to start on (optional)",
"required": false,
"schema": {
"type": "integer",
"format": "int32",
"minimum": 0,
"default": 0,
"example": 0
}
}
],
"responses": {
"200": {
"description": "A paged array of items",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
},
"400": {
"description": "Bad request"
}
}
},
"post": {
"summary": "Get statistics for a list of items with an optional date range",
"operationId": "postItems",
"tags": [
"items"
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"format": "int32",
"minimum": 1,
"maximum": 100,
"default": 100
},
"page": {
"type": "integer",
"format": "int32",
"minimum": 0,
"default": 0
},
"dateFrom": {
"type": "string",
"format": "date"
},
"dateTo": {
"type": "string",
"format": "date"
},
"items": {
"type": "array",
"items": {
"type": "string",
"format": "uuid"
}
}
},
"example": {
"limit": 100,
"page": 0,
"dateFrom": "2020-01-01T00:00:00Z",
"dateTo": "2020-12-31T00:00:00Z",
"items": [
"f44cf173-2344-4eb2-8f00-ee55df32c76f",
"2324aa41-e9de-4a2b-bc36-16241464683e",
"8542f9da-9ce1-4614-abf4-f2e3fdb4b305",
"0fe573e7-042a-4240-a4d9-753b61233908"
]
}
}
}
}
},
"responses": {
"200": {
"description": "Expected response to a valid request",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"currentPage": {
"type": "integer",
"format": "int32"
},
"limit": {
"type": "integer",
"format": "int32"
},
"totalPages": {
"type": "integer",
"format": "int32"
},
"statistics": {
"$ref": "#/components/schemas/ListOfElements"
}
}
}
}
}
},
"400": {
"description": "Bad request"
}
}
}
},
"/community/{community_uuid}": {
"get": {
"summary": "Statistics for a specific community",
"operationId": "getCommunity",
"tags": [
"community"
],
"parameters": [
{
"name": "community_uuid",
"in": "path",
"required": true,
"description": "The UUID of the community to retrieve",
"schema": {
"type": "string",
"format": "uuid",
"example": "bde7139c-d321-46bb-aef6-ae70799e5edb"
}
}
],
"responses": {
"200": {
"description": "Expected response to a valid request",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
},
"404": {
"description": "Community not found"
}
}
}
},
"/communities": {
"get": {
"summary": "Get statistics for all communities",
"operationId": "getCommunities",
"tags": [
"communities"
],
"parameters": [
{
"name": "limit",
"in": "query",
"description": "How many communities to return at once (optional)",
"required": false,
"schema": {
"type": "integer",
"format": "int32",
"minimum": 1,
"maximum": 100,
"default": 100,
"example": 100
}
},
{
"name": "page",
"in": "query",
"description": "Zero-based page of results to start on (optional)",
"required": false,
"schema": {
"type": "integer",
"format": "int32",
"minimum": 0,
"default": 0,
"example": 0
}
}
],
"responses": {
"200": {
"description": "A paged array of communities",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
},
"400": {
"description": "Bad request"
}
}
},
"post": {
"summary": "Get statistics for a list of communities with an optional date range",
"operationId": "postCommunities",
"tags": [
"communities"
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"format": "int32",
"minimum": 1,
"maximum": 100,
"default": 100
},
"page": {
"type": "integer",
"format": "int32",
"minimum": 0,
"default": 0
},
"dateFrom": {
"type": "string",
"format": "date"
},
"dateTo": {
"type": "string",
"format": "date"
},
"communities": {
"type": "array",
"items": {
"type": "string",
"format": "uuid"
}
}
},
"example": {
"limit": 100,
"page": 0,
"dateFrom": "2020-01-01T00:00:00Z",
"dateTo": "2020-12-31T00:00:00Z",
"communities": [
"bde7139c-d321-46bb-aef6-ae70799e5edb",
"8a8aeed1-077e-4360-bdf8-a5f3020193b1",
"47d0498a-203c-407d-afb8-1d44bf29badc",
"d3fe99a9-e27d-4035-9339-084c93228c82"
]
}
}
}
}
},
"responses": {
"200": {
"description": "Expected response to a valid request",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"currentPage": {
"type": "integer",
"format": "int32"
},
"limit": {
"type": "integer",
"format": "int32"
},
"totalPages": {
"type": "integer",
"format": "int32"
},
"statistics": {
"$ref": "#/components/schemas/ListOfElements"
}
}
}
}
}
},
"400": {
"description": "Bad request"
}
}
}
},
"/collection/{collection_uuid}": {
"get": {
"summary": "Statistics for a specific collection",
"operationId": "getCollection",
"tags": [
"collection"
],
"parameters": [
{
"name": "collection_uuid",
"in": "path",
"required": true,
"description": "The UUID of the collection to retrieve",
"schema": {
"type": "string",
"format": "uuid",
"example": "49dc95d8-bf2f-4e68-b30f-41ea266c37ae"
}
}
],
"responses": {
"200": {
"description": "Expected response to a valid request",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
},
"404": {
"description": "Collection not found"
}
}
}
},
"/collections": {
"get": {
"summary": "Get statistics for all collections",
"operationId": "getCollections",
"tags": [
"collections"
],
"parameters": [
{
"name": "limit",
"in": "query",
"description": "How many collections to return at once (optional)",
"required": false,
"schema": {
"type": "integer",
"format": "int32",
"minimum": 1,
"maximum": 100,
"default": 100,
"example": 100
}
},
{
"name": "page",
"in": "query",
"description": "Zero-based page of results to start on (optional)",
"required": false,
"schema": {
"type": "integer",
"format": "int32",
"minimum": 0,
"default": 0,
"example": 0
}
}
],
"responses": {
"200": {
"description": "A paged array of collections",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
},
"400": {
"description": "Bad request"
}
}
},
"post": {
"summary": "Get statistics for a list of collections with an optional date range",
"operationId": "postCollections",
"tags": [
"collections"
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"format": "int32",
"minimum": 1,
"maximum": 100,
"default": 100
},
"page": {
"type": "integer",
"format": "int32",
"minimum": 0,
"default": 0
},
"dateFrom": {
"type": "string",
"format": "date"
},
"dateTo": {
"type": "string",
"format": "date"
},
"collections": {
"type": "array",
"items": {
"type": "string",
"format": "uuid"
}
}
},
"example": {
"limit": 100,
"page": 0,
"dateFrom": "2020-01-01T00:00:00Z",
"dateTo": "2020-12-31T00:00:00Z",
"collections": [
"5eeef6cf-b91b-42d0-9549-ea61bc8a758f",
"6aac3269-b4a9-4924-a24d-9e6ee2b410d2",
"551698dd-cd2b-4327-948e-54b5eb6deda5",
"39358713-bbaf-4149-a453-e2b18c09fd5d"
]
}
}
}
}
},
"responses": {
"200": {
"description": "Expected response to a valid request",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"currentPage": {
"type": "integer",
"format": "int32"
},
"limit": {
"type": "integer",
"format": "int32"
},
"totalPages": {
"type": "integer",
"format": "int32"
},
"statistics": {
"$ref": "#/components/schemas/ListOfElements"
}
}
}
}
}
},
"400": {
"description": "Bad request"
}
}
}
},
"/status": {
"get": {
"summary": "Get API status",
"operationId": "getStatus",
"tags": [
"status"
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"type": "object",
"properties": {
"version": {
"type": "string",
"example": "1.4.0-dev"
}
}
}
}
}
},
"405": {
"description": "Method Not Allowed"
}
}
}
}
},
"components": {
"schemas": {
"SingleElementResponse": {
"type": "object",
"required": [
"id",
"views",
"downloads"
],
"properties": {
"id": {
"type": "string",
"format": "uuid"
},
"views": {
"type": "integer",
"example": 450
},
"downloads": {
"type": "integer",
"example": 1337
}
}
},
"ListOfElements": {
"type": "array",
"items": {
"$ref": "#/components/schemas/SingleElementResponse"
}
}
}
}
}

View File

@ -1,231 +0,0 @@
# SPDX-License-Identifier: GPL-3.0-only
#
# indexer.py
#
# Connects to a DSpace Solr statistics core and ingests views and downloads for
# communities, collections, and items into a PostgreSQL database.
#
# This script is written for Python 3.6+ and requires several modules that you
# can install with pip (I recommend using a Python virtual environment):
#
# $ pip install psycopg
#
# See: https://wiki.duraspace.org/display/DSPACE/Solr
import math
import psycopg
import requests
from .config import SOLR_SERVER
from .database import DatabaseManager
from .util import get_statistics_shards
def index_views(indexType: str, facetField: str):
# get total number of distinct facets for items with a minimum of 1 view,
# otherwise Solr returns all kinds of weird ids that are actually not in
# the database. Also, stats are expensive, but we need stats.calcdistinct
# so we can get the countDistinct summary to calculate how many pages of
# results we have.
#
# see: https://lucene.apache.org/solr/guide/6_6/the-stats-component.html
solr_query_params = {
"q": f"type:2 AND {facetField}:/.{{36}}/",
"fq": "-isBot:true AND statistics_type:view",
"fl": facetField,
"facet": "true",
"facet.field": facetField,
"facet.mincount": 1,
"facet.limit": 1,
"facet.offset": 0,
"stats": "true",
"stats.field": facetField,
"stats.calcdistinct": "true",
"shards": shards,
"rows": 0,
"wt": "json",
}
solr_url = SOLR_SERVER + "/statistics/select"
res = requests.get(solr_url, params=solr_query_params)
try:
# get total number of distinct facets (countDistinct)
results_totalNumFacets = res.json()["stats"]["stats_fields"][facetField][
"countDistinct"
]
except TypeError:
print(f"{indexType}: no views, exiting.")
exit(0)
# divide results into "pages" and round up to next integer
results_per_page = 100
results_num_pages = math.ceil(results_totalNumFacets / results_per_page)
results_current_page = 0
with DatabaseManager() as db:
with db.cursor() as cursor:
# create an empty list to store values for batch insertion
data = []
while results_current_page <= results_num_pages:
# "pages" are zero based, but one based is more human readable
print(
f"{indexType}: indexing views (page {results_current_page + 1} of {results_num_pages + 1})"
)
solr_query_params = {
"q": f"type:2 AND {facetField}:/.{{36}}/",
"fq": "-isBot:true AND statistics_type:view",
"fl": facetField,
"facet": "true",
"facet.field": facetField,
"facet.mincount": 1,
"facet.limit": results_per_page,
"facet.offset": results_current_page * results_per_page,
"shards": shards,
"rows": 0,
"wt": "json",
"json.nl": "map", # return facets as a dict instead of a flat list
}
res = requests.get(solr_url, params=solr_query_params)
# Solr returns facets as a dict of dicts (see json.nl parameter)
views = res.json()["facet_counts"]["facet_fields"]
# iterate over the facetField dict and get the ids and views
for id_, views in views[facetField].items():
data.append((id_, views))
# do a batch insert of values from the current "page" of results
sql = f"INSERT INTO {indexType}(id, views) VALUES (%s, %s) ON CONFLICT(id) DO UPDATE SET views=excluded.views"
cursor.executemany(sql, data)
db.commit()
# clear all items from the list so we can populate it with the next batch
data.clear()
results_current_page += 1
def index_downloads(indexType: str, facetField: str):
# get the total number of distinct facets for items with at least 1 download
solr_query_params = {
"q": f"type:0 AND {facetField}:/.{{36}}/",
"fq": "-isBot:true AND statistics_type:view AND bundleName:ORIGINAL",
"fl": facetField,
"facet": "true",
"facet.field": facetField,
"facet.mincount": 1,
"facet.limit": 1,
"facet.offset": 0,
"stats": "true",
"stats.field": facetField,
"stats.calcdistinct": "true",
"shards": shards,
"rows": 0,
"wt": "json",
}
solr_url = SOLR_SERVER + "/statistics/select"
res = requests.get(solr_url, params=solr_query_params)
try:
# get total number of distinct facets (countDistinct)
results_totalNumFacets = res.json()["stats"]["stats_fields"][facetField][
"countDistinct"
]
except TypeError:
print(f"{indexType}: no downloads, exiting.")
exit(0)
results_per_page = 100
results_num_pages = math.ceil(results_totalNumFacets / results_per_page)
results_current_page = 0
with DatabaseManager() as db:
with db.cursor() as cursor:
# create an empty list to store values for batch insertion
data = []
while results_current_page <= results_num_pages:
# "pages" are zero based, but one based is more human readable
print(
f"{indexType}: indexing downloads (page {results_current_page + 1} of {results_num_pages + 1})"
)
solr_query_params = {
"q": f"type:0 AND {facetField}:/.{{36}}/",
"fq": "-isBot:true AND statistics_type:view AND bundleName:ORIGINAL",
"fl": facetField,
"facet": "true",
"facet.field": facetField,
"facet.mincount": 1,
"facet.limit": results_per_page,
"facet.offset": results_current_page * results_per_page,
"shards": shards,
"rows": 0,
"wt": "json",
"json.nl": "map", # return facets as a dict instead of a flat list
}
res = requests.get(solr_url, params=solr_query_params)
# Solr returns facets as a dict of dicts (see json.nl parameter)
downloads = res.json()["facet_counts"]["facet_fields"]
# iterate over the facetField dict and get the item ids and downloads
for id_, downloads in downloads[facetField].items():
data.append((id_, downloads))
# do a batch insert of values from the current "page" of results
sql = f"INSERT INTO {indexType}(id, downloads) VALUES (%s, %s) ON CONFLICT(id) DO UPDATE SET downloads=excluded.downloads"
cursor.executemany(sql, data)
db.commit()
# clear all items from the list so we can populate it with the next batch
data.clear()
results_current_page += 1
with DatabaseManager() as db:
with db.cursor() as cursor:
# create table to store item views and downloads
cursor.execute(
"""CREATE TABLE IF NOT EXISTS items
(id UUID PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)"""
)
# create table to store community views and downloads
cursor.execute(
"""CREATE TABLE IF NOT EXISTS communities
(id UUID PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)"""
)
# create table to store collection views and downloads
cursor.execute(
"""CREATE TABLE IF NOT EXISTS collections
(id UUID PRIMARY KEY, views INT DEFAULT 0, downloads INT DEFAULT 0)"""
)
# commit the table creation before closing the database connection
db.commit()
shards = get_statistics_shards()
# Index views and downloads for items, communities, and collections. Here the
# first parameter is the type of indexing to perform, and the second parameter
# is the field to facet by in Solr's statistics to get this information.
index_views("items", "id")
index_views("communities", "owningComm")
index_views("collections", "owningColl")
index_downloads("items", "owningItem")
index_downloads("communities", "owningComm")
index_downloads("collections", "owningColl")
# vim: set sw=4 ts=4 expandtab:

View File

@ -1,126 +0,0 @@
# SPDX-License-Identifier: GPL-3.0-only
import requests
from .config import SOLR_SERVER
from .util import get_statistics_shards
def get_views(solr_date_string: str, elements: list, facetField: str):
"""
Get view statistics for a list of elements from Solr. Depending on the req-
uest this could be items, communities, or collections.
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
:parameter elements (list): a list of IDs
:parameter facetField (str): Solr field to facet by, for example "id"
:returns: A dict of IDs and views
"""
shards = get_statistics_shards()
# Join the UUIDs with "OR" and escape the hyphens for Solr
solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")
solr_query_params = {
"q": f"{facetField}:({solr_elements_string})",
"fq": f"type:2 AND -isBot:true AND statistics_type:view AND time:{solr_date_string}",
"fl": facetField,
"facet": "true",
"facet.field": facetField,
"facet.mincount": 1,
"shards": shards,
"rows": 0,
"wt": "json",
"json.nl": "map", # return facets as a dict instead of a flat list
}
solr_url = SOLR_SERVER + "/statistics/select"
res = requests.get(solr_url, params=solr_query_params)
# Create an empty dict to store views
data = {}
# Solr returns facets as a dict of dicts (see the json.nl parameter)
views = res.json()["facet_counts"]["facet_fields"]
# iterate over the facetField dict and ids and views
for id_, views in views[facetField].items():
# For items we can rely on Solr returning facets for the *only* the ids
# in our query, but for communities and collections, the owningComm and
# owningColl fields are multi-value so Solr will return facets with the
# values in our query as well as *any others* that happen to be present
# in the field (which looks like Solr returning unrelated results until
# you realize that the field is multi-value and this is correct).
#
# To work around this I make sure that each id in the returned dict are
# present in the elements list POSTed by the user.
if id_ in elements:
data[id_] = views
# Check if any ids have missing stats so we can set them to 0
if len(data) < len(elements):
# List comprehension to get a list of ids (keys) in the data
data_ids = [k for k, v in data.items()]
for element_id in elements:
if element_id not in data_ids:
data[element_id] = 0
continue
return data
def get_downloads(solr_date_string: str, elements: list, facetField: str):
"""
Get download statistics for a list of items from Solr. Depending on the req-
uest this could be items, communities, or collections.
:parameter solr_date_string (str): Solr date string, for example "[* TO *]"
:parameter elements (list): a list of IDs
:parameter facetField (str): Solr field to facet by, for example "id"
:returns: A dict of IDs and downloads
"""
shards = get_statistics_shards()
# Join the UUIDs with "OR" and escape the hyphens for Solr
solr_elements_string: str = " OR ".join(elements).replace("-", r"\-")
solr_query_params = {
"q": f"{facetField}:({solr_elements_string})",
"fq": f"type:0 AND -isBot:true AND statistics_type:view AND bundleName:ORIGINAL AND time:{solr_date_string}",
"fl": facetField,
"facet": "true",
"facet.field": facetField,
"facet.mincount": 1,
"shards": shards,
"rows": 0,
"wt": "json",
"json.nl": "map", # return facets as a dict instead of a flat list
}
solr_url = SOLR_SERVER + "/statistics/select"
res = requests.get(solr_url, params=solr_query_params)
# Create an empty dict to store downloads
data = {}
# Solr returns facets as a dict of dicts (see the json.nl parameter)
downloads = res.json()["facet_counts"]["facet_fields"]
# Iterate over the facetField dict and get the ids and downloads
for id_, downloads in downloads[facetField].items():
# Make sure that each id in the returned dict are present in the
# elements list POSTed by the user.
if id_ in elements:
data[id_] = downloads
# Check if any elements have missing stats so we can set them to 0
if len(data) < len(elements):
# List comprehension to get a list of ids (keys) in the data
data_ids = [k for k, v in data.items()]
for element_id in elements:
if element_id not in data_ids:
data[element_id] = 0
continue
return data
# vim: set sw=4 ts=4 expandtab:

View File

@ -1,193 +0,0 @@
# SPDX-License-Identifier: GPL-3.0-only
import datetime
import json
import re
import falcon
import requests
from .config import SOLR_SERVER
def get_statistics_shards():
"""Enumerate the cores in Solr to determine if statistics have been sharded into
yearly shards by DSpace's stats-util or not (for example: statistics-2018).
Returns:
str:A list of Solr statistics shards separated by commas.
"""
# Initialize an empty list for statistics core years
statistics_core_years = []
# URL for Solr status to check active cores
solr_query_params = {"action": "STATUS", "wt": "json"}
solr_url = SOLR_SERVER + "/admin/cores"
res = requests.get(solr_url, params=solr_query_params)
if res.status_code == requests.codes.ok:
data = res.json()
# Iterate over active cores from Solr's STATUS response (cores are in
# the status array of this response).
for core in data["status"]:
# Pattern to match, for example: statistics-2018
pattern = re.compile("^statistics-[0-9]{4}$")
if not pattern.match(core):
continue
# Append current core to list
statistics_core_years.append(core)
# Initialize a string to hold our shards (may end up being empty if the Solr
# core has not been processed by stats-util).
shards = str()
if len(statistics_core_years) > 0:
# Begin building a string of shards starting with the default one
shards = f"{SOLR_SERVER}/statistics"
for core in statistics_core_years:
# Create a comma-separated list of shards to pass to our Solr query
#
# See: https://wiki.apache.org/solr/DistributedSearch
shards += f",{SOLR_SERVER}/{core}"
# Return the string of shards, which may actually be empty. Solr doesn't
# seem to mind if the shards query parameter is empty and I haven't seen
# any negative performance impact so this should be fine.
return shards
def is_valid_date(date):
try:
# Solr date format is: 2020-01-01T00:00:00Z
# See: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
return True
except ValueError:
raise falcon.HTTPBadRequest(
title="Invalid parameter",
description=f"Invalid date format: {date}. The value must be in format: 2020-01-01T00:00:00Z.",
)
def validate_post_parameters(req, resp, resource, params):
"""Check the POSTed request parameters for the `/items`, `/communities` and
`/collections` endpoints.
Meant to be used as a `before` hook.
"""
# Only attempt to read the POSTed request if its length is not 0 (or
# rather, in the Python sense, if length is not a False-y value).
if req.content_length:
doc = json.load(req.bounded_stream)
else:
raise falcon.HTTPBadRequest(
title="Invalid request", description="Request body is empty."
)
# Parse date parameters from request body (will raise an HTTPBadRequest
# from is_valid_date() if any parameters are invalid)
if "dateFrom" in doc and is_valid_date(doc["dateFrom"]):
req.context.dateFrom = doc["dateFrom"]
else:
req.context.dateFrom = None
if "dateTo" in doc and is_valid_date(doc["dateTo"]):
req.context.dateTo = doc["dateTo"]
else:
req.context.dateTo = None
# Parse the limit parameter from the POST request body
if "limit" in doc:
if isinstance(doc["limit"], int) and 0 < doc["limit"] <= 100:
req.context.limit = doc["limit"]
else:
raise falcon.HTTPBadRequest(
title="Invalid parameter",
description='The "limit" parameter is invalid. The value must be an integer between 1 and 100.',
)
else:
req.context.limit = 100
# Parse the page parameter from the POST request body
if "page" in doc:
if isinstance(doc["page"], int) and doc["page"] >= 0:
req.context.page = doc["page"]
else:
raise falcon.HTTPBadRequest(
title="Invalid parameter",
description='The "page" parameter is invalid. The value must be at least 0.',
)
else:
req.context.page = 0
# Parse the list of elements from the POST request body
if req.context.statistics_scope in doc:
if (
isinstance(doc[req.context.statistics_scope], list)
and len(doc[req.context.statistics_scope]) > 0
):
req.context.elements = doc[req.context.statistics_scope]
else:
raise falcon.HTTPBadRequest(
title="Invalid parameter",
description=f'The "{req.context.statistics_scope}" parameter is invalid. The value must be a comma-separated list of UUIDs.',
)
else:
req.context.elements = []
def set_statistics_scope(req, resp, resource, params):
"""Set the statistics scope (item, collection, or community) of the request
as well as the appropriate database (for GET requests) and Solr facet fields
(for POST requests).
Meant to be used as a `before` hook.
"""
# Extract the scope from the request path. This is *guaranteed* to be one
# of the following values because we only send requests matching these few
# patterns to routes using this set_statistics_scope hook.
#
# Note: this regex is ordered so that "items" and "collections" match before
# "item" and "collection".
req.context.statistics_scope = re.findall(
r"^/(communities|community|collections|collection|items|item)", req.path
)[0]
# Set the correct database based on the statistics_scope. The database is
# used for all GET requests where statistics are returned directly from the
# database. In this case we can return early.
if req.method == "GET":
if re.findall(r"^(item|items)$", req.context.statistics_scope):
req.context.database = "items"
elif re.findall(r"^(community|communities)$", req.context.statistics_scope):
req.context.database = "communities"
elif re.findall(r"^(collection|collections)$", req.context.statistics_scope):
req.context.database = "collections"
# GET requests only need the scope and the database so we can return now
return
# If the current request is for a plural items, communities, or collections
# that includes a list of element ids POSTed with the request body then we
# need to set the Solr facet field so we can get the live results.
if req.method == "POST":
if req.context.statistics_scope == "items":
req.context.views_facet_field = "id"
req.context.downloads_facet_field = "owningItem"
elif req.context.statistics_scope == "communities":
req.context.views_facet_field = "owningComm"
req.context.downloads_facet_field = "owningComm"
elif req.context.statistics_scope == "collections":
req.context.views_facet_field = "owningColl"
req.context.downloads_facet_field = "owningColl"
# vim: set sw=4 ts=4 expandtab: