Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
c81baa1
wip
eikek Jan 16, 2026
8a43f74
more wip
eikek Jan 20, 2026
f67f639
record certain data in a resource_request_log table
eikek Jan 21, 2026
ce92bd4
Rename, add disk requests
eikek Jan 21, 2026
960d91d
fix bug and tests
eikek Jan 22, 2026
701c602
add more data
eikek Jan 22, 2026
d6a34ec
Fix table def, add 'kind' column
eikek Jan 22, 2026
065bcdd
Get cluster id from k8s object
eikek Jan 22, 2026
faefbf5
add since time
eikek Jan 22, 2026
4e7aec2
logging, query explore
eikek Jan 23, 2026
1e350eb
query wip
eikek Jan 28, 2026
ab00425
wip. view and data
eikek Jan 29, 2026
23bde6c
test view
eikek Jan 30, 2026
f17536d
usage query
eikek Jan 30, 2026
3addf90
setting limits on a resource pool
eikek Feb 4, 2026
293ce7f
consolidate migrations, add missing data, add resource_class costs
eikek Feb 6, 2026
cc5f9f4
extract node name
eikek Feb 6, 2026
79d1ec5
get gpu info and add resource class cost to snapshot
eikek Feb 10, 2026
92b2604
Get resource_pool_id from annotations
eikek Feb 10, 2026
e6522e0
re-enable test kubeconfig
eikek Feb 10, 2026
7d6839e
disable false warnings
eikek Feb 17, 2026
6e446e7
Add new module to pyprojects
eikek Feb 17, 2026
8925ef8
Remove note files
eikek Feb 17, 2026
e8b4c3e
try "just sql"
eikek Feb 18, 2026
269218b
try to make alembic happy
eikek Feb 18, 2026
530c1fd
lint issues
eikek Feb 18, 2026
de06a4d
add try-catch to ignore node info if not possible to get
eikek Feb 18, 2026
72cfeca
starting endpoints (seem not to register correctly)
eikek Feb 18, 2026
e80abe8
endpoints are responding
eikek Feb 18, 2026
ff84a02
resource class cost
eikek Feb 18, 2026
741110a
endpoints for querying usage data
eikek Feb 19, 2026
61eb219
doc string
eikek Feb 20, 2026
68a2b7b
tweak db code and endpoints
eikek Feb 20, 2026
d1619d0
whitespace
eikek Feb 20, 2026
f499162
fix alembic
eikek Feb 20, 2026
658a606
whitespace again
eikek Feb 20, 2026
b4ac6c1
remove empty change, quick test for gpu_count and product
eikek Feb 20, 2026
ae40d8b
Remove unused code
eikek Feb 20, 2026
fd8c928
pod should be capitalized
eikek Feb 24, 2026
3edfe03
Amend resource_class and resource_pool from amalthea session
eikek Feb 25, 2026
ebe9d8a
Change endpoints paths
eikek Feb 25, 2026
f0b8f72
Allow to query by start- and end-date
eikek Feb 25, 2026
9ced9fd
Flag for enabling/disabling resource request tracking
eikek Feb 25, 2026
0c4a037
filter only amalthea session pods
eikek Feb 25, 2026
568de0f
fix test
eikek Feb 26, 2026
9c7938f
Implement some review suggestions
eikek Feb 26, 2026
e41a62d
implement review suggestions
eikek Feb 27, 2026
ec0b587
Merge fixes
eikek Mar 4, 2026
358edab
fix alembic
eikek Mar 4, 2026
b1cdd34
run make lock again
eikek Mar 4, 2026
e29e385
Disable resource request tracking by default
eikek Mar 4, 2026
a21e7c7
Try fix error response api spec so that merging succeeds
eikek Mar 4, 2026
0f5d22e
fix api.spec.yaml
eikek Mar 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ API_SPECS := \
components/renku_data_services/data_connectors/apispec.py \
components/renku_data_services/search/apispec.py \
components/renku_data_services/notifications/apispec.py \
components/renku_data_services/capacity_reservation/apispec.py
components/renku_data_services/capacity_reservation/apispec.py \
components/renku_data_services/resource_usage/apispec.py

schemas: ${API_SPECS} ## Generate pydantic classes from apispec yaml files
@echo "generated classes based on ApiSpec"
Expand Down
9 changes: 9 additions & 0 deletions bases/renku_data_services/data_api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from renku_data_services.platform.blueprints import PlatformConfigBP, PlatformUrlRedirectBP
from renku_data_services.project.blueprints import ProjectsBP, ProjectSessionSecretBP
from renku_data_services.repositories.blueprints import RepositoriesBP
from renku_data_services.resource_usage.blueprints import ResourceUsageBP
from renku_data_services.search.blueprints import SearchBP
from renku_data_services.search.reprovision import SearchReprovision
from renku_data_services.search.solr_user_query import UsernameResolve
Expand Down Expand Up @@ -273,6 +274,13 @@ def register_all_handlers(app: Sanic, dm: DependencyManager) -> Sanic:
occurrence_repo=dm.occurrence_repo,
authenticator=dm.authenticator,
)
resource_usage = ResourceUsageBP(
name="resource_usage",
url_prefix=url_prefix,
rr_repo=dm.resource_requests_repo,
rr_svc=dm.resource_usage_service,
authenticator=dm.authenticator,
)
app.blueprint(
[
resource_pools.blueprint(),
Expand Down Expand Up @@ -302,6 +310,7 @@ def register_all_handlers(app: Sanic, dm: DependencyManager) -> Sanic:
platform_redirects.blueprint(),
notifications.blueprint(),
capacity_reservation.blueprint(),
resource_usage.blueprint(),
]
)
if builds is not None:
Expand Down
14 changes: 14 additions & 0 deletions bases/renku_data_services/data_api/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
DataConnectorSecretRepository,
)
from renku_data_services.git.gitlab import DummyGitlabAPI, EmptyGitlabAPI, GitlabAPI
from renku_data_services.k8s.client_interfaces import K8sClient
from renku_data_services.k8s.clients import (
K8sClusterClientsPool,
K8sPriorityClassClient,
Expand All @@ -62,6 +63,8 @@
ProjectSessionSecretRepository,
)
from renku_data_services.repositories.db import GitRepositoriesRepository
from renku_data_services.resource_usage.core import ResourceUsageService
from renku_data_services.resource_usage.db import ResourceRequestsRepo
from renku_data_services.search import query_manual
from renku_data_services.search.db import SearchUpdatesRepo
from renku_data_services.search.reprovision import SearchReprovision
Expand Down Expand Up @@ -118,6 +121,7 @@ class DependencyManager:

config: Config

k8s_client: K8sClient
user_store: base_models.UserStore
authenticator: base_models.Authenticator
gitlab_authenticator: base_models.Authenticator
Expand Down Expand Up @@ -158,6 +162,8 @@ class DependencyManager:
oauth_http_client_factory: OAuthHttpClientFactory
capacity_reservation_repo: CapacityReservationRepository
occurrence_repo: OccurrenceRepository
resource_requests_repo: ResourceRequestsRepo
resource_usage_service: ResourceUsageService

spec: dict[str, Any] = field(init=False, repr=False, default_factory=dict)
app_name: str = "renku_data_services"
Expand Down Expand Up @@ -187,6 +193,7 @@ def load_apispec() -> dict[str, Any]:
renku_data_services.search.__file__,
renku_data_services.notifications.__file__,
renku_data_services.capacity_reservation.__file__,
renku_data_services.resource_usage.__file__,
]

api_specs = []
Expand Down Expand Up @@ -417,8 +424,13 @@ def from_env(cls) -> DependencyManager:
occurrence_repo = OccurrenceRepository(
session_maker=config.db.async_session_maker,
)
resource_requests_repo = ResourceRequestsRepo(
session_maker=config.db.async_session_maker,
)
resource_usage_service = ResourceUsageService(repo=resource_requests_repo)
return cls(
config,
k8s_client=client,
authenticator=authenticator,
gitlab_authenticator=gitlab_authenticator,
gitlab_client=gitlab_client,
Expand Down Expand Up @@ -459,4 +471,6 @@ def from_env(cls) -> DependencyManager:
oauth_http_client_factory=oauth_http_client_factory,
capacity_reservation_repo=capacity_reservation_repo,
occurrence_repo=occurrence_repo,
resource_requests_repo=resource_requests_repo,
resource_usage_service=resource_usage_service,
)
7 changes: 5 additions & 2 deletions bases/renku_data_services/data_tasks/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class Config:
posthog: PosthogConfig
authz: AuthzConfig
keycloak: KeycloakConfig | None
k8s_config_root: str
dummy_stores: bool
max_retry_wait_seconds: int
main_log_interval_seconds: int
Expand All @@ -50,7 +51,7 @@ class Config:
x_short_task_period_s: int
short_task_period_s: int
long_task_period_s: int
k8s_config_root: str
enable_resource_request_tracking: bool

@classmethod
def from_env(cls) -> Config:
Expand All @@ -71,6 +72,7 @@ def from_env(cls) -> Config:

k8s_config_root = os.environ.get("K8S_CONFIG_ROOT", "/secrets/kube_configs")

enable_resource_request_tracking = os.environ.get("ENABLE_RESOURCE_REQUEST_TRACKING", "false").lower() == "true"
authz = AuthzConfig.from_env()

keycloak = None if dummy_stores else KeycloakConfig.from_env()
Expand All @@ -82,11 +84,12 @@ def from_env(cls) -> Config:
posthog=posthog_config,
authz=authz,
keycloak=keycloak,
k8s_config_root=k8s_config_root,
tcp_host=tcp_host,
tcp_port=tcp_port,
x_short_task_period_s=x_short_task_period,
short_task_period_s=short_task_period,
long_task_period_s=long_task_period,
k8s_config_root=k8s_config_root,
dummy_stores=dummy_stores,
enable_resource_request_tracking=enable_resource_request_tracking,
)
28 changes: 25 additions & 3 deletions bases/renku_data_services/data_tasks/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,28 @@

from dataclasses import dataclass

from renku_data_services.app_config import logging
from renku_data_services.authz.authz import Authz
from renku_data_services.capacity_reservation.db import CapacityReservationRepository, OccurrenceRepository
from renku_data_services.capacity_reservation.k8s_client import CapacityReservationK8sClient
from renku_data_services.capacity_reservation.tasks import CapacityReservationTasks
from renku_data_services.crc.db import ClusterRepository
from renku_data_services.data_tasks.config import Config
from renku_data_services.k8s.clients import K8sClusterClientsPool
from renku_data_services.k8s.config import KubeConfigEnv
from renku_data_services.k8s.config import KubeConfigEnv, get_clusters
from renku_data_services.k8s.db import K8sDbCache
from renku_data_services.metrics.core import StagingMetricsService
from renku_data_services.metrics.db import MetricsRepository
from renku_data_services.namespace.db import GroupRepository
from renku_data_services.notebooks.config import get_clusters
from renku_data_services.notebooks.constants import AMALTHEA_SESSION_GVK
from renku_data_services.project.db import ProjectRepository
from renku_data_services.resource_usage.core import (
DefaultResourcesRequestRecorder,
NoopResourcesRequestRecorder,
ResourceRequestsFetch,
ResourcesRequestRecorder,
)
from renku_data_services.resource_usage.db import ResourceRequestsRepo
from renku_data_services.search.db import SearchUpdatesRepo
from renku_data_services.session.db import SessionRepository
from renku_data_services.session.tasks import SessionTasks
Expand All @@ -25,6 +32,8 @@
from renku_data_services.users.kc_api import IKeycloakAPI, KeycloakAPI
from renku_data_services.users.models import UnsavedUserInfo

logger = logging.getLogger(__file__)


@dataclass
class DependencyManager:
Expand All @@ -40,6 +49,7 @@ class DependencyManager:
kc_api: IKeycloakAPI
session_tasks: SessionTasks
capacity_reservation_tasks: CapacityReservationTasks
resource_requests_recorder: ResourcesRequestRecorder

@classmethod
def from_env(cls, cfg: Config | None = None) -> "DependencyManager":
Expand Down Expand Up @@ -83,10 +93,11 @@ def from_env(cls, cfg: Config | None = None) -> "DependencyManager":
session_tasks = SessionTasks(session_environment_repo=session_environment_repo)
cluster_repo = ClusterRepository(session_maker=cfg.db.async_session_maker)
k8s_db_cache = K8sDbCache(cfg.db.async_session_maker)
default_kubeconfig = KubeConfigEnv()
k8s_client = K8sClusterClientsPool(
lambda: get_clusters(
kube_conf_root_dir=cfg.k8s_config_root,
default_kubeconfig=KubeConfigEnv(),
default_kubeconfig=default_kubeconfig,
cluster_repo=cluster_repo,
cache=k8s_db_cache,
kinds_to_cache=[AMALTHEA_SESSION_GVK],
Expand All @@ -100,6 +111,16 @@ def from_env(cls, cfg: Config | None = None) -> "DependencyManager":
),
k8s_client=cr_k8s_client,
)
Comment thread
eikek marked this conversation as resolved.

resource_requests_recorder: ResourcesRequestRecorder
if cfg.enable_resource_request_tracking:
resource_requests_recorder = DefaultResourcesRequestRecorder(
repo=ResourceRequestsRepo(cfg.db.async_session_maker), fetch=ResourceRequestsFetch(k8s_client)
)
else:
logger.warning("Resource request tracking is disabled!")
resource_requests_recorder = NoopResourcesRequestRecorder()

kc_api: IKeycloakAPI
if cfg.dummy_stores:
dummy_users = [
Expand Down Expand Up @@ -127,4 +148,5 @@ def from_env(cls, cfg: Config | None = None) -> "DependencyManager":
kc_api=kc_api,
session_tasks=session_tasks,
capacity_reservation_tasks=capacity_reservation_tasks,
resource_requests_recorder=resource_requests_recorder,
)
10 changes: 10 additions & 0 deletions bases/renku_data_services/data_tasks/task_defs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""The task definitions in form of coroutines."""

import asyncio
from datetime import timedelta

from authzed.api.v1 import (
Consistency,
Expand Down Expand Up @@ -433,6 +434,14 @@ async def cleanup_orphaned_capacity_reservations(dm: DependencyManager) -> None:
await asyncio.sleep(dm.config.x_short_task_period_s)


async def record_resource_requests(dm: DependencyManager) -> None:
"""Periodically record all resource requests."""
interval_seconds = 600
while True:
await dm.resource_requests_recorder.record_resource_requests(timedelta(seconds=interval_seconds))
await asyncio.sleep(interval_seconds)


def all_tasks(dm: DependencyManager) -> TaskDefininions:
"""A dict of task factories to be managed in main."""
# Impl. note: We pass the entire config to the coroutines, because
Expand All @@ -457,5 +466,6 @@ def all_tasks(dm: DependencyManager) -> TaskDefininions:
"activate_capacity_reservations": lambda: activate_capacity_reservations(dm),
"monitor_capacity_reservations": lambda: monitor_capacity_reservations(dm),
"cleanup_orphaned_capacity_reservations": lambda: cleanup_orphaned_capacity_reservations(dm),
"record_resource_requests": lambda: record_resource_requests(dm),
}
)
2 changes: 2 additions & 0 deletions components/renku_data_services/migrations/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from renku_data_services.notifications.orm import BaseORM as notifications
from renku_data_services.platform.orm import BaseORM as platform
from renku_data_services.project.orm import BaseORM as project
from renku_data_services.resource_usage.orm import BaseORM as resource_usage
from renku_data_services.search.orm import BaseORM as search
from renku_data_services.secrets.orm import BaseORM as secrets
from renku_data_services.session.orm import BaseORM as sessions
Expand All @@ -37,6 +38,7 @@
sessions.metadata,
storage.metadata,
users.metadata,
resource_usage.metadata,
]

run_migrations(all_metadata)
2 changes: 1 addition & 1 deletion components/renku_data_services/migrations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def include_object(
compare_to: SchemaItem | None,
) -> bool:
"""Prevents from alembic migrating the alembic_version tables."""
return type_ != "table" or name != "alembic_version"
return type_ != "table" or (name != "alembic_version" and name != "resource_requests_view")
Comment thread
olevski marked this conversation as resolved.


def combine_version_tables(conn: Connection, metadata_schema: str | None) -> None:
Expand Down
Loading