Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/8219.enhance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Implement all remaining TODOs in the resource handler file in the Manager API.
40 changes: 38 additions & 2 deletions docs/manager/rest-reference/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -7877,7 +7877,7 @@
}
},
"/resource/check-presets": {
"post": {
"get": {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you check why the open api schema generator changed this?

Copy link
Copy Markdown
Contributor Author

@kwonkwonn kwonkwonn Jan 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

c229676#diff-51adf4e1db91582bd045aea8a34e9565383e2c437c8cf7aa9eab64aebbdf9b3cR415-R416

    - cors.add(add_route("POST", "/check-presets", check_presets))
    + cors.add(add_route("GET", "/check-presets", check_presets))
    + cors.add(add_route("POST", "/check-presets", check_presets_legacy))

I guess that modifcation of OpenAPI would occured from here.
With intention of soft migration I've added Get handler for legacy Post handler which doesn't make sense As it doesn't modify any state of program.

but i still need to investigate why "post" were deleted instead of coexistencing.

"operationId": "resource.check_presets",
"tags": [
"resource"
Expand All @@ -7892,6 +7892,42 @@
"TokenAuth": []
}
],
"parameters": [
{
"name": "scaling_group",
"schema": {
"type": "string"
},
"required": true,
"in": "query"
},
{
"name": "group",
"schema": {
"type": "string",
"default": "default"
},
"required": true,
"in": "query"
}
],
"description": "\nReturns the list of all resource presets in the specified scaling group,\nwith additional information including allocatability of each preset,\namount of total remaining resources, and the current keypair resource limits.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: one of FROZEN, RUNNING\n"
},
"post": {
"operationId": "resource.check_presets_legacy",
"tags": [
"resource"
],
"responses": {
"200": {
"description": "Successful response"
}
},
"security": [
{
"TokenAuth": []
}
],
"requestBody": {
"content": {
"application/json": {
Expand All @@ -7915,7 +7951,7 @@
}
},
"parameters": [],
"description": "\nReturns the list of all resource presets in the current scaling group,\nwith additional information including allocatability of each preset,\namount of total remaining resources, and the current keypair resource limits.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: one of FROZEN, RUNNING\n"
"description": "\nReturns the list of all resource presets in the current scaling group,\nwith additional information including allocatability of each preset,\namount of total remaining resources, and the current keypair resource limits.\n\n.. deprecated::\n Use GET /check-presets instead. This POST endpoint is kept for backward compatibility.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: one of FROZEN, RUNNING\n"
}
},
"/resource/recalculate-usage": {
Expand Down
108 changes: 68 additions & 40 deletions src/ai/backend/manager/api/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@

import aiohttp_cors
import trafaret as t
import yarl
from aiohttp import web

from ai.backend.common import validators as tx
from ai.backend.common.types import LegacyResourceSlotState as ResourceSlotState
from ai.backend.logging import BraceStyleAdapter
from ai.backend.manager.errors.api import InvalidAPIParameters
from ai.backend.manager.services.agent.actions.get_watcher_status import GetWatcherStatusAction
from ai.backend.manager.services.agent.actions.recalculate_usage import RecalculateUsageAction
from ai.backend.manager.services.agent.actions.watcher_agent_restart import (
Expand Down Expand Up @@ -85,22 +83,78 @@ async def list_presets(request: web.Request) -> web.Response:
t.Key("group", default="default"): t.String,
})
)
async def check_presets(request: web.Request, params: Any) -> web.Response:
async def check_presets_legacy(request: web.Request, params: Any) -> web.Response:
"""
Returns the list of all resource presets in the current scaling group,
with additional information including allocatability of each preset,
amount of total remaining resources, and the current keypair resource limits.

.. deprecated::
Use GET /check-presets instead. This POST endpoint is kept for backward compatibility.
"""
root_ctx: RootContext = request.app["_root.context"]
access_key = request["keypair"]["access_key"]
resource_policy = request["keypair"]["resource_policy"]
domain_name = request["user"]["domain_name"]

log.info(
"CHECK_PRESETS (ak:{}, g:{}, sg:{})",
access_key,
params["group"],
params["scaling_group"],
)

result = await root_ctx.processors.resource_preset.check_presets.wait_for_complete(
CheckResourcePresetsAction(
access_key=access_key,
resource_policy=resource_policy,
domain_name=domain_name,
user_id=request["user"]["uuid"],
group=params["group"],
scaling_group=params["scaling_group"],
)
)

scaling_groups_json = {}
for sgname, sg_data in result.scaling_groups.items():
scaling_groups_json[sgname] = {
ResourceSlotState.OCCUPIED: sg_data[ResourceSlotState.OCCUPIED].to_json(),
ResourceSlotState.AVAILABLE: sg_data[ResourceSlotState.AVAILABLE].to_json(),
}

resp = {
"presets": result.presets,
"keypair_limits": result.keypair_limits.to_json(),
"keypair_using": result.keypair_using.to_json(),
"keypair_remaining": result.keypair_remaining.to_json(),
"group_limits": result.group_limits.to_json(),
"group_using": result.group_using.to_json(),
"group_remaining": result.group_remaining.to_json(),
"scaling_group_remaining": result.scaling_group_remaining.to_json(),
"scaling_groups": scaling_groups_json,
}

return web.json_response(resp, status=HTTPStatus.OK)


@server_status_required(READ_ALLOWED)
@auth_required
@check_api_params(
t.Dict({
t.Key("scaling_group"): t.String,
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scaling_group was previously optional (default None) and CheckResourcePresetsAction.scaling_group is typed as Optional[str]. Making it required here will reject existing requests that omit scaling_group and is inconsistent with list_presets() (which still treats it as optional). Restore the default/nullable validation for scaling_group and handle None in the action call.

Suggested change
t.Key("scaling_group"): t.String,
t.Key("scaling_group", default=None): t.Or(t.String, t.Null),

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifying the existing schema doesn’t look ideal.
Although there’s a desire to enforce stricter value constraints, it’s not something we can change in the current API.
Introducing a new API for this purpose would be a better option.

Copy link
Copy Markdown
Contributor Author

@kwonkwonn kwonkwonn Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood
I will look for better way to handle it, while retainig a exsiting api handlers.

Another small question here:

cors.add(add_route("POST", "/check-presets", check_presets))

is being handled as "POST", although it doesn't effects current state,
best practice i think would be "GET" with parameter querier.

Current plan is adding new handler with "GET" with new function which we can use while transition.

t.Key("group", default="default"): t.String,
})
)
async def check_presets(request: web.Request, params: Any) -> web.Response:
"""
Returns the list of all resource presets in the specified scaling group,
with additional information including allocatability of each preset,
amount of total remaining resources, and the current keypair resource limits.
"""
root_ctx: RootContext = request.app["_root.context"]
try:
access_key = request["keypair"]["access_key"]
resource_policy = request["keypair"]["resource_policy"]
domain_name = request["user"]["domain_name"]
# TODO: uncomment when we implement scaling group.
# scaling_group = request.query.get('scaling_group')
# assert scaling_group is not None, 'scaling_group parameter is missing.'
except (json.decoder.JSONDecodeError, AssertionError) as e:
raise InvalidAPIParameters(extra_msg=str(e.args[0]))
access_key = request["keypair"]["access_key"]
resource_policy = request["keypair"]["resource_policy"]
domain_name = request["user"]["domain_name"]

log.info(
"CHECK_PRESETS (ak:{}, g:{}, sg:{})",
Expand All @@ -120,7 +174,6 @@ async def check_presets(request: web.Request, params: Any) -> web.Response:
)
)

# Convert ResourceSlot objects to JSON for API response
scaling_groups_json = {}
for sgname, sg_data in result.scaling_groups.items():
scaling_groups_json[sgname] = {
Expand Down Expand Up @@ -259,32 +312,6 @@ async def admin_month_stats(request: web.Request) -> web.Response:
return web.json_response(result.stats, status=HTTPStatus.OK)


# TODO: get_watcher_info overlaps with service-side method.
# Keeping it because it's used by vfolder.
async def get_watcher_info(request: web.Request, agent_id: str) -> dict:
"""
Get watcher information.

:return addr: address of agent watcher (eg: http://127.0.0.1:6009)
:return token: agent watcher token ("insecure" if not set in config server)
"""
root_ctx: RootContext = request.app["_root.context"]
token = root_ctx.config_provider.config.watcher.token
if token is None:
token = "insecure"
agent_ip = await root_ctx.etcd.get(f"nodes/agents/{agent_id}/ip")
raw_watcher_port = await root_ctx.etcd.get(
f"nodes/agents/{agent_id}/watcher_port",
)
watcher_port = 6099 if raw_watcher_port is None else int(raw_watcher_port)
# TODO: watcher scheme is assumed to be http
addr = yarl.URL(f"http://{agent_ip}:{watcher_port}")
return {
"addr": addr,
"token": token,
}


@server_status_required(READ_ALLOWED)
@superadmin_required
@check_api_params(
Expand Down Expand Up @@ -385,7 +412,8 @@ def create_app(
add_route = app.router.add_route
cors.add(add_route("GET", "/presets", list_presets))
cors.add(add_route("GET", "/container-registries", get_container_registries))
cors.add(add_route("POST", "/check-presets", check_presets))
cors.add(add_route("GET", "/check-presets", check_presets))
cors.add(add_route("POST", "/check-presets", check_presets_legacy))
cors.add(add_route("POST", "/recalculate-usage", recalculate_usage))
cors.add(add_route("GET", "/usage/month", usage_per_month))
cors.add(add_route("GET", "/usage/period", usage_per_period))
Expand Down
30 changes: 29 additions & 1 deletion src/ai/backend/manager/api/vfolder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import attrs
import sqlalchemy as sa
import trafaret as t
import yarl
from aiohttp import web
from pydantic import (
AliasChoices,
Expand All @@ -48,7 +49,6 @@
VFolderUsageMode,
)
from ai.backend.logging import BraceStyleAdapter
from ai.backend.manager.api.resource import get_watcher_info
from ai.backend.manager.data.agent.types import AgentStatus
from ai.backend.manager.data.kernel.types import KernelStatus
from ai.backend.manager.data.model_serving.types import EndpointLifecycle
Expand Down Expand Up @@ -156,6 +156,34 @@
P = ParamSpec("P")


# NOTE: This function duplicates AgentService._get_watcher_info.
# Both access etcd directly for agent watcher information.
# Consider consolidating into a shared client/repository to eliminate
# this duplication and the direct etcd access pattern.
async def get_watcher_info(request: web.Request, agent_id: str) -> dict:
"""
Get watcher information.

:return addr: address of agent watcher (eg: http://127.0.0.1:6009)
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring example uses port 6009, but the default watcher port in this function is 6099. Please align the example with the actual default to avoid confusion during ops/debugging.

Suggested change
:return addr: address of agent watcher (eg: http://127.0.0.1:6009)
:return addr: address of agent watcher (eg: http://127.0.0.1:6099)

Copilot uses AI. Check for mistakes.
:return token: agent watcher token ("insecure" if not set in config server)
"""
root_ctx: RootContext = request.app["_root.context"]
token = root_ctx.config_provider.config.watcher.token
if token is None:
token = "insecure"
agent_ip = await root_ctx.etcd.get(f"nodes/agents/{agent_id}/ip")
raw_watcher_port = await root_ctx.etcd.get(
f"nodes/agents/{agent_id}/watcher_port",
)
watcher_port = 6099 if raw_watcher_port is None else int(raw_watcher_port)
# TODO: watcher scheme is assumed to be http
addr = yarl.URL(f"http://{agent_ip}:{watcher_port}")

Check warning

Code scanning / devskim

An HTTP-based URL without TLS was detected. Warning

Insecure URL
return {
"addr": addr,
"token": token,
}


class SuccessResponseModel(LegacyBaseResponseModel):
success: bool = Field(default=True)

Expand Down
Loading