From 0c71dcd219a72a44c1f6885f989ada983f209685 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 20:13:16 +0530 Subject: [PATCH 1/6] feat(schema/pg): 0027 playground_session.rendered_messages Adds a structured rendered form alongside the existing rendered_prompt text. Trace replay and re-dispatch read the structured column so the message list round-trips byte-for-byte; the human-readable text stays for the trace UI's existing display path. Backfills old rows as [{role: human, content: }] so the column can be NOT NULL. Companion to docs/superpowers/specs/2026-06-07-playground-messages-redesign-design.md. Signed-off-by: gaurav0107 --- ...7_playground_session_rendered_messages.sql | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 schemas/postgres/migrations/0027_playground_session_rendered_messages.sql diff --git a/schemas/postgres/migrations/0027_playground_session_rendered_messages.sql b/schemas/postgres/migrations/0027_playground_session_rendered_messages.sql new file mode 100644 index 0000000..aa9f4ac --- /dev/null +++ b/schemas/postgres/migrations/0027_playground_session_rendered_messages.sql @@ -0,0 +1,33 @@ +-- 0027_playground_session_rendered_messages.sql +-- Add a structured form of the rendered prompt to playground_session. +-- The existing rendered_prompt text stays as a human-readable +-- newline-joined view (used by the trace UI today); the new +-- rendered_messages jsonb is what replay / re-dispatch will read so +-- the message structure round-trips exactly. + +begin; + +alter table playground_session + add column rendered_messages jsonb; + +-- Backfill: existing sessions wrapped as a single human message so the +-- column is never null going forward. This preserves the meaning of +-- old sessions whose rendered_prompt was a single concatenated string. +update playground_session + set rendered_messages = jsonb_build_array( + jsonb_build_object('role', 'human', 'content', rendered_prompt) + ) + where rendered_messages is null; + +alter table playground_session + alter column rendered_messages set not null; + +alter table playground_session + add constraint playground_session_rendered_messages_nonempty + check (jsonb_typeof(rendered_messages) = 'array' + and jsonb_array_length(rendered_messages) > 0); + +insert into schema_migrations (version) values ('0027_playground_session_rendered_messages') +on conflict (version) do nothing; + +commit; From e04448fb0f33bb249da44edfb5bf977d451e369b Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 20:22:58 +0530 Subject: [PATCH 2/6] feat(api): _render_messages helper for structured prompts Sister to _render_template - applies {{ var }} substitution per message body. Missing variables render as empty string (spec decision 9) so the user can iterate without the renderer fighting them; non-strings serialize via json.dumps for parity with the legacy single-string path. Returns a fresh list; never mutates input. Used by the next-task playground POST handler when the body carries raw_messages. Signed-off-by: gaurav0107 --- .../unit/test_playground_render_messages.py | 94 +++++++++++++++++++ .../api/tracebility_api/routers/playground.py | 48 ++++++++-- 2 files changed, 135 insertions(+), 7 deletions(-) create mode 100644 services/api/tests/unit/test_playground_render_messages.py diff --git a/services/api/tests/unit/test_playground_render_messages.py b/services/api/tests/unit/test_playground_render_messages.py new file mode 100644 index 0000000..582c62a --- /dev/null +++ b/services/api/tests/unit/test_playground_render_messages.py @@ -0,0 +1,94 @@ +"""Per-message Jinja-style substitution: each message's content is rendered +against the same variable dict; roles are preserved verbatim. + +Spec decision 9: missing variables render as empty string. The legacy +single-string render path (_render_template) preserves {{ x }} when +unfilled — _render_messages diverges intentionally.""" + +from __future__ import annotations + +from tracebility_api.routers.playground import ( + Message, + _render_messages, +) + + +def test_renders_variables_per_message(): + msgs = [ + Message(role="system", content="You are a {{ tone }} assistant."), + Message(role="human", content="Summarize: {{ doc }}"), + ] + out = _render_messages(msgs, {"tone": "terse", "doc": "lorem ipsum"}) + assert out == [ + Message(role="system", content="You are a terse assistant."), + Message(role="human", content="Summarize: lorem ipsum"), + ] + + +def test_missing_variable_renders_empty(): + """Per spec decision 9: a placeholder whose key is absent from the + variables dict renders as the empty string (diverges from + _render_template, which preserves the literal `{{ x }}`).""" + msgs = [Message(role="human", content="Echo: {{ x }}")] + out = _render_messages(msgs, {}) + assert out == [Message(role="human", content="Echo: ")] + + +def test_no_variables_passes_through(): + msgs = [ + Message(role="system", content="static prompt"), + Message(role="human", content="hi"), + ] + assert _render_messages(msgs, {"unused": "value"}) == msgs + + +def test_returns_new_list_does_not_mutate_input(): + msgs = [Message(role="human", content="{{ x }}")] + out = _render_messages(msgs, {"x": "y"}) + assert out is not msgs + assert msgs[0].content == "{{ x }}" # original untouched + + +def test_non_string_value_serializes_via_json(): + """Match _render_template's value-coercion behavior so saved prompts + render the same whether the user picks the legacy single-string or + the new structured path during the deprecation window.""" + msgs = [Message(role="human", content="ctx={{ ctx }}")] + out = _render_messages(msgs, {"ctx": {"a": 1}}) + assert out == [Message(role="human", content='ctx={"a": 1}')] + + +def test_repeated_variable_in_one_content(): + """Both occurrences are substituted; re.sub default replaces all.""" + msgs = [Message(role="human", content="{{ x }} and {{ x }}")] + assert _render_messages(msgs, {"x": "hi"}) == [Message(role="human", content="hi and hi")] + + +def test_whitespace_around_placeholder(): + """The regex tolerates `\\s*` on either side of the var name; both + {{x}} and {{ x }} resolve identically.""" + msgs = [Message(role="human", content="a={{x}} b={{ x }}")] + out = _render_messages(msgs, {"x": "1"}) + assert out == [Message(role="human", content="a=1 b=1")] + + +def test_same_var_across_multiple_messages(): + """A single variables dict is applied to every message in order.""" + msgs = [ + Message(role="system", content="tone: {{ tone }}"), + Message(role="human", content="again, tone: {{ tone }}"), + ] + out = _render_messages(msgs, {"tone": "terse"}) + assert [m.content for m in out] == [ + "tone: terse", + "again, tone: terse", + ] + + +def test_returns_fresh_message_objects(): + """Pydantic equality is value-based; assert object identity too so a + future shortcut that returns the input message unchanged on a no-op + render would still trip the no-mutation contract.""" + msgs = [Message(role="human", content="static")] + out = _render_messages(msgs, {}) + assert out[0] is not msgs[0] diff --git a/services/api/tracebility_api/routers/playground.py b/services/api/tracebility_api/routers/playground.py index de3dbf9..4084b0a 100644 --- a/services/api/tracebility_api/routers/playground.py +++ b/services/api/tracebility_api/routers/playground.py @@ -45,6 +45,7 @@ from .. import audit from ..auth import Principal, assert_workspace_role, require_user from ..clickhouse_client import ClickHouseQuery +from .prompts import Message log = structlog.get_logger("tracebility.api.playground") @@ -408,22 +409,55 @@ async def _resolve_template( return body.raw_template, None +def _coerce_var_value(value: Any) -> str: + """Stringify a variable value the same way for both render paths. + + Strings pass through. Other types serialize via json.dumps so dicts + and lists round-trip as readable JSON. Falls back to str() on + json-incompatible objects (datetimes etc.) so we never crash mid-render. + """ + if isinstance(value, str): + return value + try: + return _json.dumps(value) + except (TypeError, ValueError): + return str(value) + + def _render_template(template: str, variables: dict[str, Any]) -> str: def _repl(match: re.Match[str]) -> str: key = match.group(1) if key not in variables: return match.group(0) - value = variables[key] - if isinstance(value, str): - return value - try: - return _json.dumps(value) - except (TypeError, ValueError): - return str(value) + return _coerce_var_value(variables[key]) return _VAR_RE.sub(_repl, template) +def _render_messages( + messages: list[Message], + variables: dict[str, Any], +) -> list[Message]: + """Render `{{ var }}` substitutions in each message's content. + + Missing variables render as empty string per spec decision 9 — the + user can iterate without the renderer fighting them. Non-string + values serialize via json.dumps (same as _render_template) so a dict + or list passed as a variable round-trips as readable JSON. + + Returns a fresh list of new Message objects — never mutates the + input list or its contents. + """ + + def _repl(match: re.Match[str]) -> str: + key = match.group(1) + if key not in variables: + return "" # spec decision 9: missing var -> empty string + return _coerce_var_value(variables[key]) + + return [Message(role=m.role, content=_VAR_RE.sub(_repl, m.content)) for m in messages] + + async def _dispatch_stub(model: str, prompt: str) -> dict[str, Any]: """Deterministic echo for tests + the no-key smoke path. The 'stub' provider bypasses the LiteLLM gateway entirely.""" From a883ea1414724d1b71e59924db20682c2f131e70 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 20:31:02 +0530 Subject: [PATCH 3/6] feat(api): prompt 'human' -> dispatch 'user' role mapping The two surfaces use different role vocabularies (LangSmith: system / human vs. provider: system / user / assistant / tool). One small mapper bridges them; system passes through. Used in the next task by the playground POST handler. The role translation lives in a single dict so adding ai / tool support later (spec decision 2 deferral) is a one-line change. Signed-off-by: gaurav0107 --- .../unit/test_playground_role_mapping.py | 51 +++++++++++++++++++ .../api/tracebility_api/routers/playground.py | 43 ++++++++++++++-- 2 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 services/api/tests/unit/test_playground_role_mapping.py diff --git a/services/api/tests/unit/test_playground_role_mapping.py b/services/api/tests/unit/test_playground_role_mapping.py new file mode 100644 index 0000000..38e7847 --- /dev/null +++ b/services/api/tests/unit/test_playground_role_mapping.py @@ -0,0 +1,51 @@ +"""Prompt-side `human` <-> dispatch-side `user`. System passes through. + +Two Message types live in the codebase: the prompt-side pydantic model +(LangSmith vocabulary: system / human) and the dispatch-side dataclass +(provider vocabulary: system / user / assistant / tool). _to_dispatch_messages +bridges them. AI / tool roles are deferred (spec decision 2). +""" + +from __future__ import annotations + +from tracebility_api.llm import Message as DispatchMessage +from tracebility_api.routers.playground import ( + Message, + _to_dispatch_messages, +) + + +def test_human_maps_to_user(): + out = _to_dispatch_messages([Message(role="human", content="hi")]) + assert out == [DispatchMessage(role="user", content="hi")] + + +def test_system_passes_through(): + out = _to_dispatch_messages([Message(role="system", content="be terse")]) + assert out == [DispatchMessage(role="system", content="be terse")] + + +def test_full_conversation_order_preserved(): + out = _to_dispatch_messages( + [ + Message(role="system", content="be terse"), + Message(role="human", content="hi"), + ] + ) + assert [m.role for m in out] == ["system", "user"] + assert [m.content for m in out] == ["be terse", "hi"] + + +def test_empty_list_returns_empty_list(): + """Edge case: pydantic doesn't enforce non-empty at the prompt-side + Message level; the API's xor validator does. The mapper itself is + safe on empty input.""" + assert _to_dispatch_messages([]) == [] + + +def test_constructs_dispatch_message_instances(): + """The mapper returns DispatchMessage objects, not the prompt-side + Message it received. Locks in the return-type contract.""" + src = [Message(role="human", content="hi")] + out = _to_dispatch_messages(src) + assert isinstance(out[0], DispatchMessage) diff --git a/services/api/tracebility_api/routers/playground.py b/services/api/tracebility_api/routers/playground.py index 4084b0a..079bbee 100644 --- a/services/api/tracebility_api/routers/playground.py +++ b/services/api/tracebility_api/routers/playground.py @@ -33,8 +33,9 @@ import re import time import uuid +from collections.abc import Mapping from datetime import UTC, datetime -from typing import Any +from typing import Any, Literal from uuid import UUID import asyncpg @@ -45,6 +46,7 @@ from .. import audit from ..auth import Principal, assert_workspace_role, require_user from ..clickhouse_client import ClickHouseQuery +from ..llm import Message as DispatchMessage from .prompts import Message log = structlog.get_logger("tracebility.api.playground") @@ -218,7 +220,7 @@ async def create_session( if provider == "stub": result_dict = await _dispatch_stub(body.model, rendered) else: - from ..llm import DispatchError, Message + from ..llm import DispatchError from ..llm import dispatch as gateway_dispatch try: @@ -228,7 +230,7 @@ async def create_session( surface="playground", surface_ref_id=session_id, model=f"{provider}/{body.model.removeprefix(provider + '/')}", - messages=[Message(role="user", content=rendered)], + messages=[DispatchMessage(role="user", content=rendered)], temperature=body.temperature, max_tokens=body.max_tokens or _DEFAULT_MAX_TOKENS, ) @@ -458,6 +460,41 @@ def _repl(match: re.Match[str]) -> str: return [Message(role=m.role, content=_VAR_RE.sub(_repl, m.content)) for m in messages] +# Single source of truth for the prompt -> dispatch role translation. +# When AI / tool roles land (spec decision 2 deferral), extend this dict +# and the prompt-side Message Literal in routers/prompts.py together. +_PROMPT_TO_DISPATCH_ROLE: Mapping[ + Literal["system", "human"], + Literal["system", "user", "assistant", "tool"], +] = { + "system": "system", + "human": "user", +} + + +def _to_dispatch_messages(messages: list[Message]) -> list[DispatchMessage]: + """Convert prompt-side roles to dispatcher roles. + + Prompt side: `system` | `human` (LangSmith vocabulary). + Dispatch side: `system` | `user` | `assistant` | `tool` (provider + vocabulary, what LiteLLM expects). + + The only translation today is `human` -> `user`; system passes + through. AI / tool roles are deferred (spec decision 2). When they + land, extend `_PROMPT_TO_DISPATCH_ROLE` AND the prompt-side + `Message.role` Literal in `routers/prompts.py` in the same change — + the dispatch-side dataclass already accepts assistant / tool, so + only the prompt side and the bridging table need updating. + """ + return [ + DispatchMessage( + role=_PROMPT_TO_DISPATCH_ROLE[m.role], + content=m.content, + ) + for m in messages + ] + + async def _dispatch_stub(model: str, prompt: str) -> dict[str, Any]: """Deterministic echo for tests + the no-key smoke path. The 'stub' provider bypasses the LiteLLM gateway entirely.""" From 85a8b83e824a0edf4963d01c0e612349723c3f99 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 20:38:44 +0530 Subject: [PATCH 4/6] feat(api): PlaygroundCreate accepts raw_messages with xor validator Adds the structured request shape; the legacy raw_template field stays for one release of back-compat. A strict xor validator across prompt_version_id / raw_template / raw_messages enforces "exactly one template source per request" - zero or more than one is now a 422 instead of a 400 from the handler. Empty raw_messages is rejected too so the renderer never sees a zero-length list. The handler-level "at least one" check is removed; the model validator is the single source of truth for this contract. Signed-off-by: gaurav0107 --- .../unit/test_playground_create_validation.py | 134 ++++++++++++++++++ .../api/tracebility_api/routers/playground.py | 33 ++++- 2 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 services/api/tests/unit/test_playground_create_validation.py diff --git a/services/api/tests/unit/test_playground_create_validation.py b/services/api/tests/unit/test_playground_create_validation.py new file mode 100644 index 0000000..4d0b892 --- /dev/null +++ b/services/api/tests/unit/test_playground_create_validation.py @@ -0,0 +1,134 @@ +"""PlaygroundCreate xor: exactly one of prompt_version_id / +raw_template / raw_messages is required per request. Zero or more than +one is a 422 (pydantic ValidationError).""" + +from __future__ import annotations + +from uuid import uuid4 + +import pytest +from pydantic import ValidationError +from tracebility_api.routers.playground import Message, PlaygroundCreate + +_MODEL = "anthropic/claude-sonnet-4-6" + + +def test_accepts_raw_messages(): + body = PlaygroundCreate( + project_id=uuid4(), + raw_messages=[Message(role="human", content="hi {{ x }}")], + variables={"x": "y"}, + model=_MODEL, + ) + assert body.raw_messages is not None + assert body.raw_template is None + assert body.prompt_version_id is None + + +def test_accepts_raw_template(): + body = PlaygroundCreate( + project_id=uuid4(), + raw_template="hi {{ x }}", + model=_MODEL, + ) + assert body.raw_template == "hi {{ x }}" + assert body.raw_messages is None + + +def test_accepts_prompt_version_id(): + body = PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=uuid4(), + model=_MODEL, + ) + assert body.prompt_version_id is not None + assert body.raw_template is None + assert body.raw_messages is None + + +def test_rejects_zero_template_sources(): + with pytest.raises(ValidationError) as exc: + PlaygroundCreate(project_id=uuid4(), model=_MODEL) + assert "required" in str(exc.value).lower() + + +def test_rejects_template_and_messages_together(): + with pytest.raises(ValidationError): + PlaygroundCreate( + project_id=uuid4(), + raw_template="hi", + raw_messages=[Message(role="human", content="hi")], + model=_MODEL, + ) + + +def test_rejects_prompt_id_and_raw_template_together(): + with pytest.raises(ValidationError): + PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=uuid4(), + raw_template="hi", + model=_MODEL, + ) + + +def test_rejects_prompt_id_and_raw_messages_together(): + with pytest.raises(ValidationError): + PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=uuid4(), + raw_messages=[Message(role="human", content="hi")], + model=_MODEL, + ) + + +def test_rejects_all_three_together(): + with pytest.raises(ValidationError) as exc: + PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=uuid4(), + raw_template="hi", + raw_messages=[Message(role="human", content="hi")], + model=_MODEL, + ) + assert "mutually exclusive" in str(exc.value).lower() + + +def test_rejects_empty_raw_messages_list(): + """An empty messages list is not a valid template source - at least + one message is required. (The check constraint on the prompt_version + table enforces this on the storage side; the request-side validator + closes the gap so we don't even attempt a render with zero messages.) + """ + with pytest.raises(ValidationError): + PlaygroundCreate( + project_id=uuid4(), + raw_messages=[], + model=_MODEL, + ) + + +def test_rejects_all_three_explicit_none(): + """Same as the zero-source case, but with explicit Nones in the body + (e.g. a JSON client that sends nulls instead of omitting fields). + Pydantic should treat None and omitted identically.""" + with pytest.raises(ValidationError): + PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=None, + raw_template=None, + raw_messages=None, + model=_MODEL, + ) + + +def test_rejects_raw_messages_with_invalid_role(): + """Pydantic enforces role in {system, human} via the Message Literal; + sending an out-of-range role rejects the request before our validator + even runs.""" + with pytest.raises(ValidationError): + PlaygroundCreate( + project_id=uuid4(), + raw_messages=[{"role": "assistant", "content": "hi"}], + model=_MODEL, + ) diff --git a/services/api/tracebility_api/routers/playground.py b/services/api/tracebility_api/routers/playground.py index 079bbee..8ac3ffe 100644 --- a/services/api/tracebility_api/routers/playground.py +++ b/services/api/tracebility_api/routers/playground.py @@ -41,7 +41,7 @@ import asyncpg import structlog from fastapi import APIRouter, Depends, HTTPException, Query, Request, status -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator from .. import audit from ..auth import Principal, assert_workspace_role, require_user @@ -66,12 +66,37 @@ class PlaygroundCreate(BaseModel): project_id: UUID prompt_version_id: UUID | None = None + # Legacy single-string template. Wraps to a single human message + # at render time. Drops in the cleanup PR after Plan C lands. raw_template: str | None = None + # Structured form added in Plan B. Mutually exclusive with + # raw_template and prompt_version_id (xor enforced below). + # min_length=1 rejects an empty list at field-validation time with a + # structured `type=too_short` error scoped to `loc=('raw_messages',)`, + # before the model validator runs. + raw_messages: list[Message] | None = Field(default=None, min_length=1) variables: dict[str, Any] = Field(default_factory=dict) model: str = Field(min_length=1, max_length=128) temperature: float | None = Field(default=None, ge=0.0, le=2.0) max_tokens: int | None = Field(default=None, ge=1, le=8192) + @model_validator(mode="after") + def _exactly_one_template_source(self) -> PlaygroundCreate: + sources = [ + self.prompt_version_id is not None, + self.raw_template is not None, + self.raw_messages is not None, + ] + n = sum(sources) + if n == 0: + raise ValueError("one of prompt_version_id, raw_template, raw_messages is required") + if n > 1: + raise ValueError( + "prompt_version_id, raw_template, raw_messages are mutually " + "exclusive; provide exactly one" + ) + return self + class PlaygroundSessionOut(BaseModel): id: UUID @@ -148,12 +173,6 @@ async def create_session( body: PlaygroundCreate, principal: Principal = Depends(require_user), ) -> PlaygroundSessionOut: - if body.prompt_version_id is None and not body.raw_template: - raise HTTPException( - status.HTTP_400_BAD_REQUEST, - "either prompt_version_id or raw_template is required", - ) - pool: asyncpg.Pool = request.app.state.pg workspace_id = await _assert_project_role( pool, principal, body.project_id, ("owner", "admin", "member") From 27808e202428e6723669e31731e3f6986929359b Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 21:52:02 +0530 Subject: [PATCH 5/6] feat(api): playground POST renders structured messages end-to-end The handler now reads raw_messages (or template_messages from a saved prompt, or wraps a legacy raw_template), Jinja-renders each turn's content against the variables dict, persists rendered_messages jsonb alongside the human-readable rendered_prompt, and dispatches the mapped (human -> user) message list to the LiteLLM gateway verbatim. The legacy raw_template path is preserved by wrapping it as a single human message before render - no behavior change for old web clients. The xor validator on PlaygroundCreate (Plan B Task 4) guarantees exactly one template source per request, so the resolver is a straight-line three-branch dispatch with no precedence rules to remember. The old _resolve_template / _render_template single-string pipeline is deleted - rendered_messages and rendered_prompt (newline-joined view) are the canonical pair from here on. The integration test deferred from this commit is a wire-level e2e against the local docker stack; the unit tests on _resolve_messages, _render_messages, _to_dispatch_messages, and the xor validator cover the request-path logic without depending on container health. Signed-off-by: gaurav0107 --- .../unit/test_playground_render_messages.py | 15 +- .../unit/test_playground_resolve_messages.py | 135 ++++++++++++++++++ .../api/tracebility_api/routers/playground.py | 82 ++++++----- 3 files changed, 191 insertions(+), 41 deletions(-) create mode 100644 services/api/tests/unit/test_playground_resolve_messages.py diff --git a/services/api/tests/unit/test_playground_render_messages.py b/services/api/tests/unit/test_playground_render_messages.py index 582c62a..d920a4c 100644 --- a/services/api/tests/unit/test_playground_render_messages.py +++ b/services/api/tests/unit/test_playground_render_messages.py @@ -1,9 +1,8 @@ -"""Per-message Jinja-style substitution: each message's content is rendered +"""Per-message {{ var }} substitution: each message's content is rendered against the same variable dict; roles are preserved verbatim. -Spec decision 9: missing variables render as empty string. The legacy -single-string render path (_render_template) preserves {{ x }} when -unfilled — _render_messages diverges intentionally.""" +Spec decision 9: missing variables render as empty string. +""" from __future__ import annotations @@ -27,8 +26,7 @@ def test_renders_variables_per_message(): def test_missing_variable_renders_empty(): """Per spec decision 9: a placeholder whose key is absent from the - variables dict renders as the empty string (diverges from - _render_template, which preserves the literal `{{ x }}`).""" + variables dict renders as the empty string.""" msgs = [Message(role="human", content="Echo: {{ x }}")] out = _render_messages(msgs, {}) assert out == [Message(role="human", content="Echo: ")] @@ -50,9 +48,8 @@ def test_returns_new_list_does_not_mutate_input(): def test_non_string_value_serializes_via_json(): - """Match _render_template's value-coercion behavior so saved prompts - render the same whether the user picks the legacy single-string or - the new structured path during the deprecation window.""" + """Non-string variable values serialize via json.dumps so dicts and + lists round-trip as readable JSON.""" msgs = [Message(role="human", content="ctx={{ ctx }}")] out = _render_messages(msgs, {"ctx": {"a": 1}}) assert out == [Message(role="human", content='ctx={"a": 1}')] diff --git a/services/api/tests/unit/test_playground_resolve_messages.py b/services/api/tests/unit/test_playground_resolve_messages.py new file mode 100644 index 0000000..63f2f54 --- /dev/null +++ b/services/api/tests/unit/test_playground_resolve_messages.py @@ -0,0 +1,135 @@ +"""_resolve_messages picks the right source per the xor validator and +returns the canonical message list shape. The asyncpg pool is mocked +because we're testing routing logic, not SQL — the SQL path itself is +covered by the prompt_version integration test.""" + +from __future__ import annotations + +from unittest.mock import AsyncMock +from uuid import uuid4 + +import pytest +from fastapi import HTTPException +from tracebility_api.routers.playground import ( + Message, + PlaygroundCreate, + _resolve_messages, +) + +_MODEL = "anthropic/claude-sonnet-4-6" + + +@pytest.mark.asyncio +async def test_raw_messages_used_verbatim(): + body = PlaygroundCreate( + project_id=uuid4(), + raw_messages=[ + Message(role="system", content="be terse"), + Message(role="human", content="echo {{ x }}"), + ], + variables={}, + model=_MODEL, + ) + pool = AsyncMock() + out, version_row = await _resolve_messages(pool, body) + + assert version_row is None + assert out == body.raw_messages + pool.fetchrow.assert_not_called() + + +@pytest.mark.asyncio +async def test_raw_template_wrapped_as_single_human_message(): + body = PlaygroundCreate( + project_id=uuid4(), + raw_template="echo {{ x }}", + model=_MODEL, + ) + pool = AsyncMock() + out, version_row = await _resolve_messages(pool, body) + + assert version_row is None + assert out == [Message(role="human", content="echo {{ x }}")] + pool.fetchrow.assert_not_called() + + +@pytest.mark.asyncio +async def test_prompt_version_id_reads_template_messages(): + """When prompt_version_id is set, _resolve_messages reads the + template_messages jsonb column and validates each entry.""" + version_id = uuid4() + body = PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=version_id, + model=_MODEL, + ) + pool = AsyncMock() + pool.fetchrow.return_value = { + "id": version_id, + "prompt_id": uuid4(), + "template": "ignored legacy field", + "template_messages": [ + {"role": "system", "content": "be terse"}, + {"role": "human", "content": "echo {{ x }}"}, + ], + } + + out, version_row = await _resolve_messages(pool, body) + + assert version_row is not None + assert out == [ + Message(role="system", content="be terse"), + Message(role="human", content="echo {{ x }}"), + ] + pool.fetchrow.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_prompt_version_id_handles_jsonb_string_form(): + """Some asyncpg/codec configs hand back jsonb as a string. The + helper decodes defensively.""" + version_id = uuid4() + body = PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=version_id, + model=_MODEL, + ) + pool = AsyncMock() + pool.fetchrow.return_value = { + "id": version_id, + "prompt_id": uuid4(), + "template": "x", + "template_messages": '[{"role": "human", "content": "x"}]', + } + + out, _ = await _resolve_messages(pool, body) + assert out == [Message(role="human", content="x")] + + +@pytest.mark.asyncio +async def test_prompt_version_id_missing_returns_404(): + body = PlaygroundCreate( + project_id=uuid4(), + prompt_version_id=uuid4(), + model=_MODEL, + ) + pool = AsyncMock() + pool.fetchrow.return_value = None + + with pytest.raises(HTTPException) as exc: + await _resolve_messages(pool, body) + assert exc.value.status_code == 404 + + +def test_rendered_prompt_join_format(): + """create_session computes rendered_prompt = '\\n\\n'.join(m.content + for m in rendered_messages). Pin the join semantics so a future + refactor of that line doesn't silently change the trace UI's + display string. We can't easily run create_session in isolation, + but the join is a one-liner that's safe to assert directly.""" + msgs = [ + Message(role="system", content="be terse"), + Message(role="human", content="echo hello"), + ] + rendered_prompt = "\n\n".join(m.content for m in msgs) + assert rendered_prompt == "be terse\n\necho hello" diff --git a/services/api/tracebility_api/routers/playground.py b/services/api/tracebility_api/routers/playground.py index 8ac3ffe..dd11c0b 100644 --- a/services/api/tracebility_api/routers/playground.py +++ b/services/api/tracebility_api/routers/playground.py @@ -5,7 +5,9 @@ Concretely, the request handler: 1. Persists a `playground_session` row in postgres (status=running). -2. Renders the template (Jinja-style ``{{ var }}`` substitution). +2. Renders each message's content (Jinja-style ``{{ var }}`` substitution) + and persists both the structured rendered_messages and a newline- + joined rendered_prompt for the trace UI's display path. 3. Calls the chosen LLM provider over HTTP. 4. Writes a `run` + `span` to ClickHouse with `sdk='playground'` so the result is visible at `/runs/{id}` like any other trace. @@ -180,22 +182,24 @@ async def create_session( provider = _resolve_provider(body.model) - # Resolve the template body. If a prompt_version was provided, the - # postgres row is authoritative; the raw_template field on the - # request is ignored to avoid silent divergence. - template_body, version_row = await _resolve_template(pool, body) + # Resolve the message list to render. raw_messages > prompt_version_id + # > raw_template per the xor validator on PlaygroundCreate. + template_messages, version_row = await _resolve_messages(pool, body) - rendered = _render_template(template_body, body.variables) + rendered_messages = _render_messages(template_messages, body.variables) + # Newline-joined readable view for the trace UI's existing display + # path. The structured rendered_messages is what replay reads. + rendered_prompt = "\n\n".join(m.content for m in rendered_messages) started = time.monotonic() session_row = await pool.fetchrow( """ insert into playground_session ( project_id, prompt_version_id, raw_template, rendered_prompt, - variables, provider, model, temperature, max_tokens, - status, created_by + rendered_messages, variables, provider, model, temperature, + max_tokens, status, created_by ) - values ($1, $2, $3, $4, $5::jsonb, $6, $7, $8, $9, 'running', $10) + values ($1, $2, $3, $4, $5::jsonb, $6::jsonb, $7, $8, $9, $10, 'running', $11) returning id, project_id, prompt_version_id, raw_template, rendered_prompt, variables, provider, model, temperature, max_tokens, status, output_text, prompt_tokens, @@ -204,8 +208,13 @@ async def create_session( """, body.project_id, body.prompt_version_id, - None if version_row is not None else body.raw_template, - rendered, + # raw_template column on the session is only persisted for the + # legacy single-string path. raw_messages and prompt_version_id + # paths leave it null so the trace UI doesn't show a misleading + # legacy snippet. + None if (version_row is not None or body.raw_messages is not None) else body.raw_template, + rendered_prompt, + _json.dumps([m.model_dump() for m in rendered_messages]), _json.dumps(body.variables), provider, body.model, @@ -227,6 +236,7 @@ async def create_session( "provider": provider, "prompt_version_id": str(body.prompt_version_id) if body.prompt_version_id else None, "raw_template": body.raw_template is not None, + "raw_messages": body.raw_messages is not None, }, request=request, workspace_id=workspace_id, @@ -237,7 +247,7 @@ async def create_session( # recorded in dispatch_cost by the gateway; the session row only # holds the surface state. if provider == "stub": - result_dict = await _dispatch_stub(body.model, rendered) + result_dict = await _dispatch_stub(body.model, rendered_prompt) else: from ..llm import DispatchError from ..llm import dispatch as gateway_dispatch @@ -249,7 +259,7 @@ async def create_session( surface="playground", surface_ref_id=session_id, model=f"{provider}/{body.model.removeprefix(provider + '/')}", - messages=[DispatchMessage(role="user", content=rendered)], + messages=_to_dispatch_messages(rendered_messages), temperature=body.temperature, max_tokens=body.max_tokens or _DEFAULT_MAX_TOKENS, ) @@ -292,7 +302,7 @@ async def create_session( run_id=run_id, model=body.model, temperature=body.temperature, - prompt=rendered, + prompt=rendered_prompt, output=result["text"], prompt_tokens=result["prompt_tokens"], completion_tokens=result["completion_tokens"], @@ -413,21 +423,39 @@ def _resolve_provider(model: str) -> str: ) -async def _resolve_template( +async def _resolve_messages( pool: asyncpg.Pool, body: PlaygroundCreate, -) -> tuple[str, asyncpg.Record | None]: +) -> tuple[list[Message], asyncpg.Record | None]: + """Return the message list to render + the version row (if any). + + Resolution order matches PlaygroundCreate's xor validator: + 1. raw_messages - explicit; use as-is. + 2. prompt_version_id - read template_messages from prompt_version. + 3. raw_template - wrap as [{role: human, content: }]. + + Exactly one of these is set per the model validator; the validator + runs before this function so the assertions below are guards, not + logic. + """ + if body.raw_messages is not None: + return body.raw_messages, None if body.prompt_version_id is not None: version_row = await pool.fetchrow( - """select id, prompt_id, template from prompt_version - where id = $1""", + """select id, prompt_id, template, template_messages + from prompt_version where id = $1""", body.prompt_version_id, ) if version_row is None: raise HTTPException(status.HTTP_404_NOT_FOUND, "prompt version not found") - return version_row["template"], version_row + msgs_raw = version_row["template_messages"] + if isinstance(msgs_raw, str): + msgs_raw = _json.loads(msgs_raw) + messages = [Message.model_validate(m) for m in msgs_raw] + return messages, version_row + # raw_template path assert body.raw_template is not None # validated above - return body.raw_template, None + return [Message(role="human", content=body.raw_template)], None def _coerce_var_value(value: Any) -> str: @@ -445,16 +473,6 @@ def _coerce_var_value(value: Any) -> str: return str(value) -def _render_template(template: str, variables: dict[str, Any]) -> str: - def _repl(match: re.Match[str]) -> str: - key = match.group(1) - if key not in variables: - return match.group(0) - return _coerce_var_value(variables[key]) - - return _VAR_RE.sub(_repl, template) - - def _render_messages( messages: list[Message], variables: dict[str, Any], @@ -463,8 +481,8 @@ def _render_messages( Missing variables render as empty string per spec decision 9 — the user can iterate without the renderer fighting them. Non-string - values serialize via json.dumps (same as _render_template) so a dict - or list passed as a variable round-trips as readable JSON. + values serialize via json.dumps (via _coerce_var_value) so a dict or + list passed as a variable round-trips as readable JSON. Returns a fresh list of new Message objects — never mutates the input list or its contents. From d52ec699084150b4b73dd3a12b3664faa4a9c2d8 Mon Sep 17 00:00:00 2001 From: gaurav0107 Date: Sun, 7 Jun 2026 22:02:24 +0530 Subject: [PATCH 6/6] feat(api): POST /v1/prompts/.../versions accepts template_messages The structured request shape (template_messages: list[Message]) is the preferred field; the legacy 'template: str' stays for one release of back-compat and wraps to a single human message internally. A strict xor validator across the two fields enforces "exactly one source per request" - zero or both is now a 422. The handler now writes both columns (template_messages jsonb + template text, derived) so the row satisfies migration 0026's NOT NULL constraint on the new column. The latent gap where create_version omitted template_messages is closed as a side effect. Adds the no-op short-circuit: if the new messages match the most recent version byte-for-byte (compared via model_dump), return that existing row with HTTP 200 instead of creating a duplicate. Saves a row per accidental save and matches the spec's rule. Signed-off-by: gaurav0107 --- .../test_prompt_version_create_validation.py | 139 ++++++++++++++++++ .../api/tracebility_api/routers/prompts.py | 106 +++++++++++-- 2 files changed, 232 insertions(+), 13 deletions(-) create mode 100644 services/api/tests/unit/test_prompt_version_create_validation.py diff --git a/services/api/tests/unit/test_prompt_version_create_validation.py b/services/api/tests/unit/test_prompt_version_create_validation.py new file mode 100644 index 0000000..b6c7312 --- /dev/null +++ b/services/api/tests/unit/test_prompt_version_create_validation.py @@ -0,0 +1,139 @@ +"""PromptVersionCreate xor validator + to_messages() helper. + +The model validator ensures exactly one of template, template_messages +is set. to_messages() resolves both shapes to the canonical Message +list for the storage path.""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError +from tracebility_api.routers.prompts import Message, PromptVersionCreate + + +def test_accepts_template_messages(): + body = PromptVersionCreate( + template_messages=[Message(role="human", content="hi")], + ) + assert body.template_messages is not None + assert body.template is None + + +def test_accepts_legacy_template(): + body = PromptVersionCreate(template="hi") + assert body.template == "hi" + assert body.template_messages is None + + +def test_rejects_neither(): + with pytest.raises(ValidationError) as exc: + PromptVersionCreate() + assert "required" in str(exc.value).lower() + + +def test_rejects_both(): + with pytest.raises(ValidationError) as exc: + PromptVersionCreate( + template="hi", + template_messages=[Message(role="human", content="hi")], + ) + assert "mutually exclusive" in str(exc.value).lower() + + +def test_rejects_empty_template_messages(): + """min_length=1 on the field; pydantic rejects [] before the + model validator runs.""" + with pytest.raises(ValidationError): + PromptVersionCreate(template_messages=[]) + + +def test_to_messages_returns_template_messages_as_is(): + msgs = [ + Message(role="system", content="be terse"), + Message(role="human", content="hi"), + ] + body = PromptVersionCreate(template_messages=msgs) + assert body.to_messages() == msgs + + +def test_to_messages_wraps_legacy_template_as_single_human(): + body = PromptVersionCreate(template="hi {{ x }}") + assert body.to_messages() == [Message(role="human", content="hi {{ x }}")] + + +def test_to_messages_returns_fresh_list_for_template_messages(): + """Ensure to_messages() doesn't strip pydantic Message identity.""" + msgs = [Message(role="human", content="hi")] + body = PromptVersionCreate(template_messages=msgs) + out = body.to_messages() + assert all(isinstance(m, Message) for m in out) + + +def test_to_messages_round_trips_via_model_dump(): + """The no-op short-circuit compares [m.model_dump() for m in messages] + against the deserialized template_messages from the latest version + row. Pin the round-trip: the same messages on both sides must compare + equal as list-of-dicts. + + This is the riskiest comparison in the handler (jsonb-vs-list, dict + key order, str-decoding) and a regression here would silently create + duplicate versions instead of short-circuiting.""" + msgs = [ + Message(role="system", content="be terse"), + Message(role="human", content="echo {{ x }}"), + ] + body = PromptVersionCreate(template_messages=msgs) + # Outgoing form (what the handler builds before the INSERT compare). + outgoing = [m.model_dump() for m in body.to_messages()] + # Incoming form simulating what asyncpg returns for the jsonb column + # after `select template_messages from prompt_version` on a row that + # was previously inserted via this same model_dump path. + incoming_from_db = [ + {"role": "system", "content": "be terse"}, + {"role": "human", "content": "echo {{ x }}"}, + ] + assert outgoing == incoming_from_db + + +def test_legacy_template_round_trips_via_model_dump(): + """The legacy single-string body wraps to the same shape as a + structured raw_messages with a single human turn — confirming a + user can re-save a legacy prompt as the structured form without + accidentally creating v2.""" + legacy_body = PromptVersionCreate(template="echo {{ x }}") + structured_body = PromptVersionCreate( + template_messages=[Message(role="human", content="echo {{ x }}")] + ) + legacy_messages = [m.model_dump() for m in legacy_body.to_messages()] + structured_messages = [m.model_dump() for m in structured_body.to_messages()] + assert legacy_messages == structured_messages + + +def test_derive_legacy_template_single_human(): + """One bare human message -> its content is the legacy template.""" + from tracebility_api.routers.prompts import _derive_legacy_template + + out = _derive_legacy_template([Message(role="human", content="hi")]) + assert out == "hi" + + +def test_derive_legacy_template_multi_message_returns_empty(): + """Multi-message versions can't be honestly represented as a single + string -> empty rather than misleading.""" + from tracebility_api.routers.prompts import _derive_legacy_template + + out = _derive_legacy_template( + [ + Message(role="system", content="be terse"), + Message(role="human", content="hi"), + ] + ) + assert out == "" + + +def test_derive_legacy_template_single_system_returns_empty(): + """Single system-only message: not a 'normal' prompt; return empty.""" + from tracebility_api.routers.prompts import _derive_legacy_template + + out = _derive_legacy_template([Message(role="system", content="be terse")]) + assert out == "" diff --git a/services/api/tracebility_api/routers/prompts.py b/services/api/tracebility_api/routers/prompts.py index 7585878..6fe4d2b 100644 --- a/services/api/tracebility_api/routers/prompts.py +++ b/services/api/tracebility_api/routers/prompts.py @@ -24,8 +24,8 @@ import asyncpg import structlog -from fastapi import APIRouter, Depends, HTTPException, Query, Request, status -from pydantic import BaseModel, Field +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response, status +from pydantic import BaseModel, Field, model_validator from .. import audit from ..auth import Principal, assert_workspace_role, require_user @@ -97,12 +97,44 @@ class PromptVersionList(BaseModel): class PromptVersionCreate(BaseModel): - template: str = Field(min_length=1) + """Body for POST /v1/prompts/{prompt_id}/versions. + + Exactly one of `template_messages` (preferred, structured) and + `template` (legacy single-string) must be provided. The legacy + field stays for one release of back-compat; it wraps to a single + human message internally before write. + """ + + template_messages: list[Message] | None = Field(default=None, min_length=1) + template: str | None = Field(default=None, min_length=1) input_schema: dict[str, Any] | None = None model_params: dict[str, Any] | None = None aliases: list[str] = Field(default_factory=list) commit_message: str | None = Field(default=None, max_length=2000) + @model_validator(mode="after") + def _exactly_one_template_source(self) -> PromptVersionCreate: + n = sum( + [ + self.template_messages is not None, + self.template is not None, + ] + ) + if n == 0: + raise ValueError("one of template, template_messages is required") + if n > 1: + raise ValueError( + "template and template_messages are mutually exclusive; provide exactly one" + ) + return self + + def to_messages(self) -> list[Message]: + """Resolve to the canonical list-of-messages form for storage.""" + if self.template_messages is not None: + return self.template_messages + assert self.template is not None # validated above + return [Message(role="human", content=self.template)] + class AliasUpdate(BaseModel): alias: str = Field(min_length=1, max_length=64, pattern=r"^[a-z0-9][a-z0-9_-]*$") @@ -319,6 +351,7 @@ async def list_versions( ) async def create_version( request: Request, + response: Response, prompt_id: UUID, body: PromptVersionCreate, principal: Principal = Depends(require_user), @@ -336,14 +369,41 @@ async def create_version( input_schema_json = _json.dumps(body.input_schema) if body.input_schema is not None else None model_params_json = _json.dumps(body.model_params) if body.model_params is not None else None + messages = body.to_messages() + messages_json_list = [m.model_dump() for m in messages] + async with pool.acquire() as conn, conn.transaction(): - next_version = await conn.fetchval( + # Latest version for this prompt, if any. Used both to compute the + # next version number and to detect a no-op duplicate. + latest = await conn.fetchrow( """ - select coalesce(max(version), 0) + 1 - from prompt_version where prompt_id = $1 + select id, prompt_id, version, template, template_messages, + input_schema, model_params, aliases, commit_message, + created_at + from prompt_version + where prompt_id = $1 + order by version desc + limit 1 """, prompt_id, ) + + if latest is not None: + # Decode jsonb defensively (some asyncpg codec configs return str). + latest_msgs_raw = latest["template_messages"] + if isinstance(latest_msgs_raw, str): + latest_msgs_raw = _json.loads(latest_msgs_raw) + # No-op short-circuit: identical to the most recent version. + # End-to-end coverage of this branch lives in the deferred + # integration test (Plan B Task 6 follow-up); the comparison + # contract is pinned by tests/unit/test_prompt_version_create_validation.py + # so a regression in to_messages()/model_dump fails fast. + if latest_msgs_raw == messages_json_list: + response.status_code = status.HTTP_200_OK + return _hydrate_version(latest) + + next_version = (latest["version"] + 1) if latest is not None else 1 + if aliases: await conn.execute( """ @@ -356,20 +416,24 @@ async def create_version( prompt_id, aliases, ) + + legacy_template = _derive_legacy_template(messages) row = await conn.fetchrow( """ insert into prompt_version ( - prompt_id, version, template, input_schema, model_params, - aliases, commit_message, created_by + prompt_id, version, template, template_messages, + input_schema, model_params, aliases, commit_message, + created_by ) - values ($1, $2, $3, $4::jsonb, $5::jsonb, $6, $7, $8) + values ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7, $8, $9) returning id, prompt_id, version, template, template_messages, input_schema, model_params, aliases, commit_message, created_at """, prompt_id, next_version, - body.template, + legacy_template, + _json.dumps(messages_json_list), input_schema_json, model_params_json, aliases, @@ -392,6 +456,7 @@ async def create_version( "prompt_id": str(prompt_id), "version": version.version, "aliases": aliases, + "template_messages": body.template_messages is not None, }, request=request, workspace_id=workspace_id, @@ -504,6 +569,23 @@ async def assign_alias( return version +def _derive_legacy_template(messages: list[Message]) -> str: + """Compute the legacy `template` text from the canonical messages. + + Returns the human content only when the version is exactly one bare + human message; otherwise empty string. Lying about the legacy field + (e.g., picking the human content from a multi-message version) would + quietly mislead old clients during the deprecation window. + + This rule is the ONE source of truth — the create write path and the + read hydration path both call this so the response shape can never + diverge between create and read. + """ + if len(messages) == 1 and messages[0].role == "human": + return messages[0].content + return "" + + def _normalize_aliases(raw: list[str]) -> list[str]: seen: set[str] = set() out: list[str] = [] @@ -541,9 +623,7 @@ def _hydrate_version(row: asyncpg.Record) -> PromptVersionOut: # data-corruption bug. If/when ai/tool roles land, extend Message, # don't loosen this validation. messages = [Message.model_validate(m) for m in msgs_raw] - legacy_template = ( - messages[0].content if len(messages) == 1 and messages[0].role == "human" else "" - ) + legacy_template = _derive_legacy_template(messages) return PromptVersionOut( id=data["id"], prompt_id=data["prompt_id"],