From b6f3e4db698d68f037a15d8468fd719620eb855f Mon Sep 17 00:00:00 2001 From: Gabor Szabo Date: Tue, 26 May 2026 18:33:09 +0200 Subject: [PATCH 1/2] feat(api,ui): showcase pipeline agent ops final polish (#321) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PRP-41 — fourth and FINAL slice of the /showcase upgrade epic (PRP-38..41). Adds two new pipeline phases on scenario=showcase_rich plus cross-cutting UI polish that closes issue #311. Pipeline (backend / app/features/demo/pipeline.py) - step_agent_hitl_flow: HITL approval round-trip on the experiment agent. Drives POST /agents/sessions + /chat + /approve via ASGITransport; surfaces an intermediate step_complete (status=running, awaiting_approval=true) for the FE to render the Approve button; absorbs 400 "No pending action" when the FE pre-empts; 90 s hard timeout falls back to skip so a hung agent never wedges the run. - step_ops_snapshot: 3 GET calls to /ops/summary + /ops/retraining-candidates + /ops/model-health, derives a 5-key KPI payload (stale_aliases_count, retraining_candidates_count, total_runs, total_aliases, degrading_health_count). warn (never fail) on all-three-failed. - _phase_table() — design Z: unified `agents` phase id for BOTH scenarios; SHOWCASE_RICH swaps step_agent for step_agent_hitl_flow and appends an ops phase carrying ops_snapshot before cleanup. SHOWCASE_RICH = 24 rows / 10 phases; DEMO_MINIMAL = 11 rows (unchanged shape under the new agents phase id). - _Client.yield_event hook + run_pipeline event-sink drain. The orchestrator stamps step_index / total_steps / phase_index / phase_total / phase_name on every drained intermediate event. Frontend (UI) - PHASE_DEFS.ts — design Z restructure: BOTH the legacy `agent` step and the new `agent_hitl_flow` live under the unified `agents` phase id; new DEMO_MINIMAL_ONLY_STEP_NAMES set complements SHOWCASE_RICH_STEP_NAMES so the filter selects the right step per scenario (lockstep test pins 24 tuples / 10 phases). - DemoPhasePanel.tsx — adds onValueChange handler + local useState (closes issue #311 / D10): post-pipeline-complete the operator can finally expand any phase without snapping back to the fallback. - demo-step-card.tsx — HitlFlowSummary chip-line + OpsSnapshotMiniGrid + one-click ApproveButton (only renders when status=running AND awaiting_approval=true). - showcase.tsx — five new chrome additions: - ShowcaseKpiStrip — 5-tile KPI strip above the controls card. - RunHistoryStrip — localStorage FIFO 5 with Replay button. - Stop button (visible mid-run) — closes the WS so the backend's WebSocketDisconnect releases the pipeline lock. - InspectArtifactsPanel — 10 deep-link cards rendered after pipeline_complete. - resolveInspectHref switch extended with agent_hitl_flow → CHAT, ops_snapshot → OPS. - use-demo-pipeline.ts — stop() callback exposed via UseDemoPipelineResult; DemoSummary.v2RunId added (mapped from pipeline_complete event.data.v2_run_id). Docs - docs/user-guide/showcase-walkthrough.md — drops 7 "planned" markers across PRP-38/39/40/41 phases; adds concrete prose for Agents (HITL) + Ops snapshot + the 5 polish items + performance budgets table refresh + screenshot placeholders. - docs/_base/RUNBOOKS.md — 5 new failure-mode entries (23-27): agent_hitl_flow no-key / timeout / no-trigger, ops_snapshot all-failed, Stop button mid-run. Tests - Backend: 9 new tests in test_pipeline.py (HITL: happy / no-key / session-fail / no-tool / 4xx-absorb / timeout + Ops: happy / warn / empty); lockstep test rewrite 23 → 24 tuples; 5 new canned-response fixtures for /ops/* endpoints. - Frontend: 22 new vitest cases across 5 test files (DemoPhasePanel onValueChange, ShowcaseKpiStrip 5-tile derivation, InspectArtifactsPanel 10-card grid, RunHistoryStrip localStorage FIFO, demo-step-card HITL + Approve + Ops mini-grid). - E2E: test_run_demo_showcase_rich_full_epic asserts PRP-41 contract shapes hold when the steps execute; tolerates a pre-existing PRP-39/40 cascade (scenario_simulate_and_save can fail to parse the safer_promote_flow placeholder artifact_uri) documented in RUNBOOKS.md entry 18. Validation - ruff + format clean; mypy + pyright strict (only pre-existing xgboost/lightgbm stub gaps remain — documented in PRP body). - 1635 unit tests pass; 249 frontend tests pass. - Vertical-slice guard empty: zero imports from agents/ops/registry/ scenarios/rag in app/features/demo/. Out of scope (explicit) - No new backend endpoints, no new schemas, no Alembic migrations. - No widening of agent_require_approval (save_scenario already listed; HITL step consumes it). - No CRLF/LF line-ending normalisation bundled in. Contract probe report: PRPs/ai_docs/prp-41-contract-probe-report.md --- PRPs/PRP-41-showcase-agent-ops-polish.md | 32 +- PRPs/ai_docs/prp-41-contract-probe-report.md | 407 +++++++++++++++ app/features/demo/pipeline.py | 398 +++++++++++++- app/features/demo/tests/test_pipeline.py | 489 ++++++++++++++++-- docs/_base/RUNBOOKS.md | 7 +- docs/user-guide/showcase-walkthrough.md | 133 +++-- .../components/demo/DemoPhasePanel.test.tsx | 102 ++++ .../src/components/demo/DemoPhasePanel.tsx | 22 +- .../demo/InspectArtifactsPanel.test.tsx | 99 ++++ .../components/demo/InspectArtifactsPanel.tsx | 195 +++++++ .../src/components/demo/PHASE_DEFS.test.ts | 20 +- frontend/src/components/demo/PHASE_DEFS.ts | 44 +- .../components/demo/RunHistoryStrip.test.tsx | 99 ++++ .../src/components/demo/RunHistoryStrip.tsx | 139 +++++ .../components/demo/ShowcaseKpiStrip.test.tsx | 87 ++++ .../src/components/demo/ShowcaseKpiStrip.tsx | 112 ++++ .../components/demo/demo-step-card.test.tsx | 79 +++ .../src/components/demo/demo-step-card.tsx | 128 +++++ frontend/src/hooks/use-demo-pipeline.test.ts | 31 +- frontend/src/hooks/use-demo-pipeline.ts | 20 + frontend/src/pages/showcase.tsx | 34 +- tests/test_e2e_demo.py | 103 ++++ 22 files changed, 2656 insertions(+), 124 deletions(-) create mode 100644 PRPs/ai_docs/prp-41-contract-probe-report.md create mode 100644 frontend/src/components/demo/DemoPhasePanel.test.tsx create mode 100644 frontend/src/components/demo/InspectArtifactsPanel.test.tsx create mode 100644 frontend/src/components/demo/InspectArtifactsPanel.tsx create mode 100644 frontend/src/components/demo/RunHistoryStrip.test.tsx create mode 100644 frontend/src/components/demo/RunHistoryStrip.tsx create mode 100644 frontend/src/components/demo/ShowcaseKpiStrip.test.tsx create mode 100644 frontend/src/components/demo/ShowcaseKpiStrip.tsx diff --git a/PRPs/PRP-41-showcase-agent-ops-polish.md b/PRPs/PRP-41-showcase-agent-ops-polish.md index d1eaee19..ec8bfc4d 100644 --- a/PRPs/PRP-41-showcase-agent-ops-polish.md +++ b/PRPs/PRP-41-showcase-agent-ops-polish.md @@ -1532,22 +1532,34 @@ async def step_ops_snapshot(ctx: DemoContext, client: _Client) -> StepResult: # MODIFY ALL_STEPS: # FIND: # { phase: 'agent', step: 'agent', label: 'Agent chat' }, -# REPLACE with (in this order): +# REPLACE with (in this order — DESIGN Z, unified phase id "agents"): +# { phase: 'agents', step: 'agent', label: 'Agent chat (legacy)' }, # { phase: 'agents', step: 'agent_hitl_flow', label: 'Agent HITL approval' }, # { phase: 'ops', step: 'ops_snapshot', label: 'Ops snapshot' }, # PRESERVE everything before / after. -# NOTE: demo_minimal still emits the legacy step name "agent" — the -# FE's `phaseDefsForScenario('demo_minimal')` filter must keep both -# step ids in `ALL_STEPS` and select by name (Task 1 confirms the -# filter shape). -# If the lockstep test's demo_minimal assertion explicitly asserts -# `'agent'` step under `'agent'` phase, ADD a sibling row preserving -# it: -# { phase: 'agent', step: 'agent', label: 'Agent chat (legacy)' }, -# ... and exclude it from showcase_rich via SHOWCASE_RICH_STEP_NAMES. +# NOTE: BOTH 'agent' and 'agent_hitl_flow' live under the same phase id +# 'agents'. demo_minimal renders the legacy `'agent'` row; showcase_rich +# renders `'agent_hitl_flow'` + `'ops_snapshot'`. The legacy 'agent' +# step function stays in pipeline.py and is wired by `_phase_table()` +# on the non-showcase-rich branch (Task 6). +# +# MODIFY the filter — design Z requires a SECOND exclusion set, because +# `SHOWCASE_RICH_STEP_NAMES` is the "exclude from demo_minimal" set, not +# the "exclude from showcase_rich" set. Task 1 contract probe § 7 +# confirmed this filter restructure is required: +# +# const DEMO_MINIMAL_ONLY_STEP_NAMES = new Set(['agent']) // legacy +# +# export function phaseDefsForScenario(scenario: ScenarioPreset): readonly PhaseDef[] { +# if (scenario === 'showcase_rich') { +# return ALL_STEPS.filter((d) => !DEMO_MINIMAL_ONLY_STEP_NAMES.has(d.step)) +# } +# return ALL_STEPS.filter((d) => !SHOWCASE_RICH_STEP_NAMES.has(d.step)) +# } # # MODIFY SHOWCASE_RICH_STEP_NAMES (lines 66–82): # ADD: 'agent_hitl_flow', 'ops_snapshot'. +# (So they're excluded from demo_minimal / sparse.) # # MODIFY PHASE_LABEL (lines 94–106): # REPLACE: agent: 'Agent' → agents: 'Agents (HITL)'. diff --git a/PRPs/ai_docs/prp-41-contract-probe-report.md b/PRPs/ai_docs/prp-41-contract-probe-report.md new file mode 100644 index 00000000..ccc9a533 --- /dev/null +++ b/PRPs/ai_docs/prp-41-contract-probe-report.md @@ -0,0 +1,407 @@ +# PRP-41 — Task 1 Contract Probe Report + +> **Probed:** branch `feat/showcase-41-agent-ops-polish` (off `dev` at `58d593a`, +> which is the merge of PR #322 atop `b3ba1f4`). All citations verified +> field-for-field against current source — no live HTTP probe needed because +> every cited contract is determined by source (Pydantic models, route +> decorators, in-memory enums). Live `/ops/*` shape spot-checked against +> `OpsService.get_summary` source. +> +> **Verdict: GO — zero field-level drift. Four of five unresolved +> contract assumptions resolved by source inspection; one (#5) was +> already CONFIRMED in the PRP body. ONE wording patch required for +> PRP-41 Task 9 (filter restructure for design Z, see §7).** + +--- + +## 1. Backend — agents slice (HITL approval surface) + +### `app/features/agents/schemas.py` + +| Symbol | Line | PRP cite | Live shape | Match | +|---|---|---|---|---| +| `SessionCreateRequest` | 27 | `agent_type: Literal["experiment","rag_assistant"]`, `initial_context: dict\|None` | `agent_type: Literal["experiment","rag_assistant"]`, `initial_context: dict[str, Any] \| None` | ✅ | +| `SessionResponse` | 45 | `session_id, agent_type, status, total_tokens_used, tool_calls_count, last_activity, expires_at, created_at` | exact same 8 fields | ✅ | +| `ChatRequest` | 108 | `message: str`, `stream: bool=False` | `message: str` + `stream: bool=False` | ✅ | +| `ChatResponse` | 145 | `session_id, message, tool_calls, pending_approval: bool, pending_action: PendingAction\|None, tokens_used: int` | exact match | ✅ | +| `PendingAction` | 170 | `action_id, action_type, description, arguments, created_at, expires_at` | exact match | ✅ | +| `ApprovalRequest` | 192 | `action_id: str` (NOT `tool_call_id`), `approved: bool`, `reason: str\|None` | exact match — `action_id` confirmed at line 203 | ✅ | +| `ApprovalResponse` | 208 | `action_id, approved, result: Any\|None, status: Literal["executed","rejected","expired"]` | exact match | ✅ | + +**PRP wording drift in INITIAL-41 (already noted in PRP-41 body):** +- `tool_call_id` → `action_id` — PRP-41 body already corrected. +- `approval_required` event vs `pending_approval` field — PRP-41 body already corrected (the event only fires on `/agents/stream` WS path; the synchronous `/chat` REST response carries `pending_approval: bool` + `pending_action: PendingAction | None`). + +### `app/features/agents/routes.py` + +| Endpoint | Line | Body shape | Notes | +|---|---|---|---| +| `POST /agents/sessions` | 43 | `SessionCreateRequest` → `SessionResponse` (201 CREATED) | ✅ | +| `GET /agents/sessions/{id}` | 80 | `SessionResponse` | ✅ | +| `POST /agents/sessions/{id}/chat` | 109 | `ChatRequest` → `ChatResponse` | ✅ | +| `POST /agents/sessions/{id}/approve` | 152 | `ApprovalRequest` → `ApprovalResponse` | ✅ | +| `DELETE /agents/sessions/{id}` | 198 | `204 NO CONTENT` (already called by `step_cleanup`) | ✅ | + +### `app/features/agents/agents/experiment.py` + +- `tool_save_scenario` at **line 419** ✅ +- Gated by `requires_approval("save_scenario")` at **line 453** + short-circuit at **line 468** ✅ + +### `app/features/agents/service.py` + +- `approve_action` at **line 640** ✅ +- Raises `SessionNotFoundError` → `HTTPException(404)` when session absent. +- Raises `NoApprovalPendingError` → `HTTPException(400)` when: + - `session.pending_action is None` (already consumed) — **line 668** + - `pending.action_id != action_id` (mismatch) — **line 672** +- After approving once, `session.pending_action = None` is set (line ~685), so a + second `POST /approve` with the same `action_id` will get **400 Bad Request** + with `detail="No pending action for session: {id}"`. + +> ⚠️ **Note — RFC 7807 inconsistency** in agents/routes.py (lines 185–195): the +> raise paths use bare `HTTPException(status_code=..., detail=str(e))`, not the +> repo's `problem_details.py` envelope. `_Client.request` handles both formats +> via the `parsed if isinstance(parsed, dict)` fallback (pipeline.py line 173), +> so PRP-41 is unaffected. This is **pre-existing** and **out of PRP-41 scope.** + +### `app/core/config.py` + +- Line **184** — `agent_require_approval: list[str] = ["create_alias", "archive_run", "save_scenario"]` ✅ matches PRP-41 expectation. **`save_scenario` is in the list.** PRP-41 reads only; modifies nothing. + +--- + +## 2. Backend — ops slice + +### `app/features/ops/schemas.py` + +| Symbol | Line | PRP cite | Live shape | Match | +|---|---|---|---|---| +| `StaleReason` | 16 | StrEnum w/ `newer_success_run`, `artifact_not_verified`, `run_not_success`, `feature_frame_version_mismatch` | exact match | ✅ | +| `SystemHealth` | 36 | n/a (consumed via `summary.system`) | `api_ok`, `database_connected`, `latest_successful_job_at` | ✅ | +| `DataFreshness` | 56 | n/a (consumed via `summary.freshness`) | matches | ✅ | +| `StatusCount` | 80 | `status: str`, `count: int` | exact match | ✅ | +| `JobHealth` | 89 | n/a | `counts, completed_today, failed_total, active_total` | ✅ | +| `RunHealth` | 111 | **`counts: list[StatusCount]`** | confirmed — `counts` (NOT `histogram`); each item carries `count` (NOT `value`) | ✅ | +| `AliasHealth` | 133 | `alias_name, run_id, is_stale, stale_reason, wape, alias_feature_frame_version, comparable_run_feature_frame_version` | match + `run_status`/`model_type`/`store_id`/`product_id` extras (additive — not used by PRP-41) | ✅ | +| `OpsSummaryResponse` | 209 | `system, jobs, runs, aliases: list[AliasHealth], freshness, attention_items, generated_at` — **NO flat `stale_aliases`/`total_aliases`** | exact match — must derive from `aliases` list | ✅ | +| `RetrainingCandidate` | 234 | `store_id, product_id, priority_score, staleness_days, wape, latest_run_id, reason` | match + `latest_run_status` (additive) | ✅ | +| `RetrainingCandidatesResponse` | 267 | `candidates, total_evaluated, generated_at` | exact match | ✅ | +| `DriftDirection` | 290 | `Literal["improving","stable","degrading","unknown"]` | exact match | ✅ | +| `ModelHealthEntry` | 306 | **`drift_direction`** (NOT `drift_verdict`) | confirmed line 338 — `drift_direction: DriftDirection` | ✅ | +| `ModelHealthResponse` | 372 | field is **`entries`** (NOT `health` / `items`) | confirmed line 377 — `entries: list[ModelHealthEntry]` | ✅ | + +### `app/features/ops/routes.py` + +| Endpoint | Line | Query params | Response | Match | +|---|---|---|---|---| +| `GET /ops/summary` | 22→41 | none | `OpsSummaryResponse` | ✅ | +| `GET /ops/retraining-candidates` | 55→70 | `limit=1..100` (default 20) | `RetrainingCandidatesResponse` | ✅ | +| `GET /ops/model-health` | 91→110 | `limit=1..100` (default 20) — **NO `grain` param** | `ModelHealthResponse` | ✅ | + +### `app/features/ops/tests/test_routes_integration.py` + +- `test_summary_resilient_structural` at **line 68** ✅ proves `GET /ops/summary` returns 200 (never 500) on an empty DB. `test_model_health_resilient_structural` at line 175 confirms the same for `/ops/model-health`. PRP-41's `step_ops_snapshot` can safely assume 200 with zero-filled fields. + +--- + +## 3. Backend — demo slice (anchors for the new step fns) + +### `app/features/demo/pipeline.py` + +| Symbol | Line | Notes | +|---|---|---| +| `_HTTP_TIMEOUT` | 96 | `httpx.Timeout(120.0, connect=5.0)` — 120 s budget the new steps share. | +| `_StepError` | 104 | Has `.step`, `.status_code`, `.problem` attributes. **`status_code` confirmed** for the absorb-4xx logic. | +| `_Client` | 125 | Constructor `__init__(self, app: FastAPI)` — current signature has NO `event_sink` param. PRP-41 Task 3 extends it. | +| `_Client.request` | 152 | Returns `dict[str, Any]`; wraps non-dict 2xx bodies as `{"_raw": body}` (line 178); raises `_StepError(step, response.status_code, problem)` on any non-2xx. | +| `DemoContext` | 187 | Has `session_id`, `v2_run_id`, `scenario_artifact_key`, `price_cut_scenario_id`, `holiday_scenario_id`, `embedding_unreachable`. **NO `approval_action_id` / `agent_approval_decision`** — PRP-41 Task 2 adds them. | +| `_llm_key_present` | 253 | Bool helper; already used by `step_agent` line 1443. Mirror exactly. | +| `step_agent` (legacy) | 1436 | One-turn chat with `experiment` agent; skips on missing key. Replacement template for `step_agent_hitl_flow`. | +| `step_register` | 984 | Multi-call multi-PATCH precedent for `step_agent_hitl_flow`'s sequential pattern. | +| `step_cleanup` | 1897 | Already closes `ctx.session_id` via DELETE — PRP-41 changes nothing here. | +| `step_batch_preset` → `step.data.completed_items` | ~1820 | confirmed source of `completed_items` KPI tile. | +| `step_scenario_simulate_and_save` → `step.data.scenario_id` | ~1170 | confirmed (also sets `ctx.price_cut_scenario_id`). | +| `step_multi_plan_compare` → `step.data.{winner_scenario_id, ranked_by, ranked}` | ~1265 | confirmed; `ranked` is `list[dict]`. | +| `step_rag_index_subset` → `step.data.{total_chunks, curated_hits}` | ~1340 | confirmed. | +| `_phase_table` | 1999 | Current `SHOWCASE_RICH` branch returns **23 rows** (verified by `test_phase_table_showcase_rich_…` line 571). PRP-41 flips to 24 (swap `("agent","agent")` row + insert `("ops","ops_snapshot")`). | +| `PHASE_AGENT` constant | 1995 | `"agent"` — PRP-41 design Z **REPLACES** this with `PHASE_AGENTS = "agents"`. | +| `PHASE_CLEANUP` constant | 1996 | `"cleanup"` — unchanged. | +| `run_pipeline` | 2076 | Already computes `index`, `phase_index_by_phase[phase_name]`, `phase_total` per row (lines 2099–2102). **The orchestrator already knows these values** — Task 3's intermediate-event drain can stamp them on each yielded event. **Design Z is viable.** | + +### `app/features/demo/routes.py` + +- WS handler `/demo/stream` at lines 57–85; `WebSocketDisconnect` caught at **line 74** and returns silently — confirmed. + +### `app/features/demo/service.py` + +- `_pipeline_lock = asyncio.Lock()` at **line 18** ✅ +- `if _pipeline_lock.locked(): raise PipelineBusyError(...)` at **line 38–39** ✅ +- `async with _pipeline_lock:` at **line 41** — lock releases on exit which includes propagation of `WebSocketDisconnect`. **Stop button will release the lock correctly.** + +--- + +## 4. Frontend — current state of every cited file + +### `frontend/src/components/demo/PHASE_DEFS.ts` + +- **`ALL_STEPS`** (lines 37–64) — 23 rows on `dev`. The legacy `{ phase: 'agent', step: 'agent', label: 'Agent chat' }` is at row 22 (1-indexed); cleanup at row 23. +- **`SHOWCASE_RICH_STEP_NAMES`** (lines 66–82) — currently 12 entries. **Semantics:** this set is the "EXCLUDE from demo_minimal" set, NOT the "include only in showcase_rich" set. The filter is: + ```ts + if (scenario === 'showcase_rich') return ALL_STEPS + return ALL_STEPS.filter((d) => !SHOWCASE_RICH_STEP_NAMES.has(d.step)) + ``` + So adding `'agent_hitl_flow'` + `'ops_snapshot'` to this set causes them to be excluded from `demo_minimal` (correct). +- **`PHASE_LABEL`** (lines 95–106) — has `agent: 'Agent'` and `cleanup: 'Cleanup'`. No `agents` / `ops` yet. +- **`PHASE_ORDER`** (lines 109–121) — 9 phases (data, modeling, decision, portfolio, planning, knowledge, verify, agent, cleanup). + +### `frontend/src/components/demo/PHASE_DEFS.test.ts` — the lockstep gate + +- `demo_minimal` assertion (lines 13–28): 11-tuple list ending in `['agent', 'agent']`, `['cleanup', 'cleanup']`. +- `showcase_rich` assertion (lines 30–62): 23-tuple list ending in `['agent', 'agent']`, `['cleanup', 'cleanup']`. +- `PHASE_ORDER` assertion (lines 68–80): 9 phases. +- **PRP-41 changes:** + - `demo_minimal` tuples: swap `['agent', 'agent']` → `['agents', 'agent']`. + - `showcase_rich` tuples: swap `['agent', 'agent']` → `['agents', 'agent_hitl_flow']` AND insert `['ops', 'ops_snapshot']` immediately after, before the cleanup row. Tuple count 23 → 24. + - `PHASE_ORDER`: 9 → 10 (`'agent'` → `'agents'`, then insert `'ops'`). + +### `frontend/src/components/demo/DemoPhasePanel.tsx` + +- **CONFIRMED MISSING `onValueChange`** — line 46: + ```tsx + + ``` + No handler. Issue #311 fix is precisely this hook addition. The PRP's Task 10 pattern is correct: lift `value` to `useState` seeded from the computed value via `useEffect`, expose `onValueChange={setExpandedPhase}`. + +### `frontend/src/components/demo/demo-step-card.tsx` + +- 392 lines total. +- Mini-summary helpers (`BacktestBreakdown`, `RegisterDetail`, `ChampionCompatDetail`, `StaleAliasDetail`, `SaferPromoteDetail`, `BatchPresetDetail`, `ScenarioSummary`, `CompareSummary`, `ProviderChip`, `IndexSummary`, `RetrieveSummary`) live at lines ~35–305 — PRP-41's `HitlFlowSummary` + `OpsSnapshotMiniGrid` follow the same shape. +- Conditional rendering switch at lines 356–377 (per `step.name`) — PRP-41 inserts two more cases. +- Inspect button at lines 378–387 — PRP-41 inserts the Approve button as a peer block (rendered when `step.data.awaiting_approval === true && step.status === 'running'`). + +### `frontend/src/hooks/use-demo-pipeline.ts` + +- `disconnectRef` at **line 198** ✅ +- `useWebSocket(...)` destructures `{status, send, disconnect, reconnect}` at **line 208** ✅ +- `useEffect(() => { disconnectRef.current = disconnect }, [disconnect])` at lines 213–215 ✅ +- **Return object (lines 247–263) currently exposes** `steps, phases, runningPhase, phase, summary, errorMessage, isRunning, connectionStatus, start, setScenario, scenario`. **`stop` is NOT exposed** — PRP-41 Task 13 adds it. + +### `frontend/src/hooks/use-websocket.ts` + +- `return { status, send, disconnect, reconnect }` at the bottom — `disconnect()` already cancels reconnect + closes socket. **No changes needed to this file.** ✅ + +### `frontend/src/pages/showcase.tsx` + +- `resolveInspectHref(step)` at lines 17–84 — switch covers train / v2_train / register / backtest / champion_compat_compare / stale_alias_trigger / safer_promote_flow / batch_preset / scenario_simulate_and_save / multi_plan_compare / embedding_provider_probe / rag_index_subset / rag_retrieve_probe / default → null. **PRP-41 adds two cases:** `agent_hitl_flow` → `ROUTES.CHAT`, `ops_snapshot` → `ROUTES.OPS`. +- `useDemoPipeline()` destructure at lines 86–98 — PRP-41 adds `stop`. +- Page structure starts at line 141 — PRP-41 inserts KPI strip + RunHistoryStrip above controls, Stop button inside controls card (visible when `isRunning`), InspectArtifactsPanel after the phase accordion (visible when `phase === 'done'`). + +### `frontend/src/lib/constants.ts` + +- **All 10 inspect-target routes already exist** — verified: + - `ROUTES.VISUALIZE.FORECAST`, `.BACKTEST`, `.BATCH`, `.PLANNER` + - `ROUTES.EXPLORER.RUNS`, `.RUN_COMPARE`, `.RUN_DETAIL` + - `ROUTES.OPS`, `.KNOWLEDGE`, `.CHAT` +- **Zero new routes required.** ✅ + +### `frontend/src/pages/admin.tsx` + +- Line 431 — `const SEEDER_FORM_STORAGE_KEY = 'forecastlab.seederForm.v1'` +- Line 456 — `window.localStorage.getItem(SEEDER_FORM_STORAGE_KEY)` +- Line 485 — `window.localStorage.setItem(SEEDER_FORM_STORAGE_KEY, JSON.stringify(form))` +- Pattern: `forecastlab..v` versioned key + raw JSON serialization. PRP-41 mirrors as `forecastlab.showcase.runs.v1`. + +--- + +## 5. Resolution of the 5 unresolved contract assumptions + +### Assumption #1 — `_Client.yield_event` orchestrator fill-in + +**Recommendation in PRP body:** orchestrator overwrites `step_index`, `total_steps`, `phase_index`, `phase_total` on every event drained from the sink. + +**Verified — viable.** `run_pipeline` (line 2076) computes all four values per-row at the top of each loop iteration (`index` from `enumerate(rows, start=1)`, `phase_index = phase_index_by_phase[phase_name]`, `phase_total = len(phases_in_order)`, `total = len(rows)`). The orchestrator can stamp these on each intermediate event BEFORE the terminal yield. **Design Z works without breaking the existing 22 steps** because none of them currently use `client.yield_event(...)` (the helper doesn't exist on `dev`). + +**Implementer guidance:** when draining `intermediate_events`, set +```python +ev.step_index = index +ev.total_steps = total +ev.phase_index = phase_index +ev.phase_total = phase_total +ev.phase_name = phase_name # belt-and-braces; the step fn may have set it +``` +on every event, in FIFO order, BEFORE yielding the terminal `step_complete`. + +### Assumption #2 — Approve double-fire response shape + +**Verified — 400 Bad Request.** `AgentService.approve_action` raises `NoApprovalPendingError` when: +- `session.pending_action is None` (already consumed by the first call), OR +- `pending.get("action_id") != action_id` (mismatch). + +Both map to `HTTPException(status_code=400, detail=...)` in `agents/routes.py` lines 192–195. **PRP-41's `if 400 <= exc.status_code < 500:` absorption catches this correctly.** + +**Implementer guidance:** the absorb branch should set `approval_decision = "executed"` (optimistic — visitor clicked first) per PRP pseudocode. The 200 path sets it from `approve_body["status"]` (one of `executed` / `rejected` / `expired`). + +### Assumption #3 — `SHOWCASE_RICH_STEP_NAMES` filter semantics + PHASE_DEFS.ts filter restructure + +**Filter shape verified:** the current filter (lines 87–93) keeps everything on `showcase_rich` and excludes `SHOWCASE_RICH_STEP_NAMES` on every other scenario: +```ts +if (scenario === 'showcase_rich') return ALL_STEPS +return ALL_STEPS.filter((d) => !SHOWCASE_RICH_STEP_NAMES.has(d.step)) +``` + +**Design Z requires a small filter restructure** beyond what PRP-41 Task 9 reads. Under design Z, BOTH `'agent'` (legacy step name, demo_minimal) AND `'agent_hitl_flow'` (showcase_rich) appear in `ALL_STEPS`. The current `if scenario === 'showcase_rich' return ALL_STEPS` would return BOTH on showcase_rich (bug — we want only `agent_hitl_flow`). + +**Recommended restructure (one of two options, pick either):** + +**(a)** Introduce a `DEMO_MINIMAL_ONLY_STEP_NAMES` set: +```ts +const DEMO_MINIMAL_ONLY_STEP_NAMES = new Set(['agent']) // legacy agent only + +export function phaseDefsForScenario(scenario: ScenarioPreset): readonly PhaseDef[] { + if (scenario === 'showcase_rich') { + return ALL_STEPS.filter((d) => !DEMO_MINIMAL_ONLY_STEP_NAMES.has(d.step)) + } + return ALL_STEPS.filter((d) => !SHOWCASE_RICH_STEP_NAMES.has(d.step)) +} +``` + +**(b)** Add `'agent_hitl_flow'` and `'ops_snapshot'` to `SHOWCASE_RICH_STEP_NAMES` +AND add `'agent'` to a `DEMO_MINIMAL_ONLY_STEP_NAMES` set (same shape as a). + +Both options produce the same result; (a) is the cleaner refactor. + +> **PRP wording note:** PRP-41 § Task 9 pseudocode reads partially as if option (a) is intended ("ADD a sibling row preserving it … and exclude it from showcase_rich via SHOWCASE_RICH_STEP_NAMES") — but `SHOWCASE_RICH_STEP_NAMES` is the wrong direction (it's the exclude-from-demo-minimal set). The implementer **MUST** use option (a) (a NEW set, not the existing one) OR restructure the filter conditional. + +### Assumption #4 — `OpsSummaryResponse.runs.counts` shape + +**Verified — `runs.counts` (NOT `runs.histogram`); per-item key is `count` (NOT `value`).** + +`RunHealth` (line 111) carries `counts: list[StatusCount]`; `StatusCount` (line 80) has `status: str` + `count: int`. PRP-41 pseudocode `sum(int(c.get("count", 0)) for c in counts if isinstance(c, dict))` is correct. + +`OpsService.get_summary` (line 225, `app/features/ops/service.py`) constructs each `StatusCount(status=..., count=...)` from the DB grouping — confirmed live shape. + +### Assumption #5 — `_Client.request` list-body wrapper + +**Already CONFIRMED in PRP body.** Verified at pipeline.py line 178: `body if isinstance(body, dict) else {"_raw": body}`. Every `/ops/*` and `/agents/*` endpoint PRP-41 calls returns a dict body — `_raw` does not come into play for PRP-41. + +--- + +## 6. step.data payload sources for KPI strip / Inspect-Artifacts panel + +Verified each PRP-39/40 source step's `step.data` keys against current `pipeline.py`: + +| KPI tile | Source step | Key | Confirmed location | +|---|---|---|---| +| `runs_registered` | `register` / `stale_alias_trigger` / `safer_promote_flow` / `v2_train` | `run_id` | pipeline.py line ~1100, ~1610, ~1750, ~970 | +| `aliases_live` | `ops_snapshot` (PRP-41) | `total_aliases` | new in PRP-41 | +| `batch_items_completed` | `batch_preset` | `completed_items` | pipeline.py ~1880 | +| `scenario_plans_saved` | `scenario_simulate_and_save` + `multi_plan_compare` | `scenario_id` + `winner_scenario_id` | pipeline.py ~1170 + ~1280 | +| `rag_chunks_indexed` | `rag_index_subset` | `total_chunks` | pipeline.py ~1350 | + +All sources match PRP-41 § Task 14 expectations. + +For Inspect-Artifacts deep-link dependencies, every required `step.data.*` key +already exists on `dev` and is documented in `resolveInspectHref` (showcase.tsx +lines 17–84) which PRP-41 extends. + +--- + +## 7. PRP wording patches required + +### Patch — Task 9 filter semantics + +**Location:** PRP-41 § Per task pseudocode → "Task 9 — PHASE_DEFS.ts extension" (lines ~1528–1560). + +**Current wording (incomplete):** +> NOTE: demo_minimal still emits the legacy step name "agent" — the FE's +> `phaseDefsForScenario('demo_minimal')` filter must keep both step ids in +> `ALL_STEPS` and select by name (Task 1 confirms the filter shape). +> If the lockstep test's demo_minimal assertion explicitly asserts `'agent'` +> step under `'agent'` phase, ADD a sibling row preserving it: +> `{ phase: 'agent', step: 'agent', label: 'Agent chat (legacy)' }`, +> ... and exclude it from showcase_rich via SHOWCASE_RICH_STEP_NAMES. + +**Issue:** PRP-41's recommended **design Z** unifies the phase id to `'agents'` +for BOTH demo_minimal and showcase_rich. Keeping `phase: 'agent'` on the +sibling row is **design X**, which the PRP § "demo_minimal phase rename +trade-off" recommends AGAINST. AND `SHOWCASE_RICH_STEP_NAMES` is the +"exclude from demo_minimal" set, NOT the "exclude from showcase_rich" set +— the current filter cannot exclude `'agent'` from showcase_rich. + +**Required patch — implementer follows this revised pseudocode:** + +```ts +// ALL_STEPS: keep the legacy "agent" row under the NEW phase id 'agents', +// and add a sibling row for the HITL flow + the new ops snapshot row. +// { phase: 'agents', step: 'agent', label: 'Agent chat (legacy)' }, +// { phase: 'agents', step: 'agent_hitl_flow', label: 'Agent HITL approval' }, +// { phase: 'ops', step: 'ops_snapshot', label: 'Ops snapshot' }, + +// NEW set — excludes the legacy step from showcase_rich: +const DEMO_MINIMAL_ONLY_STEP_NAMES = new Set(['agent']) + +// SHOWCASE_RICH_STEP_NAMES gets 'agent_hitl_flow' + 'ops_snapshot' added +// (so they're excluded from demo_minimal). Filter restructured: +export function phaseDefsForScenario(scenario: ScenarioPreset): readonly PhaseDef[] { + if (scenario === 'showcase_rich') { + return ALL_STEPS.filter((d) => !DEMO_MINIMAL_ONLY_STEP_NAMES.has(d.step)) + } + return ALL_STEPS.filter((d) => !SHOWCASE_RICH_STEP_NAMES.has(d.step)) +} +``` + +**Implementer note:** This restructure is small and additive — the existing +`SHOWCASE_RICH_STEP_NAMES` set is preserved, and `DEMO_MINIMAL_ONLY_STEP_NAMES` +is new. Both filters now use the same shape. Lockstep test fixture changes +follow naturally: +- `demo_minimal`: tuple list ends `['agents', 'agent']`, `['cleanup', 'cleanup']` (10-tuple + cleanup = 11 tuples — count unchanged). +- `showcase_rich`: tuple list ends `['agents', 'agent_hitl_flow']`, `['ops', 'ops_snapshot']`, `['cleanup', 'cleanup']` (count: 22 + 2 = 24, swap + insert one row). + +**This patch is purely a clarification — no PRP-41 task is added or removed.** + +### No other PRP body patches required. + +All other cited contracts match field-for-field. The PRP body's own notes +about `drift_direction`, `action_id`, `pending_approval` already capture the +INITIAL-41 drift correctly. + +--- + +## 8. Verdict + +✅ **GO for implementation. Proceed to Tasks 2–19.** + +- All 5 contract assumptions resolved. +- Zero field-level drift in backend or frontend contracts. +- One small filter-restructure clarification documented (§7) — does NOT add + scope; the implementer simply applies the resolved option (a) when + implementing Task 9. +- Design Z is verified viable: `run_pipeline` already computes the four + indices the orchestrator-fill-in needs. +- The Stop button design is sound: `WebSocketDisconnect` propagation + releases `_pipeline_lock` (confirmed at `service.py:41` + `routes.py:74`). +- Approve double-fire: 400 absorption is correct. + +### Implementer checklist (Task 2 onward) + +1. Implement design Z verbatim — same phase id `'agents'` for both scenarios. +2. Add `DEMO_MINIMAL_ONLY_STEP_NAMES = new Set(['agent'])` and restructure + the filter per §7. +3. Backend lockstep test updates: + - `test_phase_table_demo_minimal_matches_legacy_11_steps`: rename to + `test_phase_table_demo_minimal_matches_11_steps_with_agents_phase` and + swap the agent tuple to `("agents", "agent")`. + - `test_phase_table_showcase_rich_adds_…`: rename to include `_24_steps` + and apply the swap + insert. +4. Frontend lockstep test updates: mirror the same shape in `PHASE_DEFS.test.ts`. +5. `step_agent_hitl_flow`: NEVER raise; map every error path to `("skip", ...)`. +6. `step_ops_snapshot`: `("warn", ...)` on all-three-failed, never `("fail", ...)`. +7. Vertical-slice grep guard must remain empty: + ```bash + git grep -nE "from app\.features\.(agents|ops|registry|scenarios|rag)" \ + app/features/demo/ + ``` +8. Frontend type-check uses **project-scoped** invocation: + `cd frontend && pnpm tsc --noEmit -p tsconfig.app.json`. + +— end of report — diff --git a/app/features/demo/pipeline.py b/app/features/demo/pipeline.py index 5c96f64c..6c51bda2 100644 --- a/app/features/demo/pipeline.py +++ b/app/features/demo/pipeline.py @@ -130,7 +130,12 @@ class _Client: :class:`_StepError` with the parsed RFC 7807 body. """ - def __init__(self, app: FastAPI) -> None: + def __init__( + self, + app: FastAPI, + *, + event_sink: list[StepEvent] | None = None, + ) -> None: self._client = httpx.AsyncClient( # raise_app_exceptions=False makes the in-process transport behave # like a real network client: an unhandled error inside a driven @@ -140,6 +145,12 @@ def __init__(self, app: FastAPI) -> None: base_url="http://demo.internal", timeout=_HTTP_TIMEOUT, ) + # PRP-41 — opt-in intermediate event sink. Only the HITL step uses it; + # `run_pipeline` drains the buffer just before each terminal step_complete + # and stamps step_index / total_steps / phase_index / phase_total / + # phase_name onto every drained event. None in unit tests where the + # sink isn't wired up. + self._event_sink = event_sink async def __aenter__(self) -> _Client: return self @@ -147,6 +158,19 @@ async def __aenter__(self) -> _Client: async def __aexit__(self, *_exc: object) -> None: await self._client.aclose() + def yield_event(self, event: StepEvent) -> None: + """PRP-41 — buffer an intermediate StepEvent for the orchestrator. + + The orchestrator (``run_pipeline``) drains the sink between the step + function's return and the terminal ``step_complete`` it yields. Step + functions that do not need to surface intermediate state never call + this. If no sink is wired (e.g. in unit tests), the event is silently + dropped — callers must not rely on it for terminal payload. + """ + if self._event_sink is None: + return + self._event_sink.append(event) + async def request( self, step: str, @@ -225,6 +249,10 @@ class DemoContext: price_cut_scenario_id: str | None = None holiday_scenario_id: str | None = None embedding_unreachable: bool = False + # PRP-41 — additive HITL approval state, populated only by + # step_agent_hitl_flow on SHOWCASE_RICH. Remain None on every other path. + approval_action_id: str | None = None + agent_approval_decision: str | None = None # "executed"|"rejected"|"expired"|"timed_out" # ============================================================================= @@ -269,6 +297,17 @@ def _llm_key_present() -> bool: return False +# PRP-41 — HITL approval flow constants. Display delay gives the visitor a +# window to click Approve on the FE before the backend auto-fires; the hard +# timeout is the load-bearing fallback so a hung agent never stops the demo. +_APPROVAL_DISPLAY_DELAY_S = 3.0 +_APPROVAL_HARD_TIMEOUT_S = 90.0 +_HITL_PROMPT = ( + "Save a 10% price-cut scenario plan for the demo-production model " + "as 'showcase-agent-savedplan'." +) + + # PRP-40 — artifact-key parser for /scenarios/* run_id resolution. Two ID # spaces: model_run.run_id (32-char UUID-hex) vs scenarios.run_id (12-char # artifact key parsed from `model_{KEY}.joblib`). Memory anchor: @@ -1972,6 +2011,318 @@ async def step_cleanup(ctx: DemoContext, client: _Client) -> StepResult: ) +# ============================================================================= +# PRP-41 — Agents (HITL) + Ops snapshot phases (showcase_rich only) +# ============================================================================= + + +async def step_agent_hitl_flow(ctx: DemoContext, client: _Client) -> StepResult: + """PRP-41 — HITL approval round-trip on the experiment agent. + + Flow: + 1. ``_llm_key_present()`` -> skip when no key. + 2. ``POST /agents/sessions`` (agent_type=experiment) -> session_id. + 3. ``POST /agents/sessions/{id}/chat`` with the HITL prompt; the + experiment agent calls ``tool_save_scenario`` which short-circuits + on the ``save_scenario`` entry in ``agent_require_approval``. The + chat response carries ``pending_approval=true`` + + ``pending_action: PendingAction``. + 4. ``client.yield_event(...)`` an intermediate step_complete with + ``status='running'`` + ``awaiting_approval=true`` so the FE can + render the Approve button. + 5. Sleep ``_APPROVAL_DISPLAY_DELAY_S`` -- a one-click FE Approve may + pre-empt the auto-approve in this window. + 6. ``POST /agents/sessions/{id}/approve`` with ``{action_id, + approved: true}``. Absorb 4xx (the FE pre-empted; the action was + already consumed). + 7. Terminal: ``pass`` with the approval decision in step.data. + + Skip-gracefully on every error path (session-create / chat / approve + failure, or the agent never triggers ``save_scenario``). Never raises. + + Hard timeout: if the elapsed time exceeds ``_APPROVAL_HARD_TIMEOUT_S`` + before step (6) completes, returns ``skip`` with + ``approval_decision='timed_out'``. + """ + key_present = _llm_key_present() + logger.info("demo.agent_hitl_flow.key_present", present=key_present) + if not key_present: + return ( + "skip", + "no API key matching agent_default_model provider", + {}, + ) + + started_at = time.monotonic() + + # (1+2) -- session. + try: + create_body = await client.request( + "agent_hitl_flow[session]", + "POST", + "/agents/sessions", + json_body={"agent_type": "experiment", "initial_context": None}, + ) + except _StepError as exc: + return ("skip", f"session-create failed: {exc}", {}) + session_id_raw = create_body.get("session_id") + if not isinstance(session_id_raw, str): + return ("skip", "no session_id returned", {}) + session_id: str = session_id_raw + ctx.session_id = session_id + + # (3) -- chat that triggers the gated tool. + try: + chat_body = await client.request( + "agent_hitl_flow[chat]", + "POST", + f"/agents/sessions/{session_id}/chat", + json_body={"message": _HITL_PROMPT, "stream": False}, + ) + except _StepError as exc: + return ( + "skip", + f"chat round-trip failed: {exc}", + {"session_id": session_id}, + ) + + pending_approval = bool(chat_body.get("pending_approval", False)) + raw_action = chat_body.get("pending_action") or {} + pending_action: dict[str, Any] = raw_action if isinstance(raw_action, dict) else {} + tokens_used = int(chat_body.get("tokens_used", 0)) + raw_tool_calls = chat_body.get("tool_calls", []) + tool_count = len(raw_tool_calls) if isinstance(raw_tool_calls, list) else 0 + + if not pending_approval or not pending_action: + # The agent didn't trigger save_scenario (e.g. answered directly or + # picked a different tool). Skip-by-design: not a failure. + return ( + "skip", + ( + f"agent did not trigger save_scenario " + f"(tokens={tokens_used}, tool_calls={tool_count})" + ), + { + "session_id": session_id, + "tokens_used": tokens_used, + "tool_calls_count": tool_count, + }, + ) + + action_id_raw = pending_action.get("action_id") + if not isinstance(action_id_raw, str): + return ( + "skip", + "pending_action.action_id missing", + {"session_id": session_id}, + ) + action_id: str = action_id_raw + ctx.approval_action_id = action_id + + # (4) -- intermediate event so the FE renders Approve. step_index / + # total_steps / phase_index / phase_total are stamped by the orchestrator + # when it drains the sink (see run_pipeline). + elapsed_ms = (time.monotonic() - started_at) * 1000.0 + client.yield_event( + StepEvent( + event_type="step_complete", + step_name="agent_hitl_flow", + step_index=0, + total_steps=0, + status="running", + detail="awaiting approval (auto-approve in 3 s)", + duration_ms=elapsed_ms, + data={ + "awaiting_approval": True, + "approval_url": f"/agents/sessions/{session_id}/approve", + "action_id": action_id, + "session_id": session_id, + "tokens_used": tokens_used, + "tool_calls_count": tool_count, + }, + phase_name=PHASE_AGENTS, + ) + ) + + # (5) -- display delay. + elapsed_after_intermediate = time.monotonic() - started_at + delay = max(0.0, _APPROVAL_DISPLAY_DELAY_S - elapsed_after_intermediate) + if delay > 0: + await asyncio.sleep(delay) + + # (5b) -- hard-timeout check BEFORE the approve POST. + elapsed_before_approve = time.monotonic() - started_at + if elapsed_before_approve > _APPROVAL_HARD_TIMEOUT_S: + ctx.agent_approval_decision = "timed_out" + return ( + "skip", + "approval timed out -- pipeline continued", + { + "session_id": session_id, + "action_id": action_id, + "approval_decision": "timed_out", + "tokens_used": tokens_used, + "tool_calls_count": tool_count, + "timed_out": True, + }, + ) + + # (6) -- POST /approve. Absorb 4xx (FE pre-empted) per Task 1 §5 #2: + # AgentService.approve_action returns 400 ("No pending action") when the + # action was already consumed by the FE's optimistic Approve click. + approval_decision = "executed" + try: + approve_body = await client.request( + "agent_hitl_flow[approve]", + "POST", + f"/agents/sessions/{session_id}/approve", + json_body={"action_id": action_id, "approved": True}, + ) + raw_status = approve_body.get("status", "executed") + if isinstance(raw_status, str): + approval_decision = raw_status + except _StepError as exc: + if 400 <= exc.status_code < 500: + # FE pre-empted -- the approval already landed. Optimistic default. + logger.info( + "demo.agent_hitl_flow.approve_pre_empted", + session_id=session_id, + action_id=action_id, + status_code=exc.status_code, + ) + approval_decision = "executed" + else: + return ( + "skip", + f"approve failed: {exc}", + { + "session_id": session_id, + "action_id": action_id, + "tokens_used": tokens_used, + "tool_calls_count": tool_count, + }, + ) + + ctx.agent_approval_decision = approval_decision + + return ( + "pass", + ( + f"session={session_id[:8]}... tokens={tokens_used} " + f"tool_calls={tool_count} approved={approval_decision}" + ), + { + "session_id": session_id, + "action_id": action_id, + "approval_decision": approval_decision, + "tokens_used": tokens_used, + "tool_calls_count": tool_count, + }, + ) + + +async def step_ops_snapshot(_ctx: DemoContext, client: _Client) -> StepResult: + """PRP-41 — fetch /ops/* endpoints and embed a 5-key KPI payload. + + Three GETs: + - GET /ops/summary + - GET /ops/retraining-candidates?limit=5 + - GET /ops/model-health?limit=5 + + All endpoints are 200-safe on an empty DB (verified by + ``test_summary_resilient_structural`` + ``test_model_health_resilient_structural``). + + Returns ``("pass", ...)`` when at least one of the three returned a body. + Returns ``("warn", ...)`` only when all three failed -- never ``fail`` + (ops is observability, not a hard pipeline dependency). + """ + summary: dict[str, Any] = {} + candidates_body: dict[str, Any] = {} + health_body: dict[str, Any] = {} + + try: + summary = await client.request( + "ops_snapshot[summary]", + "GET", + "/ops/summary", + ) + except _StepError as exc: + logger.warning("demo.ops_snapshot.summary_failed", status_code=exc.status_code) + + try: + candidates_body = await client.request( + "ops_snapshot[retraining]", + "GET", + "/ops/retraining-candidates?limit=5", + ) + except _StepError as exc: + logger.warning("demo.ops_snapshot.retraining_failed", status_code=exc.status_code) + + try: + health_body = await client.request( + "ops_snapshot[health]", + "GET", + "/ops/model-health?limit=5", + ) + except _StepError as exc: + logger.warning("demo.ops_snapshot.health_failed", status_code=exc.status_code) + + raw_aliases = summary.get("aliases") or [] + aliases: list[dict[str, Any]] = ( + [a for a in raw_aliases if isinstance(a, dict)] if isinstance(raw_aliases, list) else [] + ) + stale_count = sum(1 for a in aliases if a.get("is_stale")) + total_aliases = len(aliases) + + raw_runs = summary.get("runs") or {} + runs: dict[str, Any] = raw_runs if isinstance(raw_runs, dict) else {} + raw_counts = runs.get("counts") or [] + # Task 1 confirmed: RunHealth.counts is list[StatusCount] where + # StatusCount = {status: str, count: int}. + total_runs = ( + sum(int(c.get("count", 0)) for c in raw_counts if isinstance(c, dict)) + if isinstance(raw_counts, list) + else 0 + ) + + raw_candidates = candidates_body.get("candidates") or [] + retraining_count = len(raw_candidates) if isinstance(raw_candidates, list) else 0 + + raw_entries = health_body.get("entries") or [] + degrading_count = ( + sum( + 1 + for e in raw_entries + if isinstance(e, dict) and e.get("drift_direction") == "degrading" + ) + if isinstance(raw_entries, list) + else 0 + ) + + data: dict[str, Any] = { + "stale_aliases_count": stale_count, + "retraining_candidates_count": retraining_count, + "total_runs": total_runs, + "total_aliases": total_aliases, + "degrading_health_count": degrading_count, + } + + if summary or candidates_body or health_body: + detail = ( + f"stale_aliases={stale_count} retraining={retraining_count} " + f"runs={total_runs} aliases={total_aliases} " + f"degrading={degrading_count}" + ) + return ("pass", detail, data) + + # All three endpoints failed -- warn (pipeline still goes green). + return ( + "warn", + "/ops/* all 4xx/5xx -- ops snapshot unavailable", + data, + ) + + # ============================================================================= # Orchestration # ============================================================================= @@ -1992,7 +2343,12 @@ async def step_cleanup(ctx: DemoContext, client: _Client) -> StepResult: PHASE_PLANNING = "planning" PHASE_KNOWLEDGE = "knowledge" PHASE_VERIFY = "verify" -PHASE_AGENT = "agent" +# PRP-41 — design Z: unified "agents" phase id used by BOTH demo_minimal/sparse +# (legacy step_agent) AND showcase_rich (step_agent_hitl_flow). The PRP-38 +# PHASE_AGENT constant is replaced; no other code referenced it by name. +PHASE_AGENTS = "agents" +# PRP-41 — new ops phase, populated only on SHOWCASE_RICH. +PHASE_OPS = "ops" PHASE_CLEANUP = "cleanup" @@ -2026,7 +2382,17 @@ def _phase_table(scenario: ScenarioPreset) -> list[PhaseStep]: planning_steps: list[tuple[str, StepFn]] = [] knowledge_steps: list[tuple[str, StepFn]] = [] verify_steps: list[tuple[str, StepFn]] = [("verify", step_verify)] - agent_steps: list[tuple[str, StepFn]] = [("agent", step_agent)] + # PRP-41 — design Z: same phase id "agents" for both branches; SHOWCASE_RICH + # swaps the legacy single-turn `step_agent` for the HITL flow. + agent_steps: list[tuple[str, StepFn]] = ( + [("agent_hitl_flow", step_agent_hitl_flow)] + if scenario is ScenarioPreset.SHOWCASE_RICH + else [("agent", step_agent)] + ) + # PRP-41 — new ops phase. Empty on demo_minimal / sparse (no row emitted). + ops_steps: list[tuple[str, StepFn]] = ( + [("ops_snapshot", step_ops_snapshot)] if scenario is ScenarioPreset.SHOWCASE_RICH else [] + ) cleanup_steps: list[tuple[str, StepFn]] = [("cleanup", step_cleanup)] if scenario is ScenarioPreset.SHOWCASE_RICH: data_steps += [ @@ -2063,7 +2429,10 @@ def _phase_table(scenario: ScenarioPreset) -> list[PhaseStep]: rows += [(PHASE_PLANNING, name, fn) for name, fn in planning_steps] rows += [(PHASE_KNOWLEDGE, name, fn) for name, fn in knowledge_steps] rows += [(PHASE_VERIFY, name, fn) for name, fn in verify_steps] - rows += [(PHASE_AGENT, name, fn) for name, fn in agent_steps] + # PRP-41 — both branches use PHASE_AGENTS; SHOWCASE_RICH ALSO appends an + # ops_snapshot row under the new PHASE_OPS, BEFORE cleanup. + rows += [(PHASE_AGENTS, name, fn) for name, fn in agent_steps] + rows += [(PHASE_OPS, name, fn) for name, fn in ops_steps] rows += [(PHASE_CLEANUP, name, fn) for name, fn in cleanup_steps] return rows @@ -2109,8 +2478,12 @@ async def run_pipeline(app: FastAPI, req: DemoRunRequest) -> AsyncIterator[StepE ) wall_start = time.monotonic() any_fail = False + # PRP-41 — buffer for intermediate events the HITL step emits via + # ``client.yield_event(...)``. Drained + stamped with the row's + # index/phase fields immediately BEFORE each terminal step_complete. + intermediate_events: list[StepEvent] = [] - async with _Client(app) as client: + async with _Client(app, event_sink=intermediate_events) as client: for index, (phase_name, name, fn) in enumerate(rows, start=1): phase_index = phase_index_by_phase[phase_name] yield StepEvent( @@ -2146,6 +2519,21 @@ async def run_pipeline(app: FastAPI, req: DemoRunRequest) -> AsyncIterator[StepE {}, ) duration_ms = (time.monotonic() - t0) * 1000 + # PRP-41 — drain any intermediate events the step buffered BEFORE + # the terminal step_complete. Stamp the row's index/phase fields + # so the FE state machine processes them as if they were emitted + # by the orchestrator. Order matters: intermediate events must + # land before the terminal so "awaiting_approval" precedes + # "approved" in the WS stream. + for ev in intermediate_events: + ev.step_index = index + ev.total_steps = total + ev.phase_index = phase_index + ev.phase_total = phase_total + # phase_name is set by the step fn already, but mirror in case. + ev.phase_name = phase_name + yield ev + intermediate_events.clear() yield StepEvent( event_type="step_complete", step_name=name, diff --git a/app/features/demo/tests/test_pipeline.py b/app/features/demo/tests/test_pipeline.py index 93e697a8..75b33130 100644 --- a/app/features/demo/tests/test_pipeline.py +++ b/app/features/demo/tests/test_pipeline.py @@ -266,15 +266,46 @@ def _canned_response( } if path == "/ops/summary": # PRP-39 — stale_alias_trigger GETs after registering a V=3 run. + # PRP-41 — step_ops_snapshot also consumes this; the additive runs.counts + # block + extra is_stale flag on the alias drive the KPI tiles. return { "aliases": [ { "alias_name": "demo-production", + "is_stale": True, "stale_reason": "feature_frame_version_mismatch", "alias_feature_frame_version": 2, "comparable_run_feature_frame_version": 3, } - ] + ], + "runs": { + "counts": [ + {"status": "success", "count": 5}, + {"status": "failed", "count": 1}, + ], + }, + } + if path.startswith("/ops/retraining-candidates"): + # PRP-41 — canned 2 retraining candidates so step_ops_snapshot's + # retraining KPI tile renders > 0. + return { + "candidates": [ + {"store_id": 7, "product_id": 3, "priority_score": 0.8}, + {"store_id": 7, "product_id": 4, "priority_score": 0.6}, + ], + "total_evaluated": 2, + "generated_at": "2026-05-26T10:00:00Z", + } + if path.startswith("/ops/model-health"): + # PRP-41 — 3 health entries; 1 degrading so degrading_count == 1. + return { + "entries": [ + {"store_id": 7, "product_id": 3, "drift_direction": "stable"}, + {"store_id": 7, "product_id": 4, "drift_direction": "degrading"}, + {"store_id": 8, "product_id": 3, "drift_direction": "improving"}, + ], + "total_evaluated": 3, + "generated_at": "2026-05-26T10:00:00Z", } if path == "/batch/forecasting": # PRP-39 — batch_preset POSTs the preset expansion. Return terminal @@ -302,8 +333,16 @@ def _build_fake_client(artifact_path: str, wapes: dict[str, float]) -> type: """Build a canned-response stand-in class for ``pipeline._Client``.""" class _FakeClient: - def __init__(self, _app: Any) -> None: + def __init__( + self, + _app: Any, + *, + event_sink: list[Any] | None = None, + ) -> None: + # PRP-41 — accept the optional event_sink the orchestrator passes + # in; remember it so ``yield_event`` can feed intermediate frames. self.calls: list[tuple[str, str]] = [] + self._event_sink = event_sink async def __aenter__(self) -> _FakeClient: return self @@ -311,6 +350,12 @@ async def __aenter__(self) -> _FakeClient: async def __aexit__(self, *_exc: object) -> None: return None + def yield_event(self, event: Any) -> None: + # PRP-41 — mirror pipeline._Client.yield_event semantics. + if self._event_sink is None: + return + self._event_sink.append(event) + async def request( self, step: str, @@ -474,8 +519,8 @@ async def test_run_pipeline_with_reset_and_seed(monkeypatch, tmp_path): async def test_run_pipeline_stops_on_failed_step(monkeypatch): class _FailingClient: - def __init__(self, _app: Any) -> None: - pass + def __init__(self, _app: Any, *, event_sink: list[Any] | None = None) -> None: + self._event_sink = event_sink async def __aenter__(self) -> _FailingClient: return self @@ -483,6 +528,11 @@ async def __aenter__(self) -> _FailingClient: async def __aexit__(self, *_exc: object) -> None: return None + def yield_event(self, event: Any) -> None: + if self._event_sink is None: + return + self._event_sink.append(event) + async def request( self, step: str, @@ -518,8 +568,8 @@ async def test_run_pipeline_transport_error_becomes_fail(monkeypatch): import httpx class _BrokenClient: - def __init__(self, _app: Any) -> None: - pass + def __init__(self, _app: Any, *, event_sink: list[Any] | None = None) -> None: + self._event_sink = event_sink async def __aenter__(self) -> _BrokenClient: return self @@ -527,6 +577,11 @@ async def __aenter__(self) -> _BrokenClient: async def __aexit__(self, *_exc: object) -> None: return None + def yield_event(self, event: Any) -> None: + if self._event_sink is None: + return + self._event_sink.append(event) + async def request(self, *_a: object, **_k: object) -> dict[str, Any]: raise httpx.ConnectError("connection refused") @@ -544,12 +599,13 @@ async def request(self, *_a: object, **_k: object) -> dict[str, Any]: # ============================================================================= -def test_phase_table_demo_minimal_matches_legacy_11_steps(): - """PRP-38 — phase_table for DEMO_MINIMAL drops to the legacy 11-step flow. +def test_phase_table_demo_minimal_matches_legacy_11_steps_under_agents_phase(): + """PRP-38 / PRP-41 — DEMO_MINIMAL keeps the legacy 11-step flow. - Test gates the (phase_name, step_name) lockstep contract with the frontend - PHASE_DEFS.ts. If a phase or step is added in either tier without the - matching change here, this test fails. + PRP-41 — design Z: the legacy ``step_agent`` row now lives under the + unified ``agents`` phase id (was ``agent``). The step name stays + ``agent`` so the wire payload + the frontend's legacy step rendering + keep working. Step count unchanged. """ rows = pipeline._phase_table(ScenarioPreset.DEMO_MINIMAL) by_phase_step = [(p, s) for p, s, _fn in rows] @@ -563,22 +619,27 @@ def test_phase_table_demo_minimal_matches_legacy_11_steps(): ("decision", "backtest"), ("decision", "register"), ("verify", "verify"), - ("agent", "agent"), + ("agents", "agent"), ("cleanup", "cleanup"), ] -def test_phase_table_showcase_rich_adds_v2_decision_portfolio_planning_knowledge_steps(): - """PRP-38 + PRP-39 + PRP-40 — phase_table for SHOWCASE_RICH is the canonical 23 rows. +def test_phase_table_showcase_rich_emits_24_steps_with_agents_hitl_and_ops_snapshot(): + """PRP-38 + PRP-39 + PRP-40 + PRP-41 — SHOWCASE_RICH is the canonical 24 rows. PRP-38 shipped 3 (phase2_enrichment, historical_backfill, v2_train). PRP-39 added 4 (champion_compat_compare, stale_alias_trigger, safer_promote_flow, batch_preset) plus a new ``portfolio`` phase between ``decision`` and ``verify``. - PRP-40 adds 5 (scenario_simulate_and_save, multi_plan_compare, + PRP-40 added 5 (scenario_simulate_and_save, multi_plan_compare, embedding_provider_probe, rag_index_subset, rag_retrieve_probe) under two new ``planning`` + ``knowledge`` phases, AFTER portfolio and BEFORE verify - via a relative anchor. Total: 23 rows across 9 phases. + via a relative anchor. + PRP-41 — design Z: SHOWCASE_RICH swaps the legacy ``agent`` step for + ``agent_hitl_flow`` (HITL approval round-trip) under the unified + ``agents`` phase id, AND appends a new ``ops`` phase carrying + ``ops_snapshot`` IMMEDIATELY AFTER ``agents``, BEFORE ``cleanup``. + Total: 24 rows across 10 phases. """ rows = pipeline._phase_table(ScenarioPreset.SHOWCASE_RICH) by_phase_step = [(p, s) for p, s, _fn in rows] @@ -607,7 +668,9 @@ def test_phase_table_showcase_rich_adds_v2_decision_portfolio_planning_knowledge ("knowledge", "rag_index_subset"), ("knowledge", "rag_retrieve_probe"), ("verify", "verify"), - ("agent", "agent"), + # PRP-41 — agents (HITL) + ops snapshot, both under new phase ids. + ("agents", "agent_hitl_flow"), + ("ops", "ops_snapshot"), ("cleanup", "cleanup"), ] @@ -650,8 +713,11 @@ async def test_run_pipeline_emits_phase_fields(monkeypatch, tmp_path): events = [e async for e in pipeline.run_pipeline(app=_FAKE_APP, req=DemoRunRequest())] step_events = [e for e in events if e.event_type in {"step_start", "step_complete"}] assert step_events + # PRP-41 — design Z renames the legacy "agent" phase to "agents" for ALL + # scenarios. Demo_minimal still emits 6 phases (data/modeling/decision/ + # verify/agents/cleanup); ops is showcase_rich-only. for ev in step_events: - assert ev.phase_name in {"data", "modeling", "decision", "verify", "agent", "cleanup"} + assert ev.phase_name in {"data", "modeling", "decision", "verify", "agents", "cleanup"} assert ev.phase_index is not None and ev.phase_index >= 1 assert ev.phase_total == 6 # Verify phases appear in canonical order. @@ -659,7 +725,7 @@ async def test_run_pipeline_emits_phase_fields(monkeypatch, tmp_path): for ev in step_events: if ev.phase_name and ev.phase_name not in phases_seen: phases_seen.append(ev.phase_name) - assert phases_seen == ["data", "modeling", "decision", "verify", "agent", "cleanup"] + assert phases_seen == ["data", "modeling", "decision", "verify", "agents", "cleanup"] async def test_run_pipeline_showcase_rich_runs_v2_and_buckets(monkeypatch, tmp_path): @@ -714,14 +780,12 @@ async def test_run_pipeline_showcase_rich_runs_v2_and_buckets(monkeypatch, tmp_p assert final.data["v2_run_id"] == "demo-run-abc123def456" -async def test_run_pipeline_showcase_rich_emits_23_steps(monkeypatch, tmp_path): - """PRP-38 + PRP-39 + PRP-40 — SHOWCASE_RICH = 11 base + 3 PRP-38 + 4 PRP-39 + 5 PRP-40 = 23 total steps. +async def test_run_pipeline_showcase_rich_emits_24_steps(monkeypatch, tmp_path): + """PRP-38 + PRP-39 + PRP-40 + PRP-41 — SHOWCASE_RICH = 24 total steps. - PRP-38 shipped 3 (phase2_enrichment + historical_backfill + v2_train). - PRP-39 added 4 (champion_compat_compare + stale_alias_trigger + - safer_promote_flow + batch_preset). - PRP-40 adds 5 (scenario_simulate_and_save + multi_plan_compare + - embedding_provider_probe + rag_index_subset + rag_retrieve_probe). + 11 base + 3 PRP-38 + 4 PRP-39 + 5 PRP-40 + 1 PRP-41 (ops_snapshot — the + legacy `agent` step is swapped for `agent_hitl_flow` not added, hence + +1 net for PRP-41). """ artifact = tmp_path / "artifacts" / "models" / "model_x.joblib" artifact.parent.mkdir(parents=True, exist_ok=True) @@ -733,10 +797,10 @@ async def test_run_pipeline_showcase_rich_emits_23_steps(monkeypatch, tmp_path): req = DemoRunRequest(scenario=ScenarioPreset.SHOWCASE_RICH) events = [e async for e in pipeline.run_pipeline(app=_FAKE_APP, req=req)] completes = [e for e in events if e.event_type == "step_complete"] - assert len(completes) == 23 - # Every event reports total_steps=23 + assert len(completes) == 24 + # Every event reports total_steps=24 for ev in completes: - assert ev.total_steps == 23 + assert ev.total_steps == 24 # ============================================================================= @@ -1431,3 +1495,370 @@ async def test_run_pipeline_showcase_rich_skips_knowledge_when_provider_unreacha final = events[-1] assert final.event_type == "pipeline_complete" assert final.status == "pass" + + +# ============================================================================= +# PRP-41 — agents (HITL) + ops snapshot per-step unit tests +# ============================================================================= + + +def _make_hitl_client( + *, + chat_pending: bool = True, + chat_action_id: str = "action-abc-123", + approve_status: int = 200, + approve_body: dict[str, Any] | None = None, + session_id: str = "sess-test-0001", + capture_intermediate: bool = True, +) -> tuple[Any, list[Any]]: + """Build a fake client that replays the HITL chat+approve round-trip. + + Returns ``(client, intermediate_events)`` so a test can assert what the + HITL step buffered into the event sink. + """ + intermediate: list[Any] = [] + + class _HitlClient: + def __init__( + self, + _app: Any = None, + *, + event_sink: list[Any] | None = None, + ) -> None: + self.calls: list[tuple[str, str]] = [] + self._event_sink = event_sink if event_sink is not None else intermediate + + async def __aenter__(self) -> _HitlClient: + return self + + async def __aexit__(self, *_exc: object) -> None: + return None + + def yield_event(self, event: Any) -> None: + if not capture_intermediate or self._event_sink is None: + return + self._event_sink.append(event) + + async def request( + self, + step: str, + method: str, + path: str, + *, + json_body: dict[str, Any] | None = None, + ) -> dict[str, Any]: + self.calls.append((method, path)) + if path == "/agents/sessions": + return {"session_id": session_id, "agent_type": "experiment"} + if path.endswith("/chat"): + if chat_pending: + return { + "session_id": session_id, + "message": "I'll save that scenario.", + "tool_calls": [{"tool_name": "tool_save_scenario", "tool_call_id": "tc-1"}], + "pending_approval": True, + "pending_action": { + "action_id": chat_action_id, + "action_type": "save_scenario", + }, + "tokens_used": 240, + } + return { + "session_id": session_id, + "message": "Done.", + "tool_calls": [], + "pending_approval": False, + "pending_action": None, + "tokens_used": 80, + } + if path.endswith("/approve"): + if approve_status >= 400: + raise pipeline._StepError( + step, + approve_status, + {"title": "Bad Request", "detail": "No pending action"}, + ) + return approve_body or { + "action_id": chat_action_id, + "approved": True, + "status": "executed", + } + raise AssertionError(f"unexpected request: {method} {path}") + + return _HitlClient(event_sink=intermediate), intermediate + + +async def test_agent_hitl_flow_happy_path(monkeypatch, tmp_path): + """PRP-41 — full HITL round-trip: chat -> intermediate -> approve -> pass.""" + monkeypatch.setattr( + pipeline, + "get_settings", + lambda: _fake_settings(str(tmp_path / "reg"), openai_api_key="sk-test"), + ) + # Pick a provider whose key the fake settings sets. + monkeypatch.setattr( + pipeline, + "_llm_key_present", + lambda: True, + ) + # Short-circuit the 3s display delay so the test stays fast. + monkeypatch.setattr(pipeline, "_APPROVAL_DISPLAY_DELAY_S", 0.0) + + client, intermediate = _make_hitl_client() + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_agent_hitl_flow(ctx, client) + + assert status == "pass" + assert "approved=executed" in detail + assert data["approval_decision"] == "executed" + assert data["action_id"] == "action-abc-123" + assert data["session_id"] == "sess-test-0001" + assert data["tokens_used"] == 240 + # The HITL step buffered exactly one intermediate event for the FE. + assert len(intermediate) == 1 + inter = intermediate[0] + assert inter.status == "running" + assert inter.data["awaiting_approval"] is True + assert inter.data["action_id"] == "action-abc-123" + assert inter.phase_name == pipeline.PHASE_AGENTS + # Ctx threaded for downstream cleanup + KPI consumers. + assert ctx.approval_action_id == "action-abc-123" + assert ctx.agent_approval_decision == "executed" + assert ctx.session_id == "sess-test-0001" + + +async def test_agent_hitl_flow_skips_without_key(monkeypatch, tmp_path): + """PRP-41 — no LLM key -> skip-gracefully; no session created.""" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(tmp_path / "reg"))) + monkeypatch.setattr(pipeline, "_llm_key_present", lambda: False) + + client, intermediate = _make_hitl_client() + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_agent_hitl_flow(ctx, client) + + assert status == "skip" + assert "no API key" in detail + assert data == {} + assert intermediate == [] + assert ctx.session_id is None + + +async def test_agent_hitl_flow_skips_on_session_failure(monkeypatch, tmp_path): + """PRP-41 — session-create error -> skip, never raise.""" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(tmp_path / "reg"))) + monkeypatch.setattr(pipeline, "_llm_key_present", lambda: True) + + class _NoSessionClient: + def __init__(self, _app: Any = None, *, event_sink: list[Any] | None = None) -> None: + self._event_sink = event_sink + + async def __aenter__(self) -> _NoSessionClient: + return self + + async def __aexit__(self, *_exc: object) -> None: + return None + + def yield_event(self, event: Any) -> None: + pass + + async def request(self, step: str, method: str, path: str, **_kw: Any) -> dict[str, Any]: + raise pipeline._StepError(step, 500, {"title": "boom", "detail": "agents down"}) + + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, _ = await pipeline.step_agent_hitl_flow( + ctx, cast(pipeline._Client, _NoSessionClient()) + ) + assert status == "skip" + assert "session-create failed" in detail + + +async def test_agent_hitl_flow_skips_when_agent_did_not_trigger_tool(monkeypatch, tmp_path): + """PRP-41 — agent answered directly (no pending_action) -> skip with detail.""" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(tmp_path / "reg"))) + monkeypatch.setattr(pipeline, "_llm_key_present", lambda: True) + monkeypatch.setattr(pipeline, "_APPROVAL_DISPLAY_DELAY_S", 0.0) + + client, intermediate = _make_hitl_client(chat_pending=False) + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_agent_hitl_flow(ctx, client) + assert status == "skip" + assert "did not trigger save_scenario" in detail + assert data["session_id"] == "sess-test-0001" + # No intermediate event because no pending action surfaced. + assert intermediate == [] + + +async def test_agent_hitl_flow_absorbs_double_approve_400(monkeypatch, tmp_path): + """PRP-41 — FE pre-empted Approve -> backend approve returns 400; absorb.""" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(tmp_path / "reg"))) + monkeypatch.setattr(pipeline, "_llm_key_present", lambda: True) + monkeypatch.setattr(pipeline, "_APPROVAL_DISPLAY_DELAY_S", 0.0) + + client, intermediate = _make_hitl_client(approve_status=400) + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_agent_hitl_flow(ctx, client) + + # 4xx absorbed: step still passes with optimistic "executed" decision. + assert status == "pass" + assert data["approval_decision"] == "executed" + assert "approved=executed" in detail + # The intermediate event was still buffered before the absorb branch. + assert len(intermediate) == 1 + + +async def test_agent_hitl_flow_skips_on_hard_timeout(monkeypatch, tmp_path): + """PRP-41 — elapsed > _APPROVAL_HARD_TIMEOUT_S -> skip with timed_out.""" + monkeypatch.setattr(pipeline, "get_settings", lambda: _fake_settings(str(tmp_path / "reg"))) + monkeypatch.setattr(pipeline, "_llm_key_present", lambda: True) + monkeypatch.setattr(pipeline, "_APPROVAL_DISPLAY_DELAY_S", 0.0) + # Force the elapsed-time check to fire: set the hard cap below the + # display delay so any positive elapsed exceeds it. + monkeypatch.setattr(pipeline, "_APPROVAL_HARD_TIMEOUT_S", -1.0) + + client, intermediate = _make_hitl_client() + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_agent_hitl_flow(ctx, client) + + assert status == "skip" + assert "approval timed out" in detail + assert data["timed_out"] is True + assert data["approval_decision"] == "timed_out" + assert ctx.agent_approval_decision == "timed_out" + # Intermediate event was emitted; approve POST never fired. + assert len(intermediate) == 1 + assert all(call[1] != f"/agents/sessions/{data['session_id']}/approve" for call in client.calls) + + +async def test_ops_snapshot_happy_path(tmp_path): + """PRP-41 — three /ops/* GETs feed the 5-key KPI payload.""" + + class _OpsClient: + def __init__(self, _app: Any = None, *, event_sink: list[Any] | None = None) -> None: + self._event_sink = event_sink + self.calls: list[str] = [] + + async def __aenter__(self) -> _OpsClient: + return self + + async def __aexit__(self, *_exc: object) -> None: + return None + + def yield_event(self, event: Any) -> None: + pass + + async def request(self, step: str, method: str, path: str, **_kw: Any) -> dict[str, Any]: + self.calls.append(path) + if path == "/ops/summary": + return { + "aliases": [ + {"alias_name": "demo-production", "is_stale": True}, + {"alias_name": "challenger", "is_stale": False}, + ], + "runs": { + "counts": [ + {"status": "success", "count": 4}, + {"status": "failed", "count": 1}, + ] + }, + } + if path.startswith("/ops/retraining-candidates"): + return {"candidates": [{"store_id": 1}, {"store_id": 2}], "total_evaluated": 2} + if path.startswith("/ops/model-health"): + return { + "entries": [ + {"drift_direction": "degrading"}, + {"drift_direction": "stable"}, + ], + "total_evaluated": 2, + } + raise AssertionError(f"unexpected: {path}") + + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_ops_snapshot( + ctx, cast(pipeline._Client, _OpsClient()) + ) + + assert status == "pass" + assert data == { + "stale_aliases_count": 1, + "retraining_candidates_count": 2, + "total_runs": 5, + "total_aliases": 2, + "degrading_health_count": 1, + } + assert "stale_aliases=1" in detail + assert "degrading=1" in detail + + +async def test_ops_snapshot_warns_when_all_three_endpoints_fail(tmp_path): + """PRP-41 — every /ops/* returns 5xx -> warn (not fail), zero-filled payload.""" + + class _OpsBrokenClient: + def __init__(self, _app: Any = None, *, event_sink: list[Any] | None = None) -> None: + pass + + async def __aenter__(self) -> _OpsBrokenClient: + return self + + async def __aexit__(self, *_exc: object) -> None: + return None + + def yield_event(self, event: Any) -> None: + pass + + async def request(self, step: str, method: str, path: str, **_kw: Any) -> dict[str, Any]: + raise pipeline._StepError(step, 500, {"title": "DB down", "detail": "unreachable"}) + + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, detail, data = await pipeline.step_ops_snapshot( + ctx, cast(pipeline._Client, _OpsBrokenClient()) + ) + + assert status == "warn" + assert "/ops/*" in detail and "unavailable" in detail + assert data == { + "stale_aliases_count": 0, + "retraining_candidates_count": 0, + "total_runs": 0, + "total_aliases": 0, + "degrading_health_count": 0, + } + + +async def test_ops_snapshot_passes_on_empty_db(tmp_path): + """PRP-41 — 200 + empty bodies -> pass with zero-filled payload.""" + + class _OpsEmptyClient: + def __init__(self, _app: Any = None, *, event_sink: list[Any] | None = None) -> None: + pass + + async def __aenter__(self) -> _OpsEmptyClient: + return self + + async def __aexit__(self, *_exc: object) -> None: + return None + + def yield_event(self, event: Any) -> None: + pass + + async def request(self, step: str, method: str, path: str, **_kw: Any) -> dict[str, Any]: + if path == "/ops/summary": + return {"aliases": [], "runs": {"counts": []}} + if path.startswith("/ops/retraining-candidates"): + return {"candidates": [], "total_evaluated": 0} + if path.startswith("/ops/model-health"): + return {"entries": [], "total_evaluated": 0} + raise AssertionError(path) + + ctx = pipeline.DemoContext(seed=42, skip_seed=True, reset=False) + status, _, data = await pipeline.step_ops_snapshot( + ctx, cast(pipeline._Client, _OpsEmptyClient()) + ) + assert status == "pass" + assert data == { + "stale_aliases_count": 0, + "retraining_candidates_count": 0, + "total_runs": 0, + "total_aliases": 0, + "degrading_health_count": 0, + } diff --git a/docs/_base/RUNBOOKS.md b/docs/_base/RUNBOOKS.md index 990fbf76..a514c3e3 100644 --- a/docs/_base/RUNBOOKS.md +++ b/docs/_base/RUNBOOKS.md @@ -128,10 +128,15 @@ uv run python scripts/run_demo.py --seed 42 --quiet 2>&1 | tee demo.log 20. **`embedding_provider_probe` step shows ✅ but `reachable=False` (PRP-40, `showcase_rich` only)** — expected when no embedding provider is configured. The probe always emits PASS so the pipeline still greens; downstream `rag_index_subset` and `rag_retrieve_probe` will emit ⏭️ skip with `detail="embedding provider unreachable"`. Fix only if you want the knowledge phase to run: set `OPENAI_API_KEY` (when `RAG_EMBEDDING_PROVIDER=openai`) or start Ollama on `OLLAMA_BASE_URL` (when `RAG_EMBEDDING_PROVIDER=ollama`), then re-run. 21. **`rag_index_subset` step fails with `path_prefix escapes the project root` (PRP-40, `showcase_rich` only)** — the demo step hard-codes `path_prefix="docs/user-guide"`, so a real-world hit means `RAGService._base_dir` no longer points at the repo root (e.g. a misconfigured container start). Fix: confirm the backend was started from the repo root (or that `RAGService(base_dir=...)` was constructed with the right path); rerun the showcase. The path-traversal guard is load-bearing security — never relax it. 22. **`rag_retrieve_probe` step shows ⚠️ with `no hits — corpus indexed but query did not match` (PRP-40, `showcase_rich` only)** — the 5-file corpus was indexed (the prior step PASSed) but the canned query `"How do I run the demo pipeline?"` returned zero hits. Common cause: the embedding-provider was switched mid-showcase and indexed chunks are now orphaned (memory anchor: `[[rag-runtime-config-and-corpus-state]]`); the pgvector column has one fixed dimension per provider. Fix: stick to one provider, or clear the RAG corpus (`DELETE /rag/sources/{id}` per source) and re-run. +23. **`agent_hitl_flow` step shows ⏭️ with `no API key matching agent_default_model provider` (PRP-41, `showcase_rich` only)** — expected when no LLM key is set for the configured `agent_default_model` provider. Pipeline still goes green. Fix only if you want the HITL phase to run: set `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` / `GOOGLE_API_KEY` to match the provider prefix in `agent_default_model` (e.g. `anthropic:claude-...` → `ANTHROPIC_API_KEY`). +24. **`agent_hitl_flow` step shows ⏭️ with `approval timed out -- pipeline continued` (PRP-41, `showcase_rich` only)** — the 90 s hard timeout fired before `POST /agents/sessions/{id}/approve` completed. Causes: agent retry / provider 5xx / network hang. Pipeline still greens; `cleanup` still closes the session via `DELETE /agents/sessions/{id}`. Fix: check uvicorn logs for the `session_id` echoed in the step's `data.session_id`. +25. **`agent_hitl_flow` step shows ⏭️ with `agent did not trigger save_scenario` (PRP-41, `showcase_rich` only)** — the agent answered the prompt directly (no `tool_save_scenario` call) so `pending_approval=false` came back on the chat response. Cause: model picked a different tool / answered in chat. Pipeline still greens. Fix: re-run; the model's response is non-deterministic. If the model ALWAYS skips the tool, raise the temperature in `agent_default_model` or re-prompt. +26. **`ops_snapshot` step shows ⚠️ with `/ops/* all 4xx/5xx -- ops snapshot unavailable` (PRP-41, `showcase_rich` only)** — all three of `GET /ops/summary`, `/ops/retraining-candidates`, `/ops/model-health` returned non-2xx. Cause: DB unreachable, alembic migration drift, OpsService change broke the schema. Pipeline still warn (NEVER fail). Fix: `docker compose ps`; `uv run alembic upgrade head`; re-run. +27. **Stop button used mid-run** — the Stop button on `/showcase` closes the WebSocket; the backend's `WebSocketDisconnect` handler at `app/features/demo/routes.py:74` releases `_pipeline_lock`. Page returns to `idle` within ~5 s with banner "Pipeline cancelled by user.". To resume, click Run again. Half-finished registry rows / scenario plans persist (the backend doesn't roll them back — they're operator-visible artefacts of a partial run). > ⚠️ **RAG embedding-dim mismatch can orphan chunks (R4).** PRP-40 indexes a curated 5-file subset; if the operator switches the embedding provider mid-showcase, indexed chunks orphan (pgvector assumes one fixed dimension per column). PRP-40 does NOT ship a `clear_rag` UI toggle — that's a future PRP. Stick to one provider for the showcase run. -**Notes:** the `POST /demo/run` body and `WS /demo/stream` events are documented in `docs/_base/API_CONTRACTS.md`. The pipeline mirrors `scripts/run_demo.py`; the per-step diagnosis for `make demo` above applies to the same steps. PRP-38 added the `scenario` field on `DemoRunRequest` (defaults to `demo_minimal`) and the additive `phase_name` / `phase_index` / `phase_total` fields on every `StepEvent`. PRP-39 added four new steps (`champion_compat_compare`, `stale_alias_trigger`, `safer_promote_flow`, `batch_preset`) and a new `portfolio` phase between `decision` and `verify`. PRP-40 adds the `planning` + `knowledge` phases (5 steps inserted after `portfolio`, before `verify`) and the additive `IndexProjectDocsRequest.path_prefix` field on the RAG slice. +**Notes:** the `POST /demo/run` body and `WS /demo/stream` events are documented in `docs/_base/API_CONTRACTS.md`. The pipeline mirrors `scripts/run_demo.py`; the per-step diagnosis for `make demo` above applies to the same steps. PRP-38 added the `scenario` field on `DemoRunRequest` (defaults to `demo_minimal`) and the additive `phase_name` / `phase_index` / `phase_total` fields on every `StepEvent`. PRP-39 added four new steps (`champion_compat_compare`, `stale_alias_trigger`, `safer_promote_flow`, `batch_preset`) and a new `portfolio` phase between `decision` and `verify`. PRP-40 added the `planning` + `knowledge` phases (5 steps inserted after `portfolio`, before `verify`) and the additive `IndexProjectDocsRequest.path_prefix` field on the RAG slice. PRP-41 — design Z renames the legacy `agent` phase to `agents`, swaps the legacy `step_agent` for `agent_hitl_flow` (HITL approval round-trip), and appends a new `ops` phase carrying `ops_snapshot` immediately before `cleanup`. Total: 24 rows / 10 phases on `showcase_rich`; demo_minimal / sparse keep the 11-row layout under the unified `agents` phase id. The frontend's `DemoPhasePanel.tsx` now carries `onValueChange` (issue #311) and the Showcase page adds a KPI strip + Run-history strip + Stop button + Inspect-Artifacts panel + one-click Approve button on the HITL step card. ### release-please skipped the bump after a dev → main merge **Symptoms:** `dev → main` PR is merged, `CD Release` workflow on `main` completes in ~10s, **no Release PR** is opened. release-please log shows `No user facing commits found since - skipping`. diff --git a/docs/user-guide/showcase-walkthrough.md b/docs/user-guide/showcase-walkthrough.md index 4e00898f..62ea3912 100644 --- a/docs/user-guide/showcase-walkthrough.md +++ b/docs/user-guide/showcase-walkthrough.md @@ -69,9 +69,9 @@ ForecastLab lifecycle in one live run, with every result deep-linkable into the existing dashboard pages. The phases below land incrementally — each PRP is an independent PATCH release. -### Phase: Data — planned (PRP-38) +### Phase: Data -> **Planned (PRP-38):** A new **scenario picker** lets the visitor choose +> A new **scenario picker** lets the visitor choose > `demo_minimal`, `showcase-rich` (5 stores × 15 products × 180 days), or > `sparse` before running. The Data phase calls the existing `/seeder/*` > endpoints plus two new ones — `POST /seeder/phase2-enrichment` and @@ -81,9 +81,9 @@ PRP is an independent PATCH release. > exogenous, returns) get populated too. The Inspect button on the Data card > deep-links to `/explorer/sales`. -### Phase: Modeling — planned (PRP-38) +### Phase: Modeling -> **Planned (PRP-38):** Three V1 baselines train in parallel (today's +> Three V1 baselines train in parallel (today's > behavior, kept). A new `v2_train` step then trains a **V2 `prophet_like`** > run with `feature_frame_version=2`, registers it with the full > `artifacts/models/...` `artifact_uri`, and writes @@ -93,18 +93,18 @@ PRP is an independent PATCH release. > [Advanced Forecasting Guide](./advanced-forecasting-guide.md) light up > after a single pipeline run. -### Phase: Backtesting — planned (PRP-38) +### Phase: Backtesting -> **Planned (PRP-38):** The backtest step posts with `include_baselines=true` +> The backtest step posts with `include_baselines=true` > and `feature_frame_version=2` so PRP-36 per-horizon-bucket metrics > (`h_1_7`, `h_8_14`, `h_15_28`, `h_29_plus`) populate. The step card renders > a per-bucket mini table inline; the Inspect button deep-links to > `/visualize/backtest?store_id=...&product_id=...` for the full > baseline-vs-feature-aware comparison table. -### Phase: Registry decisions — planned (PRP-39) +### Phase: Registry decisions -> **Planned (PRP-39):** Three new steps walk the visitor through a real +> Three new steps walk the visitor through a real > operator decision: `champion_compat_compare` calls > `GET /registry/compare/{v1}/{v2}` and shows the "Not comparable" badge > (V mismatch); `stale_alias_trigger` registers a second V2 run on the same @@ -115,26 +115,26 @@ PRP is an independent PATCH release. > verify, worse-WAPE acknowledgement, V-mismatch acknowledgement). Inspect > buttons deep-link to `/explorer/runs/compare?a=&b=` and `/ops`. -### Phase: Portfolio batch — planned (PRP-39) +### Phase: Portfolio batch -> **Planned (PRP-39):** A `batch_preset` step posts to `/batch/forecasting` +> A `batch_preset` step posts to `/batch/forecasting` > with the `quick_baseline_sweep` preset over a 3 × 2 × 3 matrix and polls > `/batch/{batch_id}` until it completes (90 s cap). The Inspect button > deep-links to `/visualize/batch/{batch_id}` so the just-created sweep > shows up populated in the Batch Runner page. -### Phase: Planning (scenarios) — planned (PRP-40) +### Phase: Planning (scenarios) -> **Planned (PRP-40):** A `scenario_simulate` step calls +> A `scenario_simulate` step calls > `POST /scenarios/simulate` with a 10% price-cut assumption against the > registered champion; `scenario_save` persists it as a named plan; a > `scenario_compare` step ranks two saved plans via `POST /scenarios/compare`. > The Inspect button deep-links to `/visualize/planner`, where the saved > plan and the multi-plan comparison row are visible. -### Phase: Knowledge (RAG) — planned (PRP-40) +### Phase: Knowledge (RAG) -> **Planned (PRP-40):** A `providers_health` step probes +> A `providers_health` step probes > `GET /config/providers/health`; `rag_index_subset` calls > `POST /rag/index/project-docs` against a curated five-file subset of > `docs/user-guide/`; `rag_retrieve_probe` runs a semantic search and @@ -142,50 +142,73 @@ PRP is an independent PATCH release. > [Agents and RAG Guide](./agents-and-rag-guide.md) for the RAG model. The > Inspect button deep-links to `/knowledge`. -### Phase: Agents (HITL) — planned (PRP-41) - -> **Planned (PRP-41):** An `agent_hitl_flow` step opens an experiment-agent -> session and asks it to `save_scenario`. The pipeline pauses on the -> `approval_required` event and surfaces a one-click **Approve** button on -> the step card; on approval the tool completes and the step card resolves -> pass. A 90 s timeout falls back to Skip so a forgotten approval cannot -> wedge the run. The Inspect button deep-links to `/chat` where the -> approved tool call is visible in the transcript. See -> [Agents and RAG Guide](./agents-and-rag-guide.md) for the approval gate. - -### Phase: Ops snapshot — planned (PRP-41) - -> **Planned (PRP-41):** A final `ops_snapshot` step calls -> `GET /ops/summary`, `GET /ops/retraining-candidates`, and -> `GET /ops/model-health/{grain}`, rendering the results as a compact KPI -> grid (stale aliases, retraining queue depth, per-grain health). The -> Inspect button deep-links to `/ops`. - -### Cross-cutting polish — planned (PRP-41) - -> **Planned (PRP-41):** Four chrome-level additions wrap the page: -> -> - **KPI strip** at the top of `/showcase` — live counts of registered runs, -> active aliases, indexed RAG sources, recent ops health. -> - **Inspect-Artifacts panel** rendered after `pipeline_complete` — a grid -> of deep-link cards into every dashboard page that should now have -> populated state (`/visualize/forecast`, `/visualize/backtest`, -> `/visualize/batch`, `/visualize/planner`, `/explorer/runs`, `/ops`, -> `/knowledge`, `/chat`). -> - **Run history strip** showing the last five runs, persisted in the -> browser's `localStorage` (no new tables — the demo slice stays -> stateless), with a one-click replay of parameters. -> - **Stop button** that cancels an in-flight run by releasing the -> server-side pipeline lock. -> - **Scenario picker** wired through (introduced in PRP-38; polished here -> with descriptions and estimated wall-clock per choice). - -## Performance budgets (planned) +### Phase: Agents (HITL) + +An `agent_hitl_flow` step opens an experiment-agent session and asks it +to `save_scenario`. The chat response carries `pending_approval: true` + +`pending_action: PendingAction`; the pipeline emits an intermediate +`step_complete` (status=`running`, `awaiting_approval=true`) so the FE +renders a one-click **Approve** button on the step card. After a 3 s +display delay the backend auto-fires `POST /agents/sessions/{id}/approve` +(absorbing the 400 "No pending action" if the FE pre-empted). A 90 s +hard timeout falls back to Skip so a forgotten approval cannot wedge the +run. The Inspect button deep-links to `/chat` where the approved tool +call is visible in the transcript. See +[Agents and RAG Guide](./agents-and-rag-guide.md) for the approval gate. + + + +### Phase: Ops snapshot + +A final `ops_snapshot` step calls `GET /ops/summary`, +`GET /ops/retraining-candidates?limit=5`, and `GET /ops/model-health?limit=5`, +rendering the results as a compact KPI grid (stale aliases, retraining queue +depth, total runs, total aliases, degrading-health grains). All three +endpoints are 200-safe on an empty DB, so the step always reports `pass` +unless every endpoint fails (then `warn`). The Inspect button deep-links +to `/ops`. + + + +### Cross-cutting polish + +Five chrome-level additions wrap the page: + +- **KPI strip** at the top of `/showcase` — five tiles populated once the + first step reaches a terminal status: runs registered, aliases live, batch + items completed, scenario plans saved, RAG chunks indexed. +- **Inspect-Artifacts panel** rendered after `pipeline_complete` — a grid + of 10 deep-link cards into every dashboard page that should now have + populated state (`/visualize/forecast`, `/visualize/backtest`, + `/visualize/batch`, `/visualize/planner`, `/explorer/runs`, + `/explorer/runs/{v2_run_id}`, `/explorer/runs/compare?a=&b=`, + `/ops`, `/knowledge`, `/chat`). Cards greyed out when the required + step.data ids are missing (e.g. `agent_hitl_flow` skipped → Agent + transcript card disabled). +- **Run history strip** showing the last 5 runs, persisted in the browser's + `localStorage` (key `forecastlab.showcase.runs.v1`, FIFO cap 5; no new + tables — the demo slice stays stateless). Each row carries timestamp, + scenario, status, wall-clock seconds, and a one-click **Replay** button + that re-issues the same start frame. +- **Stop button** that cancels an in-flight run by closing the WebSocket; + the server-side `WebSocketDisconnect` releases `_pipeline_lock` so the + next click on Run starts fresh. +- **Phase accordion #311 fix** — the `` now carries + `onValueChange` so a click after `pipeline_complete` correctly opens any + phase. Prior behavior pinned the open panel to the running/fallback + phase. + + + + + +## Performance budgets | Scenario | Target wall-clock | Notes | | ---------------------------- | ----------------- | -------------------------------------------------- | | `demo_minimal` (default) | ≤ 90 s | Backwards-compatible with today's behavior | -| `showcase-rich` (new — PRP-38)| ≤ 240 s | Full lifecycle coverage across all phases | +| `showcase-rich` | ≤ 240 s | Full lifecycle coverage across all phases | +| HITL approval round-trip | ≤ 90 s | Hard timeout falls back to Skip (never wedges) | | Per-step timeout | 120 s | Unchanged from today | ## Troubleshooting @@ -197,7 +220,7 @@ PRP is an independent PATCH release. from a localhost browser. Fix: edit `frontend/.env`, restart Vite. - **`Pipeline could not start` error banner** — another pipeline is already running. Only one run is allowed at a time across the whole backend. Wait - for it to finish, or (planned PRP-41) use the **Stop** button. + for it to finish, or use the **Stop** button to release the lock. - **A step shows Skip with "no API key matching agent_default_model provider"** — expected without `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` / `GOOGLE_API_KEY` in `.env`. The pipeline still goes green; the agent diff --git a/frontend/src/components/demo/DemoPhasePanel.test.tsx b/frontend/src/components/demo/DemoPhasePanel.test.tsx new file mode 100644 index 00000000..3e2c6199 --- /dev/null +++ b/frontend/src/components/demo/DemoPhasePanel.test.tsx @@ -0,0 +1,102 @@ +/** + * PRP-41 / issue #311 — DemoPhasePanel onValueChange test. + * + * Verifies the controlled Accordion's open panel: + * 1. Initially derives from `runningPhase` (or the first phase). + * 2. Tracks `runningPhase` updates while the pipeline is in flight. + * 3. Honours user clicks AFTER pipeline_complete (runningPhase=null) + * without snapping back to the running/fallback fallback chain. + */ + +import { cleanup, fireEvent, render } from '@testing-library/react' +import { afterEach, describe, expect, it } from 'vitest' +import { useState } from 'react' +import { DemoPhasePanel } from './DemoPhasePanel' +import type { DemoStep } from '@/hooks/use-demo-pipeline' + +afterEach(() => { + cleanup() +}) + +const makeStep = ( + name: string, + status: DemoStep['status'] = 'idle', + overrides: Partial = {}, +): DemoStep => ({ + name, + label: name, + status, + detail: '', + durationMs: 0, + data: {}, + phaseName: 'data', + ...overrides, +}) + +describe('DemoPhasePanel (#311 onValueChange fix)', () => { + // Radix Accordion renders each AccordionItem with a `data-state` attribute. + // To avoid ambiguity in accessible-name matching (header text includes the + // numeric index "01.", "02.", ...), we resolve the open trigger via the + // aria-expanded attribute and read the inner label span text. + const openPhaseLabel = (container: HTMLElement): string | null => { + const trigger = container.querySelector('button[aria-expanded="true"]') + if (!trigger) return null + // The label span is the .font-semibold child; its first sub-span carries + // the index prefix; the trailing text node is the phase label. + const labelSpan = trigger.querySelector('span.font-semibold') + if (!labelSpan) return null + // Extract direct text-node content (skips the inner index span). + let label = '' + labelSpan.childNodes.forEach((node) => { + if (node.nodeType === Node.TEXT_NODE) { + label += node.textContent ?? '' + } + }) + return label.trim() || null + } + + it('opens the running phase initially', () => { + const phases = [ + { id: 'data', label: 'Data', steps: [makeStep('precheck', 'pass')] }, + { id: 'modeling', label: 'Modeling', steps: [makeStep('train', 'running')] }, + { id: 'verify', label: 'Verify', steps: [makeStep('verify')] }, + ] + const { container } = render() + expect(openPhaseLabel(container)).toBe('Modeling') + }) + + it('lets the user expand any phase after pipeline_complete without snapping back', () => { + function Harness() { + const [running] = useState(null) + const phases = [ + { id: 'data', label: 'Data', steps: [makeStep('precheck', 'pass')] }, + { id: 'verify', label: 'Verify', steps: [makeStep('verify', 'pass')] }, + ] + return + } + + const { container } = render() + expect(openPhaseLabel(container)).toBe('Data') + // Click the Verify trigger; without the #311 fix the parent's snap-back + // would reset to the fallback (`data`). + const verifyTrigger = Array.from(container.querySelectorAll('button')).find((b) => + (b.textContent ?? '').includes('Verify'), + ) + expect(verifyTrigger).toBeDefined() + fireEvent.click(verifyTrigger!) + expect(openPhaseLabel(container)).toBe('Verify') + }) + + it('re-syncs the open panel when runningPhase changes', () => { + const phases = [ + { id: 'data', label: 'Data', steps: [makeStep('precheck', 'pass')] }, + { id: 'modeling', label: 'Modeling', steps: [makeStep('train', 'idle')] }, + ] + const { container, rerender } = render( + , + ) + expect(openPhaseLabel(container)).toBe('Data') + rerender() + expect(openPhaseLabel(container)).toBe('Modeling') + }) +}) diff --git a/frontend/src/components/demo/DemoPhasePanel.tsx b/frontend/src/components/demo/DemoPhasePanel.tsx index d96a906d..8e22d96c 100644 --- a/frontend/src/components/demo/DemoPhasePanel.tsx +++ b/frontend/src/components/demo/DemoPhasePanel.tsx @@ -1,3 +1,4 @@ +import { useEffect, useState } from 'react' import { Accordion, AccordionContent, @@ -36,14 +37,25 @@ export function DemoPhasePanel({ runningPhase, getInspectHref, }: DemoPhasePanelProps) { - // Controlled value — defaults to the running phase, falls back to the - // first phase that has at least one running step (resilient when the - // parent doesn't pass runningPhase). + // PRP-41 / issue #311 — controlled accordion needs an onValueChange handler. + // Without it, the parent's recomputed `value` overrides every user click, + // pinning the open panel to the running/fallback phase. Lift to local + // state and let `useEffect` re-sync only when the parent's hint moves. const fallback = phases.find((p) => p.steps.some((s) => s.status === 'running'))?.id - const value = runningPhase ?? fallback ?? phases[0]?.id ?? '' + const computedValue = runningPhase ?? fallback ?? phases[0]?.id ?? '' + const [expandedPhase, setExpandedPhase] = useState(computedValue) + useEffect(() => { + setExpandedPhase(computedValue) + }, [computedValue]) return ( - + {phases.map((phase, phaseIndex) => { const completed = phase.steps.filter((s) => TERMINAL_STATUSES.has(s.status)).length return ( diff --git a/frontend/src/components/demo/InspectArtifactsPanel.test.tsx b/frontend/src/components/demo/InspectArtifactsPanel.test.tsx new file mode 100644 index 00000000..4a692dfb --- /dev/null +++ b/frontend/src/components/demo/InspectArtifactsPanel.test.tsx @@ -0,0 +1,99 @@ +import { cleanup, render } from '@testing-library/react' +import { afterEach, describe, expect, it } from 'vitest' +import { MemoryRouter } from 'react-router-dom' +import { InspectArtifactsPanel } from './InspectArtifactsPanel' +import type { DemoStep, DemoSummary } from '@/hooks/use-demo-pipeline' + +afterEach(() => cleanup()) + +const makeStep = (overrides: Partial): DemoStep => ({ + name: 'unset', + label: 'unset', + status: 'pass', + detail: '', + durationMs: 0, + data: {}, + phaseName: 'data', + ...overrides, +}) + +const baseSummary: DemoSummary = { + overallStatus: 'pass', + winnerModelType: 'prophet_like', + winnerWape: 0.08, + winningRunId: 'r-123', + alias: 'demo-production', + wallClockS: 180, + v2RunId: 'v2-456', +} + +describe('InspectArtifactsPanel', () => { + it('renders all 10 cards', () => { + const { container } = render( + + + , + ) + const headings = container.querySelectorAll('.text-sm.font-semibold') + expect(headings.length).toBe(10) + }) + + it('greys out cards whose source data is missing', () => { + const { container } = render( + + + , + ) + // Steps array is empty -> Forecast/Backtest/Batch/Compare/HITL/RAG all + // missing -> at least 6 cards greyed (opacity-50). + const greyed = container.querySelectorAll('.opacity-50') + expect(greyed.length).toBeGreaterThanOrEqual(5) + }) + + it('Forecast card becomes active when the status step exposes the grain', () => { + const steps = [ + makeStep({ name: 'status', status: 'pass', data: { store_id: 7, product_id: 3 } }), + makeStep({ name: 'train', status: 'pass', data: {} }), + ] + const { container } = render( + + + , + ) + const links = Array.from(container.querySelectorAll('a[href]')).map((a) => + a.getAttribute('href'), + ) + expect(links.some((h) => h?.startsWith('/visualize/forecast?store_id=7&product_id=3'))).toBe(true) + }) + + it('V2 Feature Frame card uses summary.v2RunId when present', () => { + const { container } = render( + + + , + ) + const links = Array.from(container.querySelectorAll('a[href]')).map((a) => + a.getAttribute('href'), + ) + expect(links).toContain('/explorer/runs/v2-456') + }) + + it('Agent transcript card disables when HITL session_id missing', () => { + const steps = [ + makeStep({ name: 'agent_hitl_flow', status: 'skip', data: {} }), + ] + const { container } = render( + + + , + ) + // The agent card should appear greyed (opacity-50) when session_id is missing. + const agentCard = Array.from(container.querySelectorAll('.text-sm.font-semibold')).find( + (h) => h.textContent === 'Agent transcript', + ) + expect(agentCard).toBeDefined() + // Walk up to the wrapper div with class opacity-50. + const wrapper = agentCard?.closest('.opacity-50') + expect(wrapper).not.toBeNull() + }) +}) diff --git a/frontend/src/components/demo/InspectArtifactsPanel.tsx b/frontend/src/components/demo/InspectArtifactsPanel.tsx new file mode 100644 index 00000000..37be2fdf --- /dev/null +++ b/frontend/src/components/demo/InspectArtifactsPanel.tsx @@ -0,0 +1,195 @@ +/** + * PRP-41 — post-run "Inspect Artifacts" panel for the Showcase page. + * + * Renders a grid of 10 deep-link cards covering every surface the demo + * touches. Each card is disabled (with a tooltip-style hint) when the + * required step.data ids are missing. + */ + +import { Link } from 'react-router-dom' +import { ArrowUpRight } from 'lucide-react' +import { Card, CardContent } from '@/components/ui/card' +import { ROUTES } from '@/lib/constants' +import type { DemoStep, DemoSummary } from '@/hooks/use-demo-pipeline' + +interface InspectCard { + label: string + blurb: string + href: string | null + disabledReason?: string +} + +interface InspectArtifactsPanelProps { + steps: DemoStep[] + summary: DemoSummary +} + +function readGrain(steps: DemoStep[]): { store_id: number | null; product_id: number | null } { + const status = steps.find((s) => s.name === 'status') + return { + store_id: typeof status?.data.store_id === 'number' ? status.data.store_id : null, + product_id: typeof status?.data.product_id === 'number' ? status.data.product_id : null, + } +} + +function buildCards(steps: DemoStep[], summary: DemoSummary): InspectCard[] { + const byName = new Map() + for (const s of steps) byName.set(s.name, s) + const { store_id, product_id } = readGrain(steps) + + const train = byName.get('train') + const backtest = byName.get('backtest') + const v2 = byName.get('v2_train') + const batch = byName.get('batch_preset') + const scenario = byName.get('scenario_simulate_and_save') + const compat = byName.get('champion_compat_compare') + const ragIndex = byName.get('rag_index_subset') + const hitl = byName.get('agent_hitl_flow') + + const cards: InspectCard[] = [] + + // 1. Forecast deep link. + cards.push({ + label: 'Forecast (V1+V2 ready)', + blurb: 'Visualize the trained model on the showcase grain.', + href: + store_id !== null && product_id !== null && train?.status === 'pass' + ? `${ROUTES.VISUALIZE.FORECAST}?store_id=${store_id}&product_id=${product_id}` + : null, + disabledReason: 'Train step did not surface a grain.', + }) + // 2. Backtest deep link. + cards.push({ + label: 'Backtest with horizon buckets', + blurb: 'RMSE + per-bucket WAPE for the winning model.', + href: + store_id !== null && product_id !== null && backtest?.status === 'pass' + ? `${ROUTES.VISUALIZE.BACKTEST}?store_id=${store_id}&product_id=${product_id}` + : null, + disabledReason: 'Backtest step did not surface a grain.', + }) + // 3. Portfolio batch. + { + const batchId = typeof batch?.data.batch_id === 'string' ? batch.data.batch_id : null + cards.push({ + label: 'Portfolio sweep', + blurb: 'Run-by-run results for the batch preset.', + href: batchId ? `${ROUTES.VISUALIZE.BATCH}/${batchId}` : null, + disabledReason: 'Batch preset step skipped or failed.', + }) + } + // 4. Saved scenario plans. + { + const sid = typeof scenario?.data.scenario_id === 'string' ? scenario.data.scenario_id : null + cards.push({ + label: 'Saved scenario plans', + blurb: 'The price-cut plan saved during the planning phase.', + href: sid ? `${ROUTES.VISUALIZE.PLANNER}?scenario_id=${sid}` : ROUTES.VISUALIZE.PLANNER, + }) + } + // 5. Multi-run registry. + cards.push({ + label: 'Multi-run registry', + blurb: 'Every run registered across this pipeline run.', + href: ROUTES.EXPLORER.RUNS, + }) + // 6. V2 feature frame. + { + const v2Run = summary.v2RunId ?? (typeof v2?.data.v2_run_id === 'string' ? v2.data.v2_run_id : null) + cards.push({ + label: 'V2 Feature Frame panel', + blurb: 'Inspect feature groups + safety classes for the V2 winner.', + href: v2Run ? `${ROUTES.EXPLORER.RUNS}/${v2Run}` : null, + disabledReason: 'V2 train step skipped or failed.', + }) + } + // 7. Champion-compat compare. + { + const v1 = typeof compat?.data.v1_run_id === 'string' ? compat.data.v1_run_id : null + const v2id = typeof compat?.data.v2_run_id === 'string' ? compat.data.v2_run_id : null + cards.push({ + label: '"Not comparable" diff', + blurb: 'Side-by-side V1 vs V2 with the comparability verdict.', + href: + v1 && v2id ? `${ROUTES.EXPLORER.RUN_COMPARE}?a=${v1}&b=${v2id}` : null, + disabledReason: 'Champion-compat compare step skipped.', + }) + } + // 8. Ops — stale alias + Model Health. + cards.push({ + label: 'Stale-alias + Model Health', + blurb: 'Operator-side view of staleness and drift.', + href: ROUTES.OPS, + }) + // 9. Indexed corpus. + { + const chunks = + ragIndex && typeof ragIndex.data.total_chunks === 'number' ? ragIndex.data.total_chunks : 0 + cards.push({ + label: 'Indexed corpus + search probe', + blurb: 'The 5 user-guide docs indexed by the knowledge phase.', + href: chunks > 0 ? ROUTES.KNOWLEDGE : null, + disabledReason: 'RAG index skipped (embedding provider unreachable).', + }) + } + // 10. Agent transcript. + { + const sid = + typeof hitl?.data.session_id === 'string' && hitl.data.session_id + ? hitl.data.session_id + : null + cards.push({ + label: 'Agent transcript', + blurb: 'The HITL approval round-trip the agent ran.', + href: sid ? ROUTES.CHAT : null, + disabledReason: 'Agent HITL skipped (no LLM key).', + }) + } + + return cards +} + +export function InspectArtifactsPanel({ steps, summary }: InspectArtifactsPanelProps) { + const cards = buildCards(steps, summary) + return ( + + +

Inspect what just happened

+

+ Deep-link into every artifact this run produced. Cards greyed out when + the matching step skipped or failed. +

+
+ {cards.map((card) => { + const isActive = typeof card.href === 'string' && card.href.length > 0 + return ( +
+ {isActive ? ( + +
+ {card.label} + +
+

{card.blurb}

+ + ) : ( +
+
{card.label}
+

{card.blurb}

+
+ )} +
+ ) + })} +
+
+
+ ) +} diff --git a/frontend/src/components/demo/PHASE_DEFS.test.ts b/frontend/src/components/demo/PHASE_DEFS.test.ts index ab486487..74f0665c 100644 --- a/frontend/src/components/demo/PHASE_DEFS.test.ts +++ b/frontend/src/components/demo/PHASE_DEFS.test.ts @@ -4,13 +4,17 @@ * * The backend test `app/features/demo/tests/test_pipeline.py::test_phase_table_*` * pins the same tuple list; if either tier drifts the matching test fails. + * + * PRP-41 — design Z: legacy `agent` row moved under unified `agents` phase id; + * showcase_rich swaps it for `agent_hitl_flow` and appends `ops_snapshot` + * under a new `ops` phase id (24 tuples / 10 phases). */ import { describe, expect, it } from 'vitest' import { PHASE_LABEL, PHASE_ORDER, phaseDefsForScenario } from './PHASE_DEFS' describe('PHASE_DEFS lockstep with backend _phase_table', () => { - it('demo_minimal -> the legacy 11-step (phase, step) sequence', () => { + it('demo_minimal -> the legacy 11-step sequence (agent under unified `agents` phase)', () => { const tuples = phaseDefsForScenario('demo_minimal').map((d) => [d.phase, d.step]) expect(tuples).toEqual([ ['data', 'precheck'], @@ -22,12 +26,13 @@ describe('PHASE_DEFS lockstep with backend _phase_table', () => { ['decision', 'backtest'], ['decision', 'register'], ['verify', 'verify'], - ['agent', 'agent'], + // PRP-41 — legacy `agent` step now under unified `agents` phase id. + ['agents', 'agent'], ['cleanup', 'cleanup'], ]) }) - it('showcase_rich -> the 23-step sequence with PRP-38 V2 + PRP-39 decision/portfolio + PRP-40 planning/knowledge rows', () => { + it('showcase_rich -> 24 steps with PRP-38 V2 + PRP-39 portfolio + PRP-40 planning/knowledge + PRP-41 HITL + ops', () => { const tuples = phaseDefsForScenario('showcase_rich').map((d) => [d.phase, d.step]) expect(tuples).toEqual([ ['data', 'precheck'], @@ -54,7 +59,9 @@ describe('PHASE_DEFS lockstep with backend _phase_table', () => { ['knowledge', 'rag_index_subset'], ['knowledge', 'rag_retrieve_probe'], ['verify', 'verify'], - ['agent', 'agent'], + // PRP-41 — HITL flow + ops snapshot, both under new phase ids. + ['agents', 'agent_hitl_flow'], + ['ops', 'ops_snapshot'], ['cleanup', 'cleanup'], ]) }) @@ -65,7 +72,7 @@ describe('PHASE_DEFS lockstep with backend _phase_table', () => { expect(sparse).toEqual(minimal) }) - it('PHASE_ORDER contains exactly the nine canonical phases (PRP-39 adds portfolio, PRP-40 adds planning + knowledge)', () => { + it('PHASE_ORDER contains exactly the ten canonical phases (PRP-41 swaps agent->agents and adds ops)', () => { expect(PHASE_ORDER).toEqual([ 'data', 'modeling', @@ -74,7 +81,8 @@ describe('PHASE_DEFS lockstep with backend _phase_table', () => { 'planning', 'knowledge', 'verify', - 'agent', + 'agents', + 'ops', 'cleanup', ]) }) diff --git a/frontend/src/components/demo/PHASE_DEFS.ts b/frontend/src/components/demo/PHASE_DEFS.ts index b91c95b3..441ec559 100644 --- a/frontend/src/components/demo/PHASE_DEFS.ts +++ b/frontend/src/components/demo/PHASE_DEFS.ts @@ -20,7 +20,8 @@ export interface PhaseDef { /** * The complete set of step definitions used by either DEMO_MINIMAL (legacy - * 11 steps) or SHOWCASE_RICH (11 + 3 PRP-38 + 4 PRP-39 + 5 PRP-40 = 23 steps). + * 11 steps) or SHOWCASE_RICH (11 base + 3 PRP-38 + 4 PRP-39 + 5 PRP-40 + * + 1 PRP-41 = 24 steps). * * PRP-39 adds four steps (champion_compat_compare, stale_alias_trigger, * safer_promote_flow under the existing decision phase, plus batch_preset @@ -30,6 +31,13 @@ export interface PhaseDef { * "knowledge"), inserted after portfolio and BEFORE verify via relative * anchors. * + * PRP-41 — design Z: BOTH the legacy `agent` step and the new + * `agent_hitl_flow` step live under the unified `agents` phase id. The two + * exclusion sets below (`SHOWCASE_RICH_STEP_NAMES` excludes from + * demo_minimal; `DEMO_MINIMAL_ONLY_STEP_NAMES` excludes from showcase_rich) + * select one or the other per scenario. PRP-41 also adds the new + * `ops_snapshot` step under the new `ops` phase id (showcase_rich only). + * * Order matters: each row's (phase, step) tuple list is what the lockstep * test asserts equals the backend's `_phase_table(scenario)` output for * the matching scenario. @@ -59,10 +67,18 @@ const ALL_STEPS: ReadonlyArray = [ { phase: 'knowledge', step: 'rag_index_subset', label: 'Index user-guide corpus' }, { phase: 'knowledge', step: 'rag_retrieve_probe', label: 'Semantic-retrieve probe' }, { phase: 'verify', step: 'verify', label: 'Verify artifact' }, - { phase: 'agent', step: 'agent', label: 'Agent chat' }, + // PRP-41 — design Z: both agent rows under unified `agents` phase id. + { phase: 'agents', step: 'agent', label: 'Agent chat (legacy)' }, + { phase: 'agents', step: 'agent_hitl_flow', label: 'Agent HITL approval' }, + // PRP-41 — new ops phase (showcase_rich only via SHOWCASE_RICH_STEP_NAMES). + { phase: 'ops', step: 'ops_snapshot', label: 'Ops snapshot' }, { phase: 'cleanup', step: 'cleanup', label: 'Cleanup' }, ] as const +/** + * Steps that should NOT appear on the `demo_minimal` / `sparse` scenarios. + * Excludes the PRP-38/39/40/41 showcase-rich extensions when filtering. + */ const SHOWCASE_RICH_STEP_NAMES = new Set([ // PRP-38 'phase2_enrichment', @@ -79,14 +95,26 @@ const SHOWCASE_RICH_STEP_NAMES = new Set([ 'embedding_provider_probe', 'rag_index_subset', 'rag_retrieve_probe', + // PRP-41 + 'agent_hitl_flow', + 'ops_snapshot', ]) +/** + * PRP-41 — steps that should NOT appear on `showcase_rich`. Today this set + * carries only the legacy `agent` step (replaced by `agent_hitl_flow` on + * showcase_rich). Resolved by Task 1 contract probe § 7 — design Z requires + * a second exclusion set because SHOWCASE_RICH_STEP_NAMES already serves + * the opposite filter direction. + */ +const DEMO_MINIMAL_ONLY_STEP_NAMES = new Set(['agent']) + /** Return the PhaseDef list for one scenario (lockstep with backend). */ export function phaseDefsForScenario(scenario: ScenarioPreset): readonly PhaseDef[] { if (scenario === 'showcase_rich') { - return ALL_STEPS + return ALL_STEPS.filter((d) => !DEMO_MINIMAL_ONLY_STEP_NAMES.has(d.step)) } - // demo_minimal / sparse / others — legacy 11-step flow (no V2 enrichment). + // demo_minimal / sparse / others — legacy 11-step flow. return ALL_STEPS.filter((d) => !SHOWCASE_RICH_STEP_NAMES.has(d.step)) } @@ -101,7 +129,9 @@ export const PHASE_LABEL: Record = { planning: 'Planning', knowledge: 'Knowledge', verify: 'Verify', - agent: 'Agent', + // PRP-41 — unified `agents` phase id replaces `agent`; new `ops` phase. + agents: 'Agents (HITL)', + ops: 'Ops snapshot', cleanup: 'Cleanup', } @@ -116,6 +146,8 @@ export const PHASE_ORDER: readonly string[] = [ 'planning', 'knowledge', 'verify', - 'agent', + // PRP-41 — `agent` renamed to `agents`; `ops` inserted between agents and cleanup. + 'agents', + 'ops', 'cleanup', ] diff --git a/frontend/src/components/demo/RunHistoryStrip.test.tsx b/frontend/src/components/demo/RunHistoryStrip.test.tsx new file mode 100644 index 00000000..ab5d41c0 --- /dev/null +++ b/frontend/src/components/demo/RunHistoryStrip.test.tsx @@ -0,0 +1,99 @@ +import { cleanup, fireEvent, render } from '@testing-library/react' +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { RunHistoryStrip } from './RunHistoryStrip' +import type { DemoSummary } from '@/hooks/use-demo-pipeline' + +const STORAGE_KEY = 'forecastlab.showcase.runs.v1' + +afterEach(() => { + cleanup() + window.localStorage.clear() +}) + +beforeEach(() => { + window.localStorage.clear() +}) + +const summary: DemoSummary = { + overallStatus: 'pass', + winnerModelType: 'prophet_like', + winnerWape: 0.08, + winningRunId: 'run-abc', + alias: 'demo-production', + wallClockS: 174.5, + v2RunId: 'v2-456', +} + +describe('RunHistoryStrip', () => { + it('renders nothing when no history + no summary yet', () => { + const { container } = render( + {}} summary={null} scenario="demo_minimal" />, + ) + expect(container.firstChild).toBeNull() + }) + + it('persists a new history entry on pipeline_complete summary', () => { + const { container } = render( + {}} summary={summary} scenario="showcase_rich" />, + ) + // The entry persists after the first render. + const stored = window.localStorage.getItem(STORAGE_KEY) + expect(stored).not.toBeNull() + const items = JSON.parse(stored!) + expect(items).toHaveLength(1) + expect(items[0].scenario).toBe('showcase_rich') + expect(items[0].status).toBe('pass') + // Rendered list shows the entry. + expect(container.textContent).toContain('showcase_rich') + expect(container.textContent).toContain('PASS') + }) + + it('caps history at 5 entries (FIFO eviction)', () => { + const existing = Array.from({ length: 5 }).map((_, i) => ({ + id: `id-${i}`, + runId: `run-${i}`, + timestamp: new Date(2026, 4, 26, 10, i).toISOString(), + scenario: 'demo_minimal' as const, + status: 'pass' as const, + wallClockS: 60 + i, + })) + window.localStorage.setItem(STORAGE_KEY, JSON.stringify(existing)) + render( + {}} summary={summary} scenario="showcase_rich" />, + ) + const stored = JSON.parse(window.localStorage.getItem(STORAGE_KEY)!) + expect(stored).toHaveLength(5) + // Newest is first. + expect(stored[0].scenario).toBe('showcase_rich') + // Oldest (id-4) was evicted. + expect(stored.find((it: { id: string }) => it.id === 'id-4')).toBeUndefined() + }) + + it('invokes onReplay with the entry scenario when Replay is clicked', () => { + const onReplay = vi.fn() + const { container } = render( + , + ) + const replayBtn = Array.from(container.querySelectorAll('button')).find( + (b) => (b.textContent ?? '').trim() === 'Replay', + ) + expect(replayBtn).toBeDefined() + fireEvent.click(replayBtn!) + expect(onReplay).toHaveBeenCalledWith( + expect.objectContaining({ scenario: 'showcase_rich', skip_seed: true, reset: false }), + ) + }) + + it('Clear button empties history + localStorage', () => { + const { container } = render( + {}} summary={summary} scenario="demo_minimal" />, + ) + expect(window.localStorage.getItem(STORAGE_KEY)).not.toBeNull() + const clearBtn = Array.from(container.querySelectorAll('button')).find( + (b) => (b.textContent ?? '').trim() === 'Clear', + ) + fireEvent.click(clearBtn!) + const stored = window.localStorage.getItem(STORAGE_KEY) + expect(stored).toBe('[]') + }) +}) diff --git a/frontend/src/components/demo/RunHistoryStrip.tsx b/frontend/src/components/demo/RunHistoryStrip.tsx new file mode 100644 index 00000000..5605879c --- /dev/null +++ b/frontend/src/components/demo/RunHistoryStrip.tsx @@ -0,0 +1,139 @@ +/** + * PRP-41 — localStorage-backed FIFO run history (5 entries max). + * + * Storage: + * key : forecastlab.showcase.runs.v1 (versioned per R18; future schema + * changes pick a new key, never collide) + * cap : 5 entries (FIFO eviction) + * shape: RunHistoryItem (id, runId, timestamp, scenario, status, wallClockS) + * + * SSR-guarded: every read/write checks `typeof window === 'undefined'` and + * swallows JSON parse / quota-exceeded errors. + */ + +import { useCallback, useEffect, useState } from 'react' +import { Button } from '@/components/ui/button' +import { Card, CardContent } from '@/components/ui/card' +import type { DemoRunRequest, ScenarioPreset } from '@/types/api' +import type { DemoSummary } from '@/hooks/use-demo-pipeline' + +const STORAGE_KEY = 'forecastlab.showcase.runs.v1' +const HISTORY_CAP = 5 + +export interface RunHistoryItem { + id: string + runId: string | null + timestamp: string // ISO8601 + scenario: ScenarioPreset + status: 'pass' | 'fail' + wallClockS: number +} + +function loadHistory(): RunHistoryItem[] { + if (typeof window === 'undefined') return [] + try { + const raw = window.localStorage.getItem(STORAGE_KEY) + if (!raw) return [] + const parsed = JSON.parse(raw) + return Array.isArray(parsed) ? (parsed as RunHistoryItem[]) : [] + } catch { + return [] + } +} + +function saveHistory(items: RunHistoryItem[]): void { + if (typeof window === 'undefined') return + try { + window.localStorage.setItem(STORAGE_KEY, JSON.stringify(items)) + } catch { + // quota exceeded / private mode -- silently drop. + } +} + +interface RunHistoryStripProps { + /** Called when the operator clicks Replay on a historical entry. */ + onReplay: (req: DemoRunRequest) => void + /** Latest pipeline_complete summary. When non-null, append to history once. */ + summary: DemoSummary | null + /** Current scenario the picker is on. */ + scenario: ScenarioPreset +} + +export function RunHistoryStrip({ onReplay, summary, scenario }: RunHistoryStripProps) { + const [items, setItems] = useState(() => loadHistory()) + const [lastSummary, setLastSummary] = useState(null) + + useEffect(() => { + if (!summary || summary === lastSummary) return + // Persist exactly once per pipeline_complete summary (R18). + const entry: RunHistoryItem = { + id: crypto.randomUUID(), + runId: summary.winningRunId, + timestamp: new Date().toISOString(), + scenario, + status: summary.overallStatus, + wallClockS: summary.wallClockS, + } + const next = [entry, ...items].slice(0, HISTORY_CAP) + setItems(next) + saveHistory(next) + setLastSummary(summary) + }, [summary, lastSummary, items, scenario]) + + const clear = useCallback(() => { + setItems([]) + saveHistory([]) + }, []) + + if (items.length === 0) return null + + return ( + + +
+

Recent runs

+ +
+
    + {items.map((item) => ( +
  • +
    + {new Date(item.timestamp).toLocaleString()} + {item.scenario} + + {item.status.toUpperCase()} + + {item.wallClockS.toFixed(0)}s +
    + +
  • + ))} +
+
+
+ ) +} diff --git a/frontend/src/components/demo/ShowcaseKpiStrip.test.tsx b/frontend/src/components/demo/ShowcaseKpiStrip.test.tsx new file mode 100644 index 00000000..816cc84e --- /dev/null +++ b/frontend/src/components/demo/ShowcaseKpiStrip.test.tsx @@ -0,0 +1,87 @@ +import { cleanup, render } from '@testing-library/react' +import { afterEach, describe, expect, it } from 'vitest' +import { ShowcaseKpiStrip } from './ShowcaseKpiStrip' +import type { DemoStep } from '@/hooks/use-demo-pipeline' + +afterEach(() => cleanup()) + +const makeStep = (overrides: Partial): DemoStep => ({ + name: 'unset', + label: 'unset', + status: 'pass', + detail: '', + durationMs: 0, + data: {}, + phaseName: 'data', + ...overrides, +}) + +describe('ShowcaseKpiStrip', () => { + it('renders nothing until at least one step reaches a terminal status', () => { + const steps = [makeStep({ name: 'precheck', status: 'idle' })] + const { container } = render() + expect(container.firstChild).toBeNull() + }) + + it('counts runs_registered from register / v2_train / stale_alias_trigger / safer_promote_flow', () => { + const steps = [ + makeStep({ name: 'register', status: 'pass', data: { run_id: 'r1' } }), + makeStep({ name: 'v2_train', status: 'pass', data: { run_id: 'r2' } }), + makeStep({ name: 'stale_alias_trigger', status: 'pass', data: { run_id: 'r3' } }), + makeStep({ name: 'safer_promote_flow', status: 'pass', data: { run_id: 'r4' } }), + // Not a counter: no run_id. + makeStep({ name: 'register', status: 'pass', data: {} }), + ] + const { container } = render() + const tile = Array.from(container.querySelectorAll('div.font-mono.text-2xl')).find( + (d) => (d.previousElementSibling?.textContent ?? '') === 'Runs registered', + ) + expect(tile?.textContent).toBe('4') + }) + + it('prefers ops_snapshot.total_aliases for the aliases_live tile', () => { + const steps = [ + makeStep({ + name: 'ops_snapshot', + status: 'pass', + data: { total_aliases: 7 }, + }), + ] + const { container } = render() + const tile = Array.from(container.querySelectorAll('div.font-mono.text-2xl')).find( + (d) => (d.previousElementSibling?.textContent ?? '') === 'Aliases live', + ) + expect(tile?.textContent).toBe('7') + }) + + it('counts plans_saved across scenario_simulate_and_save + multi_plan_compare', () => { + const steps = [ + makeStep({ + name: 'scenario_simulate_and_save', + status: 'pass', + data: { scenario_id: 'scn-1' }, + }), + makeStep({ + name: 'multi_plan_compare', + status: 'pass', + data: { winner_scenario_id: 'scn-2', ranked: [{ name: 'a' }, { name: 'b' }] }, + }), + ] + const { container } = render() + const tile = Array.from(container.querySelectorAll('div.font-mono.text-2xl')).find( + (d) => (d.previousElementSibling?.textContent ?? '') === 'Plans saved', + ) + expect(tile?.textContent).toBe('2') + }) + + it('renders em-dash for missing data', () => { + const steps = [makeStep({ name: 'register', status: 'pass', data: {} })] + const { container } = render() + // Batch items + RAG chunks have no source; rendered as em-dash. + const tiles = Array.from(container.querySelectorAll('div.font-mono.text-2xl')) + const batch = tiles.find( + (d) => (d.previousElementSibling?.textContent ?? '') === 'Batch items', + ) + expect(batch?.textContent).toBe('—') + }) +}) diff --git a/frontend/src/components/demo/ShowcaseKpiStrip.tsx b/frontend/src/components/demo/ShowcaseKpiStrip.tsx new file mode 100644 index 00000000..2edd4a67 --- /dev/null +++ b/frontend/src/components/demo/ShowcaseKpiStrip.tsx @@ -0,0 +1,112 @@ +/** + * PRP-41 — top-of-page KPI strip for the Showcase page. + * + * Renders 5 tiles derived from the cumulative step.data emitted across the + * pipeline. Hidden until at least one step has reached a terminal status + * (anything other than `idle`). + * + * Sources: + * runs_registered -- count steps in {register, stale_alias_trigger, + * safer_promote_flow, v2_train} where step.data.run_id is set + * aliases_live -- ops_snapshot.step.data.total_aliases (preferred); + * fallback to counting steps with .data.alias + * batch_items_completed -- batch_preset.step.data.completed_items + * scenario_plans_saved -- scenario_simulate_and_save.step.data.scenario_id + + * multi_plan_compare.step.data.winner_scenario_id + * rag_chunks_indexed -- rag_index_subset.step.data.total_chunks + */ + +import type { DemoStep } from '@/hooks/use-demo-pipeline' +import { Card, CardContent } from '@/components/ui/card' + +const TERMINAL_STATUSES = new Set(['pass', 'fail', 'skip', 'warn']) +const REGISTER_STEP_NAMES = new Set([ + 'register', + 'stale_alias_trigger', + 'safer_promote_flow', + 'v2_train', +]) + +interface KpiStripProps { + steps: DemoStep[] +} + +interface Tile { + label: string + value: number | string | null +} + +function tilesFromSteps(steps: DemoStep[]): Tile[] { + const byName = new Map() + for (const s of steps) byName.set(s.name, s) + + const runsRegistered = steps.filter( + (s) => REGISTER_STEP_NAMES.has(s.name) && typeof s.data.run_id === 'string', + ).length + + // Prefer ops_snapshot total_aliases; fallback to per-step alias count. + const ops = byName.get('ops_snapshot') + const aliasesFromOps = + ops && typeof ops.data.total_aliases === 'number' ? ops.data.total_aliases : null + const aliasesFallback = steps.filter( + (s) => typeof s.data.alias === 'string' && s.data.alias.length > 0, + ).length + const aliasesLive = aliasesFromOps ?? aliasesFallback + + const batch = byName.get('batch_preset') + const batchCompleted = + batch && typeof batch.data.completed_items === 'number' + ? batch.data.completed_items + : null + + // scenario plans saved = (scenario_simulate_and_save with scenario_id) + + // (multi_plan_compare with winner_scenario_id AND ranked.length>=2). + const ssave = byName.get('scenario_simulate_and_save') + const mcompare = byName.get('multi_plan_compare') + let plansSaved = 0 + if (ssave && typeof ssave.data.scenario_id === 'string' && ssave.data.scenario_id) plansSaved += 1 + const ranked = mcompare?.data.ranked + if ( + mcompare && + typeof mcompare.data.winner_scenario_id === 'string' && + Array.isArray(ranked) && + ranked.length >= 2 + ) { + plansSaved += 1 + } + + const ragIndex = byName.get('rag_index_subset') + const chunks = + ragIndex && typeof ragIndex.data.total_chunks === 'number' + ? ragIndex.data.total_chunks + : null + + return [ + { label: 'Runs registered', value: runsRegistered }, + { label: 'Aliases live', value: aliasesLive }, + { label: 'Batch items', value: batchCompleted }, + { label: 'Plans saved', value: plansSaved }, + { label: 'RAG chunks', value: chunks }, + ] +} + +export function ShowcaseKpiStrip({ steps }: KpiStripProps) { + const hasAnyTerminal = steps.some((s) => TERMINAL_STATUSES.has(s.status)) + if (!hasAnyTerminal) return null + + const tiles = tilesFromSteps(steps) + return ( +
+ {tiles.map((tile) => ( + + +
{tile.label}
+
+ {tile.value === null || tile.value === undefined ? '—' : String(tile.value)} +
+
+
+ ))} +
+ ) +} diff --git a/frontend/src/components/demo/demo-step-card.test.tsx b/frontend/src/components/demo/demo-step-card.test.tsx index 5776a730..eac4112d 100644 --- a/frontend/src/components/demo/demo-step-card.test.tsx +++ b/frontend/src/components/demo/demo-step-card.test.tsx @@ -123,4 +123,83 @@ describe('DemoStepCard PRP-39 mini-summaries', () => { const links = screen.queryAllByRole('link', { name: /Inspect/i }) expect(links.length).toBe(0) }) + + // ============================================================ + // PRP-41 — HitlFlowSummary, ApproveButton, OpsSnapshotMiniGrid + // ============================================================ + + it('agent_hitl_flow — terminal pass renders HitlFlowSummary with the approval decision', () => { + const step = makeStep('agent_hitl_flow', 'pass', { + session_id: 'sess-abcdef0123456', + tokens_used: 240, + tool_calls_count: 1, + action_id: 'act-x', + approval_decision: 'executed', + }) + const { container } = renderCard(step, null) + const text = container.textContent ?? '' + expect(text).toContain('session=sess-abc') + expect(text).toContain('tokens=240') + expect(text).toContain('tool_calls=1') + expect(text).toContain('approval=executed') + }) + + it('agent_hitl_flow — running + awaiting_approval=true surfaces the Approve button', () => { + const step = makeStep('agent_hitl_flow', 'running', { + session_id: 'sess-x', + awaiting_approval: true, + action_id: 'act-y', + approval_url: '/agents/sessions/sess-x/approve', + }) + const { container } = renderCard(step, null) + const buttons = Array.from(container.querySelectorAll('button')).map((b) => b.textContent) + expect(buttons).toContain('Approve') + }) + + it('agent_hitl_flow — terminal status hides the Approve button', () => { + const step = makeStep('agent_hitl_flow', 'pass', { + session_id: 'sess-x', + awaiting_approval: true, // stale flag from intermediate event + action_id: 'act-y', + approval_url: '/agents/sessions/sess-x/approve', + }) + const { container } = renderCard(step, null) + const buttons = Array.from(container.querySelectorAll('button')).map((b) => b.textContent) + expect(buttons).not.toContain('Approve') + }) + + it('ops_snapshot — renders the 5-tile mini grid with values', () => { + const step = makeStep('ops_snapshot', 'pass', { + stale_aliases_count: 1, + retraining_candidates_count: 2, + total_runs: 5, + total_aliases: 2, + degrading_health_count: 1, + }) + const { container } = renderCard(step, null) + // 5 tiles in a grid-cols-5; each tile has a label + value. + const tileLabels = Array.from( + container.querySelectorAll('.grid-cols-5 .text-muted-foreground'), + ).map((d) => d.textContent) + expect(tileLabels).toEqual([ + 'stale_aliases', + 'retraining', + 'runs', + 'aliases', + 'degrading', + ]) + }) + + it('ops_snapshot — renders em-dash for missing keys', () => { + const step = makeStep('ops_snapshot', 'pass', { + stale_aliases_count: 3, + // others missing + }) + const { container } = renderCard(step, null) + const values = Array.from( + container.querySelectorAll('.grid-cols-5 .font-mono.font-semibold'), + ).map((d) => d.textContent) + expect(values[0]).toBe('3') + expect(values.slice(1)).toEqual(['—', '—', '—', '—']) + }) }) diff --git a/frontend/src/components/demo/demo-step-card.tsx b/frontend/src/components/demo/demo-step-card.tsx index eddf2f09..b2b1379b 100644 --- a/frontend/src/components/demo/demo-step-card.tsx +++ b/frontend/src/components/demo/demo-step-card.tsx @@ -1,4 +1,5 @@ import { ArrowUpRight } from 'lucide-react' +import { useEffect, useState } from 'react' import { Link } from 'react-router-dom' import type { DemoStep, DemoStepUiStatus } from '@/hooks/use-demo-pipeline' import { Button } from '@/components/ui/button' @@ -305,6 +306,120 @@ function RetrieveSummary({ data }: { data: Record }) { ) } +/** PRP-41 — HITL flow mini-summary (session/tokens/tool_calls/approval chips). */ +function HitlFlowSummary({ data }: { data: Record }) { + const sessionId = typeof data.session_id === 'string' ? data.session_id : '' + const tokens = + typeof data.tokens_used === 'number' ? data.tokens_used : Number(data.tokens_used ?? 0) + const toolCalls = + typeof data.tool_calls_count === 'number' + ? data.tool_calls_count + : Number(data.tool_calls_count ?? 0) + const decision = + typeof data.approval_decision === 'string' ? data.approval_decision : '' + return ( +
+ {sessionId && ( + + session={sessionId.slice(0, 8)}... + + )} + tokens={tokens} + + tool_calls={toolCalls} + + {decision && ( + + approval={decision} + + )} +
+ ) +} + +/** PRP-41 — ops snapshot 5-tile KPI grid. */ +function OpsSnapshotMiniGrid({ data }: { data: Record }) { + const tiles: ReadonlyArray = [ + ['stale_aliases', data.stale_aliases_count], + ['retraining', data.retraining_candidates_count], + ['runs', data.total_runs], + ['aliases', data.total_aliases], + ['degrading', data.degrading_health_count], + ] + return ( +
+ {tiles.map(([label, value]) => ( +
+
{label}
+
+ {value !== undefined && value !== null ? String(value) : '—'} +
+
+ ))} +
+ ) +} + +/** + * PRP-41 — one-click Approve button rendered on the HITL step card when + * the backend has emitted `awaiting_approval=true` + `status='running'`. + * + * Clicking POSTs `{action_id, approved: true}` to the captured approval_url. + * Optimistic disable on click; the backend's auto-approve absorbs a 400 + * "No pending action" if the auto-approve fires first (Task 1 contract probe). + */ +function ApproveButton({ + approvalUrl, + actionId, +}: { + approvalUrl: string + actionId: string +}) { + const [clicked, setClicked] = useState(false) + const [error, setError] = useState(null) + const [waitingMs, setWaitingMs] = useState(0) + + useEffect(() => { + if (clicked) return + const startedAt = Date.now() + const id = setInterval(() => setWaitingMs(Date.now() - startedAt), 1000) + return () => clearInterval(id) + }, [clicked]) + + const onClick = async () => { + if (clicked || !approvalUrl || !actionId) return + setClicked(true) + try { + const res = await fetch(approvalUrl, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ action_id: actionId, approved: true }), + }) + // Absorb 4xx absorptions silently — the auto-approve already landed + // and the next StepEvent will surface the terminal status. + if (!res.ok && res.status >= 500) { + setError(`approve failed (${res.status})`) + } + } catch (err) { + setError(err instanceof Error ? err.message : 'approve failed') + } + } + + return ( +
+ + {!clicked && waitingMs > 30_000 && ( + + Still waiting — auto-approve in {Math.max(0, Math.ceil((90_000 - waitingMs) / 1000))}s + + )} + {error && {error}} +
+ ) +} + interface DemoStepCardProps { step: DemoStep index: number @@ -375,6 +490,19 @@ export function DemoStepCard({ step, index, inspectHref }: DemoStepCardProps) { )} {step.name === 'rag_index_subset' && } {step.name === 'rag_retrieve_probe' && } + {/* PRP-41 — agents (HITL) + ops snapshot mini-summaries. */} + {step.name === 'agent_hitl_flow' && } + {step.name === 'ops_snapshot' && } + {/* PRP-41 — one-click Approve only while awaiting (status==running). */} + {step.data.awaiting_approval === true && + step.status === 'running' && + typeof step.data.approval_url === 'string' && + typeof step.data.action_id === 'string' && ( + + )} {showInspect && (
+ {/* PRP-41 — KPI strip at the top, hidden until at least one step completes. */} + + + {/* PRP-41 — Replayable run history (localStorage FIFO 5). */} + start(req)} + summary={phase === 'done' ? summary : null} + scenario={scenario} + /> + {/* Controls */} @@ -175,6 +194,14 @@ export default function ShowcasePage() { {isRunning ? 'Running…' : 'Run pipeline'} + {/* PRP-41 — Stop button: cancel an in-flight pipeline. */} + {isRunning && ( + + )} +