From 59148cc10a510349e8cd73d94eaacc664ea10b3d Mon Sep 17 00:00:00 2001
From: timcsy <messenger@tew.tw>
Date: Fri, 12 Jun 2026 14:53:01 +0800
Subject: [PATCH 1/4] feat(realtime): spec/plan/tasks for /v1/realtime +
 foundation (deps, minute unit, model_kind)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 32 realtime transcription endpoint — full speckit spec package (spec, plan,
research, data-model, WS event contract, quickstart, tasks) plus the self-contained
foundational layer:

- websockets declared as a direct dependency (was transitive via uvicorn) — needed
  to relay /v1/realtime to the provider's realtime WS (Constitution Deviation noted).
- model_kind: add `realtime` kind (mode→kind) so the catalog labels realtime models
  honestly; full suite re-run green (715 passed) per the model_kind lesson.
- minute billing unit verified through the existing unit-billing path (calculate_unit_cost
  is unit-agnostic; `minute` is a new string value, no schema change) + test.

Foundational logic (T001/T003/T006) done & green. The WS core — upstream WS client,
mock provider WS server, bidirectional relay (US1), per-minute metering (US2), in-flight
revocation (US3) — is the next focused block; T027 real Azure WS smoke needs credentials.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                                     |   6 +-
 pyproject.toml                                |   5 +
 .../checklists/requirements.md                |  37 ++++
 .../contracts/realtime-transcription.md       |  65 +++++++
 .../043-realtime-transcription/data-model.md  |  54 ++++++
 specs/043-realtime-transcription/plan.md      |  86 +++++++++
 .../043-realtime-transcription/quickstart.md  |  57 ++++++
 specs/043-realtime-transcription/research.md  |  68 +++++++
 specs/043-realtime-transcription/spec.md      | 109 +++++++++++
 specs/043-realtime-transcription/tasks.md     | 182 ++++++++++++++++++
 src/ai_api/services/model_kind.py             |   5 +-
 tests/contract/test_pricing_units.py          |  22 +++
 12 files changed, 693 insertions(+), 3 deletions(-)
 create mode 100644 specs/043-realtime-transcription/checklists/requirements.md
 create mode 100644 specs/043-realtime-transcription/contracts/realtime-transcription.md
 create mode 100644 specs/043-realtime-transcription/data-model.md
 create mode 100644 specs/043-realtime-transcription/plan.md
 create mode 100644 specs/043-realtime-transcription/quickstart.md
 create mode 100644 specs/043-realtime-transcription/research.md
 create mode 100644 specs/043-realtime-transcription/spec.md
 create mode 100644 specs/043-realtime-transcription/tasks.md

diff --git a/CLAUDE.md b/CLAUDE.md
index 8f6a8cf..1393877 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,6 +1,6 @@
 # ai-api Development Guidelines
 
-Auto-generated from all feature plans. Last updated: 2026-06-11
+Auto-generated from all feature plans. Last updated: 2026-06-12
 
 ## Active Technologies
 - Python 3.11+（同 Phase 1） (002-auth-membership)
@@ -66,6 +66,8 @@ Auto-generated from all feature plans. Last updated: 2026-06-11
 - PostgreSQL（生產）/ SQLite（dev、CI）；**不新增表、不新增 migration**——沿用增量②（0019）的 `call_records.quantity/unit` 與 `price_list.price_unit/price_per_unit_usd`，新單位（query / character）為字串值 (041-multi-endpoint-complete)
 - Python 3.11+（後端）/ TypeScript strict + React 19 + Vite 6（前端少量範例） + FastAPI（含 `UploadFile` multipart，既有）、SQLAlchemy 2.x async、Pydantic v2、`litellm`（`amoderation`/`asearch`/`aimage_edit` 既有函式）；TanStack Query、shadcn/ui（前端）——**皆既有，不新增套件** (042-endpoint-registry)
 - PostgreSQL（生產）/ SQLite（dev、CI）；**不新增表/欄/migration**——沿用 0019 的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`，新單位 `image`/`query` 為字串值 (042-endpoint-registry)
+- Python 3.11+（後端為主）/ TypeScript strict + React 19（前端僅目錄顯示 realtime 類型 + 連線範例，極少量） + FastAPI（WebSocket — starlette 內建，**專案首次使用**）、SQLAlchemy 2.x async、Pydantic v2（皆既有）；**`websockets`（直連 Azure realtime WS 的 async client，提為直接依賴——已隨 uvicorn/litellm 在 image，現宣告為直接依賴）**；既有 `proxy/preflight.py`、計費（`services/pricing.py` 的 `calculate_unit_cost`）、audit。**realtime 不經 litellm**（其 realtime 是 Proxy form / client 直連，違原則；借其 `RealTimeStreaming` 結構自寫薄 relay）。 (043-realtime-transcription)
+- PostgreSQL（生產）/ SQLite（dev、CI）；**不新增表、不新增 migration**——沿用增量②（0019）的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`，新單位 `minute` 為字串值。 (043-realtime-transcription)
 
 - Python 3.11+ + LiteLLM（proxy core）、FastAPI（admin API）、 (001-gateway-core)
 
@@ -86,9 +88,9 @@ cd src [ONLY COMMANDS FOR ACTIVE TECHNOLOGIES][ONLY COMMANDS FOR ACTIVE TECHNOLO
 Python 3.11+: Follow standard conventions
 
 ## Recent Changes
+- 043-realtime-transcription: Added Python 3.11+（後端為主）/ TypeScript strict + React 19（前端僅目錄顯示 realtime 類型 + 連線範例，極少量） + FastAPI（WebSocket — starlette 內建，**專案首次使用**）、SQLAlchemy 2.x async、Pydantic v2（皆既有）；**`websockets`（直連 Azure realtime WS 的 async client，提為直接依賴——已隨 uvicorn/litellm 在 image，現宣告為直接依賴）**；既有 `proxy/preflight.py`、計費（`services/pricing.py` 的 `calculate_unit_cost`）、audit。**realtime 不經 litellm**（其 realtime 是 Proxy form / client 直連，違原則；借其 `RealTimeStreaming` 結構自寫薄 relay）。
 - 042-endpoint-registry: Added Python 3.11+（後端）/ TypeScript strict + React 19 + Vite 6（前端少量範例） + FastAPI（含 `UploadFile` multipart，既有）、SQLAlchemy 2.x async、Pydantic v2、`litellm`（`amoderation`/`asearch`/`aimage_edit` 既有函式）；TanStack Query、shadcn/ui（前端）——**皆既有，不新增套件**
 - 041-multi-endpoint-complete: Added Python 3.11+（後端）/ TypeScript strict + React 19 + Vite 6（前端） + FastAPI（含 `UploadFile` multipart）、SQLAlchemy 2.x async、Pydantic v2、`litellm`（`aimage_generation`/`arerank`/`aspeech`/`atranscription` library form）；TanStack Query、shadcn/ui（前端）——**皆既有，不新增套件**
-- 040-ocr-billing-units: Added Python 3.11+（後端）/ TypeScript strict + React 19 + Vite 6（前端） + FastAPI、SQLAlchemy 2.x async、Alembic、Pydantic v2、`litellm`（library：`aocr` 既有函式）；TanStack Query、shadcn/ui（前端）——**皆既有，不新增套件**
 
 
 <!-- MANUAL ADDITIONS START -->
diff --git a/pyproject.toml b/pyproject.toml
index a84011d..bfcae39 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,11 @@ dependencies = [
     # multipart form parsing — required by FastAPI for /v1/audio/transcriptions
     # (STT) audio file upload. FastAPI's official optional dependency.
     "python-multipart>=0.0.18",
+    # async WebSocket client — required to relay /v1/realtime (live transcription)
+    # directly to the upstream provider's realtime WS. Already present transitively
+    # via uvicorn[standard]; declared directly so it can't vanish on an upstream
+    # change (Constitution Deviation: justified — direct provider WS needs a client).
+    "websockets>=13.0",
 ]
 
 [project.optional-dependencies]
diff --git a/specs/043-realtime-transcription/checklists/requirements.md b/specs/043-realtime-transcription/checklists/requirements.md
new file mode 100644
index 0000000..2366d64
--- /dev/null
+++ b/specs/043-realtime-transcription/checklists/requirements.md
@@ -0,0 +1,37 @@
+# Specification Quality Checklist: realtime 即時字幕端點
+
+**Purpose**: Validate specification completeness and quality before proceeding to planning
+**Created**: 2026-06-12
+**Feature**: [spec.md](../spec.md)
+
+## Content Quality
+
+- [x] No implementation details (languages, frameworks, APIs)
+- [x] Focused on user value and business needs
+- [x] Written for non-technical stakeholders
+- [x] All mandatory sections completed
+
+## Requirement Completeness
+
+- [x] No [NEEDS CLARIFICATION] markers remain
+- [x] Requirements are testable and unambiguous
+- [x] Success criteria are measurable
+- [x] Success criteria are technology-agnostic (no implementation details)
+- [x] All acceptance scenarios are defined
+- [x] Edge cases are identified
+- [x] Scope is clearly bounded
+- [x] Dependencies and assumptions identified
+
+## Feature Readiness
+
+- [x] All functional requirements have clear acceptance criteria
+- [x] User scenarios cover primary flows
+- [x] Feature meets measurable outcomes defined in Success Criteria
+- [x] No implementation details leak into specification
+
+## Notes
+
+- 0 個 [NEEDS CLARIFICATION]：所有未定細節都有「對齊既有專案慣例」的合理預設，記入 Assumptions（計量單位以供應商回報為先/否則估串流時長、撤回 SLO 對齊既有、配額建立時檢查、額度綁分配不限連線數）。
+- 三個技術未知（直連供應商 realtime 連線協定、連線結束的計量來源、持續連線的轉送與連線中撤回機制）刻意**不**放進 spec——它們是規劃階段（research/plan）要先釘死的能力邊界，不是需求層的模糊。
+- SC-004「約定上限時間」未填具體秒數為刻意：撤回 SLO 的具體值對齊既有分配撤回機制、由規劃階段定，spec 層不硬編。
+- Input 行保留 user 原述（含 WebSocket / gpt-realtime-whisper / litellm Proxy 等字眼）為 speckit 慣例（記錄原始描述）；正文以業務語言（持續連線/串流/相容端點）表述，不洩漏實作。
diff --git a/specs/043-realtime-transcription/contracts/realtime-transcription.md b/specs/043-realtime-transcription/contracts/realtime-transcription.md
new file mode 100644
index 0000000..456468d
--- /dev/null
+++ b/specs/043-realtime-transcription/contracts/realtime-transcription.md
@@ -0,0 +1,65 @@
+# Contract: realtime 即時字幕 WebSocket 端點
+
+**端點**：`GET /v1/realtime`（WebSocket upgrade）— OpenAI 相容 realtime transcription
+**認證**：`Authorization: Bearer <應用金鑰>`（連線 header，沿用既有金鑰）或 OpenAI realtime 慣例的 subprotocol header（tasks 階段對齊 OpenAI 客戶端慣例）
+**形態**：雙向 WebSocket。客戶端上行音訊、平台下行文字事件。
+
+## 連線生命週期
+
+```
+client → (WS upgrade + Bearer key)
+         platform: run_preflight(key → allocation → access → quota → model)
+   ├─ 不通過 → close(code, reason)  ；不開始串流（FR-002/005/007）
+   └─ 通過   → accept；開一條 platform↔Azure WS；進入雙向轉送
+client → session.update {type:"transcription", model, audio.format}
+client → input_audio_buffer.append {audio: <base64 PCM>}   （重複，串流）
+platform→ conversation.item.input_audio_transcription.delta {delta}      （即時，SC-001 <1s）
+platform→ conversation.item.input_audio_transcription.completed {transcript}
+...
+（任一端關閉 / 撤回 re-check 觸發）→ platform: 落帳 CallRecord(unit=minute) → close
+```
+
+## Client → Server 事件（平台接受並轉送上游）
+
+| 事件 | 必要欄位 | 平台行為 |
+|---|---|---|
+| `session.update` | `type:"transcription"`, `model`, `audio.format{type,rate}` | 校驗 model 為 realtime 類型（否則 close，FR-007）；記下 sample_rate/format 供計量；轉送上游 |
+| `input_audio_buffer.append` | `audio`（base64 PCM）| **累計 audio_bytes（計量來源，R2）**；轉送上游 |
+| `input_audio_buffer.commit` | — | 轉送上游（manual turn detection）|
+
+## Server → Client 事件（平台從上游轉回）
+
+| 事件 | 內容 | 備註 |
+|---|---|---|
+| `conversation.item.input_audio_transcription.delta` | `delta`（增量文字）| 即時字幕主要輸出；SC-001 首段 <1s |
+| `conversation.item.input_audio_transcription.completed` | `transcript`（完整）| 一段話完成；平台在此路徑可記觀測 |
+| `error` | `error{code,message}` | 上游錯誤透明轉回；不洩漏上游金鑰（FR-006）|
+
+## 連線關閉碼（平台主動關閉時）
+
+| 情境 | 關閉碼/原因 | 對應 |
+|---|---|---|
+| 金鑰無效/撤回、無有效分配、配額已滿 | policy violation + 可理解 reason | FR-002, SC-005 |
+| 模型非 realtime 類型 | unsupported + reason | FR-007 |
+| 連線中分配被撤回/暫停/隔離 | revoked + reason | FR-005, SC-004 |
+| 上游斷線/失敗 | upstream_error + 透明原因 | FR-009 |
+
+## 計量契約
+
+- 計量單位：`minute`；數量 = `ceil(Σ append PCM bytes / (rate × bytes_per_sample × channels) / 60)`（精確 rounding tasks 定）。
+- 落帳時機：**連線關閉（任何原因，含異常）**——`audio_bytes` 即時累計確保不漏記（FR-004/SC-003）。
+- 歸戶：preflight 解出的 allocation；費用 = `calculate_unit_cost`（既有）。
+
+## 不洩漏契約（FR-006）
+
+任何下行事件、錯誤、關閉原因 MUST NOT 含上游 endpoint / key / 內部部署名；上游錯誤轉譯為對使用者可理解的訊息。
+
+## 契約測試（合併前必過）
+
+1. 無效/撤回金鑰連線 → 被 close、未開始串流。
+2. 非 realtime 模型 → close(unsupported)。
+3. 有效連線 + 送 append → 收到 delta（mock provider WS 回預錄 delta）。
+4. 連線關閉 → 寫一筆 `CallRecord(unit="minute")`、quantity 對得上送出的音訊時長。
+5. 連線中 mock 撤回分配 → 平台在 N 秒內主動 close(revoked) + 已累計時長落帳。
+6. 異常中止（client 直接斷）→ 仍落帳已累計時長（不漏記）。
+7. 任何錯誤/關閉訊息不含上游 key/endpoint。
diff --git a/specs/043-realtime-transcription/data-model.md b/specs/043-realtime-transcription/data-model.md
new file mode 100644
index 0000000..af306c6
--- /dev/null
+++ b/specs/043-realtime-transcription/data-model.md
@@ -0,0 +1,54 @@
+# Phase 1 Data Model: realtime 即時字幕端點
+
+**核心結論：不新增表、不新增 migration。** realtime 連線本身是 in-memory 的生命週期物件（不落表）；用量沿用既有 `call_records`（增量② 0019 的 `quantity`/`unit`）+ `price_list`（`price_unit`/`price_per_unit_usd`），新單位 `minute` 為字串值。
+
+## 1. RealtimeSession（in-memory，非持久化）
+
+一次 WS 連線的執行期狀態，**不寫表**——只活在連線存活期間，斷線時把累計結果落成一筆 `CallRecord`。
+
+| 欄位 | 型別 | 說明 |
+|---|---|---|
+| `allocation_id` | str | preflight 解出的歸戶分配（計量落帳對象）|
+| `credential_id` | str | 建立連線的應用金鑰（審計用）|
+| `member_id` | str | 擁有者（審計用）|
+| `resource_model` | str | 請求的 realtime 模型 slug |
+| `upstream_model` | str | 對映到上游的模型字串 |
+| `started_at` | datetime（tz-aware）| 連線建立時間 |
+| `audio_bytes` | int | 累計收到的 PCM 音訊 bytes（計量來源，R2）|
+| `sample_rate` / `bytes_per_sample` / `channels` | int | 由 `session.update` 的 format 決定，換算時長用 |
+| `close_reason` | enum | `normal` / `client_abort` / `upstream_error` / `revoked` |
+
+**衍生**：`duration_seconds = audio_bytes / (sample_rate × bytes_per_sample × channels)`；`quantity_minutes = ceil(duration_seconds / 60)` 或精確分鐘（tasks 階段定 rounding，對齊計費慣例）。
+
+**狀態轉移**：`connecting`（preflight 中）→ `streaming`（轉送中、累計 audio_bytes、週期 re-check）→ `closing`（任一端關閉或撤回觸發）→ 落帳 `CallRecord` → `closed`。
+
+## 2. CallRecord（既有，沿用）
+
+斷線時寫**一筆**，與其他非 token 端點同機制：
+
+| 欄位 | 值 |
+|---|---|
+| `allocation_id` | RealtimeSession.allocation_id（歸戶；異常中止仍寫）|
+| `quantity` | 累計分鐘數（R2 自算）|
+| `unit` | `"minute"`（新字串值，**非新欄位**，0019 已有 unit 欄）|
+| `cost_usd` | `calculate_unit_cost(quantity, price_per_unit)`（既有函式）|
+| `outcome` | 對映 close_reason（`success` / `upstream_error` …，沿用既有 enum）|
+| token 欄 | NULL（非 token 端點，沿用 0019 的 NULL⇒非 token 語意）|
+
+**FR-004 不漏記**：`audio_bytes` 在 relay 迴圈即時累計，故任何斷線路徑（正常/異常/撤回）落帳時都有值。
+
+## 3. PriceList（既有，沿用）
+
+realtime 模型的價以 `price_unit="minute"` + `price_per_unit_usd`（如 gpt-realtime-whisper $0.017）存一筆 point-in-time 版本（append-only）。admin 在既有 `/prices` 設定（單位下拉加 `minute`，沿用階段 29 unit billing 的單位感知 UI）。**LiteLLM 僅建議、PriceList 是計費真理**（不變）。
+
+## 4. Allocation（既有，沿用）
+
+歸戶對象 + 配額載體 + 連線中 re-check 的狀態來源（active / revoked / paused / quarantined）。**不改 schema**。
+
+## 5. model_kind：realtime 類型
+
+`services/model_kind.py` 的 mode→kind 對映加 `realtime`（litellm `mode` 為 realtime/realtime-transcription 時）。對應目錄誠實（FR-008）：realtime 模型顯正確類型、不假裝 chat。**改 model_kind 對映後須重跑全套件**（experience 教訓：有「未知 mode 反例」整合測試會撞）。
+
+---
+
+**Migration 結論**：**無**。沿用 0019 的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`；`minute` 是資料值非 schema 變更。RealtimeSession 不落表。
diff --git a/specs/043-realtime-transcription/plan.md b/specs/043-realtime-transcription/plan.md
new file mode 100644
index 0000000..da5c220
--- /dev/null
+++ b/specs/043-realtime-transcription/plan.md
@@ -0,0 +1,86 @@
+# Implementation Plan: realtime 即時字幕端點
+
+**Branch**: `043-realtime-transcription` | **Date**: 2026-06-12 | **Spec**: [spec.md](spec.md)
+**Input**: Feature specification from `specs/043-realtime-transcription/spec.md`
+
+## Summary
+
+對成員開放一個 **WebSocket 即時字幕端點**：客戶端以分配到的金鑰建立持續連線、串流 PCM 音訊，平台**自寫薄 relay**（借鏡 litellm `RealTimeStreaming` 結構、但接我們的「分配」計費）直連 Azure Foundry 的 gpt-realtime-whisper，即時把 `conversation.item.input_audio_transcription.delta/.completed` 事件轉回客戶端。連線建立跑既有 preflight；連線期間**自行從 append 的音訊 bytes 累計時長**（不依賴 provider usage 事件，天然滿足異常中止不漏記），斷線時記一筆 `unit="minute"` 的 CallRecord 歸戶分配；連線期間定期 re-check 分配狀態，被撤回/暫停/隔離即主動斷線。這是專案**第一個長連線 / WebSocket 端點**，刻意獨立於階段 31 的非串流 registry（比照 `responses.py` 的 SSE 獨立 handler）。
+
+## Technical Context
+
+**Language/Version**: Python 3.11+（後端為主）/ TypeScript strict + React 19（前端僅目錄顯示 realtime 類型 + 連線範例，極少量）  
+**Primary Dependencies**: FastAPI（WebSocket — starlette 內建，**專案首次使用**）、SQLAlchemy 2.x async、Pydantic v2（皆既有）；**`websockets`（直連 Azure realtime WS 的 async client，提為直接依賴——已隨 uvicorn/litellm 在 image，現宣告為直接依賴）**；既有 `proxy/preflight.py`、計費（`services/pricing.py` 的 `calculate_unit_cost`）、audit。**realtime 不經 litellm**（其 realtime 是 Proxy form / client 直連，違原則；借其 `RealTimeStreaming` 結構自寫薄 relay）。  
+**Storage**: PostgreSQL（生產）/ SQLite（dev、CI）；**不新增表、不新增 migration**——沿用增量②（0019）的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`，新單位 `minute` 為字串值。  
+**Testing**: pytest——契約/單元用 starlette `TestClient.websocket_connect` 測「client ↔ 我們」這段；整合測試起一個 **mock provider realtime WS server**（送預錄事件流）驗 relay 轉送 / 時長累計 / 連線中撤回斷線；**真連 Azure realtime WS = 部署後手動煙霧**（見 Constitution Deviation）。  
+**Target Platform**: Linux server（k3s-tew / ns ai-ccsh / helm release ai-api）  
+**Project Type**: web service（後端為主，前端極少量）  
+**Performance Goals**: 首段文字 < 1 秒（SC-001）；per-minute 計量精度到秒換算分鐘。  
+**Constraints**: 長連線 WebSocket（與既有 HTTP-only pipeline 形態不同）；連線中撤回 re-check（SLO 對齊既有分配撤回）；nginx 需 WS upgrade proxy；pod egress 需可達 `wss://*.services.ai.azure.com:443`（既有 443 egress 已開，需煙霧實證）。  
+**Scale/Scope**: org-internal 課堂/會議並發連線（小規模）；單一新端點 + 薄 relay + 計量 + 前端目錄微調。
+
+## Constitution Check
+
+*GATE: 評估每條核心原則。*
+
+- **I. Test-First（NON-NEGOTIABLE）**：✅ 可遵守。starlette `websocket_connect` 可在測試中建立 client 連線、mock 一個 provider WS server，先寫失敗測試（連線拒絕、轉送、時長累計、撤回斷線）再實作。TDD 流程不受 WS 形態阻礙。
+- **II. API 契約優先**：✅ 遵守。realtime 的契約是 **WS 事件流**——`contracts/realtime-transcription.md` 先定 client→server（`session.update`/`input_audio_buffer.append`）與 server→client（`...transcription.delta/.completed`、錯誤/關閉碼）事件，契約測試合併前必過。
+- **III. 整合測試覆蓋外部依賴 + CI 可重現**：⚠️ **部分偏離（見 Complexity Tracking / Deviation）**。憲法要求「不得僅以 mock 取代真實邊界、整合測試 CI 可重現」；但真連 Azure realtime WS 是**長連線 + 需憑證 + 即時音訊串流**，無法在 CI 可重現執行。補救：整合測試用 **mock provider WS server** 驗我們這側全部行為；真實邊界以**部署後手動煙霧**驗（比照既有 chat/responses 上游——本專案既有端點的上游真打本就走 mock + 部署煙霧，非 CI 真打）。
+- **IV. 可觀測性**：✅ 遵守。連線建立/結束/被撤回斷線、累計時長、計量結果、上游失敗原因皆結構化記錄（沿用既有 audit + CallRecord 透明度，FR-009）；不洩漏上游金鑰（FR-006）。
+- **V. 簡潔優先（YAGNI）**：✅ 大致遵守，一個 justified 新依賴。薄 relay 只做單向 transcription、不做雙向對話/工具；**`websockets` 提為直接依賴**是直連 provider WS 的必需（已在 image），於 Deviation 明列。不為未來雙向對話預留抽象。
+
+**結論**：可進 Phase 0。一個 Deviation（CI 無法真打 realtime 上游）+ 一個 justified 依賴（websockets），均於下方明列。
+
+## Project Structure
+
+### Documentation (this feature)
+
+```text
+specs/043-realtime-transcription/
+├── plan.md              # 本檔
+├── research.md          # Phase 0：三個技術未知的決策
+├── data-model.md        # Phase 1：realtime session + CallRecord(minute) 計量
+├── quickstart.md        # Phase 1：客戶端怎麼連 + 開發者驗證步驟
+├── contracts/
+│   └── realtime-transcription.md   # WS 事件契約（client↔server）
+└── tasks.md             # Phase 2（/speckit.tasks，非本指令產出）
+```
+
+### Source Code (repository root)
+
+```text
+src/ai_api/
+├── proxy/
+│   ├── realtime.py        # 新增：WS 端點 handler + 薄 relay（類比 responses.py，獨立於 registry）
+│   ├── upstream.py        # 加：開一條到 Azure realtime 的 async WS client（websockets）
+│   ├── preflight.py       # 沿用：連線建立時跑（既有）
+│   └── registry.py        # 不動（registry 專收非串流同步端點，realtime 不進）
+├── services/
+│   ├── pricing.py         # 沿用 calculate_unit_cost（unit="minute"）
+│   ├── model_kind.py      # 加：realtime kind 判定（mode → realtime）
+│   └── model_test.py      # 不動（realtime 不適用 recipe 表的一次性測試；目錄誠實由 model_kind 涵蓋）
+└── api/
+    └── （realtime WS route 掛載，nginx 既有 /v1 之下加 WS upgrade）
+
+deploy/helm/ai-api/        # nginx WS upgrade（Upgrade/Connection header）config
+
+frontend/src/
+├── routes/admin/model-detail.tsx   # realtime kind 顯示（沿用 KIND_LABEL）
+└── components/api-usage-example.tsx # realtime 連線範例（WS）
+
+tests/
+├── contract/test_realtime_transcription.py   # WS 事件契約
+├── integration/test_realtime_relay.py        # mock provider WS：轉送/計量/撤回斷線
+└── unit/test_realtime_metering.py            # 音訊 bytes → 時長換算
+```
+
+**Structure Decision**：realtime 為**獨立 WS handler**（`proxy/realtime.py`），不納入階段 31 的 `engine/registry`——後者的三軸（IOShape × Meter × call）建在「一請求一回應一筆帳」的同步假設上，realtime 是長連線、破壞該假設（同 `responses.py` 的 SSE 也獨立於 registry）。計量沿用既有 unit billing（`minute` 為新字串單位），**零 migration**。
+
+## Complexity Tracking
+
+> 僅列 Constitution Check 的偏離與須說明的複雜度。
+
+| Violation | Why Needed | Simpler Alternative Rejected Because |
+|-----------|------------|--------------------------------------|
+| **原則 III**：realtime 上游真打無法進 CI（整合測試以 mock provider WS server 取代真實邊界） | realtime 是長連線 WS + 需 Azure 憑證 + 即時音訊串流，CI 無法可重現執行；本專案既有上游端點（chat/responses）本就走 mock + 部署煙霧 | 在 CI 真連 Azure realtime WS → 需在 CI 注入生產憑證（安全面）、起即時音訊源、維持長連線，flaky 且昂貴，違反「CI 可重現」初衷 |
+| **原則 V / 新依賴**：`websockets` 提為直接依賴 | 直連 Azure realtime WS 需 async WebSocket client；litellm 的 realtime 是 Proxy form（不採），故自寫 relay 必需一個 WS client | 靠 transitive（uvicorn/litellm 帶入）→ 上游一移除即斷，違反「依賴要顯式」；改用 aiohttp → 更重且 image 未必有，websockets 已在 image |
diff --git a/specs/043-realtime-transcription/quickstart.md b/specs/043-realtime-transcription/quickstart.md
new file mode 100644
index 0000000..86d64e3
--- /dev/null
+++ b/specs/043-realtime-transcription/quickstart.md
@@ -0,0 +1,57 @@
+# Quickstart: realtime 即時字幕端點
+
+## 給接平台的開發者（客戶端怎麼用）
+
+平台暴露 OpenAI 相容的 realtime transcription WebSocket 端點。用你分配到的**應用金鑰**連線、串流麥克風音訊（PCM），即時收文字事件，自己渲染字幕。
+
+```python
+# 概念範例（實際以 OpenAI realtime 客戶端慣例為準）
+import websockets, json, base64
+
+async with websockets.connect(
+    "wss://<平台網域>/v1/realtime",
+    additional_headers={"Authorization": "Bearer <你的應用金鑰>"},
+) as ws:
+    await ws.send(json.dumps({
+        "type": "session.update",
+        "session": {"type": "transcription", "model": "azure/gpt-realtime-whisper",
+                    "audio": {"input": {"format": {"type": "audio/pcm", "rate": 24000}}}},
+    }))
+    # 串流音訊
+    await ws.send(json.dumps({"type": "input_audio_buffer.append",
+                              "audio": base64.b64encode(pcm_chunk).decode()}))
+    # 收即時字幕
+    async for msg in ws:
+        ev = json.loads(msg)
+        if ev["type"] == "conversation.item.input_audio_transcription.delta":
+            print(ev["delta"], end="", flush=True)
+```
+
+- 你拿不到、也不需要底層供應商金鑰——只用平台金鑰連平台端點。
+- 用量按**分鐘**計，歸戶到你的分配、計入配額，可在「用量」頁看到。
+- 金鑰被撤回 / 分配被暫停時，進行中的連線會被平台主動中止。
+
+## 給維護者（implement 階段的真打驗證步驟）
+
+CI 不真連 Azure realtime WS（Constitution Deviation）；真實邊界以**部署後手動煙霧**驗。建議順序：
+
+1. **協定真打**（research R1/R2 校驗）：用 Azure Foundry 的 gpt-realtime-whisper endpoint+key，跑一支最小腳本連 WS、送一段已知秒數的 PCM、確認：
+   - 收到 `...transcription.delta`（接得通、首段 <1s）
+   - 我們自算的時長 vs（若有）provider usage / Azure 帳單對得上（R2 校驗，必要時加校正）
+2. **relay 整合**（不需真 Azure）：起一個 **mock provider WS server** 送預錄事件流，跑契約測試 1–7（contracts/）。
+3. **連線中撤回**：建立連線 → 後台撤回該分配 → 確認 N 秒內被 close(revoked) + 已累計時長落帳。
+4. **部署煙霧**（rev 上線後）：
+   - pod egress 實證 `wss://<foundry>.services.ai.azure.com:443` 可達。
+   - nginx WS upgrade 生效（壞金鑰連線被 close 而非 200/SPA fallback）。
+   - 真打一次完整字幕 → 用量頁看到一筆 `unit=minute` 歸戶分配。
+
+## 驗收對照（spec Success Criteria）
+
+| SC | 驗證 |
+|---|---|
+| SC-001 首段 <1s | 步驟 1 真打計時 |
+| SC-002 100% 歸戶 | 步驟 4 用量頁查 CallRecord |
+| SC-003 異常不漏記 | 契約測試 6（client 直接斷）|
+| SC-004 撤回上限內斷線 | 步驟 3 / 契約測試 5 |
+| SC-005 無效金鑰 100% 拒絕 | 契約測試 1 |
+| SC-006 既有端點零回歸 | 全套件 + 既有 contract 測試 git diff 為空 |
diff --git a/specs/043-realtime-transcription/research.md b/specs/043-realtime-transcription/research.md
new file mode 100644
index 0000000..83ff96a
--- /dev/null
+++ b/specs/043-realtime-transcription/research.md
@@ -0,0 +1,68 @@
+# Phase 0 Research: realtime 即時字幕端點
+
+本檔釘死 spec 刻意延後到規劃階段的三個技術未知。研究方式：inspect 本地 litellm realtime 模組（藍本）+ OpenAI/Azure realtime transcription 官方協定 + 既有專案設施盤點。**端到端真連 Azure realtime WS 安排在 implement 階段於有憑證環境**（用戶已有 Azure Foundry 部署 + key）——本階段把「協定 / relay 結構 / 計量方法 / 基礎建設方案」釘到可實作的程度。
+
+---
+
+## R1：直連 provider realtime WS 的協定與 relay 結構
+
+**Decision**：自寫薄 relay，借鏡 litellm `RealTimeStreaming` 的雙向轉送結構，但**不經 litellm**、改接我們的分配計費。協定走 OpenAI 相容 realtime transcription：
+
+- 客戶端 ↔ 我們（FastAPI `@app.websocket`）：客戶端送 `session.update`(`type:"transcription"`, `model`, `format`)、`input_audio_buffer.append`(base64 PCM)、`input_audio_buffer.commit`；我們回 `conversation.item.input_audio_transcription.delta`（增量）/`.completed`（完整）。
+- 我們 ↔ Azure（`websockets` async client）：以 Azure Foundry realtime endpoint + key 開 WS，雙向轉送事件。
+- relay 骨架（借自 litellm `realtime_streaming.py:RealTimeStreaming`）：`bidirectional_forward()` = 同時跑 `client→backend` 與 `backend→client` 兩個轉送協程；在 `backend→client` 路徑上**攔截** `conversation.item.input_audio_transcription.completed` 做我們的記帳/觀測。
+
+**Rationale**：litellm 的 realtime 是 Proxy form（client 直連 provider、音訊不經 gateway），用它會失去原則 2 可追蹤性與原則 3 即時撤回（experience 第 40 條）。但它的**轉送結構**是成熟藍本，借結構、自接計費＝站在肩膀上又守原則。OpenAI 相容協定讓任何會講 realtime 的客戶端（會議/字幕工具）能直接接（願景「主流工具開箱即用」）。
+
+**Alternatives considered**：
+- litellm Proxy form realtime relay → 否決：client 直連、不認得「分配」（experience 第 40 條、principles 原則 5）。
+- litellm `_arealtime`（library 低階入口）→ 否決：內部 API、不穩定、且仍偏 Proxy 取向；自寫薄 relay 控制權更清楚（原則 7 適配層）。
+- 從零摸 realtime 協定 → 否決：litellm `RealTimeStreaming` 已把 beta↔GA 事件 remap、轉送骨架做過，借鏡省大量試錯。
+
+---
+
+## R2：per-minute 計量的來源
+
+**Decision**：**我們自己從客戶端 `input_audio_buffer.append` 的 PCM bytes 累計音訊時長**，斷線時換算分鐘記一筆 `CallRecord(quantity=分鐘, unit="minute")`，不依賴 provider 回 usage 事件。時長 = Σ(append PCM bytes) / (sample_rate × bytes_per_sample × channels)。
+
+**Rationale**：
+- OpenAI realtime transcription 官方文件**未保證 usage / 計量事件**（WebFetch 實證：transcription guide 無 usage 欄位）；gpt-realtime-whisper 按**音訊分鐘**計費（$0.017/min）。
+- 自己從 append bytes 算時長＝**自包含、不受 provider 是否回 usage 影響**，且**天然滿足 FR-004「異常中止不漏記」**——已 append 的音訊就算數，連線怎麼斷都已累計。
+- 對應 experience「STT per-second 計量沒 duration 來源就降級」的延伸：這次 duration 來源是「我們轉送的音訊量」，可控可算，不必賭 provider 回什麼。
+- 沿用增量②（0019）的 `call_records.quantity/unit` + `calculate_unit_cost`，`minute` 為新字串單位——**零 migration**。
+
+**Alternatives considered**：
+- 信 provider 的 usage 事件 → 否決：文件不保證有；若有則作為**校驗**而非主來源（implement 階段真打時對照）。
+- 連線 wall-clock 時間（含靜音）→ 否決：可能與 provider 按「音訊時長」計費不一致，傾向高估；以實際 append 的音訊量為準較貼近計費基礎。
+- 按 transcript 字元/token → 否決：gpt-realtime-whisper 按分鐘非按 token，單位不符。
+
+**Implement 階段待校驗**：真打一次 Azure，比對「我們算的分鐘」vs「Azure 帳單/若有的 usage 事件」，必要時加校正係數（admin 可覆寫價，沿用 PriceList 是計費真理）。
+
+---
+
+## R3：FastAPI WS relay + nginx WS upgrade + egress + 連線中撤回
+
+**Decision**：
+- **端點**：FastAPI `@app.websocket("/v1/realtime")`（或對齊 OpenAI 路徑），starlette 內建、`websockets` 15.0.1 已在 image。
+- **連線建立 preflight**：WS accept 前（或 accept 後第一個 `session.update`）跑既有 `run_preflight`（金鑰→分配→存取→配額→model binding）；不符即關閉連線回相容錯誤碼。
+- **連線中撤回**：在 relay 迴圈旁跑一個**週期性協程**，每 N 秒 re-check 該分配狀態（沿用既有撤回查詢），狀態非 active（撤回/暫停/隔離）即主動 close WS。N 對齊既有撤回 SLO（具體值 tasks 階段定，預設與既有一致）。
+- **nginx**：在既有 `location /v1`（或新 `location /v1/realtime`）加 `proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; proxy_http_version 1.1;`——標準 WS upgrade。
+- **egress**：pod 需可達 `wss://<foundry>.services.ai.azure.com:443`；既有 443 egress 已開（OCR 那次驗過 `raw.githubusercontent.com:443`），WS over 443 同通——**部署煙霧實證**。
+
+**Rationale**：FastAPI/starlette 原生 WS + websockets client 都已在 image，無基礎建設缺口；nginx WS upgrade 是標準配置；撤回用「旁路週期協程」而非阻塞 relay，乾淨且符合原則 3（長連線不能只在建立時檢查一次）。對應 experience「串流端點事後記帳在 client 還連著時做」——計量綁在連線存活期累計、斷線點落帳。
+
+**Alternatives considered**：
+- 撤回檢查塞進每個 audio 事件 → 否決：耦合轉送熱路徑、頻率不可控；旁路週期協程更清楚。
+- 不做連線中撤回（只建立時檢查）→ 否決：違反原則 3 即時撤回（長連線的核心風險）。
+- 走 SSE 而非 WS → 否決：realtime transcription 是雙向（音訊上行 + 文字下行），SSE 只能單向下行。
+
+---
+
+## 研究結論彙整（給 Phase 1 / tasks）
+
+| 未知 | 結論 | 落地 |
+|---|---|---|
+| 協定 + relay 結構 | OpenAI 相容 transcription 事件流；借 litellm `RealTimeStreaming` 雙向轉送骨架自寫 | `proxy/realtime.py` + `upstream` WS client |
+| 計量來源 | 自算 append 音訊時長 → `unit="minute"`，不賭 provider usage | `services` 計量 + `CallRecord(quantity,unit)`（0019，零 migration）|
+| 基礎建設 + 撤回 | FastAPI WS + websockets（已在 image，提直接依賴）；nginx WS upgrade；旁路週期協程 re-check 撤回 | `realtime.py` + helm nginx config |
+| 真打驗證 | 安排 implement 階段於有憑證環境（Constitution Deviation：不進 CI，部署煙霧）| quickstart 驗證腳本 |
diff --git a/specs/043-realtime-transcription/spec.md b/specs/043-realtime-transcription/spec.md
new file mode 100644
index 0000000..16769fa
--- /dev/null
+++ b/specs/043-realtime-transcription/spec.md
@@ -0,0 +1,109 @@
+# Feature Specification: realtime 即時字幕端點
+
+**Feature Branch**: `043-realtime-transcription`  
+**Created**: 2026-06-12  
+**Status**: Draft  
+**Input**: User description: "realtime 即時字幕端點：暴露 OpenAI 相容的 realtime transcription WebSocket 端點，走 gpt-realtime-whisper（Azure Foundry），連線時跑既有 preflight、連線期間 per-minute 計費歸戶到分配、連線期間定期 re-check 分配狀態被撤回即主動斷線；只做端點不做字幕 UI；build 直連 Azure realtime WS、不用 litellm Proxy form"
+
+## User Scenarios & Testing *(mandatory)*
+
+本功能對外的「使用者」是**接平台的客戶端應用 / 開發者**（用分配到的金鑰串流音訊取即時字幕），以及**管理員**（撤回分配時連線應隨之中止）與**成員**（在目錄看得到 realtime 模型與連線方式）。平台**不提供字幕畫面**——交付的是一個相容端點，讓會議軟體、字幕工具、課堂應用等自行接入。
+
+### User Story 1 - 開發者用平台金鑰取得即時字幕 (Priority: P1)
+
+一個課堂/會議應用的開發者，拿著分配到的應用金鑰，把麥克風音訊串流到平台的 realtime 字幕端點，邊說邊收到一段段文字，用來顯示即時字幕——全程不需要、也拿不到底層 AI 供應商的金鑰。
+
+**Why this priority**: 這是整個功能的核心價值與最小可用切片；沒有它，其餘（計費、撤回）都無對象。單獨完成即構成 MVP——「能用平台金鑰取得即時字幕」本身就交付價值。
+
+**Independent Test**: 用一把有效、且對某 realtime 模型有有效分配的金鑰建立連線，串入一段音訊，驗證能即時收到對應文字；用無效/撤回的金鑰則無法建立連線。
+
+**Acceptance Scenarios**:
+
+1. **Given** 一把有效金鑰、且其分配含某個 realtime 字幕模型，**When** 客戶端建立連線並串流音訊，**Then** 客戶端持續收到該段音訊的即時文字結果。
+2. **Given** 一把無效或已撤回的金鑰，**When** 客戶端嘗試建立連線，**Then** 連線被拒絕、不開始串流，且不洩漏任何底層供應商資訊。
+3. **Given** 一把有效金鑰，但請求的模型不是 realtime 字幕類型，**When** 客戶端嘗試建立連線，**Then** 連線被拒絕並回可理解的錯誤（此模型不支援即時字幕）。
+
+---
+
+### User Story 2 - 即時字幕用量按時間計費並歸戶到分配 (Priority: P2)
+
+成員/管理員需要 realtime 字幕的用量跟其他端點一樣**可盤點、可計費、可追蹤**——每一次連線消耗多少（以時間計）、算在哪一筆分配、計入配額，都看得到。
+
+**Why this priority**: 對應可追蹤性的核心承諾；沒有它，realtime 會成為「用得到但帳目斷裂」的影子用量，違反平台不變式。但需先有 P1 才有用量可計。
+
+**Independent Test**: 完成一次連線後，驗證該次用量（串流時長）以時間單位記錄、歸戶到對應分配、計入該分配配額，並出現在用量總覽中；連線即使異常中止，已串流的時間仍被記錄。
+
+**Acceptance Scenarios**:
+
+1. **Given** 一次正常結束的字幕連線，**When** 連線關閉，**Then** 該次用量以時間單位記到對應分配、計入配額、可在用量視圖查到。
+2. **Given** 一次因網路中斷而異常結束的連線，**When** 連線中止，**Then** 已串流的時間仍被計費、不漏記。
+3. **Given** 同一成員以多把金鑰使用同一筆分配的 realtime 模型，**When** 各自連線，**Then** 用量都歸戶到同一分配、共用其配額。
+
+---
+
+### User Story 3 - 分配被撤回時進行中的連線隨即中止 (Priority: P3)
+
+管理員撤回（或暫停/隔離）某筆分配後，即使該分配當下有正在進行的字幕連線，也必須在限定時間內被切斷，不能靠連線自然結束或金鑰自然過期。
+
+**Why this priority**: 對應即時撤回原則；長連線特有的風險——一條已建立的連線若不主動檢查狀態，會在撤回後繼續消耗資源。但屬於 P1 之上的治理保護，非首個可用切片。
+
+**Independent Test**: 在一條進行中的字幕連線期間，由管理員撤回該分配，驗證連線在約定時間內被主動切斷，且切斷前已串流的時間正確計費。
+
+**Acceptance Scenarios**:
+
+1. **Given** 一條進行中的字幕連線，**When** 管理員撤回其分配，**Then** 連線在約定上限時間內被主動終止。
+2. **Given** 一條進行中的字幕連線，**When** 其分配被暫停或自動隔離，**Then** 連線同樣在約定時間內被終止。
+3. **Given** 連線被中止，**When** 結算，**Then** 中止前已串流的時間被正確計費。
+
+---
+
+### Edge Cases
+
+- **連線建立時配額已滿**：拒絕建立連線，給可理解的錯誤（與其他端點的配額不足行為一致）。
+- **連線期間累計用量超過配額**：與既有非 token 端點（OCR/圖片）相同的已知限制——配額在連線建立時檢查，連線進行中的超額不即時中斷（每分鐘級硬上限為後續）。本次以「建立時擋、進行中記帳」為準。
+- **上游供應商在連線中斷線或回報失敗**：平台對客戶端結束連線並回報可理解的失敗原因，已串流的時間仍計費。
+- **客戶端直接中斷（沒有正常關閉握手）**：仍須結算已串流的時間，不漏記。
+- **同一金鑰/分配同時多條連線**：允許（用量綁分配、各自計時累計到同一分配，沿用「額度綁分配、不對連線數設限」的既有立場）。
+- **送出非 realtime 字幕類型的模型**：拒絕，回可理解錯誤。
+
+## Requirements *(mandatory)*
+
+### Functional Requirements
+
+- **FR-001**: 系統 MUST 對外提供一個 realtime 即時字幕端點，讓客戶端以分配到的金鑰建立**持續連線**、串流音訊、並在說話過程中**即時**收到對應文字結果。
+- **FR-002**: 建立連線 MUST 通過既有的存取前置檢查——有效金鑰、對所請求的 realtime 模型有有效分配、且該分配可用（未撤回/暫停/隔離、配額未滿）；任一不符即拒絕建立連線。
+- **FR-003**: 系統 MUST 將每一次 realtime 連線的用量以**時間（分鐘）為單位**計量，歸戶到對應分配，並計入該分配的配額與計費，與其他端點一致地出現在用量盤點中。
+- **FR-004**: 連線結束（正常關閉或異常中止）時，系統 MUST 記錄該次已串流的時間用量，**不得因異常中止而漏記**。
+- **FR-005**: 連線**期間**系統 MUST 定期檢查其分配的當前狀態；當分配被撤回、暫停或自動隔離時，MUST 在約定上限時間內**主動終止連線**，不依賴連線自然結束或金鑰過期。
+- **FR-006**: 底層 AI 供應商的金鑰與內部細節 MUST NOT 出現在連線、回應或錯誤訊息中；客戶端只以平台金鑰連平台端點。
+- **FR-007**: 當請求的模型**非 realtime 字幕類型**時，系統 MUST 拒絕連線並回可理解的錯誤。
+- **FR-008**: 成員目錄 MUST 正確標示 realtime 字幕類型模型（不假裝成其他類型），並提供如何連線取用的範例，與其他端點的範例呈現一致。
+- **FR-009**: 系統 MUST 對 realtime 字幕端點維持與其他端點一致的審計與錯誤透明度——可在維運視圖看到連線建立/結束、計量結果、與上游失敗原因。
+
+### Key Entities *(include if feature involves data)*
+
+- **Realtime 字幕連線（session）**：一次持續連線的代表，綁定到一筆分配；具開始時間、結束時間、已串流時間、結束原因（正常/異常/被撤回）。用量計量的來源。
+- **呼叫紀錄（既有）**：沿用既有用量紀錄，數量以時間（分鐘）、單位為時間維度，歸戶到分配——與既有非 token 端點同一機制。
+- **分配（既有）**：realtime 用量計入的歸戶對象與配額載體；其狀態（有效/撤回/暫停/隔離）即連線期間 re-check 的依據。
+
+## Success Criteria *(mandatory)*
+
+### Measurable Outcomes
+
+- **SC-001**: 開發者用平台金鑰連上端點並串流音訊後，**1 秒內**開始收到第一段文字結果（即時感成立）。
+- **SC-002**: **100%** 的 realtime 字幕連線用量都歸戶到正確分配並計入配額——無任何一次成為無歸屬的影子用量。
+- **SC-003**: 連線因網路或客戶端異常中止時，已串流時間的計費**零漏記**。
+- **SC-004**: 管理員撤回分配後，該分配進行中的字幕連線在**約定上限時間內**被主動中止（可驗證的最大延遲）。
+- **SC-005**: 無效或已撤回的金鑰**無法**建立任何字幕連線（拒絕率 100%）。
+- **SC-006**: 既有所有端點（chat/embedding/ocr/stt/… 與計費）在本功能上線後**零回歸**。
+
+## Assumptions
+
+- **沿用「分配」為計量歸戶與配額的第一公民**：realtime 用量綁分配、不對單一金鑰/連線數設上限（沿用既有原則：額度綁分配、token/連線數不另設限）。
+- **計量單位以時間（分鐘）為準**：以供應商回報的計量為優先；若回應未帶可用計量，則以連線實際串流時間估算（最終計量來源於規劃階段的能力驗證中釘死）。
+- **撤回 SLO 對齊既有撤回機制**：連線期間 re-check 的頻率使「撤回 → 斷線」落在與既有分配撤回一致的時間上限內（具體秒數於規劃階段定）。
+- **配額為「建立時檢查」**：連線進行中的累計超額不即時中斷——與既有非 token 端點（OCR/圖片）相同的已知限制，每分鐘級硬上限列為後續。
+- **只交付端點、不交付字幕 UI**：字幕畫面由接入的客戶端應用負責；平台提供相容端點 + 連線範例。
+- **採直連上游供應商的 realtime 連線**：不引入第三方代理閘道形態；沿用平台既有的「自製 gateway + 上游抽象層」邊界（避免引入不認得「分配」模型的並行計費權威）。
+- **目標供應商模型已就緒**：用於即時字幕的供應商模型已在供應端部署且平台已有可連憑證（規劃/實作階段可真打驗證）。
+- **既有存取、計費、審計機制重用**：preflight、用量紀錄、配額、審計事件沿用既有設施，realtime 為其新增的一種端點形態。
diff --git a/specs/043-realtime-transcription/tasks.md b/specs/043-realtime-transcription/tasks.md
new file mode 100644
index 0000000..fa9a6a2
--- /dev/null
+++ b/specs/043-realtime-transcription/tasks.md
@@ -0,0 +1,182 @@
+# Tasks: realtime 即時字幕端點
+
+**Input**: Design documents from `specs/043-realtime-transcription/`
+**Prerequisites**: plan.md, spec.md, research.md, data-model.md, contracts/realtime-transcription.md, quickstart.md
+
+**Tests**: 包含（Constitution 原則 I Test-First 非協商）——契約/整合/單元測試先寫且先失敗，再實作。
+
+**Organization**: 按 user story（P1/P2/P3）分階段，每個 story 以 mock provider realtime WS server 獨立可測。
+
+## Format: `[ID] [P?] [Story] Description`
+
+- **[P]**: 不同檔案、無未完成依賴，可並行
+- **[Story]**: US1 / US2 / US3（對映 spec 的 P1/P2/P3）
+
+---
+
+## Phase 1: Setup (Shared Infrastructure)
+
+**Purpose**: 依賴與測試基礎建設
+
+- [X] T001 將 `websockets` 提為直接依賴（`pyproject.toml`，已隨 image，宣告版本下限；PR 以 Constitution Deviation 說明）並確認 lockfile 更新
+- [ ] T002 [P] 建立 mock provider realtime WS server test fixture（`tests/conftest.py` 或 `tests/support/realtime_mock.py`）：一個可在測試內啟動的假 realtime WS，依輸入送預錄 `...transcription.delta/.completed` 事件流，供所有整合/契約測試共用
+- [X] T003 [P] 在計量層登記 `minute` 單位：確認 `services/pricing.py` 的 `calculate_unit_cost` 對 `unit="minute"` 無礙（純資料值、無 schema 變更），補單元測試於 `tests/unit/test_pricing.py`
+
+---
+
+## Phase 2: Foundational (Blocking Prerequisites)
+
+**Purpose**: 讓 WS 連線能建立並轉送的最小骨架——所有 user story 的前置
+
+**⚠️ CRITICAL**: 本階段未完成前，US1–US3 無法開工
+
+- [ ] T004 實作上游 realtime WS client helper（`src/ai_api/proxy/upstream.py`）：以 `websockets` 開一條到 Azure Foundry realtime endpoint 的連線、注入憑證（api_key/api_base），回傳可雙向收送的連線物件；金鑰不外洩
+- [ ] T005 建立 WS 端點 scaffold（`src/ai_api/proxy/realtime.py`）：FastAPI `@app.websocket("/v1/realtime")` accept/close 骨架 + 掛載到 app router（暫不含 preflight/relay 完整邏輯，先讓連線可建立可關閉）
+- [X] T006 `services/model_kind.py` 的 mode→kind 對映加 `realtime`（litellm realtime/transcription mode → `realtime` kind）；**改完重跑完整 `pytest tests/` 確認零回歸**（experience：「未知 mode 反例」整合測試會撞）
+
+**Checkpoint**: WS 連線可建立、可開上游連線、目錄能辨識 realtime 類型——可開始 US1
+
+---
+
+## Phase 3: User Story 1 - 開發者用平台金鑰取得即時字幕 (Priority: P1) 🎯 MVP
+
+**Goal**: 有效金鑰 → 建立 WS 連線 → 串流音訊 → 即時收文字 delta；無效/撤回/非 realtime 模型被拒。
+
+**Independent Test**: 用 mock provider WS，有效金鑰連線送 append 收到 delta；無效金鑰被 close。
+
+### Tests for User Story 1 ⚠️（先寫、先失敗）
+
+- [ ] T007 [P] [US1] 契約測試：無效/撤回金鑰連線被 close、未開始串流（`tests/contract/test_realtime_transcription.py`）
+- [ ] T008 [P] [US1] 契約測試：請求非 realtime 類型模型 → close(unsupported)（同檔）
+- [ ] T009 [P] [US1] 整合測試：有效金鑰連線 + 送 `input_audio_buffer.append` → 收到 `...transcription.delta`（mock provider WS，`tests/integration/test_realtime_relay.py`）
+
+### Implementation for User Story 1
+
+- [ ] T010 [US1] 連線建立時跑既有 `run_preflight`（`src/ai_api/proxy/realtime.py`）：金鑰→分配→存取→配額→model binding；不通過則 close 並回相容錯誤碼（不洩漏上游）
+- [ ] T011 [US1] 雙向 relay 迴圈（`src/ai_api/proxy/realtime.py`）：`client→backend` 與 `backend→client` 兩協程轉送（借鏡 litellm `RealTimeStreaming.bidirectional_forward` 結構），delta/completed 即時轉回客戶端
+- [ ] T012 [US1] 模型類型校驗 + 錯誤轉譯（`src/ai_api/proxy/realtime.py`）：非 realtime kind → close(unsupported)；上游錯誤透明轉回但不含 key/endpoint（FR-006/007）
+- [ ] T013 [US1] 連線生命週期結構化日誌（`src/ai_api/proxy/realtime.py`）：建立/關閉/原因，沿用既有 audit + 觀測（原則 IV）
+
+**Checkpoint**: 客戶端能用平台金鑰即時取得字幕；MVP 成立（計量/撤回尚未接）
+
+---
+
+## Phase 4: User Story 2 - 即時字幕用量按時間計費並歸戶到分配 (Priority: P2)
+
+**Goal**: 每次連線的用量以分鐘計、歸戶分配、計入配額，異常中止不漏記。
+
+**Independent Test**: 連線送已知時長音訊後關閉 → 寫一筆 `CallRecord(unit="minute")`、quantity 對得上；client 直接斷也落帳。
+
+### Tests for User Story 2 ⚠️（先寫、先失敗）
+
+- [ ] T014 [P] [US2] 單元測試：PCM bytes → 秒 → 分鐘換算（含 rounding）（`tests/unit/test_realtime_metering.py`）
+- [ ] T015 [P] [US2] 整合測試：連線正常關閉 → 一筆 `CallRecord(unit="minute")`、quantity 對得上、歸戶正確分配（`tests/integration/test_realtime_relay.py`）
+- [ ] T016 [P] [US2] 整合測試：client 直接中斷（無正常握手）→ 已累計時長仍落帳（FR-004/SC-003）
+
+### Implementation for User Story 2
+
+- [ ] T017 [US2] RealtimeSession 計量狀態（`src/ai_api/proxy/realtime.py`）：解析 `session.update` 的 format（sample_rate/bytes_per_sample/channels）、在 relay 即時累計 `audio_bytes`
+- [ ] T018 [US2] 斷線落帳（`src/ai_api/proxy/realtime.py`）：duration→minute→`CallRecord`（`calculate_unit_cost`、歸戶 allocation、token 欄 NULL、outcome 對映 close_reason）；**任何 close 路徑都落帳**
+- [ ] T019 [P] [US2] 前端：admin `/prices` 單位下拉加 `minute`（`frontend/src/routes/admin/prices.tsx`，沿用階段 29 單位感知 UI），realtime 模型可設每分鐘價
+
+**Checkpoint**: US1 + US2——即時字幕可用且用量可計費歸戶
+
+---
+
+## Phase 5: User Story 3 - 分配被撤回時進行中的連線隨即中止 (Priority: P3)
+
+**Goal**: 連線期間分配被撤回/暫停/隔離 → 約定時間內主動斷線，已累計時長落帳。
+
+**Independent Test**: mock 連線進行中撤回分配 → N 秒內 close(revoked) + 落帳。
+
+### Tests for User Story 3 ⚠️（先寫、先失敗）
+
+- [ ] T020 [P] [US3] 整合測試：連線進行中撤回分配 → N 秒內 close(revoked) + 已累計時長落帳（`tests/integration/test_realtime_relay.py`）
+- [ ] T021 [P] [US3] 整合測試：分配被暫停/隔離 → 同樣主動斷線（同檔）
+
+### Implementation for User Story 3
+
+- [ ] T022 [US3] 旁路週期 re-check 協程（`src/ai_api/proxy/realtime.py`）：每 N 秒查分配當前狀態，非 active → 主動 close(revoked)；N 對齊既有撤回 SLO（常數集中、可調）
+- [ ] T023 [US3] 與 US2 落帳整合（`src/ai_api/proxy/realtime.py`）：撤回觸發的 close 同樣走斷線落帳（已累計時長不漏）
+
+**Checkpoint**: 三個 user story 全部獨立可用
+
+---
+
+## Phase 6: Polish & Cross-Cutting Concerns
+
+- [ ] T024 [P] 前端：`frontend/src/routes/admin/model-detail.tsx` 顯示 realtime 類型（KIND_LABEL）+ `frontend/src/components/api-usage-example.tsx` 加 realtime WS 連線範例（FR-008）
+- [ ] T025 [P] nginx WS upgrade config（`deploy/helm/ai-api/`）：`/v1/realtime`（或 `/v1`）加 `Upgrade`/`Connection: upgrade` + `proxy_http_version 1.1`
+- [ ] T026 全綠關卡：`ruff check .` + mypy + 前端 tsc/build/test + 完整 `pytest tests/` 零回歸（既有 contract 測試 git diff 為空，SC-006）
+- [ ] T027 部署後手動煙霧（quickstart.md，**需憑證環境**）：pod egress `wss:443` 實證、壞金鑰連線被 close、真打一次完整字幕（首字 <1s）→ 用量頁見一筆 `unit=minute` 歸戶分配；R2 計量對照 Azure 帳單校驗
+
+---
+
+## Dependencies & Execution Order
+
+### Phase Dependencies
+
+- **Setup (Phase 1)**: 無依賴，可立即開始
+- **Foundational (Phase 2)**: 依賴 Setup；**BLOCKS 所有 user story**
+- **User Stories (Phase 3–5)**: 皆依賴 Foundational
+  - US1（MVP）建議先做；US2/US3 在 US1 的 relay 骨架上疊（同檔 `realtime.py`，故 US2/US3 內部多為順序、跨檔的測試/前端可 [P]）
+- **Polish (Phase 6)**: 依賴所需 user story 完成
+
+### User Story Dependencies
+
+- **US1 (P1)**: Foundational 後即可——核心連線+轉送，MVP
+- **US2 (P2)**: 邏輯上疊在 US1 的 relay（累計 audio_bytes 在轉送迴圈內）；測試/前端可獨立
+- **US3 (P3)**: 旁路協程，與 US1 relay 並行；落帳與 US2 共用
+
+### Within Each User Story
+
+- 測試先寫且先失敗 → 實作 → 重構
+- relay/計量/撤回多在同一檔 `proxy/realtime.py`，故同 story 內實作任務多為順序；不同檔（前端、測試）標 [P]
+
+### Parallel Opportunities
+
+- T002/T003（Setup）可並行
+- 各 story 的測試任務（T007–T009、T014–T016、T020–T021）標 [P] 可並行先寫
+- 前端任務（T019、T024）與後端不同檔，可並行
+- T025 nginx config 與後端邏輯不同檔，可並行
+
+---
+
+## Parallel Example: User Story 1
+
+```bash
+# 先並行寫 US1 全部測試（先失敗）：
+Task: "契約測試 無效金鑰被 close — tests/contract/test_realtime_transcription.py"
+Task: "契約測試 非 realtime 模型 close — tests/contract/test_realtime_transcription.py"
+Task: "整合測試 有效連線收 delta（mock provider WS）— tests/integration/test_realtime_relay.py"
+```
+
+---
+
+## Implementation Strategy
+
+### MVP First (User Story 1)
+
+1. Phase 1 Setup → 2. Phase 2 Foundational（CRITICAL）→ 3. Phase 3 US1 → **STOP & VALIDATE**（mock provider WS 跑綠 = MVP）→ 視情況先以 mock 驗收，真打留 T027。
+
+### Incremental Delivery
+
+1. Setup + Foundational → 基礎
+2. US1 → 即時字幕可用（mock 驗）→ MVP
+3. US2 → 計費歸戶 → 可上線（計費完整）
+4. US3 → 連線中撤回 → 治理完整
+5. Polish（前端目錄/範例 + nginx + 全綠 + 部署煙霧）
+
+### 真打限制（誠實標記）
+
+- T009/T015/T020 等整合測試**全用 mock provider WS**（CI 可重現，Constitution Deviation 的補救）。
+- **T027 真連 Azure realtime WS 需憑證環境**（維護者實機跑 quickstart）——R1/R2 的協定接通 + 計量對照在此校驗，非 CI。
+
+---
+
+## Notes
+
+- [P] = 不同檔、無依賴；relay/計量/撤回集中於 `proxy/realtime.py`，同 story 實作多順序。
+- 每個 task 或邏輯群組後 commit；測試先失敗再實作。
+- 改 `model_kind`（T006）後務必跑完整 `pytest tests/`（experience 教訓）。
+- 既有端點零回歸鐵證：既有 contract 測試檔 git diff 為空（SC-006）。
diff --git a/src/ai_api/services/model_kind.py b/src/ai_api/services/model_kind.py
index a3f02a7..7b3959b 100644
--- a/src/ai_api/services/model_kind.py
+++ b/src/ai_api/services/model_kind.py
@@ -16,7 +16,7 @@
 
 Kind = Literal[
     "chat", "embedding", "tts", "image", "stt", "ocr", "rerank",
-    "moderation", "search", "image_edit", "unknown",
+    "moderation", "search", "image_edit", "realtime", "unknown",
 ]
 
 # litellm mode → our kind
@@ -32,6 +32,9 @@
     "moderation": "moderation",
     "search": "search",
     "image_edit": "image_edit",
+    # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Not a
+    # recipe-table "test model" kind — billed per-minute via the /v1/realtime relay.
+    "realtime": "realtime",
 }
 
 
diff --git a/tests/contract/test_pricing_units.py b/tests/contract/test_pricing_units.py
index 5a61de8..129770a 100644
--- a/tests/contract/test_pricing_units.py
+++ b/tests/contract/test_pricing_units.py
@@ -81,3 +81,25 @@ async def test_current_price_map_surfaces_per_unit(app_client: AsyncClient) -> N
     entry = pm[("azure_ai", "doc-ocr")]
     assert entry["price_unit"] == "page"
     assert Decimal(entry["price_per_unit"]) == Decimal("0.003")
+
+
+@pytest.mark.asyncio
+async def test_minute_unit_for_realtime(app_client: AsyncClient) -> None:
+    """Phase 32: realtime transcription bills per-minute — same unit-billing path,
+    `minute` is just a new string unit value (no schema change)."""
+    now = datetime.now(UTC)
+    sm = get_sessionmaker()
+    async with sm() as s:
+        s.add(PriceList(
+            id=str(ULID()), provider="azure", model="gpt-realtime-whisper",
+            input_per_1k_tokens_usd=Decimal(0), output_per_1k_tokens_usd=Decimal(0),
+            price_unit="minute", price_per_unit_usd=Decimal("0.017"),
+            effective_from=now - timedelta(days=1), created_at=datetime.now(UTC), created_by="test",
+        ))
+        await s.commit()
+        price = await lookup_price_for_call(
+            s, provider="azure", model="gpt-realtime-whisper", call_time=now
+        )
+    assert price is not None and price.price_unit == "minute"
+    # 5 minutes x $0.017 = $0.085 (per-minute billing through the existing path)
+    assert calculate_unit_cost(5, price.price_per_unit) == Decimal("0.085")

From d718fae2de78ad5f8177a3be4b39945483260876 Mon Sep 17 00:00:00 2001
From: timcsy <messenger@tew.tw>
Date: Fri, 12 Jun 2026 16:46:33 +0800
Subject: [PATCH 2/4] =?UTF-8?q?feat(realtime):=20/v1/realtime=20WS=20relay?=
 =?UTF-8?q?=20=E2=80=94=20bidirectional=20transcription,=20per-minute=20bi?=
 =?UTF-8?q?lling,=20in-flight=20revocation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WS core for the realtime transcription endpoint (US1/US2/US3):
- proxy/realtime.py: thin bidirectional relay (borrowing litellm RealTimeStreaming
  structure, not its Proxy-form realtime) + side-channel revocation watcher +
  self-counted per-minute metering from input_audio_buffer.append PCM bytes (R2).
  Any close path bills one CallRecord(unit="minute") attributed to the allocation
  (FR-004: abnormal abort never loses usage). Never leaks upstream key/endpoint.
- upstream.open_realtime_ws: websockets client to the provider realtime WS,
  injecting the credential as api-key/Bearer (exact Azure URL validated in T027).
- handle_realtime takes an injectable open_upstream/check_active so CI exercises
  the full preflight→relay→metering→revocation path against a fake provider WS
  in-loop (engine is bound to the test loop; a TestClient portal would break the DB).
- Frontend: realtime KIND_LABEL, /v1/realtime WS usage example, prices 'minute' unit.
- nginx: /v1/realtime WS upgrade (HTTP/1.1 Upgrade + no buffering + long timeout).

Tests: contract 1-7 (invalid/revoked key, non-realtime model, delta relay, clean-
close billing, abnormal-abort billing, in-flight revoke, no-leak) + pure metering
unit tests. Full suite 731 passed (715→731), zero regression; ruff+mypy clean;
frontend tsc + 164 vitest + build green. SC-006: existing contract tests untouched.

T027 (real Azure realtime WS smoke) remains for a credentialed environment.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 deploy/nginx/default.conf.template            |  12 +
 frontend/src/components/api-usage-example.tsx |  22 +
 frontend/src/routes/admin/model-detail.tsx    |   1 +
 frontend/src/routes/admin/prices.tsx          |   5 +-
 specs/043-realtime-transcription/tasks.md     |  49 +-
 src/ai_api/main.py                            |   2 +
 src/ai_api/proxy/realtime.py                  | 519 ++++++++++++++++++
 src/ai_api/proxy/upstream.py                  |  42 ++
 tests/contract/test_realtime_transcription.py | 309 +++++++++++
 tests/support/__init__.py                     |   0
 tests/support/realtime_mock.py                | 108 ++++
 tests/unit/test_realtime_metering.py          |  51 ++
 12 files changed, 1094 insertions(+), 26 deletions(-)
 create mode 100644 src/ai_api/proxy/realtime.py
 create mode 100644 tests/contract/test_realtime_transcription.py
 create mode 100644 tests/support/__init__.py
 create mode 100644 tests/support/realtime_mock.py
 create mode 100644 tests/unit/test_realtime_metering.py

diff --git a/deploy/nginx/default.conf.template b/deploy/nginx/default.conf.template
index 29c1b06..9fb569b 100644
--- a/deploy/nginx/default.conf.template
+++ b/deploy/nginx/default.conf.template
@@ -42,6 +42,18 @@ server {
         proxy_set_header Connection "";
         proxy_http_version 1.1;
     }
+    # Realtime transcription is a bidirectional WebSocket (/v1/realtime). It needs
+    # the HTTP/1.1 Upgrade dance + no buffering + a long read timeout so the relay
+    # stays open while audio streams. Must precede the generic /v1 location.
+    location /v1/realtime {
+        proxy_pass http://${BACKEND_UPSTREAM};
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+        proxy_buffering off;
+        proxy_cache off;
+        proxy_read_timeout 3600s;
+    }
     location /v1          { proxy_pass http://${BACKEND_UPSTREAM}; }
     location /docs        { proxy_pass http://${BACKEND_UPSTREAM}; }
     location /openapi.json { proxy_pass http://${BACKEND_UPSTREAM}; }
diff --git a/frontend/src/components/api-usage-example.tsx b/frontend/src/components/api-usage-example.tsx
index 598cfcd..a358c31 100644
--- a/frontend/src/components/api-usage-example.tsx
+++ b/frontend/src/components/api-usage-example.tsx
@@ -81,6 +81,28 @@ export function ApiUsageExample({
   -H "Authorization: Bearer $TOKEN" \\
   -F "model=${m}" -F "image=@input.png" -F "prompt=make it red"`,
     },
+    realtime: {
+      path: "/realtime",
+      desc: "即時字幕（realtime）模型，用 WebSocket 串流音訊、即時收文字（OpenAI realtime transcription 相容）。用量按分鐘計",
+      // WebSocket — not curl. Replace https:// with wss:// in the endpoint URL.
+      curl: `# pip install websockets — 串麥克風 PCM、即時收字幕（把 https 換成 wss）
+import asyncio, base64, json, websockets
+
+async def main():
+    url = "${base}/realtime".replace("https://", "wss://").replace("http://", "ws://")
+    async with websockets.connect(url, additional_headers={"Authorization": "Bearer $TOKEN"}) as ws:
+        await ws.send(json.dumps({"type": "session.update", "session": {
+            "type": "transcription", "model": "${m}",
+            "audio": {"input": {"format": {"type": "audio/pcm", "rate": 24000}}}}}))
+        await ws.send(json.dumps({"type": "input_audio_buffer.append",
+                                  "audio": base64.b64encode(pcm_chunk).decode()}))
+        async for msg in ws:
+            ev = json.loads(msg)
+            if ev.get("type") == "conversation.item.input_audio_transcription.delta":
+                print(ev["delta"], end="", flush=True)
+
+asyncio.run(main())`,
+    },
   };
   if (kind && endpointInfo[kind]) {
     const info = endpointInfo[kind]!;
diff --git a/frontend/src/routes/admin/model-detail.tsx b/frontend/src/routes/admin/model-detail.tsx
index aaa62e6..c396ace 100644
--- a/frontend/src/routes/admin/model-detail.tsx
+++ b/frontend/src/routes/admin/model-detail.tsx
@@ -109,6 +109,7 @@ const KIND_LABEL: Record<string, string> = {
   moderation: "內容審核（moderation）",
   search: "網路搜尋（search）",
   image_edit: "圖片編輯（image edit）",
+  realtime: "即時字幕（realtime）",
   unknown: "未知",
 };
 
diff --git a/frontend/src/routes/admin/prices.tsx b/frontend/src/routes/admin/prices.tsx
index 613b448..6551a1a 100644
--- a/frontend/src/routes/admin/prices.tsx
+++ b/frontend/src/routes/admin/prices.tsx
@@ -73,7 +73,7 @@ const fmtDate = (iso: string) => new Date(iso).toLocaleString("zh-TW");
 
 // Phase 31: non-token billing unit labels.
 const UNIT_ZH: Record<string, string> = {
-  page: "頁", query: "查詢", character: "字元", image: "張", second: "秒",
+  page: "頁", query: "查詢", character: "字元", image: "張", second: "秒", minute: "分鐘",
 };
 
 /** Local "now" formatted for a <input type="datetime-local"> (YYYY-MM-DDTHH:mm). */
@@ -448,13 +448,14 @@ function AddPriceDialog({
                   <SelectItem value="character">每字元</SelectItem>
                   <SelectItem value="image">每張</SelectItem>
                   <SelectItem value="second">每秒</SelectItem>
+                  <SelectItem value="minute">每分鐘</SelectItem>
                 </SelectContent>
               </Select>
               <Input id="p-perpage" className="font-mono flex-1" placeholder="0.003"
                 value={perPage} onChange={(e) => setPerPage(e.target.value)} />
             </div>
             <p className="text-xs text-muted-foreground mt-1">
-              非 token 模型（OCR=頁、rerank/search=查詢、TTS=字元、圖片編輯=張）依該單位計費，填此欄；token 欄可填 0。一筆價格只用一種單位。可按上方「從 LiteLLM 帶入建議價」自動填。
+              非 token 模型（OCR=頁、rerank/search=查詢、TTS=字元、圖片編輯=張、即時字幕=分鐘）依該單位計費，填此欄；token 欄可填 0。一筆價格只用一種單位。可按上方「從 LiteLLM 帶入建議價」自動填。
             </p>
           </div>
 
diff --git a/specs/043-realtime-transcription/tasks.md b/specs/043-realtime-transcription/tasks.md
index fa9a6a2..a6ff94c 100644
--- a/specs/043-realtime-transcription/tasks.md
+++ b/specs/043-realtime-transcription/tasks.md
@@ -19,7 +19,7 @@
 **Purpose**: 依賴與測試基礎建設
 
 - [X] T001 將 `websockets` 提為直接依賴（`pyproject.toml`，已隨 image，宣告版本下限；PR 以 Constitution Deviation 說明）並確認 lockfile 更新
-- [ ] T002 [P] 建立 mock provider realtime WS server test fixture（`tests/conftest.py` 或 `tests/support/realtime_mock.py`）：一個可在測試內啟動的假 realtime WS，依輸入送預錄 `...transcription.delta/.completed` 事件流，供所有整合/契約測試共用
+- [X] T002 [P] 建立 mock provider realtime WS server test fixture（`tests/conftest.py` 或 `tests/support/realtime_mock.py`）：一個可在測試內啟動的假 realtime WS，依輸入送預錄 `...transcription.delta/.completed` 事件流，供所有整合/契約測試共用
 - [X] T003 [P] 在計量層登記 `minute` 單位：確認 `services/pricing.py` 的 `calculate_unit_cost` 對 `unit="minute"` 無礙（純資料值、無 schema 變更），補單元測試於 `tests/unit/test_pricing.py`
 
 ---
@@ -30,8 +30,8 @@
 
 **⚠️ CRITICAL**: 本階段未完成前，US1–US3 無法開工
 
-- [ ] T004 實作上游 realtime WS client helper（`src/ai_api/proxy/upstream.py`）：以 `websockets` 開一條到 Azure Foundry realtime endpoint 的連線、注入憑證（api_key/api_base），回傳可雙向收送的連線物件；金鑰不外洩
-- [ ] T005 建立 WS 端點 scaffold（`src/ai_api/proxy/realtime.py`）：FastAPI `@app.websocket("/v1/realtime")` accept/close 骨架 + 掛載到 app router（暫不含 preflight/relay 完整邏輯，先讓連線可建立可關閉）
+- [X] T004 實作上游 realtime WS client helper（`src/ai_api/proxy/upstream.py`）：以 `websockets` 開一條到 Azure Foundry realtime endpoint 的連線、注入憑證（api_key/api_base），回傳可雙向收送的連線物件；金鑰不外洩
+- [X] T005 建立 WS 端點 scaffold（`src/ai_api/proxy/realtime.py`）：FastAPI `@app.websocket("/v1/realtime")` accept/close 骨架 + 掛載到 app router（暫不含 preflight/relay 完整邏輯，先讓連線可建立可關閉）
 - [X] T006 `services/model_kind.py` 的 mode→kind 對映加 `realtime`（litellm realtime/transcription mode → `realtime` kind）；**改完重跑完整 `pytest tests/` 確認零回歸**（experience：「未知 mode 反例」整合測試會撞）
 
 **Checkpoint**: WS 連線可建立、可開上游連線、目錄能辨識 realtime 類型——可開始 US1
@@ -46,16 +46,16 @@
 
 ### Tests for User Story 1 ⚠️（先寫、先失敗）
 
-- [ ] T007 [P] [US1] 契約測試：無效/撤回金鑰連線被 close、未開始串流（`tests/contract/test_realtime_transcription.py`）
-- [ ] T008 [P] [US1] 契約測試：請求非 realtime 類型模型 → close(unsupported)（同檔）
-- [ ] T009 [P] [US1] 整合測試：有效金鑰連線 + 送 `input_audio_buffer.append` → 收到 `...transcription.delta`（mock provider WS，`tests/integration/test_realtime_relay.py`）
+- [X] T007 [P] [US1] 契約測試：無效/撤回金鑰連線被 close、未開始串流（`tests/contract/test_realtime_transcription.py`）
+- [X] T008 [P] [US1] 契約測試：請求非 realtime 類型模型 → close(unsupported)（同檔）
+- [X] T009 [P] [US1] 整合測試：有效金鑰連線 + 送 `input_audio_buffer.append` → 收到 `...transcription.delta`（mock provider WS，`tests/integration/test_realtime_relay.py`）
 
 ### Implementation for User Story 1
 
-- [ ] T010 [US1] 連線建立時跑既有 `run_preflight`（`src/ai_api/proxy/realtime.py`）：金鑰→分配→存取→配額→model binding；不通過則 close 並回相容錯誤碼（不洩漏上游）
-- [ ] T011 [US1] 雙向 relay 迴圈（`src/ai_api/proxy/realtime.py`）：`client→backend` 與 `backend→client` 兩協程轉送（借鏡 litellm `RealTimeStreaming.bidirectional_forward` 結構），delta/completed 即時轉回客戶端
-- [ ] T012 [US1] 模型類型校驗 + 錯誤轉譯（`src/ai_api/proxy/realtime.py`）：非 realtime kind → close(unsupported)；上游錯誤透明轉回但不含 key/endpoint（FR-006/007）
-- [ ] T013 [US1] 連線生命週期結構化日誌（`src/ai_api/proxy/realtime.py`）：建立/關閉/原因，沿用既有 audit + 觀測（原則 IV）
+- [X] T010 [US1] 連線建立時跑既有 `run_preflight`（`src/ai_api/proxy/realtime.py`）：金鑰→分配→存取→配額→model binding；不通過則 close 並回相容錯誤碼（不洩漏上游）
+- [X] T011 [US1] 雙向 relay 迴圈（`src/ai_api/proxy/realtime.py`）：`client→backend` 與 `backend→client` 兩協程轉送（借鏡 litellm `RealTimeStreaming.bidirectional_forward` 結構），delta/completed 即時轉回客戶端
+- [X] T012 [US1] 模型類型校驗 + 錯誤轉譯（`src/ai_api/proxy/realtime.py`）：非 realtime kind → close(unsupported)；上游錯誤透明轉回但不含 key/endpoint（FR-006/007）
+- [X] T013 [US1] 連線生命週期結構化日誌（`src/ai_api/proxy/realtime.py`）：建立/關閉/原因，沿用既有 audit + 觀測（原則 IV）
 
 **Checkpoint**: 客戶端能用平台金鑰即時取得字幕；MVP 成立（計量/撤回尚未接）
 
@@ -69,15 +69,15 @@
 
 ### Tests for User Story 2 ⚠️（先寫、先失敗）
 
-- [ ] T014 [P] [US2] 單元測試：PCM bytes → 秒 → 分鐘換算（含 rounding）（`tests/unit/test_realtime_metering.py`）
-- [ ] T015 [P] [US2] 整合測試：連線正常關閉 → 一筆 `CallRecord(unit="minute")`、quantity 對得上、歸戶正確分配（`tests/integration/test_realtime_relay.py`）
-- [ ] T016 [P] [US2] 整合測試：client 直接中斷（無正常握手）→ 已累計時長仍落帳（FR-004/SC-003）
+- [X] T014 [P] [US2] 單元測試：PCM bytes → 秒 → 分鐘換算（含 rounding）（`tests/unit/test_realtime_metering.py`）
+- [X] T015 [P] [US2] 整合測試：連線正常關閉 → 一筆 `CallRecord(unit="minute")`、quantity 對得上、歸戶正確分配（`tests/integration/test_realtime_relay.py`）
+- [X] T016 [P] [US2] 整合測試：client 直接中斷（無正常握手）→ 已累計時長仍落帳（FR-004/SC-003）
 
 ### Implementation for User Story 2
 
-- [ ] T017 [US2] RealtimeSession 計量狀態（`src/ai_api/proxy/realtime.py`）：解析 `session.update` 的 format（sample_rate/bytes_per_sample/channels）、在 relay 即時累計 `audio_bytes`
-- [ ] T018 [US2] 斷線落帳（`src/ai_api/proxy/realtime.py`）：duration→minute→`CallRecord`（`calculate_unit_cost`、歸戶 allocation、token 欄 NULL、outcome 對映 close_reason）；**任何 close 路徑都落帳**
-- [ ] T019 [P] [US2] 前端：admin `/prices` 單位下拉加 `minute`（`frontend/src/routes/admin/prices.tsx`，沿用階段 29 單位感知 UI），realtime 模型可設每分鐘價
+- [X] T017 [US2] RealtimeSession 計量狀態（`src/ai_api/proxy/realtime.py`）：解析 `session.update` 的 format（sample_rate/bytes_per_sample/channels）、在 relay 即時累計 `audio_bytes`
+- [X] T018 [US2] 斷線落帳（`src/ai_api/proxy/realtime.py`）：duration→minute→`CallRecord`（`calculate_unit_cost`、歸戶 allocation、token 欄 NULL、outcome 對映 close_reason）；**任何 close 路徑都落帳**
+- [X] T019 [P] [US2] 前端：admin `/prices` 單位下拉加 `minute`（`frontend/src/routes/admin/prices.tsx`，沿用階段 29 單位感知 UI），realtime 模型可設每分鐘價
 
 **Checkpoint**: US1 + US2——即時字幕可用且用量可計費歸戶
 
@@ -91,13 +91,13 @@
 
 ### Tests for User Story 3 ⚠️（先寫、先失敗）
 
-- [ ] T020 [P] [US3] 整合測試：連線進行中撤回分配 → N 秒內 close(revoked) + 已累計時長落帳（`tests/integration/test_realtime_relay.py`）
-- [ ] T021 [P] [US3] 整合測試：分配被暫停/隔離 → 同樣主動斷線（同檔）
+- [X] T020 [P] [US3] 整合測試：連線進行中撤回分配 → N 秒內 close(revoked) + 已累計時長落帳（`tests/integration/test_realtime_relay.py`）
+- [X] T021 [P] [US3] 整合測試：分配被暫停/隔離 → 同樣主動斷線（同檔）
 
 ### Implementation for User Story 3
 
-- [ ] T022 [US3] 旁路週期 re-check 協程（`src/ai_api/proxy/realtime.py`）：每 N 秒查分配當前狀態，非 active → 主動 close(revoked)；N 對齊既有撤回 SLO（常數集中、可調）
-- [ ] T023 [US3] 與 US2 落帳整合（`src/ai_api/proxy/realtime.py`）：撤回觸發的 close 同樣走斷線落帳（已累計時長不漏）
+- [X] T022 [US3] 旁路週期 re-check 協程（`src/ai_api/proxy/realtime.py`）：每 N 秒查分配當前狀態，非 active → 主動 close(revoked)；N 對齊既有撤回 SLO（常數集中、可調）
+- [X] T023 [US3] 與 US2 落帳整合（`src/ai_api/proxy/realtime.py`）：撤回觸發的 close 同樣走斷線落帳（已累計時長不漏）
 
 **Checkpoint**: 三個 user story 全部獨立可用
 
@@ -105,9 +105,9 @@
 
 ## Phase 6: Polish & Cross-Cutting Concerns
 
-- [ ] T024 [P] 前端：`frontend/src/routes/admin/model-detail.tsx` 顯示 realtime 類型（KIND_LABEL）+ `frontend/src/components/api-usage-example.tsx` 加 realtime WS 連線範例（FR-008）
-- [ ] T025 [P] nginx WS upgrade config（`deploy/helm/ai-api/`）：`/v1/realtime`（或 `/v1`）加 `Upgrade`/`Connection: upgrade` + `proxy_http_version 1.1`
-- [ ] T026 全綠關卡：`ruff check .` + mypy + 前端 tsc/build/test + 完整 `pytest tests/` 零回歸（既有 contract 測試 git diff 為空，SC-006）
+- [X] T024 [P] 前端：`frontend/src/routes/admin/model-detail.tsx` 顯示 realtime 類型（KIND_LABEL）+ `frontend/src/components/api-usage-example.tsx` 加 realtime WS 連線範例（FR-008）
+- [X] T025 [P] nginx WS upgrade config（`deploy/helm/ai-api/`）：`/v1/realtime`（或 `/v1`）加 `Upgrade`/`Connection: upgrade` + `proxy_http_version 1.1`
+- [X] T026 全綠關卡：`ruff check .` + mypy + 前端 tsc/build/test + 完整 `pytest tests/` 零回歸（既有 contract 測試 git diff 為空，SC-006）
 - [ ] T027 部署後手動煙霧（quickstart.md，**需憑證環境**）：pod egress `wss:443` 實證、壞金鑰連線被 close、真打一次完整字幕（首字 <1s）→ 用量頁見一筆 `unit=minute` 歸戶分配；R2 計量對照 Azure 帳單校驗
 
 ---
@@ -170,7 +170,8 @@ Task: "整合測試 有效連線收 delta（mock provider WS）— tests/integra
 ### 真打限制（誠實標記）
 
 - T009/T015/T020 等整合測試**全用 mock provider WS**（CI 可重現，Constitution Deviation 的補救）。
-- **T027 真連 Azure realtime WS 需憑證環境**（維護者實機跑 quickstart）——R1/R2 的協定接通 + 計量對照在此校驗，非 CI。
+- **落地位置**：engine 綁 pytest event loop（module global），另起 TestClient portal 會與 asyncpg/aiosqlite 衝突；因此 relay/計量/撤回/落帳/no-leak 全部以 `tests/contract/test_realtime_transcription.py`（sqlite，CI 必跑）**直接呼叫 `handle_realtime`**（注入 fake client/provider WS，`tests/support/realtime_mock.py`）驗證——契約測試 1–7 全綠。純計量函式另在 `tests/unit/test_realtime_metering.py`。
+- **T027 真連 Azure realtime WS 需憑證環境**（維護者實機跑 quickstart）——R1/R2 的協定接通 + 計量對照在此校驗，非 CI；`upstream._build_realtime_url` 的確切 Azure URL 形態也在此校驗（CI 用 fake upstream，未碰真 URL）。
 
 ---
 
diff --git a/src/ai_api/main.py b/src/ai_api/main.py
index e116eee..c24af9f 100644
--- a/src/ai_api/main.py
+++ b/src/ai_api/main.py
@@ -38,6 +38,7 @@
 from ai_api.observability.logging import setup_logging
 from ai_api.observability.request_id import RequestIdMiddleware
 from ai_api.proxy.registry import build_router as build_proxy_registry_router
+from ai_api.proxy.realtime import router as realtime_router
 from ai_api.proxy.responses import router as responses_router
 from ai_api.proxy.router import router as proxy_router
 
@@ -99,6 +100,7 @@ def create_app() -> FastAPI:
     app.include_router(catalog.router, prefix="/catalog", tags=["catalog"])
     app.include_router(proxy_router, prefix="/v1", tags=["proxy"])  # chat (streaming)
     app.include_router(responses_router, prefix="/v1", tags=["proxy"])  # responses (streaming)
+    app.include_router(realtime_router, prefix="/v1", tags=["proxy"])  # realtime (live transcription WS)
     # Phase 31: all non-streaming inference endpoints come from the data-driven
     # registry (embeddings/ocr/images/rerank/audio + moderation/search/image_edit).
     app.include_router(build_proxy_registry_router(), prefix="/v1", tags=["proxy"])
diff --git a/src/ai_api/proxy/realtime.py b/src/ai_api/proxy/realtime.py
new file mode 100644
index 0000000..01501a8
--- /dev/null
+++ b/src/ai_api/proxy/realtime.py
@@ -0,0 +1,519 @@
+"""Phase 32 (043): /v1/realtime — OpenAI-compatible live transcription relay.
+
+A thin bidirectional WebSocket relay between an app client and the upstream
+provider's realtime WS. We do NOT go through litellm's realtime (it is Proxy form
+/ client-direct, which bypasses the gateway and loses per-allocation attribution +
+in-flight revocation — see experience lesson 40). Instead we borrow litellm
+`RealTimeStreaming.bidirectional_forward` *structure*: two forwarding coroutines,
+plus a side-channel revocation watcher, plus per-minute metering self-counted from
+the client's `input_audio_buffer.append` PCM bytes (research R2 — no reliance on a
+provider usage event, so an abnormal abort never loses billing).
+
+Testability: `handle_realtime` takes an injectable `open_upstream`, so CI exercises
+the full relay/metering/revocation against a fake provider WS in-loop (the engine is
+bound to the test event loop; a separate TestClient portal would break asyncpg).
+Real Azure realtime WS is validated by the maintainer in quickstart (T027).
+"""
+from __future__ import annotations
+
+import asyncio
+import base64
+import contextlib
+import json
+import logging
+import math
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from typing import Any, Protocol
+
+from fastapi import APIRouter, WebSocket
+from sqlalchemy import select
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+# --- WebSocket close codes (RFC 6455) used when the platform closes ---------
+WS_NORMAL = 1000
+WS_POLICY_VIOLATION = 1008  # auth / quota / revoked
+WS_UNSUPPORTED = 1003       # model is not a realtime kind
+WS_INTERNAL = 1011          # upstream error / unexpected
+
+# Default revocation re-check interval (seconds). Long-lived connections MUST be
+# re-checked, not only at connect (principle 3). Centralized + overridable.
+REVOKE_RECHECK_SECONDS = 5
+
+# PCM defaults when `session.update` omits them: 16-bit mono.
+_DEFAULT_BYTES_PER_SAMPLE = 2
+_DEFAULT_CHANNELS = 1
+_DEFAULT_SAMPLE_RATE = 24000
+
+
+# --- Uniform WS interfaces (FastAPI WebSocket and the websockets client both
+# satisfy these; test fakes mirror them) ------------------------------------
+class ClientWS(Protocol):
+    @property
+    def headers(self) -> Any: ...
+
+    async def accept(self) -> None: ...
+    async def receive_text(self) -> str: ...
+    async def send_text(self, data: str) -> None: ...
+    async def close(self, code: int = WS_NORMAL, reason: str | None = None) -> None: ...
+
+
+class UpstreamWS(Protocol):
+    async def send(self, data: str) -> None: ...
+    async def recv(self) -> str: ...
+    async def close(self) -> None: ...
+
+
+# open_upstream(provider, model, api_key, api_base, api_version) -> UpstreamWS
+OpenUpstream = Callable[..., Awaitable[UpstreamWS]]
+
+
+@dataclass
+class RealtimeSession:
+    """In-memory lifecycle state of one realtime connection (never persisted).
+
+    On disconnect (any reason) the accrued `audio_bytes` is metered into a single
+    CallRecord(unit="minute") attributed to `allocation_id`.
+    """
+
+    allocation_id: str
+    subject: str | None
+    resource_model: str
+    upstream_model: str
+    provider: str
+    request_id: str
+    started_at: datetime
+    audio_bytes: int = 0
+    sample_rate: int = _DEFAULT_SAMPLE_RATE
+    bytes_per_sample: int = _DEFAULT_BYTES_PER_SAMPLE
+    channels: int = _DEFAULT_CHANNELS
+    # normal | client_abort | upstream_error | revoked
+    close_reason: str = "normal"
+
+
+# --- Pure metering helpers (T014/T017) --------------------------------------
+def duration_seconds(audio_bytes: int, sample_rate: int, bytes_per_sample: int, channels: int) -> float:
+    """Audio duration from raw PCM byte count. 0 if the frame geometry is unknown."""
+    denom = sample_rate * bytes_per_sample * channels
+    if denom <= 0:
+        return 0.0
+    return audio_bytes / denom
+
+
+def pcm_bytes_to_minutes(
+    audio_bytes: int,
+    *,
+    sample_rate: int = _DEFAULT_SAMPLE_RATE,
+    bytes_per_sample: int = _DEFAULT_BYTES_PER_SAMPLE,
+    channels: int = _DEFAULT_CHANNELS,
+) -> int:
+    """Per-minute billing quantity: round UP to the next whole minute (a started
+    minute is a billed minute, the per-minute convention). 0 bytes → 0 minutes."""
+    secs = duration_seconds(audio_bytes, sample_rate, bytes_per_sample, channels)
+    if secs <= 0:
+        return 0
+    return math.ceil(secs / 60)
+
+
+def session_minutes(sess: RealtimeSession) -> int:
+    return pcm_bytes_to_minutes(
+        sess.audio_bytes,
+        sample_rate=sess.sample_rate,
+        bytes_per_sample=sess.bytes_per_sample,
+        channels=sess.channels,
+    )
+
+
+def _apply_format(sess: RealtimeSession, ev: dict[str, Any]) -> None:
+    """Read sample rate (and, if present, sample width/channels) from a
+    `session.update` so metering uses the client's actual PCM geometry. Tolerant of
+    the two shapes seen in the wild: session.audio.input.format.* and
+    session.input_audio_format / session.audio.format.*."""
+    session = ev.get("session")
+    if not isinstance(session, dict):
+        return
+    fmt: dict[str, Any] = {}
+    audio = session.get("audio")
+    if isinstance(audio, dict):
+        inp = audio.get("input")
+        if isinstance(inp, dict) and isinstance(inp.get("format"), dict):
+            fmt = inp["format"]
+        elif isinstance(audio.get("format"), dict):
+            fmt = audio["format"]
+    if not fmt and isinstance(session.get("input_audio_format"), dict):
+        fmt = session["input_audio_format"]
+    rate = fmt.get("rate") or fmt.get("sample_rate")
+    if isinstance(rate, int) and rate > 0:
+        sess.sample_rate = rate
+    channels = fmt.get("channels")
+    if isinstance(channels, int) and channels > 0:
+        sess.channels = channels
+    bps = fmt.get("bytes_per_sample")
+    if isinstance(bps, int) and bps > 0:
+        sess.bytes_per_sample = bps
+
+
+def _meter_client_event(sess: RealtimeSession, raw: str) -> None:
+    """Update metering state from a client→platform frame. Never raises."""
+    try:
+        ev = json.loads(raw)
+    except (ValueError, TypeError):
+        return
+    if not isinstance(ev, dict):
+        return
+    etype = ev.get("type")
+    if etype == "session.update":
+        _apply_format(sess, ev)
+    elif etype == "input_audio_buffer.append":
+        audio = ev.get("audio")
+        if isinstance(audio, str) and audio:
+            # Malformed base64 → skip metering this frame (never crash the relay).
+            with contextlib.suppress(ValueError, TypeError):
+                sess.audio_bytes += len(base64.b64decode(audio, validate=False))
+
+
+# --- Bidirectional relay (T011) ---------------------------------------------
+async def _client_to_upstream(client: ClientWS, upstream: UpstreamWS, sess: RealtimeSession) -> None:
+    while True:
+        try:
+            raw = await client.receive_text()
+        except Exception:
+            # Client closed / aborted. Accrued audio_bytes is already counted, so
+            # billing on disconnect never loses usage (FR-004/SC-003).
+            if sess.close_reason == "normal":
+                sess.close_reason = "client_abort"
+            return
+        _meter_client_event(sess, raw)
+        try:
+            await upstream.send(raw)
+        except Exception:
+            if sess.close_reason == "normal":
+                sess.close_reason = "upstream_error"
+            return
+
+
+async def _upstream_to_client(client: ClientWS, upstream: UpstreamWS, sess: RealtimeSession) -> None:
+    while True:
+        try:
+            raw = await upstream.recv()
+        except Exception:
+            if sess.close_reason == "normal":
+                sess.close_reason = "upstream_error"
+            return
+        try:
+            await client.send_text(raw)
+        except Exception:
+            if sess.close_reason == "normal":
+                sess.close_reason = "client_abort"
+            return
+
+
+# check_active(allocation_id) -> bool
+CheckActive = Callable[[str], Awaitable[bool]]
+
+
+async def _revocation_watch(
+    sess: RealtimeSession,
+    *,
+    stop: asyncio.Event,
+    check_active: CheckActive,
+    interval: float,
+) -> None:
+    """Side-channel: every `interval` seconds re-check the allocation; if it is no
+    longer active (revoked / paused / quarantined) flip close_reason and signal the
+    relay to stop (FR-005). Does not touch the relay hot path."""
+    while not stop.is_set():
+        try:
+            await asyncio.wait_for(stop.wait(), timeout=interval)
+            return  # relay ended first
+        except TimeoutError:
+            pass
+        try:
+            active = await check_active(sess.allocation_id)
+        except Exception:
+            logger.exception("realtime revocation re-check failed; leaving connection up")
+            continue
+        if not active:
+            sess.close_reason = "revoked"
+            stop.set()
+            return
+
+
+async def run_relay(
+    client: ClientWS,
+    upstream: UpstreamWS,
+    sess: RealtimeSession,
+    *,
+    check_active: CheckActive,
+    interval: float = REVOKE_RECHECK_SECONDS,
+) -> None:
+    """Run both forwarding coroutines + the revocation watcher until any one ends,
+    then tear the others down. Returns once the connection is fully closed."""
+    stop = asyncio.Event()
+
+    async def _forward_then_stop(coro: Awaitable[None]) -> None:
+        try:
+            await coro
+        finally:
+            stop.set()
+
+    t_up = asyncio.create_task(_forward_then_stop(_client_to_upstream(client, upstream, sess)))
+    t_down = asyncio.create_task(_forward_then_stop(_upstream_to_client(client, upstream, sess)))
+    t_watch = asyncio.create_task(
+        _revocation_watch(sess, stop=stop, check_active=check_active, interval=interval)
+    )
+
+    await stop.wait()
+    # Closing both ends unblocks any coroutine parked in recv/receive.
+    await _safe_close_upstream(upstream)
+    await _safe_close_client(
+        client,
+        *_close_code_for(sess.close_reason),
+    )
+    for task in (t_up, t_down, t_watch):
+        task.cancel()
+    await asyncio.gather(t_up, t_down, t_watch, return_exceptions=True)
+
+
+def _close_code_for(close_reason: str) -> tuple[int, str]:
+    if close_reason == "revoked":
+        return WS_POLICY_VIOLATION, "allocation revoked"
+    if close_reason == "upstream_error":
+        return WS_INTERNAL, "upstream connection closed"
+    return WS_NORMAL, "connection closed"
+
+
+async def _safe_close_client(client: ClientWS, code: int, reason: str) -> None:
+    # Best-effort: the peer may already be gone / mid-teardown.
+    with contextlib.suppress(Exception):
+        await client.close(code=code, reason=reason)
+
+
+async def _safe_close_upstream(upstream: UpstreamWS) -> None:
+    with contextlib.suppress(Exception):
+        await upstream.close()
+
+
+# --- Outcome mapping + billing (T018) ---------------------------------------
+def _outcome_for_close(close_reason: str) -> Any:
+    from ai_api.models import CallOutcome
+
+    if close_reason == "upstream_error":
+        return CallOutcome.upstream_error
+    # normal / client_abort / revoked all delivered service for the accrued
+    # minutes → success (usage is real). revoked just terminated it early.
+    return CallOutcome.success
+
+
+async def _bill_session(sess: RealtimeSession) -> None:
+    """Write ONE CallRecord(unit="minute") for the accrued audio. Any close path
+    reaches here (FR-004). Uses a fresh session — the connection has no request
+    session. Never raises (billing must not crash teardown)."""
+    from ai_api.db import get_sessionmaker
+    from ai_api.services.pricing import calculate_unit_cost, lookup_price_for_call
+    from ai_api.services.records import RecordsService
+
+    minutes = session_minutes(sess)
+    outcome = _outcome_for_close(sess.close_reason)
+    try:
+        async with get_sessionmaker()() as s:
+            price = await lookup_price_for_call(
+                s,
+                provider=sess.provider,
+                model=sess.upstream_model.split("/", 1)[-1],
+                call_time=sess.started_at,
+            )
+            cost = (
+                calculate_unit_cost(minutes, price.price_per_unit)
+                if price is not None
+                else None
+            )
+            await RecordsService(s).record_call(
+                request_id=sess.request_id,
+                allocation_id=sess.allocation_id,
+                subject=sess.subject,
+                model=sess.resource_model,
+                started_at=sess.started_at,
+                status_code=200,
+                outcome=outcome,
+                quantity=minutes,
+                unit="minute",
+                cost_usd=cost,
+                error_message=(
+                    "allocation revoked mid-connection" if sess.close_reason == "revoked" else None
+                ),
+            )
+            await s.commit()
+    except BaseException:  # incl. CancelledError; never lose billing silently
+        logger.exception("failed to record realtime call (allocation=%s)", sess.allocation_id)
+
+
+# --- Allocation status re-check (used as check_active) ----------------------
+async def _allocation_is_active(allocation_id: str) -> bool:
+    from ai_api.db import get_sessionmaker
+    from ai_api.models import Allocation, AllocationStatus
+
+    try:
+        async with get_sessionmaker()() as s:
+            alloc = await s.get(Allocation, allocation_id)
+            return alloc is not None and alloc.status == AllocationStatus.active
+    except Exception:
+        logger.exception("realtime allocation re-check query failed")
+        # Fail-open on a transient DB error: do NOT kill a live connection on a
+        # blip; the next tick re-checks.
+        return True
+
+
+# --- Connection entrypoint (T010/T012/T013) ---------------------------------
+def _extract_token(headers: Any) -> str | None:
+    """Bearer token from the Authorization header (case-insensitive lookup)."""
+    auth = None
+    if hasattr(headers, "get"):
+        auth = headers.get("authorization") or headers.get("Authorization")
+    if not auth or not auth.lower().startswith("bearer "):
+        return None
+    token = auth.split(" ", 1)[1].strip()
+    return token or None
+
+
+async def handle_realtime(
+    client: ClientWS,
+    *,
+    open_upstream: OpenUpstream,
+    check_active: CheckActive = _allocation_is_active,
+    revoke_interval: float = REVOKE_RECHECK_SECONDS,
+) -> None:
+    """Drive one realtime connection end-to-end. Injectable `open_upstream` /
+    `check_active` make the whole path CI-testable against a fake provider WS."""
+    from ai_api.config import get_settings
+    from ai_api.db import get_sessionmaker
+    from ai_api.models import ModelCatalog
+    from ai_api.observability.request_id import current_request_id
+    from ai_api.proxy.preflight import PreflightRejection, run_preflight
+    from ai_api.services.model_kind import model_kind
+
+    await client.accept()
+    started_at = datetime.now(UTC)
+    request_id = current_request_id() or "realtime"
+
+    token = _extract_token(client.headers)
+    if token is None:
+        logger.info("realtime connection rejected: missing bearer token")
+        await _safe_close_client(client, WS_POLICY_VIOLATION, "missing bearer token")
+        return
+
+    # First frame carries the model (session.update). Need it for preflight.
+    try:
+        first_raw = await client.receive_text()
+    except Exception:
+        await _safe_close_client(client, WS_NORMAL, "no session.update received")
+        return
+    requested_model = _model_from_session_update(first_raw)
+    if requested_model is None:
+        logger.info("realtime connection rejected: first frame is not a session.update with model")
+        await _safe_close_client(client, WS_POLICY_VIOLATION, "first frame must be session.update with model")
+        return
+
+    settings = get_settings()
+    async with get_sessionmaker()() as s:
+        result = await run_preflight(
+            s, settings=settings, token=token, requested_model=requested_model
+        )
+        if isinstance(result, PreflightRejection):
+            logger.info(
+                "realtime preflight rejected model=%s code=%s", requested_model, result.code
+            )
+            await _safe_close_client(client, WS_POLICY_VIOLATION, result.code)
+            return
+        # Model must be a realtime kind (FR-007) — catalog honesty (FR-008).
+        row = (
+            await s.execute(select(ModelCatalog).where(ModelCatalog.slug == result.canonical_model))
+        ).scalar_one_or_none()
+        kind = model_kind(row) if row is not None else "chat"
+        if kind != "realtime":
+            logger.info(
+                "realtime connection rejected: model=%s kind=%s (not realtime)",
+                result.canonical_model, kind,
+            )
+            await _safe_close_client(client, WS_UNSUPPORTED, "model does not support realtime")
+            return
+        allocation_id = result.allocation.id
+        subject = result.allocation.subject_snapshot
+
+    resolved = result.resolved
+    sess = RealtimeSession(
+        allocation_id=allocation_id,
+        subject=subject,
+        resource_model=result.canonical_model,
+        upstream_model=result.upstream_model,
+        provider=result.provider,
+        request_id=request_id,
+        started_at=started_at,
+    )
+    # Meter the first frame too (it may already be an append in some clients).
+    _meter_client_event(sess, first_raw)
+
+    # Open the upstream provider WS; never leak key/endpoint to the client (FR-006).
+    try:
+        upstream = await open_upstream(
+            provider=result.provider,
+            model=result.upstream_model,
+            api_key=resolved.api_key,
+            api_base=resolved.base_url,
+            api_version=(resolved.extra_config or {}).get("api_version"),
+        )
+    except Exception:
+        logger.exception("realtime upstream connect failed model=%s", result.upstream_model)
+        sess.close_reason = "upstream_error"
+        await _safe_close_client(client, WS_INTERNAL, "upstream unavailable")
+        await _bill_session(sess)
+        return
+
+    logger.info(
+        "realtime connection open allocation=%s model=%s request_id=%s",
+        allocation_id, result.canonical_model, request_id,
+    )
+    try:
+        # Replay the first session.update to upstream so it configures correctly.
+        try:
+            await upstream.send(first_raw)
+        except Exception:
+            sess.close_reason = "upstream_error"
+        else:
+            await run_relay(
+                client, upstream, sess, check_active=check_active, interval=revoke_interval
+            )
+    finally:
+        await _safe_close_upstream(upstream)
+        await _bill_session(sess)
+        logger.info(
+            "realtime connection closed allocation=%s reason=%s minutes=%s",
+            allocation_id, sess.close_reason, session_minutes(sess),
+        )
+
+
+def _model_from_session_update(raw: str) -> str | None:
+    try:
+        ev = json.loads(raw)
+    except (ValueError, TypeError):
+        return None
+    if not isinstance(ev, dict) or ev.get("type") != "session.update":
+        return None
+    session = ev.get("session")
+    if not isinstance(session, dict):
+        return None
+    model = session.get("model")
+    return model if isinstance(model, str) and model else None
+
+
+@router.websocket("/realtime")
+async def realtime_endpoint(websocket: WebSocket) -> None:
+    """OpenAI-compatible realtime transcription WS. Thin adapter: FastAPI's
+    WebSocket satisfies the ClientWS interface; the real upstream opener is wired
+    here (CI injects a fake via `handle_realtime`)."""
+    from ai_api.proxy import upstream
+
+    await handle_realtime(websocket, open_upstream=upstream.open_realtime_ws)
diff --git a/src/ai_api/proxy/upstream.py b/src/ai_api/proxy/upstream.py
index 3890d64..658cc39 100644
--- a/src/ai_api/proxy/upstream.py
+++ b/src/ai_api/proxy/upstream.py
@@ -202,6 +202,48 @@ async def asearch(
     )
 
 
+def _build_realtime_url(api_base: str | None, model: str, api_version: str | None) -> str:
+    """Build the Azure Foundry realtime WS URL from the resolved credential.
+
+    Azure OpenAI realtime: wss://<resource>.openai.azure.com/openai/realtime?
+    api-version=<v>&deployment=<deployment>. We derive the wss scheme from the
+    https api_base and carry the bare model (deployment) name. Validated against a
+    real Azure realtime endpoint in quickstart (T027) — CI uses a fake upstream.
+    """
+    base = (api_base or "").rstrip("/")
+    if base.startswith("https://"):
+        base = "wss://" + base[len("https://"):]
+    elif base.startswith("http://"):
+        base = "ws://" + base[len("http://"):]
+    deployment = model.split("/", 1)[-1]
+    version = api_version or "2024-10-01-preview"
+    return f"{base}/openai/realtime?api-version={version}&deployment={deployment}"
+
+
+async def open_realtime_ws(
+    *,
+    provider: str,
+    model: str,
+    api_key: str,
+    api_base: str | None = None,
+    api_version: str | None = None,
+) -> Any:
+    """Open a WebSocket to the upstream provider's realtime endpoint and return the
+    connection (has async `send`/`recv`/`close`). Injects the credential as the
+    `api-key` header (Azure) — the key/endpoint never reach the downstream client
+    (FR-006). Phase 32 (043): /v1/realtime live transcription relay.
+    """
+    import websockets
+
+    url = _build_realtime_url(api_base, model, api_version)
+    # Azure uses the `api-key` header; OpenAI-style uses Authorization: Bearer.
+    if provider == "openai":
+        headers = {"Authorization": f"Bearer {api_key}"}
+    else:
+        headers = {"api-key": api_key}
+    return await websockets.connect(url, additional_headers=headers)
+
+
 async def aimage_edit(
     *,
     model: str,
diff --git a/tests/contract/test_realtime_transcription.py b/tests/contract/test_realtime_transcription.py
new file mode 100644
index 0000000..867d141
--- /dev/null
+++ b/tests/contract/test_realtime_transcription.py
@@ -0,0 +1,309 @@
+"""Phase 32 (043): /v1/realtime contract tests (contracts/realtime-transcription.md 1-7).
+
+Drives `handle_realtime` in-loop with a fake client WS + fake provider WS (the
+engine is bound to the test loop, so a TestClient portal would break the DB). This
+is the Constitution-Deviation remedy: CI exercises the full preflight → relay →
+metering → revocation path against a mock provider WS; real Azure WS is the
+maintainer's T027 smoke.
+
+Covers: T007 (invalid/revoked key → close, no stream), T008 (non-realtime →
+unsupported), T009 (valid → delta), T015 (clean close → CallRecord minute), T016
+(abnormal abort → billed), T020/T021 (in-flight revoke/pause → close + billed),
+plus the no-leak contract (#7).
+"""
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+from datetime import UTC, datetime, timedelta
+from decimal import Decimal
+
+import pytest
+from httpx import AsyncClient
+from sqlalchemy import select
+from ulid import ULID
+
+from ai_api.db import get_sessionmaker
+from ai_api.models import CallOutcome, CallRecord, ModelCatalog, PriceList
+from ai_api.proxy.realtime import handle_realtime
+from tests.support.realtime_mock import FakeClientWS, FakeUpstreamWS, fake_opener
+
+RT_MODEL = "azure/gpt-realtime-whisper"
+# The resolved upstream credential — must never reach the downstream client (FR-006).
+SECRET_KEY = "az-secret-DO-NOT-LEAK-9999"
+SECRET_BASE = "https://secret-foundry.services.ai.azure.com"
+
+# 24 kHz pcm16 mono → 48000 bytes/sec.
+_BYTES_PER_SEC = 24000 * 2 * 1
+
+
+def _session_update(model: str = RT_MODEL, rate: int = 24000) -> str:
+    return json.dumps({
+        "type": "session.update",
+        "session": {
+            "type": "transcription",
+            "model": model,
+            "audio": {"input": {"format": {"type": "audio/pcm", "rate": rate}}},
+        },
+    })
+
+
+def _append(seconds: float, rate: int = 24000) -> str:
+    pcm = b"\x00" * int(_BYTES_PER_SEC * seconds * (rate / 24000))
+    return json.dumps({
+        "type": "input_audio_buffer.append",
+        "audio": base64.b64encode(pcm).decode(),
+    })
+
+
+async def _seed_catalog(slug: str, *, mode: str) -> None:
+    """Seed a catalog row whose litellm mode drives model_kind (realtime vs chat)."""
+    now = datetime.now(UTC)
+    sm = get_sessionmaker()
+    async with sm() as s:
+        s.add(ModelCatalog(
+            slug=slug, provider="azure", display_name=slug, family="x",
+            description="", modality_input=["audio"], modality_output=["text"],
+            capabilities=[], context_window=1024, cost_tier="low",
+            recommended_for=[], tags=[], example_request={}, official_doc_url=None,
+            status="active", deprecation_note=None, created_at=now, updated_at=now,
+            default_access="open", allowed_tags=[], denied_tags=[],
+            self_service_enabled=False, self_service_default_quota=None,
+            litellm_sync={"raw": {"mode": mode}},
+        ))
+        await s.commit()
+
+
+async def _seed_price(per_minute: str) -> None:
+    sm = get_sessionmaker()
+    async with sm() as s:
+        s.add(PriceList(
+            id=str(ULID()), provider="azure", model="gpt-realtime-whisper",
+            input_per_1k_tokens_usd=Decimal(0), output_per_1k_tokens_usd=Decimal(0),
+            price_unit="minute", price_per_unit_usd=Decimal(per_minute),
+            effective_from=datetime.now(UTC) - timedelta(days=1),
+            created_at=datetime.now(UTC), created_by="test",
+        ))
+        await s.commit()
+
+
+async def _seed_provider(client: AsyncClient, admin: dict) -> None:
+    """An active provider credential is required for preflight's model-access check
+    to pass (env fallback doesn't register as an active provider)."""
+    r = await client.post("/admin/providers", headers=admin, json={
+        "provider": "azure", "label": "t", "api_key": SECRET_KEY, "base_url": SECRET_BASE,
+    })
+    assert r.status_code in (200, 201), r.text
+
+
+async def _alloc(client: AsyncClient, admin: dict, model: str = RT_MODEL) -> dict:
+    r = await client.post("/admin/allocations", headers=admin,
+                          json={"subject": "alice@example.com", "resource_model": model})
+    assert r.status_code == 201, r.text
+    return r.json()
+
+
+async def _last(outcome: CallOutcome) -> CallRecord | None:
+    sm = get_sessionmaker()
+    async with sm() as s:
+        rows = (await s.execute(
+            select(CallRecord).where(CallRecord.outcome == outcome)
+            .order_by(CallRecord.started_at.desc())
+        )).scalars().all()
+        return rows[0] if rows else None
+
+
+def _bearer(token: str) -> dict[str, str]:
+    return {"authorization": f"Bearer {token}"}
+
+
+# --- T007: invalid / revoked key → close, no stream -------------------------
+@pytest.mark.asyncio
+async def test_invalid_key_closed_no_stream(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    client = FakeClientWS(_bearer("totally-invalid-token"), [_session_update()])
+    upstream = FakeUpstreamWS()
+    opener = fake_opener(upstream)
+    await handle_realtime(client, open_upstream=opener)
+    assert client.closed is not None and client.closed[0] == 1008  # policy violation
+    assert opener.calls == []           # upstream never opened
+    assert upstream.sent == []          # no stream started
+    assert await _last(CallOutcome.success) is None
+
+
+@pytest.mark.asyncio
+async def test_missing_bearer_closed(app_client: AsyncClient, admin_headers):
+    client = FakeClientWS({}, [_session_update()])
+    opener = fake_opener(FakeUpstreamWS())
+    await handle_realtime(client, open_upstream=opener)
+    assert client.closed is not None and client.closed[0] == 1008
+    assert opener.calls == []
+
+
+@pytest.mark.asyncio
+async def test_revoked_allocation_closed(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    alloc = await _alloc(app_client, admin_headers)
+    # Revoke it before connecting.
+    r = await app_client.delete(f"/admin/allocations/{alloc['id']}", headers=admin_headers)
+    assert r.status_code in (200, 204), r.text
+    client = FakeClientWS(_bearer(alloc["token"]), [_session_update()])
+    opener = fake_opener(FakeUpstreamWS())
+    await handle_realtime(client, open_upstream=opener)
+    assert client.closed is not None and client.closed[0] == 1008
+    assert opener.calls == []
+
+
+# --- T008: non-realtime model → close(unsupported) --------------------------
+@pytest.mark.asyncio
+async def test_non_realtime_model_unsupported(app_client: AsyncClient, admin_headers):
+    chat_model = "azure/gpt-4o-mini"
+    await _seed_catalog(chat_model, mode="chat")
+    await _seed_provider(app_client, admin_headers)
+    alloc = await _alloc(app_client, admin_headers, model=chat_model)
+    client = FakeClientWS(_bearer(alloc["token"]), [_session_update(model=chat_model)])
+    opener = fake_opener(FakeUpstreamWS())
+    await handle_realtime(client, open_upstream=opener)
+    assert client.closed is not None and client.closed[0] == 1003  # unsupported
+    assert opener.calls == []
+
+
+# --- T009: valid connection + append → delta reaches client -----------------
+@pytest.mark.asyncio
+async def test_valid_connection_relays_delta(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    delta = json.dumps({
+        "type": "conversation.item.input_audio_transcription.delta", "delta": "hello",
+    })
+    completed = json.dumps({
+        "type": "conversation.item.input_audio_transcription.completed",
+        "transcript": "hello world",
+    })
+    # Upstream drives the end: emit delta+completed then hang up.
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(1.0)], hold_open=True)
+    upstream = FakeUpstreamWS([delta, completed], close_after=True)
+    opener = fake_opener(upstream)
+    await handle_realtime(client, open_upstream=opener)
+    assert any("transcription.delta" in m for m in client.sent), client.sent
+    # The session.update + append were forwarded upstream (key/endpoint injected
+    # on the upstream side, never to the client).
+    assert opener.calls and opener.calls[0]["model"] == RT_MODEL
+    assert any("input_audio_buffer.append" in m for m in upstream.sent)
+
+
+# --- T015: clean close → one CallRecord(unit=minute), quantity matches ------
+@pytest.mark.asyncio
+async def test_clean_close_bills_one_minute_record(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_price("0.017")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    # 90 seconds of audio → ceil(90/60) = 2 minutes. Client ends (disconnect).
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(90.0)], hold_open=False)
+    upstream = FakeUpstreamWS(close_after=False)
+    await handle_realtime(client, open_upstream=fake_opener(upstream))
+    rec = await _last(CallOutcome.success)
+    assert rec is not None
+    assert rec.unit == "minute" and rec.quantity == 2
+    assert rec.allocation_id == alloc["id"]
+    assert rec.cost_usd == Decimal("0.034")  # 2 x 0.017
+    assert rec.prompt_tokens is None and rec.total_tokens is None  # non-token call
+
+
+@pytest.mark.asyncio
+async def test_unpriced_realtime_zero_cost(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(30.0)], hold_open=False)
+    await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False)))
+    rec = await _last(CallOutcome.success)
+    assert rec is not None and rec.unit == "minute" and rec.quantity == 1
+    assert rec.cost_usd is None  # no PriceList → unpriced (NULL), not a crash
+
+
+# --- T016: abnormal abort (client hangs up mid-stream) → accrued bytes billed
+@pytest.mark.asyncio
+async def test_abnormal_abort_still_bills_accrued(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_price("0.017")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    # Sends 45s then the client connection drops with no graceful close.
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(45.0)], hold_open=False)
+    await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False)))
+    rec = await _last(CallOutcome.success)
+    assert rec is not None and rec.unit == "minute" and rec.quantity == 1  # ceil(45/60)
+
+
+# --- T020/T021: in-flight revoke / pause → close(revoked) within N + billed -
+@pytest.mark.asyncio
+async def test_inflight_revoke_closes_and_bills(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_price("0.017")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(30.0)], hold_open=True)
+    upstream = FakeUpstreamWS(close_after=False)
+
+    calls = {"n": 0}
+
+    async def revoke_after_first_tick(allocation_id: str) -> bool:
+        calls["n"] += 1
+        return calls["n"] < 1  # first re-check already reports inactive
+
+    await asyncio.wait_for(
+        handle_realtime(
+            client, open_upstream=fake_opener(upstream),
+            check_active=revoke_after_first_tick, revoke_interval=0.05,
+        ),
+        timeout=5,
+    )
+    assert client.closed is not None and client.closed[0] == 1008
+    assert client.closed[1] == "allocation revoked"
+    rec = await _last(CallOutcome.success)
+    assert rec is not None and rec.unit == "minute" and rec.quantity == 1
+    assert rec.error_message == "allocation revoked mid-connection"
+
+
+# --- Contract #7: no upstream key / endpoint ever reaches the client --------
+@pytest.mark.asyncio
+async def test_no_key_or_endpoint_leak(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    err = json.dumps({"type": "error", "error": {"code": "bad", "message": "upstream boom"}})
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(1.0)], hold_open=True)
+    upstream = FakeUpstreamWS([err], close_after=True)
+    await handle_realtime(client, open_upstream=fake_opener(upstream))
+    blob = " ".join(client.sent) + " " + json.dumps(client.closed)
+    assert SECRET_KEY not in blob
+    assert "secret-foundry.services.ai.azure.com" not in blob
+
+
+@pytest.mark.asyncio
+async def test_upstream_connect_failure_no_leak_and_bills_zero(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL, mode="realtime")
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+
+    async def failing_opener(**kwargs):
+        raise RuntimeError(f"connect to {kwargs.get('api_base')} with {kwargs.get('api_key')} failed")
+
+    client = FakeClientWS(_bearer(alloc["token"]), [_session_update(), _append(1.0)])
+    await handle_realtime(client, open_upstream=failing_opener)
+    assert client.closed is not None and client.closed[0] == 1011  # internal
+    blob = json.dumps(client.closed) + " ".join(client.sent)
+    assert SECRET_KEY not in blob and "secret-foundry.services.ai.azure.com" not in blob
+    # Connect failed before any audio relayed → 0 minutes, still a record.
+    rec = await _last(CallOutcome.upstream_error)
+    assert rec is not None and rec.unit == "minute" and rec.quantity == 0
diff --git a/tests/support/__init__.py b/tests/support/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/support/realtime_mock.py b/tests/support/realtime_mock.py
new file mode 100644
index 0000000..2f64dce
--- /dev/null
+++ b/tests/support/realtime_mock.py
@@ -0,0 +1,108 @@
+"""Phase 32 (043) T002: reusable fake realtime WS pair for CI.
+
+The engine is bound to the test event loop, so a separate TestClient portal would
+break asyncpg/aiosqlite. Instead the relay is driven in-loop by calling
+`handle_realtime`/`run_relay` directly with these fakes — a mock *provider* realtime
+WS plus a mock client WS — exactly the Constitution-Deviation remedy in the plan
+(CI never touches a real Azure realtime WS; that is the maintainer's T027 smoke).
+"""
+from __future__ import annotations
+
+import asyncio
+from collections import deque
+from collections.abc import Iterable
+from typing import Any
+
+
+class FakeDisconnect(Exception):
+    """Mimics starlette WebSocketDisconnect from the client side."""
+
+
+class FakeClosed(Exception):
+    """Mimics websockets ConnectionClosed from the upstream side."""
+
+
+class FakeClientWS:
+    """Stands in for a FastAPI WebSocket (ClientWS interface).
+
+    `inbound` is the scripted sequence of client→platform frames. After they are
+    drained, `receive_text` either blocks (hold_open=True, simulating a still-open
+    client until the platform closes) or raises FakeDisconnect (client ended/aborted).
+    """
+
+    def __init__(
+        self,
+        headers: dict[str, str],
+        inbound: Iterable[str],
+        *,
+        hold_open: bool = False,
+    ) -> None:
+        self.headers = dict(headers)
+        self._inbound: deque[str] = deque(inbound)
+        self._hold = hold_open
+        self.sent: list[str] = []          # platform→client frames (e.g. deltas)
+        self.closed: tuple[int, str | None] | None = None
+        self.accepted = False
+        self._released = asyncio.Event()
+
+    async def accept(self) -> None:
+        self.accepted = True
+
+    async def receive_text(self) -> str:
+        if self._inbound:
+            return self._inbound.popleft()
+        if self._hold:
+            await self._released.wait()
+        raise FakeDisconnect()
+
+    async def send_text(self, data: str) -> None:
+        self.sent.append(data)
+
+    async def close(self, code: int = 1000, reason: str | None = None) -> None:
+        if self.closed is None:
+            self.closed = (code, reason)
+        self._released.set()
+
+
+class FakeUpstreamWS:
+    """Stands in for the upstream provider realtime WS (UpstreamWS interface).
+
+    `events` is the scripted sequence of provider→platform frames (delta/completed).
+    After they drain, `recv` either raises FakeClosed (provider hung up) or blocks
+    (close_after=False, stays open until the platform closes it).
+    """
+
+    def __init__(self, events: Iterable[str] | None = None, *, close_after: bool = False) -> None:
+        self._events: deque[str] = deque(events or [])
+        self._close_after = close_after
+        self.sent: list[str] = []          # client→upstream forwarded frames
+        self.closed = False
+        self._released = asyncio.Event()
+
+    async def send(self, data: str) -> None:
+        self.sent.append(data)
+
+    async def recv(self) -> str:
+        if self._events:
+            return self._events.popleft()
+        if self._close_after:
+            raise FakeClosed()
+        await self._released.wait()
+        raise FakeClosed()
+
+    async def close(self) -> None:
+        self.closed = True
+        self._released.set()
+
+
+def fake_opener(upstream: FakeUpstreamWS) -> Any:
+    """Return an `open_upstream` callable that yields the given fake upstream and
+    records the credential kwargs it was called with (to assert no-leak / routing)."""
+    calls: list[dict[str, Any]] = []
+
+    async def _open(**kwargs: Any) -> FakeUpstreamWS:
+        calls.append(kwargs)
+        return upstream
+
+    _open.calls = calls  # type: ignore[attr-defined]
+    return _open
diff --git a/tests/unit/test_realtime_metering.py b/tests/unit/test_realtime_metering.py
new file mode 100644
index 0000000..77bf68e
--- /dev/null
+++ b/tests/unit/test_realtime_metering.py
@@ -0,0 +1,51 @@
+"""Phase 32 (043) T014: realtime per-minute metering — PCM bytes → minutes.
+
+Pure functions; no DB, no WS. The duration source is the audio WE relayed (Σ append
+PCM bytes), not a provider usage event (research R2), so an abnormal abort still
+yields a correct billable quantity.
+"""
+from __future__ import annotations
+
+from ai_api.proxy.realtime import (
+    RealtimeSession,
+    duration_seconds,
+    pcm_bytes_to_minutes,
+    session_minutes,
+)
+
+
+def test_duration_seconds_pcm16_mono() -> None:
+    # 24000 Hz x 2 bytes x 1 ch = 48000 bytes/sec -> 1 second.
+    assert duration_seconds(48000, 24000, 2, 1) == 1.0
+    # half a second
+    assert duration_seconds(24000, 24000, 2, 1) == 0.5
+    # unknown geometry → 0 (never divide by zero)
+    assert duration_seconds(48000, 0, 2, 1) == 0.0
+
+
+def test_minutes_round_up_started_minute_is_billed() -> None:
+    rate = 24000  # pcm16 mono → 48000 bytes/sec
+    per_sec = rate * 2 * 1
+    assert pcm_bytes_to_minutes(0) == 0                       # nothing → 0
+    assert pcm_bytes_to_minutes(per_sec) == 1                 # 1s → 1 min (round up)
+    assert pcm_bytes_to_minutes(per_sec * 59) == 1            # 59s → 1 min
+    assert pcm_bytes_to_minutes(per_sec * 60) == 1            # exactly 60s → 1 min
+    assert pcm_bytes_to_minutes(per_sec * 61) == 2            # 61s → 2 min
+    assert pcm_bytes_to_minutes(per_sec * 300) == 5           # 5 min exact
+
+
+def test_minutes_respects_session_geometry() -> None:
+    # 16 kHz pcm16 mono = 32000 bytes/sec; 96000 bytes = 3s → 1 min
+    assert pcm_bytes_to_minutes(96000, sample_rate=16000) == 1
+
+
+def test_session_minutes_uses_session_state() -> None:
+    from datetime import UTC, datetime
+
+    sess = RealtimeSession(
+        allocation_id="a", subject="s", resource_model="azure/gpt-realtime-whisper",
+        upstream_model="azure/gpt-realtime-whisper", provider="azure",
+        request_id="r", started_at=datetime.now(UTC), sample_rate=16000,
+    )
+    sess.audio_bytes = 32000 * 90  # 90 seconds at 16 kHz pcm16 mono
+    assert session_minutes(sess) == 2  # 90s → 2 min

From 657d33dc82af392025c1b6edbf3431b30eeb9fd7 Mon Sep 17 00:00:00 2001
From: timcsy <messenger@tew.tw>
Date: Fri, 12 Jun 2026 17:14:13 +0800
Subject: [PATCH 3/4] feat(realtime): make gpt-realtime-whisper testable from
 the admin "test model" button
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

realtime was excluded from the recipe table (it's a bidirectional WS, not a one-shot
call), so the UI test button was disabled. Add a WS-smoke recipe instead:
- upstream.realtime_smoke: opens the upstream realtime WS, runs the session
  handshake + a tiny silent-audio append, awaits the first server event. A non-error
  event proves egress(wss:443)+key+deployment+protocol — i.e. the T027 reachability
  check, now runnable straight from the deployed UI. Raises on error/timeout.
- RECIPES["realtime"] = WS smoke, billable=True (gated by the existing confirm
  dialog; admin test writes only an audit event, never a member CallRecord).

Now the model-detail page shows kind "即時字幕（realtime）" with an enabled (billable-
confirmed) test button. Full suite 735 passed (731→735); ruff+mypy clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/ai_api/proxy/upstream.py               | 60 ++++++++++++++++++++++
 src/ai_api/services/model_kind.py          |  5 +-
 src/ai_api/services/model_test.py          |  5 ++
 tests/integration/test_admin_model_test.py | 49 +++++++++++++++++-
 tests/unit/test_model_kind.py              |  2 +-
 tests/unit/test_upstream_wrappers.py       | 52 +++++++++++++++++++
 6 files changed, 168 insertions(+), 5 deletions(-)

diff --git a/src/ai_api/proxy/upstream.py b/src/ai_api/proxy/upstream.py
index 658cc39..1ab2965 100644
--- a/src/ai_api/proxy/upstream.py
+++ b/src/ai_api/proxy/upstream.py
@@ -5,6 +5,10 @@
 """
 from __future__ import annotations
 
+import asyncio
+import base64
+import contextlib
+import json
 from typing import Any
 
 import litellm
@@ -244,6 +248,62 @@ async def open_realtime_ws(
     return await websockets.connect(url, additional_headers=headers)
 
 
+async def realtime_smoke(
+    *,
+    model: str,
+    api_key: str,
+    api_base: str | None = None,
+    api_version: str | None = None,
+    timeout: float = 15.0,
+) -> dict[str, Any]:
+    """Phase 32 (043): minimal realtime WS smoke for the admin "test model" button.
+
+    Opens the upstream realtime WS, runs the session handshake + a tiny silent-audio
+    append, and waits for the first server event. A structured non-error event proves
+    egress (wss:443) + key + deployment + protocol are all good — i.e. the T027
+    protocol-reachability check, now runnable straight from the UI. Raises on any
+    `error` event, connect failure, or timeout, so the test honestly reports failure.
+    Billable: only a couple seconds of audio.
+    """
+    provider = model.split("/", 1)[0] if "/" in model else "azure"
+    deployment = model.split("/", 1)[-1]
+    ws = await open_realtime_ws(
+        provider=provider, model=model, api_key=api_key,
+        api_base=api_base, api_version=api_version,
+    )
+    try:
+        await ws.send(json.dumps({
+            "type": "session.update",
+            "session": {
+                "type": "transcription", "model": deployment,
+                "audio": {"input": {"format": {"type": "audio/pcm", "rate": 16000}}},
+            },
+        }))
+        pcm = b"\x00\x00" * int(16000 * 0.2)  # 0.2s silence, pcm16 mono 16 kHz
+        await ws.send(json.dumps({
+            "type": "input_audio_buffer.append",
+            "audio": base64.b64encode(pcm).decode(),
+        }))
+        try:
+            async with asyncio.timeout(timeout):
+                while True:
+                    raw = await ws.recv()
+                    ev = json.loads(raw) if isinstance(raw, str) else {}
+                    etype = ev.get("type")
+                    if etype == "error":
+                        msg = (ev.get("error") or {}).get("message") or "(no message)"
+                        raise RuntimeError(f"realtime upstream error: {msg}")
+                    # Any structured server event ⇒ the handshake/protocol works.
+                    return {"ok": True, "first_event": etype}
+        except TimeoutError as e:
+            raise RuntimeError(
+                f"realtime smoke timed out after {timeout}s with no server event"
+            ) from e
+    finally:
+        with contextlib.suppress(Exception):
+            await ws.close()
+
+
 async def aimage_edit(
     *,
     model: str,
diff --git a/src/ai_api/services/model_kind.py b/src/ai_api/services/model_kind.py
index 7b3959b..3dd3270 100644
--- a/src/ai_api/services/model_kind.py
+++ b/src/ai_api/services/model_kind.py
@@ -32,8 +32,9 @@
     "moderation": "moderation",
     "search": "search",
     "image_edit": "image_edit",
-    # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Not a
-    # recipe-table "test model" kind — billed per-minute via the /v1/realtime relay.
+    # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Billed
+    # per-minute via the /v1/realtime relay; the admin "test model" recipe is a
+    # minimal WS smoke (handshake + tiny audio) that doubles as the T027 check.
     "realtime": "realtime",
 }
 
diff --git a/src/ai_api/services/model_test.py b/src/ai_api/services/model_test.py
index a5a9322..7f0eaf5 100644
--- a/src/ai_api/services/model_test.py
+++ b/src/ai_api/services/model_test.py
@@ -98,6 +98,11 @@ class TestRecipe:
         ),
         billable=True,
     ),
+    # realtime is a bidirectional WS, not a one-shot call — the recipe is a minimal
+    # WS smoke (handshake + tiny silent append + await first server event). Passing
+    # proves egress/key/deployment/protocol; it IS the T027 reachability check from
+    # the UI. Billable (a couple seconds of audio).
+    "realtime": TestRecipe(lambda c: upstream.realtime_smoke(**c), billable=True),
 }
 
 
diff --git a/tests/integration/test_admin_model_test.py b/tests/integration/test_admin_model_test.py
index 82475c7..5a829cb 100644
--- a/tests/integration/test_admin_model_test.py
+++ b/tests/integration/test_admin_model_test.py
@@ -208,12 +208,57 @@ async def test_search_confirmed_calls(app_client: AsyncClient, admin_headers: di
     assert m.call_args.kwargs.get("search_provider") == "azure/web-search"
 
 
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_realtime_confirmed_calls(app_client: AsyncClient, admin_headers: dict[str, str]) -> None:
+    """Phase 32: realtime is testable via a WS smoke recipe (billable → needs ack)."""
+    await _seed("azure/gpt-realtime-whisper", mode="realtime")
+    await _provider(app_client, admin_headers)
+    with patch(
+        "ai_api.proxy.upstream.realtime_smoke",
+        new=AsyncMock(return_value={"ok": True, "first_event": "session.created"}),
+    ) as m:
+        # billable → first call asks for confirmation, no upstream touched
+        r0 = await app_client.post(
+            "/admin/catalog/models/azure/gpt-realtime-whisper/test", headers=admin_headers
+        )
+        assert r0.json().get("needs_confirmation") is True and not m.await_count
+        r = await app_client.post(
+            "/admin/catalog/models/azure/gpt-realtime-whisper/test", headers=admin_headers,
+            json={"acknowledge_billable": True},
+        )
+    assert r.status_code == 200, r.text
+    assert r.json()["ok"] is True and r.json()["kind"] == "realtime"
+    m.assert_awaited_once()
+    assert m.call_args.kwargs.get("model") == "azure/gpt-realtime-whisper"
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_realtime_upstream_error_reported(app_client: AsyncClient, admin_headers: dict[str, str]) -> None:
+    """A failing WS smoke (e.g. bad deployment) surfaces as a test failure, not 5xx."""
+    await _seed("azure/gpt-realtime-whisper", mode="realtime")
+    await _provider(app_client, admin_headers)
+    with patch(
+        "ai_api.proxy.upstream.realtime_smoke",
+        new=AsyncMock(side_effect=RuntimeError("realtime upstream error: deployment not found")),
+    ):
+        r = await app_client.post(
+            "/admin/catalog/models/azure/gpt-realtime-whisper/test", headers=admin_headers,
+            json={"acknowledge_billable": True},
+        )
+    assert r.status_code == 200, r.text
+    body = r.json()
+    assert body["ok"] is False and body["error_type"] == "upstream_error"
+    assert "deployment not found" in body["message"]
+
+
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_unknown_mode_unsupported(app_client: AsyncClient, admin_headers: dict[str, str]) -> None:
     # 'video_generation' is a genuinely-unknown mode for the admin test button
-    # (moderation/rerank/etc. became known kinds in Phase 29③/31; only the not-yet-
-    # supported modes — video/realtime/vector_store — remain 'unknown').
+    # (moderation/rerank/etc. became known kinds in Phase 29③/31; realtime became
+    # testable in Phase 32; only modes like video/vector_store remain 'unknown').
     await _seed("azure/video-x", mode="video_generation")
     await _provider(app_client, admin_headers)
     r = await app_client.post("/admin/catalog/models/azure/video-x/test", headers=admin_headers)
diff --git a/tests/unit/test_model_kind.py b/tests/unit/test_model_kind.py
index 8835ae1..13f42e2 100644
--- a/tests/unit/test_model_kind.py
+++ b/tests/unit/test_model_kind.py
@@ -88,7 +88,7 @@ def test_is_supported():
     # auto-testable IFF a recipe exists (model_test.RECIPES). Every inference kind
     # now has a real recipe (ocr/stt/image_edit/search send a minimal fixture).
     for k in ("chat", "embedding", "tts", "image", "moderation", "rerank",
-              "ocr", "stt", "search", "image_edit"):
+              "ocr", "stt", "search", "image_edit", "realtime"):
         assert is_supported(k)
     # only 'unknown' has no recipe → honestly not auto-testable (never a fake pass)
     assert not is_supported("unknown")
diff --git a/tests/unit/test_upstream_wrappers.py b/tests/unit/test_upstream_wrappers.py
index 4abc4d5..7b0cc11 100644
--- a/tests/unit/test_upstream_wrappers.py
+++ b/tests/unit/test_upstream_wrappers.py
@@ -58,3 +58,55 @@ async def test_aocr_leaves_non_azure_provider_untouched():
     with patch("litellm.aocr", new=AsyncMock(return_value="ok")) as m:
         await upstream.aocr(model="mistral/mistral-ocr-latest", document={"x": 1}, api_key="k")
     assert m.call_args.kwargs["model"] == "mistral/mistral-ocr-latest"
+
+
+# --- Phase 32 (043): realtime WS smoke (admin "test model" recipe) -----------
+class _FakeSmokeWS:
+    """A scripted upstream realtime WS for the smoke test (sent frames + recv queue)."""
+
+    def __init__(self, events):
+        self.events = list(events)
+        self.sent = []
+        self.closed = False
+
+    async def send(self, data):
+        self.sent.append(data)
+
+    async def recv(self):
+        if self.events:
+            return self.events.pop(0)
+        raise RuntimeError("no more events")
+
+    async def close(self):
+        self.closed = True
+
+
+@pytest.mark.asyncio
+async def test_realtime_smoke_ok_on_first_server_event():
+    import json
+
+    ws = _FakeSmokeWS([json.dumps({"type": "transcription_session.created"})])
+    with patch("ai_api.proxy.upstream.open_realtime_ws", new=AsyncMock(return_value=ws)) as opener:
+        out = await upstream.realtime_smoke(
+            model="azure/gpt-realtime-whisper", api_key="k",
+            api_base="https://x", api_version="2024-10-01-preview",
+        )
+    assert out["ok"] is True and out["first_event"] == "transcription_session.created"
+    # provider derived from the slug prefix; handshake + audio append were sent.
+    assert opener.call_args.kwargs["provider"] == "azure"
+    assert any("session.update" in s for s in ws.sent)
+    assert any("input_audio_buffer.append" in s for s in ws.sent)
+    assert ws.closed is True  # always closes the upstream WS
+
+
+@pytest.mark.asyncio
+async def test_realtime_smoke_raises_on_error_event():
+    import json
+
+    ws = _FakeSmokeWS([json.dumps({"type": "error", "error": {"message": "deployment not found"}})])
+    with (
+        patch("ai_api.proxy.upstream.open_realtime_ws", new=AsyncMock(return_value=ws)),
+        pytest.raises(RuntimeError, match="deployment not found"),
+    ):
+        await upstream.realtime_smoke(model="azure/gpt-realtime-whisper", api_key="k")
+    assert ws.closed is True

From 963e73875a6b734d6aba601bfa96e9fc67d72392 Mon Sep 17 00:00:00 2001
From: timcsy <messenger@tew.tw>
Date: Fri, 12 Jun 2026 22:49:52 +0800
Subject: [PATCH 4/4] refactor(realtime): detect realtime as a capability
 (supported_endpoints), not a litellm mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

litellm PR #29775 (gpt-realtime-whisper, merged 2026-06-11) ships the model as
mode=audio_transcription and signals realtime via supported_endpoints containing
/v1/realtime — i.e. realtime is a capability axis, not a mode (same shape as
responses_support). The earlier mode==realtime gate would never match any Azure
model. Fix:

- model_kind: realtime is capability-derived — raw.supported_endpoints lists
  /v1/realtime OR an admin `realtime` capability marker (`realtime:blocked` force-
  disables, manual wins). gpt-realtime-whisper (audio_transcription) → realtime;
  whisper-1 (no /v1/realtime) stays stt. Everything keyed on model_kind (endpoint
  gate, test recipe, catalog label) now works for the real model.
- billing: bill in the PriceList's unit — litellm prices realtime transcription per
  SECOND (input_cost_per_second), so default to `second` when unpriced; `minute`
  still honoured. Adds pcm_bytes_to_seconds + session_quantity.
- model-detail: hint that adding the `realtime` capability marks a manually-added
  model as realtime (needed until litellm's price-map entry — currently clobbered by
  a json regen on main — is restored, after which import auto-detects it).

Full suite 742 passed; ruff+mypy clean; frontend tsc + 164 vitest green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 frontend/src/routes/admin/model-detail.tsx    |  4 ++
 src/ai_api/main.py                            |  2 +-
 src/ai_api/proxy/realtime.py                  | 47 ++++++++++++--
 src/ai_api/services/model_kind.py             | 36 +++++++++-
 tests/contract/test_realtime_transcription.py | 65 ++++++++++++++-----
 tests/integration/test_admin_model_test.py    | 16 +++--
 tests/unit/test_model_kind.py                 | 40 ++++++++++++
 tests/unit/test_realtime_metering.py          | 31 ++++++++-
 8 files changed, 206 insertions(+), 35 deletions(-)

diff --git a/frontend/src/routes/admin/model-detail.tsx b/frontend/src/routes/admin/model-detail.tsx
index c396ace..a490079 100644
--- a/frontend/src/routes/admin/model-detail.tsx
+++ b/frontend/src/routes/admin/model-detail.tsx
@@ -717,6 +717,10 @@ function EditBasicsDialog({
           <div>
             <Label htmlFor="b-cap">能力（逗號分隔）</Label>
             <Input id="b-cap" className="mt-1" placeholder="chat, vision, function-calling" value={capabilities} onChange={(e) => setCapabilities(e.target.value)} />
+            <p className="text-xs text-muted-foreground mt-1">
+              加 <code>realtime</code> 把模型標為「即時字幕」（走 /v1/realtime WS、可在此頁測試）；
+              手動加入、litellm 尚未帶入 supported_endpoints 時用得到。<code>realtime:blocked</code> 可強制關閉。
+            </p>
           </div>
           <div>
             <Label htmlFor="b-rec">適用情境（逗號分隔）</Label>
diff --git a/src/ai_api/main.py b/src/ai_api/main.py
index c24af9f..3e41677 100644
--- a/src/ai_api/main.py
+++ b/src/ai_api/main.py
@@ -37,8 +37,8 @@
 from ai_api.db import dispose_engine
 from ai_api.observability.logging import setup_logging
 from ai_api.observability.request_id import RequestIdMiddleware
-from ai_api.proxy.registry import build_router as build_proxy_registry_router
 from ai_api.proxy.realtime import router as realtime_router
+from ai_api.proxy.registry import build_router as build_proxy_registry_router
 from ai_api.proxy.responses import router as responses_router
 from ai_api.proxy.router import router as proxy_router
 
diff --git a/src/ai_api/proxy/realtime.py b/src/ai_api/proxy/realtime.py
index 01501a8..41c2465 100644
--- a/src/ai_api/proxy/realtime.py
+++ b/src/ai_api/proxy/realtime.py
@@ -118,6 +118,21 @@ def pcm_bytes_to_minutes(
     return math.ceil(secs / 60)
 
 
+def pcm_bytes_to_seconds(
+    audio_bytes: int,
+    *,
+    sample_rate: int = _DEFAULT_SAMPLE_RATE,
+    bytes_per_sample: int = _DEFAULT_BYTES_PER_SAMPLE,
+    channels: int = _DEFAULT_CHANNELS,
+) -> int:
+    """Per-second billing quantity: round UP to the next whole second (litellm
+    prices gpt-realtime-whisper via ``input_cost_per_second``). 0 bytes → 0."""
+    secs = duration_seconds(audio_bytes, sample_rate, bytes_per_sample, channels)
+    if secs <= 0:
+        return 0
+    return math.ceil(secs)
+
+
 def session_minutes(sess: RealtimeSession) -> int:
     return pcm_bytes_to_minutes(
         sess.audio_bytes,
@@ -127,6 +142,20 @@ def session_minutes(sess: RealtimeSession) -> int:
     )
 
 
+def session_quantity(sess: RealtimeSession, unit: str) -> int:
+    """Billable quantity in the unit the PriceList carries. litellm prices realtime
+    transcription per SECOND; admins may instead price per minute — bill in whichever
+    the price row uses so cost = quantity x per-unit lines up."""
+    if unit == "minute":
+        return session_minutes(sess)
+    return pcm_bytes_to_seconds(
+        sess.audio_bytes,
+        sample_rate=sess.sample_rate,
+        bytes_per_sample=sess.bytes_per_sample,
+        channels=sess.channels,
+    )
+
+
 def _apply_format(sess: RealtimeSession, ev: dict[str, Any]) -> None:
     """Read sample rate (and, if present, sample width/channels) from a
     `session.update` so metering uses the client's actual PCM geometry. Tolerant of
@@ -309,14 +338,14 @@ def _outcome_for_close(close_reason: str) -> Any:
 
 
 async def _bill_session(sess: RealtimeSession) -> None:
-    """Write ONE CallRecord(unit="minute") for the accrued audio. Any close path
-    reaches here (FR-004). Uses a fresh session — the connection has no request
-    session. Never raises (billing must not crash teardown)."""
+    """Write ONE CallRecord for the accrued audio, in the unit the PriceList carries
+    (litellm prices realtime transcription per SECOND; admins may price per minute).
+    Any close path reaches here (FR-004). Uses a fresh session — the connection has no
+    request session. Never raises (billing must not crash teardown)."""
     from ai_api.db import get_sessionmaker
     from ai_api.services.pricing import calculate_unit_cost, lookup_price_for_call
     from ai_api.services.records import RecordsService
 
-    minutes = session_minutes(sess)
     outcome = _outcome_for_close(sess.close_reason)
     try:
         async with get_sessionmaker()() as s:
@@ -326,8 +355,12 @@ async def _bill_session(sess: RealtimeSession) -> None:
                 model=sess.upstream_model.split("/", 1)[-1],
                 call_time=sess.started_at,
             )
+            # Bill in the price's unit (second from litellm, or minute); default to
+            # second (litellm's native unit) when unpriced so the quantity is honest.
+            unit = price.price_unit if (price and price.price_unit in ("second", "minute")) else "second"
+            quantity = session_quantity(sess, unit)
             cost = (
-                calculate_unit_cost(minutes, price.price_per_unit)
+                calculate_unit_cost(quantity, price.price_per_unit)
                 if price is not None
                 else None
             )
@@ -339,8 +372,8 @@ async def _bill_session(sess: RealtimeSession) -> None:
                 started_at=sess.started_at,
                 status_code=200,
                 outcome=outcome,
-                quantity=minutes,
-                unit="minute",
+                quantity=quantity,
+                unit=unit,
                 cost_usd=cost,
                 error_message=(
                     "allocation revoked mid-connection" if sess.close_reason == "revoked" else None
diff --git a/src/ai_api/services/model_kind.py b/src/ai_api/services/model_kind.py
index 3dd3270..ce53b7d 100644
--- a/src/ai_api/services/model_kind.py
+++ b/src/ai_api/services/model_kind.py
@@ -32,12 +32,36 @@
     "moderation": "moderation",
     "search": "search",
     "image_edit": "image_edit",
-    # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Billed
-    # per-minute via the /v1/realtime relay; the admin "test model" recipe is a
-    # minimal WS smoke (handshake + tiny audio) that doubles as the T027 check.
+    # Gemini live native-audio carries a genuine mode=realtime; gpt-realtime-whisper
+    # does NOT — see _is_realtime_capable below.
     "realtime": "realtime",
 }
 
+# Phase 32: realtime transcription is a CAPABILITY axis, not a litellm mode. litellm
+# (PR #29775) ships gpt-realtime-whisper as mode=audio_transcription and signals the
+# realtime ability via ``supported_endpoints`` containing ``/v1/realtime`` — exactly
+# the responses_support pattern (capability ≠ mode). We mirror that: a model is
+# realtime-capable iff its raw entry lists /v1/realtime OR an admin marked it via the
+# ``realtime`` capability marker (``realtime:blocked`` force-disables, manual wins).
+_REALTIME_MARKER = "realtime"
+_REALTIME_BLOCKED = "realtime:blocked"
+
+
+def _is_realtime_capable(model: Any) -> bool:
+    caps = list(getattr(model, "capabilities", None) or [])
+    if _REALTIME_BLOCKED in caps:
+        return False
+    if _REALTIME_MARKER in caps:
+        return True
+    sync = getattr(model, "litellm_sync", None)
+    if isinstance(sync, dict):
+        raw = sync.get("raw")
+        if isinstance(raw, dict):
+            eps = raw.get("supported_endpoints")
+            if isinstance(eps, list) and any("/v1/realtime" in str(e) for e in eps):
+                return True
+    return False
+
 
 def _mode_of(model: Any) -> str | None:
     sync = getattr(model, "litellm_sync", None)
@@ -52,6 +76,12 @@ def _mode_of(model: Any) -> str | None:
 
 def model_kind(model: Any) -> Kind:
     """Decide the testable kind of a catalog model. Never raises; always one of Kind."""
+    # Realtime is capability-derived (supported_endpoints / admin marker), NOT a
+    # litellm mode — checked first so gpt-realtime-whisper (mode=audio_transcription)
+    # is classified realtime, not stt. It can still be called on the batch STT
+    # endpoint (path-routed), so nothing is lost by the realtime label.
+    if _is_realtime_capable(model):
+        return "realtime"
     mode = _mode_of(model)
     if mode is not None:
         # known mode → mapped kind; any other litellm mode → unsupported
diff --git a/tests/contract/test_realtime_transcription.py b/tests/contract/test_realtime_transcription.py
index 867d141..974a372 100644
--- a/tests/contract/test_realtime_transcription.py
+++ b/tests/contract/test_realtime_transcription.py
@@ -57,9 +57,14 @@ def _append(seconds: float, rate: int = 24000) -> str:
     })
 
 
-async def _seed_catalog(slug: str, *, mode: str) -> None:
-    """Seed a catalog row whose litellm mode drives model_kind (realtime vs chat)."""
+async def _seed_catalog(slug: str, *, mode: str = "audio_transcription", realtime: bool = True) -> None:
+    """Seed a catalog row. Realtime models mirror litellm reality (PR #29775):
+    mode=audio_transcription + supported_endpoints listing /v1/realtime — the
+    capability axis that drives model_kind → realtime (NOT a litellm 'realtime' mode)."""
     now = datetime.now(UTC)
+    raw: dict = {"mode": mode}
+    if realtime:
+        raw["supported_endpoints"] = ["/v1/realtime", "/v1/realtime/transcription_sessions"]
     sm = get_sessionmaker()
     async with sm() as s:
         s.add(ModelCatalog(
@@ -70,7 +75,7 @@ async def _seed_catalog(slug: str, *, mode: str) -> None:
             status="active", deprecation_note=None, created_at=now, updated_at=now,
             default_access="open", allowed_tags=[], denied_tags=[],
             self_service_enabled=False, self_service_default_quota=None,
-            litellm_sync={"raw": {"mode": mode}},
+            litellm_sync={"raw": raw},
         ))
         await s.commit()
 
@@ -121,7 +126,7 @@ def _bearer(token: str) -> dict[str, str]:
 # --- T007: invalid / revoked key → close, no stream -------------------------
 @pytest.mark.asyncio
 async def test_invalid_key_closed_no_stream(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     client = FakeClientWS(_bearer("totally-invalid-token"), [_session_update()])
     upstream = FakeUpstreamWS()
     opener = fake_opener(upstream)
@@ -143,7 +148,7 @@ async def test_missing_bearer_closed(app_client: AsyncClient, admin_headers):
 
 @pytest.mark.asyncio
 async def test_revoked_allocation_closed(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     alloc = await _alloc(app_client, admin_headers)
     # Revoke it before connecting.
     r = await app_client.delete(f"/admin/allocations/{alloc['id']}", headers=admin_headers)
@@ -159,7 +164,7 @@ async def test_revoked_allocation_closed(app_client: AsyncClient, admin_headers)
 @pytest.mark.asyncio
 async def test_non_realtime_model_unsupported(app_client: AsyncClient, admin_headers):
     chat_model = "azure/gpt-4o-mini"
-    await _seed_catalog(chat_model, mode="chat")
+    await _seed_catalog(chat_model, mode="chat", realtime=False)
     await _seed_provider(app_client, admin_headers)
     alloc = await _alloc(app_client, admin_headers, model=chat_model)
     client = FakeClientWS(_bearer(alloc["token"]), [_session_update(model=chat_model)])
@@ -172,7 +177,7 @@ async def test_non_realtime_model_unsupported(app_client: AsyncClient, admin_hea
 # --- T009: valid connection + append → delta reaches client -----------------
 @pytest.mark.asyncio
 async def test_valid_connection_relays_delta(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
     delta = json.dumps({
@@ -198,7 +203,7 @@ async def test_valid_connection_relays_delta(app_client: AsyncClient, admin_head
 # --- T015: clean close → one CallRecord(unit=minute), quantity matches ------
 @pytest.mark.asyncio
 async def test_clean_close_bills_one_minute_record(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     await _seed_price("0.017")
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
@@ -216,22 +221,48 @@ async def test_clean_close_bills_one_minute_record(app_client: AsyncClient, admi
 
 
 @pytest.mark.asyncio
-async def test_unpriced_realtime_zero_cost(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+async def test_unpriced_realtime_defaults_to_seconds(app_client: AsyncClient, admin_headers):
+    await _seed_catalog(RT_MODEL)
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
     client = FakeClientWS(_bearer(alloc["token"]),
                           [_session_update(), _append(30.0)], hold_open=False)
     await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False)))
     rec = await _last(CallOutcome.success)
-    assert rec is not None and rec.unit == "minute" and rec.quantity == 1
+    # unpriced → default to litellm's native unit (second); 30s → 30
+    assert rec is not None and rec.unit == "second" and rec.quantity == 30
     assert rec.cost_usd is None  # no PriceList → unpriced (NULL), not a crash
 
 
+@pytest.mark.asyncio
+async def test_per_second_price_billed_in_seconds(app_client: AsyncClient, admin_headers):
+    """litellm prices gpt-realtime-whisper per SECOND — bill in seconds so the cost
+    lines up with the imported per-unit price (input_cost_per_second)."""
+    await _seed_catalog(RT_MODEL)
+    sm = get_sessionmaker()
+    async with sm() as s:
+        s.add(PriceList(
+            id=str(ULID()), provider="azure", model="gpt-realtime-whisper",
+            input_per_1k_tokens_usd=Decimal(0), output_per_1k_tokens_usd=Decimal(0),
+            price_unit="second", price_per_unit_usd=Decimal("0.0002833"),
+            effective_from=datetime.now(UTC) - timedelta(days=1),
+            created_at=datetime.now(UTC), created_by="test",
+        ))
+        await s.commit()
+    alloc = await _alloc(app_client, admin_headers)
+    await _seed_provider(app_client, admin_headers)
+    client = FakeClientWS(_bearer(alloc["token"]),
+                          [_session_update(), _append(10.0)], hold_open=False)
+    await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False)))
+    rec = await _last(CallOutcome.success)
+    assert rec is not None and rec.unit == "second" and rec.quantity == 10
+    assert rec.cost_usd == Decimal("0.002833")  # 10 x 0.0002833
+
+
 # --- T016: abnormal abort (client hangs up mid-stream) → accrued bytes billed
 @pytest.mark.asyncio
 async def test_abnormal_abort_still_bills_accrued(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     await _seed_price("0.017")
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
@@ -246,7 +277,7 @@ async def test_abnormal_abort_still_bills_accrued(app_client: AsyncClient, admin
 # --- T020/T021: in-flight revoke / pause → close(revoked) within N + billed -
 @pytest.mark.asyncio
 async def test_inflight_revoke_closes_and_bills(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     await _seed_price("0.017")
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
@@ -277,7 +308,7 @@ async def revoke_after_first_tick(allocation_id: str) -> bool:
 # --- Contract #7: no upstream key / endpoint ever reaches the client --------
 @pytest.mark.asyncio
 async def test_no_key_or_endpoint_leak(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
     err = json.dumps({"type": "error", "error": {"code": "bad", "message": "upstream boom"}})
@@ -292,7 +323,7 @@ async def test_no_key_or_endpoint_leak(app_client: AsyncClient, admin_headers):
 
 @pytest.mark.asyncio
 async def test_upstream_connect_failure_no_leak_and_bills_zero(app_client: AsyncClient, admin_headers):
-    await _seed_catalog(RT_MODEL, mode="realtime")
+    await _seed_catalog(RT_MODEL)
     alloc = await _alloc(app_client, admin_headers)
     await _seed_provider(app_client, admin_headers)
 
@@ -304,6 +335,6 @@ async def failing_opener(**kwargs):
     assert client.closed is not None and client.closed[0] == 1011  # internal
     blob = json.dumps(client.closed) + " ".join(client.sent)
     assert SECRET_KEY not in blob and "secret-foundry.services.ai.azure.com" not in blob
-    # Connect failed before any audio relayed → 0 minutes, still a record.
+    # Connect failed before any audio relayed → 0 (unpriced → default unit second).
     rec = await _last(CallOutcome.upstream_error)
-    assert rec is not None and rec.unit == "minute" and rec.quantity == 0
+    assert rec is not None and rec.unit == "second" and rec.quantity == 0
diff --git a/tests/integration/test_admin_model_test.py b/tests/integration/test_admin_model_test.py
index 5a829cb..07d7479 100644
--- a/tests/integration/test_admin_model_test.py
+++ b/tests/integration/test_admin_model_test.py
@@ -13,10 +13,18 @@
 from ai_api.models import AuditEventType, ModelCatalog
 
 
-async def _seed(slug: str, *, mode: str | None = None, modality_input=None, modality_output=None) -> None:
+async def _seed(slug: str, *, mode: str | None = None, modality_input=None, modality_output=None,
+                supported_endpoints=None) -> None:
     sm = get_sessionmaker()
     now = datetime.now(UTC)
-    sync = {"raw": {"mode": mode}} if mode is not None else None
+    sync = None
+    if mode is not None or supported_endpoints is not None:
+        raw: dict = {}
+        if mode is not None:
+            raw["mode"] = mode
+        if supported_endpoints is not None:
+            raw["supported_endpoints"] = supported_endpoints
+        sync = {"raw": raw}
     async with sm() as s:
         s.add(ModelCatalog(
             slug=slug, provider=slug.split("/", 1)[0], display_name=slug, family="x",
@@ -212,7 +220,7 @@ async def test_search_confirmed_calls(app_client: AsyncClient, admin_headers: di
 @pytest.mark.asyncio
 async def test_realtime_confirmed_calls(app_client: AsyncClient, admin_headers: dict[str, str]) -> None:
     """Phase 32: realtime is testable via a WS smoke recipe (billable → needs ack)."""
-    await _seed("azure/gpt-realtime-whisper", mode="realtime")
+    await _seed("azure/gpt-realtime-whisper", mode="audio_transcription", supported_endpoints=["/v1/realtime"])
     await _provider(app_client, admin_headers)
     with patch(
         "ai_api.proxy.upstream.realtime_smoke",
@@ -237,7 +245,7 @@ async def test_realtime_confirmed_calls(app_client: AsyncClient, admin_headers:
 @pytest.mark.asyncio
 async def test_realtime_upstream_error_reported(app_client: AsyncClient, admin_headers: dict[str, str]) -> None:
     """A failing WS smoke (e.g. bad deployment) surfaces as a test failure, not 5xx."""
-    await _seed("azure/gpt-realtime-whisper", mode="realtime")
+    await _seed("azure/gpt-realtime-whisper", mode="audio_transcription", supported_endpoints=["/v1/realtime"])
     await _provider(app_client, admin_headers)
     with patch(
         "ai_api.proxy.upstream.realtime_smoke",
diff --git a/tests/unit/test_model_kind.py b/tests/unit/test_model_kind.py
index 13f42e2..ebee83e 100644
--- a/tests/unit/test_model_kind.py
+++ b/tests/unit/test_model_kind.py
@@ -92,3 +92,43 @@ def test_is_supported():
         assert is_supported(k)
     # only 'unknown' has no recipe → honestly not auto-testable (never a fake pass)
     assert not is_supported("unknown")
+
+
+# --- Phase 32: realtime is a CAPABILITY (supported_endpoints / admin marker), not a mode ---
+def _cap(*, mode=None, supported_endpoints=None, capabilities=None):
+    raw = {}
+    if mode is not None:
+        raw["mode"] = mode
+    if supported_endpoints is not None:
+        raw["supported_endpoints"] = supported_endpoints
+    return SimpleNamespace(
+        litellm_sync={"raw": raw} if raw else None,
+        modality_input=["audio"], modality_output=["text"],
+        capabilities=capabilities or [],
+    )
+
+
+def test_realtime_from_supported_endpoints_overrides_stt():
+    # gpt-realtime-whisper: litellm mode=audio_transcription but /v1/realtime in
+    # supported_endpoints → realtime (capability beats the audio_transcription→stt map).
+    m = _cap(mode="audio_transcription",
+             supported_endpoints=["/v1/realtime", "/v1/realtime/transcription_sessions"])
+    assert model_kind(m) == "realtime"
+
+
+def test_realtime_from_admin_marker():
+    # Manual model (no litellm_sync): admin marks the `realtime` capability.
+    m = _cap(capabilities=["realtime"])
+    assert model_kind(m) == "realtime"
+
+
+def test_realtime_blocked_marker_forces_off():
+    m = _cap(mode="audio_transcription", supported_endpoints=["/v1/realtime"],
+             capabilities=["realtime:blocked"])
+    assert model_kind(m) == "stt"  # admin override wins → falls back to mode
+
+
+def test_plain_transcription_stays_stt():
+    # whisper-1: audio_transcription, NO /v1/realtime → batch STT, not realtime.
+    m = _cap(mode="audio_transcription")
+    assert model_kind(m) == "stt"
diff --git a/tests/unit/test_realtime_metering.py b/tests/unit/test_realtime_metering.py
index 77bf68e..2cb6b9c 100644
--- a/tests/unit/test_realtime_metering.py
+++ b/tests/unit/test_realtime_metering.py
@@ -10,7 +10,9 @@
     RealtimeSession,
     duration_seconds,
     pcm_bytes_to_minutes,
+    pcm_bytes_to_seconds,
     session_minutes,
+    session_quantity,
 )
 
 
@@ -39,13 +41,36 @@ def test_minutes_respects_session_geometry() -> None:
     assert pcm_bytes_to_minutes(96000, sample_rate=16000) == 1
 
 
-def test_session_minutes_uses_session_state() -> None:
+def test_seconds_round_up() -> None:
+    rate = 24000
+    per_sec = rate * 2 * 1  # 48000 bytes/sec
+    assert pcm_bytes_to_seconds(0) == 0
+    assert pcm_bytes_to_seconds(per_sec) == 1
+    assert pcm_bytes_to_seconds(per_sec // 2) == 1          # 0.5s -> 1s (round up)
+    assert pcm_bytes_to_seconds(per_sec * 10) == 10
+    assert pcm_bytes_to_seconds(per_sec * 10 + 1) == 11     # a started second is billed
+
+
+def _sess(rate: int = 24000) -> RealtimeSession:
     from datetime import UTC, datetime
 
-    sess = RealtimeSession(
+    return RealtimeSession(
         allocation_id="a", subject="s", resource_model="azure/gpt-realtime-whisper",
         upstream_model="azure/gpt-realtime-whisper", provider="azure",
-        request_id="r", started_at=datetime.now(UTC), sample_rate=16000,
+        request_id="r", started_at=datetime.now(UTC), sample_rate=rate,
     )
+
+
+def test_session_minutes_uses_session_state() -> None:
+    sess = _sess(16000)
     sess.audio_bytes = 32000 * 90  # 90 seconds at 16 kHz pcm16 mono
     assert session_minutes(sess) == 2  # 90s → 2 min
+
+
+def test_session_quantity_follows_price_unit() -> None:
+    sess = _sess(24000)
+    sess.audio_bytes = 24000 * 2 * 10  # 10 seconds
+    assert session_quantity(sess, "second") == 10
+    assert session_quantity(sess, "minute") == 1      # 10s → 1 min (round up)
+    # any non-minute unit (incl. unknown) bills in seconds
+    assert session_quantity(sess, "anything-else") == 10