From 59148cc10a510349e8cd73d94eaacc664ea10b3d Mon Sep 17 00:00:00 2001 From: timcsy Date: Fri, 12 Jun 2026 14:53:01 +0800 Subject: [PATCH 1/4] feat(realtime): spec/plan/tasks for /v1/realtime + foundation (deps, minute unit, model_kind) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 32 realtime transcription endpoint — full speckit spec package (spec, plan, research, data-model, WS event contract, quickstart, tasks) plus the self-contained foundational layer: - websockets declared as a direct dependency (was transitive via uvicorn) — needed to relay /v1/realtime to the provider's realtime WS (Constitution Deviation noted). - model_kind: add `realtime` kind (mode→kind) so the catalog labels realtime models honestly; full suite re-run green (715 passed) per the model_kind lesson. - minute billing unit verified through the existing unit-billing path (calculate_unit_cost is unit-agnostic; `minute` is a new string value, no schema change) + test. Foundational logic (T001/T003/T006) done & green. The WS core — upstream WS client, mock provider WS server, bidirectional relay (US1), per-minute metering (US2), in-flight revocation (US3) — is the next focused block; T027 real Azure WS smoke needs credentials. Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 6 +- pyproject.toml | 5 + .../checklists/requirements.md | 37 ++++ .../contracts/realtime-transcription.md | 65 +++++++ .../043-realtime-transcription/data-model.md | 54 ++++++ specs/043-realtime-transcription/plan.md | 86 +++++++++ .../043-realtime-transcription/quickstart.md | 57 ++++++ specs/043-realtime-transcription/research.md | 68 +++++++ specs/043-realtime-transcription/spec.md | 109 +++++++++++ specs/043-realtime-transcription/tasks.md | 182 ++++++++++++++++++ src/ai_api/services/model_kind.py | 5 +- tests/contract/test_pricing_units.py | 22 +++ 12 files changed, 693 insertions(+), 3 deletions(-) create mode 100644 specs/043-realtime-transcription/checklists/requirements.md create mode 100644 specs/043-realtime-transcription/contracts/realtime-transcription.md create mode 100644 specs/043-realtime-transcription/data-model.md create mode 100644 specs/043-realtime-transcription/plan.md create mode 100644 specs/043-realtime-transcription/quickstart.md create mode 100644 specs/043-realtime-transcription/research.md create mode 100644 specs/043-realtime-transcription/spec.md create mode 100644 specs/043-realtime-transcription/tasks.md diff --git a/CLAUDE.md b/CLAUDE.md index 8f6a8cf..1393877 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ # ai-api Development Guidelines -Auto-generated from all feature plans. Last updated: 2026-06-11 +Auto-generated from all feature plans. Last updated: 2026-06-12 ## Active Technologies - Python 3.11+(同 Phase 1) (002-auth-membership) @@ -66,6 +66,8 @@ Auto-generated from all feature plans. Last updated: 2026-06-11 - PostgreSQL(生產)/ SQLite(dev、CI);**不新增表、不新增 migration**——沿用增量②(0019)的 `call_records.quantity/unit` 與 `price_list.price_unit/price_per_unit_usd`,新單位(query / character)為字串值 (041-multi-endpoint-complete) - Python 3.11+(後端)/ TypeScript strict + React 19 + Vite 6(前端少量範例) + FastAPI(含 `UploadFile` multipart,既有)、SQLAlchemy 2.x async、Pydantic v2、`litellm`(`amoderation`/`asearch`/`aimage_edit` 既有函式);TanStack Query、shadcn/ui(前端)——**皆既有,不新增套件** (042-endpoint-registry) - PostgreSQL(生產)/ SQLite(dev、CI);**不新增表/欄/migration**——沿用 0019 的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`,新單位 `image`/`query` 為字串值 (042-endpoint-registry) +- Python 3.11+(後端為主)/ TypeScript strict + React 19(前端僅目錄顯示 realtime 類型 + 連線範例,極少量) + FastAPI(WebSocket — starlette 內建,**專案首次使用**)、SQLAlchemy 2.x async、Pydantic v2(皆既有);**`websockets`(直連 Azure realtime WS 的 async client,提為直接依賴——已隨 uvicorn/litellm 在 image,現宣告為直接依賴)**;既有 `proxy/preflight.py`、計費(`services/pricing.py` 的 `calculate_unit_cost`)、audit。**realtime 不經 litellm**(其 realtime 是 Proxy form / client 直連,違原則;借其 `RealTimeStreaming` 結構自寫薄 relay)。 (043-realtime-transcription) +- PostgreSQL(生產)/ SQLite(dev、CI);**不新增表、不新增 migration**——沿用增量②(0019)的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`,新單位 `minute` 為字串值。 (043-realtime-transcription) - Python 3.11+ + LiteLLM(proxy core)、FastAPI(admin API)、 (001-gateway-core) @@ -86,9 +88,9 @@ cd src [ONLY COMMANDS FOR ACTIVE TECHNOLOGIES][ONLY COMMANDS FOR ACTIVE TECHNOLO Python 3.11+: Follow standard conventions ## Recent Changes +- 043-realtime-transcription: Added Python 3.11+(後端為主)/ TypeScript strict + React 19(前端僅目錄顯示 realtime 類型 + 連線範例,極少量) + FastAPI(WebSocket — starlette 內建,**專案首次使用**)、SQLAlchemy 2.x async、Pydantic v2(皆既有);**`websockets`(直連 Azure realtime WS 的 async client,提為直接依賴——已隨 uvicorn/litellm 在 image,現宣告為直接依賴)**;既有 `proxy/preflight.py`、計費(`services/pricing.py` 的 `calculate_unit_cost`)、audit。**realtime 不經 litellm**(其 realtime 是 Proxy form / client 直連,違原則;借其 `RealTimeStreaming` 結構自寫薄 relay)。 - 042-endpoint-registry: Added Python 3.11+(後端)/ TypeScript strict + React 19 + Vite 6(前端少量範例) + FastAPI(含 `UploadFile` multipart,既有)、SQLAlchemy 2.x async、Pydantic v2、`litellm`(`amoderation`/`asearch`/`aimage_edit` 既有函式);TanStack Query、shadcn/ui(前端)——**皆既有,不新增套件** - 041-multi-endpoint-complete: Added Python 3.11+(後端)/ TypeScript strict + React 19 + Vite 6(前端) + FastAPI(含 `UploadFile` multipart)、SQLAlchemy 2.x async、Pydantic v2、`litellm`(`aimage_generation`/`arerank`/`aspeech`/`atranscription` library form);TanStack Query、shadcn/ui(前端)——**皆既有,不新增套件** -- 040-ocr-billing-units: Added Python 3.11+(後端)/ TypeScript strict + React 19 + Vite 6(前端) + FastAPI、SQLAlchemy 2.x async、Alembic、Pydantic v2、`litellm`(library:`aocr` 既有函式);TanStack Query、shadcn/ui(前端)——**皆既有,不新增套件** diff --git a/pyproject.toml b/pyproject.toml index a84011d..bfcae39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,11 @@ dependencies = [ # multipart form parsing — required by FastAPI for /v1/audio/transcriptions # (STT) audio file upload. FastAPI's official optional dependency. "python-multipart>=0.0.18", + # async WebSocket client — required to relay /v1/realtime (live transcription) + # directly to the upstream provider's realtime WS. Already present transitively + # via uvicorn[standard]; declared directly so it can't vanish on an upstream + # change (Constitution Deviation: justified — direct provider WS needs a client). + "websockets>=13.0", ] [project.optional-dependencies] diff --git a/specs/043-realtime-transcription/checklists/requirements.md b/specs/043-realtime-transcription/checklists/requirements.md new file mode 100644 index 0000000..2366d64 --- /dev/null +++ b/specs/043-realtime-transcription/checklists/requirements.md @@ -0,0 +1,37 @@ +# Specification Quality Checklist: realtime 即時字幕端點 + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-06-12 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- 0 個 [NEEDS CLARIFICATION]:所有未定細節都有「對齊既有專案慣例」的合理預設,記入 Assumptions(計量單位以供應商回報為先/否則估串流時長、撤回 SLO 對齊既有、配額建立時檢查、額度綁分配不限連線數)。 +- 三個技術未知(直連供應商 realtime 連線協定、連線結束的計量來源、持續連線的轉送與連線中撤回機制)刻意**不**放進 spec——它們是規劃階段(research/plan)要先釘死的能力邊界,不是需求層的模糊。 +- SC-004「約定上限時間」未填具體秒數為刻意:撤回 SLO 的具體值對齊既有分配撤回機制、由規劃階段定,spec 層不硬編。 +- Input 行保留 user 原述(含 WebSocket / gpt-realtime-whisper / litellm Proxy 等字眼)為 speckit 慣例(記錄原始描述);正文以業務語言(持續連線/串流/相容端點)表述,不洩漏實作。 diff --git a/specs/043-realtime-transcription/contracts/realtime-transcription.md b/specs/043-realtime-transcription/contracts/realtime-transcription.md new file mode 100644 index 0000000..456468d --- /dev/null +++ b/specs/043-realtime-transcription/contracts/realtime-transcription.md @@ -0,0 +1,65 @@ +# Contract: realtime 即時字幕 WebSocket 端點 + +**端點**:`GET /v1/realtime`(WebSocket upgrade)— OpenAI 相容 realtime transcription +**認證**:`Authorization: Bearer <應用金鑰>`(連線 header,沿用既有金鑰)或 OpenAI realtime 慣例的 subprotocol header(tasks 階段對齊 OpenAI 客戶端慣例) +**形態**:雙向 WebSocket。客戶端上行音訊、平台下行文字事件。 + +## 連線生命週期 + +``` +client → (WS upgrade + Bearer key) + platform: run_preflight(key → allocation → access → quota → model) + ├─ 不通過 → close(code, reason) ;不開始串流(FR-002/005/007) + └─ 通過 → accept;開一條 platform↔Azure WS;進入雙向轉送 +client → session.update {type:"transcription", model, audio.format} +client → input_audio_buffer.append {audio: } (重複,串流) +platform→ conversation.item.input_audio_transcription.delta {delta} (即時,SC-001 <1s) +platform→ conversation.item.input_audio_transcription.completed {transcript} +... +(任一端關閉 / 撤回 re-check 觸發)→ platform: 落帳 CallRecord(unit=minute) → close +``` + +## Client → Server 事件(平台接受並轉送上游) + +| 事件 | 必要欄位 | 平台行為 | +|---|---|---| +| `session.update` | `type:"transcription"`, `model`, `audio.format{type,rate}` | 校驗 model 為 realtime 類型(否則 close,FR-007);記下 sample_rate/format 供計量;轉送上游 | +| `input_audio_buffer.append` | `audio`(base64 PCM)| **累計 audio_bytes(計量來源,R2)**;轉送上游 | +| `input_audio_buffer.commit` | — | 轉送上游(manual turn detection)| + +## Server → Client 事件(平台從上游轉回) + +| 事件 | 內容 | 備註 | +|---|---|---| +| `conversation.item.input_audio_transcription.delta` | `delta`(增量文字)| 即時字幕主要輸出;SC-001 首段 <1s | +| `conversation.item.input_audio_transcription.completed` | `transcript`(完整)| 一段話完成;平台在此路徑可記觀測 | +| `error` | `error{code,message}` | 上游錯誤透明轉回;不洩漏上游金鑰(FR-006)| + +## 連線關閉碼(平台主動關閉時) + +| 情境 | 關閉碼/原因 | 對應 | +|---|---|---| +| 金鑰無效/撤回、無有效分配、配額已滿 | policy violation + 可理解 reason | FR-002, SC-005 | +| 模型非 realtime 類型 | unsupported + reason | FR-007 | +| 連線中分配被撤回/暫停/隔離 | revoked + reason | FR-005, SC-004 | +| 上游斷線/失敗 | upstream_error + 透明原因 | FR-009 | + +## 計量契約 + +- 計量單位:`minute`;數量 = `ceil(Σ append PCM bytes / (rate × bytes_per_sample × channels) / 60)`(精確 rounding tasks 定)。 +- 落帳時機:**連線關閉(任何原因,含異常)**——`audio_bytes` 即時累計確保不漏記(FR-004/SC-003)。 +- 歸戶:preflight 解出的 allocation;費用 = `calculate_unit_cost`(既有)。 + +## 不洩漏契約(FR-006) + +任何下行事件、錯誤、關閉原因 MUST NOT 含上游 endpoint / key / 內部部署名;上游錯誤轉譯為對使用者可理解的訊息。 + +## 契約測試(合併前必過) + +1. 無效/撤回金鑰連線 → 被 close、未開始串流。 +2. 非 realtime 模型 → close(unsupported)。 +3. 有效連線 + 送 append → 收到 delta(mock provider WS 回預錄 delta)。 +4. 連線關閉 → 寫一筆 `CallRecord(unit="minute")`、quantity 對得上送出的音訊時長。 +5. 連線中 mock 撤回分配 → 平台在 N 秒內主動 close(revoked) + 已累計時長落帳。 +6. 異常中止(client 直接斷)→ 仍落帳已累計時長(不漏記)。 +7. 任何錯誤/關閉訊息不含上游 key/endpoint。 diff --git a/specs/043-realtime-transcription/data-model.md b/specs/043-realtime-transcription/data-model.md new file mode 100644 index 0000000..af306c6 --- /dev/null +++ b/specs/043-realtime-transcription/data-model.md @@ -0,0 +1,54 @@ +# Phase 1 Data Model: realtime 即時字幕端點 + +**核心結論:不新增表、不新增 migration。** realtime 連線本身是 in-memory 的生命週期物件(不落表);用量沿用既有 `call_records`(增量② 0019 的 `quantity`/`unit`)+ `price_list`(`price_unit`/`price_per_unit_usd`),新單位 `minute` 為字串值。 + +## 1. RealtimeSession(in-memory,非持久化) + +一次 WS 連線的執行期狀態,**不寫表**——只活在連線存活期間,斷線時把累計結果落成一筆 `CallRecord`。 + +| 欄位 | 型別 | 說明 | +|---|---|---| +| `allocation_id` | str | preflight 解出的歸戶分配(計量落帳對象)| +| `credential_id` | str | 建立連線的應用金鑰(審計用)| +| `member_id` | str | 擁有者(審計用)| +| `resource_model` | str | 請求的 realtime 模型 slug | +| `upstream_model` | str | 對映到上游的模型字串 | +| `started_at` | datetime(tz-aware)| 連線建立時間 | +| `audio_bytes` | int | 累計收到的 PCM 音訊 bytes(計量來源,R2)| +| `sample_rate` / `bytes_per_sample` / `channels` | int | 由 `session.update` 的 format 決定,換算時長用 | +| `close_reason` | enum | `normal` / `client_abort` / `upstream_error` / `revoked` | + +**衍生**:`duration_seconds = audio_bytes / (sample_rate × bytes_per_sample × channels)`;`quantity_minutes = ceil(duration_seconds / 60)` 或精確分鐘(tasks 階段定 rounding,對齊計費慣例)。 + +**狀態轉移**:`connecting`(preflight 中)→ `streaming`(轉送中、累計 audio_bytes、週期 re-check)→ `closing`(任一端關閉或撤回觸發)→ 落帳 `CallRecord` → `closed`。 + +## 2. CallRecord(既有,沿用) + +斷線時寫**一筆**,與其他非 token 端點同機制: + +| 欄位 | 值 | +|---|---| +| `allocation_id` | RealtimeSession.allocation_id(歸戶;異常中止仍寫)| +| `quantity` | 累計分鐘數(R2 自算)| +| `unit` | `"minute"`(新字串值,**非新欄位**,0019 已有 unit 欄)| +| `cost_usd` | `calculate_unit_cost(quantity, price_per_unit)`(既有函式)| +| `outcome` | 對映 close_reason(`success` / `upstream_error` …,沿用既有 enum)| +| token 欄 | NULL(非 token 端點,沿用 0019 的 NULL⇒非 token 語意)| + +**FR-004 不漏記**:`audio_bytes` 在 relay 迴圈即時累計,故任何斷線路徑(正常/異常/撤回)落帳時都有值。 + +## 3. PriceList(既有,沿用) + +realtime 模型的價以 `price_unit="minute"` + `price_per_unit_usd`(如 gpt-realtime-whisper $0.017)存一筆 point-in-time 版本(append-only)。admin 在既有 `/prices` 設定(單位下拉加 `minute`,沿用階段 29 unit billing 的單位感知 UI)。**LiteLLM 僅建議、PriceList 是計費真理**(不變)。 + +## 4. Allocation(既有,沿用) + +歸戶對象 + 配額載體 + 連線中 re-check 的狀態來源(active / revoked / paused / quarantined)。**不改 schema**。 + +## 5. model_kind:realtime 類型 + +`services/model_kind.py` 的 mode→kind 對映加 `realtime`(litellm `mode` 為 realtime/realtime-transcription 時)。對應目錄誠實(FR-008):realtime 模型顯正確類型、不假裝 chat。**改 model_kind 對映後須重跑全套件**(experience 教訓:有「未知 mode 反例」整合測試會撞)。 + +--- + +**Migration 結論**:**無**。沿用 0019 的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`;`minute` 是資料值非 schema 變更。RealtimeSession 不落表。 diff --git a/specs/043-realtime-transcription/plan.md b/specs/043-realtime-transcription/plan.md new file mode 100644 index 0000000..da5c220 --- /dev/null +++ b/specs/043-realtime-transcription/plan.md @@ -0,0 +1,86 @@ +# Implementation Plan: realtime 即時字幕端點 + +**Branch**: `043-realtime-transcription` | **Date**: 2026-06-12 | **Spec**: [spec.md](spec.md) +**Input**: Feature specification from `specs/043-realtime-transcription/spec.md` + +## Summary + +對成員開放一個 **WebSocket 即時字幕端點**:客戶端以分配到的金鑰建立持續連線、串流 PCM 音訊,平台**自寫薄 relay**(借鏡 litellm `RealTimeStreaming` 結構、但接我們的「分配」計費)直連 Azure Foundry 的 gpt-realtime-whisper,即時把 `conversation.item.input_audio_transcription.delta/.completed` 事件轉回客戶端。連線建立跑既有 preflight;連線期間**自行從 append 的音訊 bytes 累計時長**(不依賴 provider usage 事件,天然滿足異常中止不漏記),斷線時記一筆 `unit="minute"` 的 CallRecord 歸戶分配;連線期間定期 re-check 分配狀態,被撤回/暫停/隔離即主動斷線。這是專案**第一個長連線 / WebSocket 端點**,刻意獨立於階段 31 的非串流 registry(比照 `responses.py` 的 SSE 獨立 handler)。 + +## Technical Context + +**Language/Version**: Python 3.11+(後端為主)/ TypeScript strict + React 19(前端僅目錄顯示 realtime 類型 + 連線範例,極少量) +**Primary Dependencies**: FastAPI(WebSocket — starlette 內建,**專案首次使用**)、SQLAlchemy 2.x async、Pydantic v2(皆既有);**`websockets`(直連 Azure realtime WS 的 async client,提為直接依賴——已隨 uvicorn/litellm 在 image,現宣告為直接依賴)**;既有 `proxy/preflight.py`、計費(`services/pricing.py` 的 `calculate_unit_cost`)、audit。**realtime 不經 litellm**(其 realtime 是 Proxy form / client 直連,違原則;借其 `RealTimeStreaming` 結構自寫薄 relay)。 +**Storage**: PostgreSQL(生產)/ SQLite(dev、CI);**不新增表、不新增 migration**——沿用增量②(0019)的 `call_records.{quantity,unit}` 與 `price_list.{price_unit,price_per_unit_usd}`,新單位 `minute` 為字串值。 +**Testing**: pytest——契約/單元用 starlette `TestClient.websocket_connect` 測「client ↔ 我們」這段;整合測試起一個 **mock provider realtime WS server**(送預錄事件流)驗 relay 轉送 / 時長累計 / 連線中撤回斷線;**真連 Azure realtime WS = 部署後手動煙霧**(見 Constitution Deviation)。 +**Target Platform**: Linux server(k3s-tew / ns ai-ccsh / helm release ai-api) +**Project Type**: web service(後端為主,前端極少量) +**Performance Goals**: 首段文字 < 1 秒(SC-001);per-minute 計量精度到秒換算分鐘。 +**Constraints**: 長連線 WebSocket(與既有 HTTP-only pipeline 形態不同);連線中撤回 re-check(SLO 對齊既有分配撤回);nginx 需 WS upgrade proxy;pod egress 需可達 `wss://*.services.ai.azure.com:443`(既有 443 egress 已開,需煙霧實證)。 +**Scale/Scope**: org-internal 課堂/會議並發連線(小規模);單一新端點 + 薄 relay + 計量 + 前端目錄微調。 + +## Constitution Check + +*GATE: 評估每條核心原則。* + +- **I. Test-First(NON-NEGOTIABLE)**:✅ 可遵守。starlette `websocket_connect` 可在測試中建立 client 連線、mock 一個 provider WS server,先寫失敗測試(連線拒絕、轉送、時長累計、撤回斷線)再實作。TDD 流程不受 WS 形態阻礙。 +- **II. API 契約優先**:✅ 遵守。realtime 的契約是 **WS 事件流**——`contracts/realtime-transcription.md` 先定 client→server(`session.update`/`input_audio_buffer.append`)與 server→client(`...transcription.delta/.completed`、錯誤/關閉碼)事件,契約測試合併前必過。 +- **III. 整合測試覆蓋外部依賴 + CI 可重現**:⚠️ **部分偏離(見 Complexity Tracking / Deviation)**。憲法要求「不得僅以 mock 取代真實邊界、整合測試 CI 可重現」;但真連 Azure realtime WS 是**長連線 + 需憑證 + 即時音訊串流**,無法在 CI 可重現執行。補救:整合測試用 **mock provider WS server** 驗我們這側全部行為;真實邊界以**部署後手動煙霧**驗(比照既有 chat/responses 上游——本專案既有端點的上游真打本就走 mock + 部署煙霧,非 CI 真打)。 +- **IV. 可觀測性**:✅ 遵守。連線建立/結束/被撤回斷線、累計時長、計量結果、上游失敗原因皆結構化記錄(沿用既有 audit + CallRecord 透明度,FR-009);不洩漏上游金鑰(FR-006)。 +- **V. 簡潔優先(YAGNI)**:✅ 大致遵守,一個 justified 新依賴。薄 relay 只做單向 transcription、不做雙向對話/工具;**`websockets` 提為直接依賴**是直連 provider WS 的必需(已在 image),於 Deviation 明列。不為未來雙向對話預留抽象。 + +**結論**:可進 Phase 0。一個 Deviation(CI 無法真打 realtime 上游)+ 一個 justified 依賴(websockets),均於下方明列。 + +## Project Structure + +### Documentation (this feature) + +```text +specs/043-realtime-transcription/ +├── plan.md # 本檔 +├── research.md # Phase 0:三個技術未知的決策 +├── data-model.md # Phase 1:realtime session + CallRecord(minute) 計量 +├── quickstart.md # Phase 1:客戶端怎麼連 + 開發者驗證步驟 +├── contracts/ +│ └── realtime-transcription.md # WS 事件契約(client↔server) +└── tasks.md # Phase 2(/speckit.tasks,非本指令產出) +``` + +### Source Code (repository root) + +```text +src/ai_api/ +├── proxy/ +│ ├── realtime.py # 新增:WS 端點 handler + 薄 relay(類比 responses.py,獨立於 registry) +│ ├── upstream.py # 加:開一條到 Azure realtime 的 async WS client(websockets) +│ ├── preflight.py # 沿用:連線建立時跑(既有) +│ └── registry.py # 不動(registry 專收非串流同步端點,realtime 不進) +├── services/ +│ ├── pricing.py # 沿用 calculate_unit_cost(unit="minute") +│ ├── model_kind.py # 加:realtime kind 判定(mode → realtime) +│ └── model_test.py # 不動(realtime 不適用 recipe 表的一次性測試;目錄誠實由 model_kind 涵蓋) +└── api/ + └── (realtime WS route 掛載,nginx 既有 /v1 之下加 WS upgrade) + +deploy/helm/ai-api/ # nginx WS upgrade(Upgrade/Connection header)config + +frontend/src/ +├── routes/admin/model-detail.tsx # realtime kind 顯示(沿用 KIND_LABEL) +└── components/api-usage-example.tsx # realtime 連線範例(WS) + +tests/ +├── contract/test_realtime_transcription.py # WS 事件契約 +├── integration/test_realtime_relay.py # mock provider WS:轉送/計量/撤回斷線 +└── unit/test_realtime_metering.py # 音訊 bytes → 時長換算 +``` + +**Structure Decision**:realtime 為**獨立 WS handler**(`proxy/realtime.py`),不納入階段 31 的 `engine/registry`——後者的三軸(IOShape × Meter × call)建在「一請求一回應一筆帳」的同步假設上,realtime 是長連線、破壞該假設(同 `responses.py` 的 SSE 也獨立於 registry)。計量沿用既有 unit billing(`minute` 為新字串單位),**零 migration**。 + +## Complexity Tracking + +> 僅列 Constitution Check 的偏離與須說明的複雜度。 + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|--------------------------------------| +| **原則 III**:realtime 上游真打無法進 CI(整合測試以 mock provider WS server 取代真實邊界) | realtime 是長連線 WS + 需 Azure 憑證 + 即時音訊串流,CI 無法可重現執行;本專案既有上游端點(chat/responses)本就走 mock + 部署煙霧 | 在 CI 真連 Azure realtime WS → 需在 CI 注入生產憑證(安全面)、起即時音訊源、維持長連線,flaky 且昂貴,違反「CI 可重現」初衷 | +| **原則 V / 新依賴**:`websockets` 提為直接依賴 | 直連 Azure realtime WS 需 async WebSocket client;litellm 的 realtime 是 Proxy form(不採),故自寫 relay 必需一個 WS client | 靠 transitive(uvicorn/litellm 帶入)→ 上游一移除即斷,違反「依賴要顯式」;改用 aiohttp → 更重且 image 未必有,websockets 已在 image | diff --git a/specs/043-realtime-transcription/quickstart.md b/specs/043-realtime-transcription/quickstart.md new file mode 100644 index 0000000..86d64e3 --- /dev/null +++ b/specs/043-realtime-transcription/quickstart.md @@ -0,0 +1,57 @@ +# Quickstart: realtime 即時字幕端點 + +## 給接平台的開發者(客戶端怎麼用) + +平台暴露 OpenAI 相容的 realtime transcription WebSocket 端點。用你分配到的**應用金鑰**連線、串流麥克風音訊(PCM),即時收文字事件,自己渲染字幕。 + +```python +# 概念範例(實際以 OpenAI realtime 客戶端慣例為準) +import websockets, json, base64 + +async with websockets.connect( + "wss://<平台網域>/v1/realtime", + additional_headers={"Authorization": "Bearer <你的應用金鑰>"}, +) as ws: + await ws.send(json.dumps({ + "type": "session.update", + "session": {"type": "transcription", "model": "azure/gpt-realtime-whisper", + "audio": {"input": {"format": {"type": "audio/pcm", "rate": 24000}}}}, + })) + # 串流音訊 + await ws.send(json.dumps({"type": "input_audio_buffer.append", + "audio": base64.b64encode(pcm_chunk).decode()})) + # 收即時字幕 + async for msg in ws: + ev = json.loads(msg) + if ev["type"] == "conversation.item.input_audio_transcription.delta": + print(ev["delta"], end="", flush=True) +``` + +- 你拿不到、也不需要底層供應商金鑰——只用平台金鑰連平台端點。 +- 用量按**分鐘**計,歸戶到你的分配、計入配額,可在「用量」頁看到。 +- 金鑰被撤回 / 分配被暫停時,進行中的連線會被平台主動中止。 + +## 給維護者(implement 階段的真打驗證步驟) + +CI 不真連 Azure realtime WS(Constitution Deviation);真實邊界以**部署後手動煙霧**驗。建議順序: + +1. **協定真打**(research R1/R2 校驗):用 Azure Foundry 的 gpt-realtime-whisper endpoint+key,跑一支最小腳本連 WS、送一段已知秒數的 PCM、確認: + - 收到 `...transcription.delta`(接得通、首段 <1s) + - 我們自算的時長 vs(若有)provider usage / Azure 帳單對得上(R2 校驗,必要時加校正) +2. **relay 整合**(不需真 Azure):起一個 **mock provider WS server** 送預錄事件流,跑契約測試 1–7(contracts/)。 +3. **連線中撤回**:建立連線 → 後台撤回該分配 → 確認 N 秒內被 close(revoked) + 已累計時長落帳。 +4. **部署煙霧**(rev 上線後): + - pod egress 實證 `wss://.services.ai.azure.com:443` 可達。 + - nginx WS upgrade 生效(壞金鑰連線被 close 而非 200/SPA fallback)。 + - 真打一次完整字幕 → 用量頁看到一筆 `unit=minute` 歸戶分配。 + +## 驗收對照(spec Success Criteria) + +| SC | 驗證 | +|---|---| +| SC-001 首段 <1s | 步驟 1 真打計時 | +| SC-002 100% 歸戶 | 步驟 4 用量頁查 CallRecord | +| SC-003 異常不漏記 | 契約測試 6(client 直接斷)| +| SC-004 撤回上限內斷線 | 步驟 3 / 契約測試 5 | +| SC-005 無效金鑰 100% 拒絕 | 契約測試 1 | +| SC-006 既有端點零回歸 | 全套件 + 既有 contract 測試 git diff 為空 | diff --git a/specs/043-realtime-transcription/research.md b/specs/043-realtime-transcription/research.md new file mode 100644 index 0000000..83ff96a --- /dev/null +++ b/specs/043-realtime-transcription/research.md @@ -0,0 +1,68 @@ +# Phase 0 Research: realtime 即時字幕端點 + +本檔釘死 spec 刻意延後到規劃階段的三個技術未知。研究方式:inspect 本地 litellm realtime 模組(藍本)+ OpenAI/Azure realtime transcription 官方協定 + 既有專案設施盤點。**端到端真連 Azure realtime WS 安排在 implement 階段於有憑證環境**(用戶已有 Azure Foundry 部署 + key)——本階段把「協定 / relay 結構 / 計量方法 / 基礎建設方案」釘到可實作的程度。 + +--- + +## R1:直連 provider realtime WS 的協定與 relay 結構 + +**Decision**:自寫薄 relay,借鏡 litellm `RealTimeStreaming` 的雙向轉送結構,但**不經 litellm**、改接我們的分配計費。協定走 OpenAI 相容 realtime transcription: + +- 客戶端 ↔ 我們(FastAPI `@app.websocket`):客戶端送 `session.update`(`type:"transcription"`, `model`, `format`)、`input_audio_buffer.append`(base64 PCM)、`input_audio_buffer.commit`;我們回 `conversation.item.input_audio_transcription.delta`(增量)/`.completed`(完整)。 +- 我們 ↔ Azure(`websockets` async client):以 Azure Foundry realtime endpoint + key 開 WS,雙向轉送事件。 +- relay 骨架(借自 litellm `realtime_streaming.py:RealTimeStreaming`):`bidirectional_forward()` = 同時跑 `client→backend` 與 `backend→client` 兩個轉送協程;在 `backend→client` 路徑上**攔截** `conversation.item.input_audio_transcription.completed` 做我們的記帳/觀測。 + +**Rationale**:litellm 的 realtime 是 Proxy form(client 直連 provider、音訊不經 gateway),用它會失去原則 2 可追蹤性與原則 3 即時撤回(experience 第 40 條)。但它的**轉送結構**是成熟藍本,借結構、自接計費=站在肩膀上又守原則。OpenAI 相容協定讓任何會講 realtime 的客戶端(會議/字幕工具)能直接接(願景「主流工具開箱即用」)。 + +**Alternatives considered**: +- litellm Proxy form realtime relay → 否決:client 直連、不認得「分配」(experience 第 40 條、principles 原則 5)。 +- litellm `_arealtime`(library 低階入口)→ 否決:內部 API、不穩定、且仍偏 Proxy 取向;自寫薄 relay 控制權更清楚(原則 7 適配層)。 +- 從零摸 realtime 協定 → 否決:litellm `RealTimeStreaming` 已把 beta↔GA 事件 remap、轉送骨架做過,借鏡省大量試錯。 + +--- + +## R2:per-minute 計量的來源 + +**Decision**:**我們自己從客戶端 `input_audio_buffer.append` 的 PCM bytes 累計音訊時長**,斷線時換算分鐘記一筆 `CallRecord(quantity=分鐘, unit="minute")`,不依賴 provider 回 usage 事件。時長 = Σ(append PCM bytes) / (sample_rate × bytes_per_sample × channels)。 + +**Rationale**: +- OpenAI realtime transcription 官方文件**未保證 usage / 計量事件**(WebFetch 實證:transcription guide 無 usage 欄位);gpt-realtime-whisper 按**音訊分鐘**計費($0.017/min)。 +- 自己從 append bytes 算時長=**自包含、不受 provider 是否回 usage 影響**,且**天然滿足 FR-004「異常中止不漏記」**——已 append 的音訊就算數,連線怎麼斷都已累計。 +- 對應 experience「STT per-second 計量沒 duration 來源就降級」的延伸:這次 duration 來源是「我們轉送的音訊量」,可控可算,不必賭 provider 回什麼。 +- 沿用增量②(0019)的 `call_records.quantity/unit` + `calculate_unit_cost`,`minute` 為新字串單位——**零 migration**。 + +**Alternatives considered**: +- 信 provider 的 usage 事件 → 否決:文件不保證有;若有則作為**校驗**而非主來源(implement 階段真打時對照)。 +- 連線 wall-clock 時間(含靜音)→ 否決:可能與 provider 按「音訊時長」計費不一致,傾向高估;以實際 append 的音訊量為準較貼近計費基礎。 +- 按 transcript 字元/token → 否決:gpt-realtime-whisper 按分鐘非按 token,單位不符。 + +**Implement 階段待校驗**:真打一次 Azure,比對「我們算的分鐘」vs「Azure 帳單/若有的 usage 事件」,必要時加校正係數(admin 可覆寫價,沿用 PriceList 是計費真理)。 + +--- + +## R3:FastAPI WS relay + nginx WS upgrade + egress + 連線中撤回 + +**Decision**: +- **端點**:FastAPI `@app.websocket("/v1/realtime")`(或對齊 OpenAI 路徑),starlette 內建、`websockets` 15.0.1 已在 image。 +- **連線建立 preflight**:WS accept 前(或 accept 後第一個 `session.update`)跑既有 `run_preflight`(金鑰→分配→存取→配額→model binding);不符即關閉連線回相容錯誤碼。 +- **連線中撤回**:在 relay 迴圈旁跑一個**週期性協程**,每 N 秒 re-check 該分配狀態(沿用既有撤回查詢),狀態非 active(撤回/暫停/隔離)即主動 close WS。N 對齊既有撤回 SLO(具體值 tasks 階段定,預設與既有一致)。 +- **nginx**:在既有 `location /v1`(或新 `location /v1/realtime`)加 `proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; proxy_http_version 1.1;`——標準 WS upgrade。 +- **egress**:pod 需可達 `wss://.services.ai.azure.com:443`;既有 443 egress 已開(OCR 那次驗過 `raw.githubusercontent.com:443`),WS over 443 同通——**部署煙霧實證**。 + +**Rationale**:FastAPI/starlette 原生 WS + websockets client 都已在 image,無基礎建設缺口;nginx WS upgrade 是標準配置;撤回用「旁路週期協程」而非阻塞 relay,乾淨且符合原則 3(長連線不能只在建立時檢查一次)。對應 experience「串流端點事後記帳在 client 還連著時做」——計量綁在連線存活期累計、斷線點落帳。 + +**Alternatives considered**: +- 撤回檢查塞進每個 audio 事件 → 否決:耦合轉送熱路徑、頻率不可控;旁路週期協程更清楚。 +- 不做連線中撤回(只建立時檢查)→ 否決:違反原則 3 即時撤回(長連線的核心風險)。 +- 走 SSE 而非 WS → 否決:realtime transcription 是雙向(音訊上行 + 文字下行),SSE 只能單向下行。 + +--- + +## 研究結論彙整(給 Phase 1 / tasks) + +| 未知 | 結論 | 落地 | +|---|---|---| +| 協定 + relay 結構 | OpenAI 相容 transcription 事件流;借 litellm `RealTimeStreaming` 雙向轉送骨架自寫 | `proxy/realtime.py` + `upstream` WS client | +| 計量來源 | 自算 append 音訊時長 → `unit="minute"`,不賭 provider usage | `services` 計量 + `CallRecord(quantity,unit)`(0019,零 migration)| +| 基礎建設 + 撤回 | FastAPI WS + websockets(已在 image,提直接依賴);nginx WS upgrade;旁路週期協程 re-check 撤回 | `realtime.py` + helm nginx config | +| 真打驗證 | 安排 implement 階段於有憑證環境(Constitution Deviation:不進 CI,部署煙霧)| quickstart 驗證腳本 | diff --git a/specs/043-realtime-transcription/spec.md b/specs/043-realtime-transcription/spec.md new file mode 100644 index 0000000..16769fa --- /dev/null +++ b/specs/043-realtime-transcription/spec.md @@ -0,0 +1,109 @@ +# Feature Specification: realtime 即時字幕端點 + +**Feature Branch**: `043-realtime-transcription` +**Created**: 2026-06-12 +**Status**: Draft +**Input**: User description: "realtime 即時字幕端點:暴露 OpenAI 相容的 realtime transcription WebSocket 端點,走 gpt-realtime-whisper(Azure Foundry),連線時跑既有 preflight、連線期間 per-minute 計費歸戶到分配、連線期間定期 re-check 分配狀態被撤回即主動斷線;只做端點不做字幕 UI;build 直連 Azure realtime WS、不用 litellm Proxy form" + +## User Scenarios & Testing *(mandatory)* + +本功能對外的「使用者」是**接平台的客戶端應用 / 開發者**(用分配到的金鑰串流音訊取即時字幕),以及**管理員**(撤回分配時連線應隨之中止)與**成員**(在目錄看得到 realtime 模型與連線方式)。平台**不提供字幕畫面**——交付的是一個相容端點,讓會議軟體、字幕工具、課堂應用等自行接入。 + +### User Story 1 - 開發者用平台金鑰取得即時字幕 (Priority: P1) + +一個課堂/會議應用的開發者,拿著分配到的應用金鑰,把麥克風音訊串流到平台的 realtime 字幕端點,邊說邊收到一段段文字,用來顯示即時字幕——全程不需要、也拿不到底層 AI 供應商的金鑰。 + +**Why this priority**: 這是整個功能的核心價值與最小可用切片;沒有它,其餘(計費、撤回)都無對象。單獨完成即構成 MVP——「能用平台金鑰取得即時字幕」本身就交付價值。 + +**Independent Test**: 用一把有效、且對某 realtime 模型有有效分配的金鑰建立連線,串入一段音訊,驗證能即時收到對應文字;用無效/撤回的金鑰則無法建立連線。 + +**Acceptance Scenarios**: + +1. **Given** 一把有效金鑰、且其分配含某個 realtime 字幕模型,**When** 客戶端建立連線並串流音訊,**Then** 客戶端持續收到該段音訊的即時文字結果。 +2. **Given** 一把無效或已撤回的金鑰,**When** 客戶端嘗試建立連線,**Then** 連線被拒絕、不開始串流,且不洩漏任何底層供應商資訊。 +3. **Given** 一把有效金鑰,但請求的模型不是 realtime 字幕類型,**When** 客戶端嘗試建立連線,**Then** 連線被拒絕並回可理解的錯誤(此模型不支援即時字幕)。 + +--- + +### User Story 2 - 即時字幕用量按時間計費並歸戶到分配 (Priority: P2) + +成員/管理員需要 realtime 字幕的用量跟其他端點一樣**可盤點、可計費、可追蹤**——每一次連線消耗多少(以時間計)、算在哪一筆分配、計入配額,都看得到。 + +**Why this priority**: 對應可追蹤性的核心承諾;沒有它,realtime 會成為「用得到但帳目斷裂」的影子用量,違反平台不變式。但需先有 P1 才有用量可計。 + +**Independent Test**: 完成一次連線後,驗證該次用量(串流時長)以時間單位記錄、歸戶到對應分配、計入該分配配額,並出現在用量總覽中;連線即使異常中止,已串流的時間仍被記錄。 + +**Acceptance Scenarios**: + +1. **Given** 一次正常結束的字幕連線,**When** 連線關閉,**Then** 該次用量以時間單位記到對應分配、計入配額、可在用量視圖查到。 +2. **Given** 一次因網路中斷而異常結束的連線,**When** 連線中止,**Then** 已串流的時間仍被計費、不漏記。 +3. **Given** 同一成員以多把金鑰使用同一筆分配的 realtime 模型,**When** 各自連線,**Then** 用量都歸戶到同一分配、共用其配額。 + +--- + +### User Story 3 - 分配被撤回時進行中的連線隨即中止 (Priority: P3) + +管理員撤回(或暫停/隔離)某筆分配後,即使該分配當下有正在進行的字幕連線,也必須在限定時間內被切斷,不能靠連線自然結束或金鑰自然過期。 + +**Why this priority**: 對應即時撤回原則;長連線特有的風險——一條已建立的連線若不主動檢查狀態,會在撤回後繼續消耗資源。但屬於 P1 之上的治理保護,非首個可用切片。 + +**Independent Test**: 在一條進行中的字幕連線期間,由管理員撤回該分配,驗證連線在約定時間內被主動切斷,且切斷前已串流的時間正確計費。 + +**Acceptance Scenarios**: + +1. **Given** 一條進行中的字幕連線,**When** 管理員撤回其分配,**Then** 連線在約定上限時間內被主動終止。 +2. **Given** 一條進行中的字幕連線,**When** 其分配被暫停或自動隔離,**Then** 連線同樣在約定時間內被終止。 +3. **Given** 連線被中止,**When** 結算,**Then** 中止前已串流的時間被正確計費。 + +--- + +### Edge Cases + +- **連線建立時配額已滿**:拒絕建立連線,給可理解的錯誤(與其他端點的配額不足行為一致)。 +- **連線期間累計用量超過配額**:與既有非 token 端點(OCR/圖片)相同的已知限制——配額在連線建立時檢查,連線進行中的超額不即時中斷(每分鐘級硬上限為後續)。本次以「建立時擋、進行中記帳」為準。 +- **上游供應商在連線中斷線或回報失敗**:平台對客戶端結束連線並回報可理解的失敗原因,已串流的時間仍計費。 +- **客戶端直接中斷(沒有正常關閉握手)**:仍須結算已串流的時間,不漏記。 +- **同一金鑰/分配同時多條連線**:允許(用量綁分配、各自計時累計到同一分配,沿用「額度綁分配、不對連線數設限」的既有立場)。 +- **送出非 realtime 字幕類型的模型**:拒絕,回可理解錯誤。 + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: 系統 MUST 對外提供一個 realtime 即時字幕端點,讓客戶端以分配到的金鑰建立**持續連線**、串流音訊、並在說話過程中**即時**收到對應文字結果。 +- **FR-002**: 建立連線 MUST 通過既有的存取前置檢查——有效金鑰、對所請求的 realtime 模型有有效分配、且該分配可用(未撤回/暫停/隔離、配額未滿);任一不符即拒絕建立連線。 +- **FR-003**: 系統 MUST 將每一次 realtime 連線的用量以**時間(分鐘)為單位**計量,歸戶到對應分配,並計入該分配的配額與計費,與其他端點一致地出現在用量盤點中。 +- **FR-004**: 連線結束(正常關閉或異常中止)時,系統 MUST 記錄該次已串流的時間用量,**不得因異常中止而漏記**。 +- **FR-005**: 連線**期間**系統 MUST 定期檢查其分配的當前狀態;當分配被撤回、暫停或自動隔離時,MUST 在約定上限時間內**主動終止連線**,不依賴連線自然結束或金鑰過期。 +- **FR-006**: 底層 AI 供應商的金鑰與內部細節 MUST NOT 出現在連線、回應或錯誤訊息中;客戶端只以平台金鑰連平台端點。 +- **FR-007**: 當請求的模型**非 realtime 字幕類型**時,系統 MUST 拒絕連線並回可理解的錯誤。 +- **FR-008**: 成員目錄 MUST 正確標示 realtime 字幕類型模型(不假裝成其他類型),並提供如何連線取用的範例,與其他端點的範例呈現一致。 +- **FR-009**: 系統 MUST 對 realtime 字幕端點維持與其他端點一致的審計與錯誤透明度——可在維運視圖看到連線建立/結束、計量結果、與上游失敗原因。 + +### Key Entities *(include if feature involves data)* + +- **Realtime 字幕連線(session)**:一次持續連線的代表,綁定到一筆分配;具開始時間、結束時間、已串流時間、結束原因(正常/異常/被撤回)。用量計量的來源。 +- **呼叫紀錄(既有)**:沿用既有用量紀錄,數量以時間(分鐘)、單位為時間維度,歸戶到分配——與既有非 token 端點同一機制。 +- **分配(既有)**:realtime 用量計入的歸戶對象與配額載體;其狀態(有效/撤回/暫停/隔離)即連線期間 re-check 的依據。 + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: 開發者用平台金鑰連上端點並串流音訊後,**1 秒內**開始收到第一段文字結果(即時感成立)。 +- **SC-002**: **100%** 的 realtime 字幕連線用量都歸戶到正確分配並計入配額——無任何一次成為無歸屬的影子用量。 +- **SC-003**: 連線因網路或客戶端異常中止時,已串流時間的計費**零漏記**。 +- **SC-004**: 管理員撤回分配後,該分配進行中的字幕連線在**約定上限時間內**被主動中止(可驗證的最大延遲)。 +- **SC-005**: 無效或已撤回的金鑰**無法**建立任何字幕連線(拒絕率 100%)。 +- **SC-006**: 既有所有端點(chat/embedding/ocr/stt/… 與計費)在本功能上線後**零回歸**。 + +## Assumptions + +- **沿用「分配」為計量歸戶與配額的第一公民**:realtime 用量綁分配、不對單一金鑰/連線數設上限(沿用既有原則:額度綁分配、token/連線數不另設限)。 +- **計量單位以時間(分鐘)為準**:以供應商回報的計量為優先;若回應未帶可用計量,則以連線實際串流時間估算(最終計量來源於規劃階段的能力驗證中釘死)。 +- **撤回 SLO 對齊既有撤回機制**:連線期間 re-check 的頻率使「撤回 → 斷線」落在與既有分配撤回一致的時間上限內(具體秒數於規劃階段定)。 +- **配額為「建立時檢查」**:連線進行中的累計超額不即時中斷——與既有非 token 端點(OCR/圖片)相同的已知限制,每分鐘級硬上限列為後續。 +- **只交付端點、不交付字幕 UI**:字幕畫面由接入的客戶端應用負責;平台提供相容端點 + 連線範例。 +- **採直連上游供應商的 realtime 連線**:不引入第三方代理閘道形態;沿用平台既有的「自製 gateway + 上游抽象層」邊界(避免引入不認得「分配」模型的並行計費權威)。 +- **目標供應商模型已就緒**:用於即時字幕的供應商模型已在供應端部署且平台已有可連憑證(規劃/實作階段可真打驗證)。 +- **既有存取、計費、審計機制重用**:preflight、用量紀錄、配額、審計事件沿用既有設施,realtime 為其新增的一種端點形態。 diff --git a/specs/043-realtime-transcription/tasks.md b/specs/043-realtime-transcription/tasks.md new file mode 100644 index 0000000..fa9a6a2 --- /dev/null +++ b/specs/043-realtime-transcription/tasks.md @@ -0,0 +1,182 @@ +# Tasks: realtime 即時字幕端點 + +**Input**: Design documents from `specs/043-realtime-transcription/` +**Prerequisites**: plan.md, spec.md, research.md, data-model.md, contracts/realtime-transcription.md, quickstart.md + +**Tests**: 包含(Constitution 原則 I Test-First 非協商)——契約/整合/單元測試先寫且先失敗,再實作。 + +**Organization**: 按 user story(P1/P2/P3)分階段,每個 story 以 mock provider realtime WS server 獨立可測。 + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: 不同檔案、無未完成依賴,可並行 +- **[Story]**: US1 / US2 / US3(對映 spec 的 P1/P2/P3) + +--- + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: 依賴與測試基礎建設 + +- [X] T001 將 `websockets` 提為直接依賴(`pyproject.toml`,已隨 image,宣告版本下限;PR 以 Constitution Deviation 說明)並確認 lockfile 更新 +- [ ] T002 [P] 建立 mock provider realtime WS server test fixture(`tests/conftest.py` 或 `tests/support/realtime_mock.py`):一個可在測試內啟動的假 realtime WS,依輸入送預錄 `...transcription.delta/.completed` 事件流,供所有整合/契約測試共用 +- [X] T003 [P] 在計量層登記 `minute` 單位:確認 `services/pricing.py` 的 `calculate_unit_cost` 對 `unit="minute"` 無礙(純資料值、無 schema 變更),補單元測試於 `tests/unit/test_pricing.py` + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: 讓 WS 連線能建立並轉送的最小骨架——所有 user story 的前置 + +**⚠️ CRITICAL**: 本階段未完成前,US1–US3 無法開工 + +- [ ] T004 實作上游 realtime WS client helper(`src/ai_api/proxy/upstream.py`):以 `websockets` 開一條到 Azure Foundry realtime endpoint 的連線、注入憑證(api_key/api_base),回傳可雙向收送的連線物件;金鑰不外洩 +- [ ] T005 建立 WS 端點 scaffold(`src/ai_api/proxy/realtime.py`):FastAPI `@app.websocket("/v1/realtime")` accept/close 骨架 + 掛載到 app router(暫不含 preflight/relay 完整邏輯,先讓連線可建立可關閉) +- [X] T006 `services/model_kind.py` 的 mode→kind 對映加 `realtime`(litellm realtime/transcription mode → `realtime` kind);**改完重跑完整 `pytest tests/` 確認零回歸**(experience:「未知 mode 反例」整合測試會撞) + +**Checkpoint**: WS 連線可建立、可開上游連線、目錄能辨識 realtime 類型——可開始 US1 + +--- + +## Phase 3: User Story 1 - 開發者用平台金鑰取得即時字幕 (Priority: P1) 🎯 MVP + +**Goal**: 有效金鑰 → 建立 WS 連線 → 串流音訊 → 即時收文字 delta;無效/撤回/非 realtime 模型被拒。 + +**Independent Test**: 用 mock provider WS,有效金鑰連線送 append 收到 delta;無效金鑰被 close。 + +### Tests for User Story 1 ⚠️(先寫、先失敗) + +- [ ] T007 [P] [US1] 契約測試:無效/撤回金鑰連線被 close、未開始串流(`tests/contract/test_realtime_transcription.py`) +- [ ] T008 [P] [US1] 契約測試:請求非 realtime 類型模型 → close(unsupported)(同檔) +- [ ] T009 [P] [US1] 整合測試:有效金鑰連線 + 送 `input_audio_buffer.append` → 收到 `...transcription.delta`(mock provider WS,`tests/integration/test_realtime_relay.py`) + +### Implementation for User Story 1 + +- [ ] T010 [US1] 連線建立時跑既有 `run_preflight`(`src/ai_api/proxy/realtime.py`):金鑰→分配→存取→配額→model binding;不通過則 close 並回相容錯誤碼(不洩漏上游) +- [ ] T011 [US1] 雙向 relay 迴圈(`src/ai_api/proxy/realtime.py`):`client→backend` 與 `backend→client` 兩協程轉送(借鏡 litellm `RealTimeStreaming.bidirectional_forward` 結構),delta/completed 即時轉回客戶端 +- [ ] T012 [US1] 模型類型校驗 + 錯誤轉譯(`src/ai_api/proxy/realtime.py`):非 realtime kind → close(unsupported);上游錯誤透明轉回但不含 key/endpoint(FR-006/007) +- [ ] T013 [US1] 連線生命週期結構化日誌(`src/ai_api/proxy/realtime.py`):建立/關閉/原因,沿用既有 audit + 觀測(原則 IV) + +**Checkpoint**: 客戶端能用平台金鑰即時取得字幕;MVP 成立(計量/撤回尚未接) + +--- + +## Phase 4: User Story 2 - 即時字幕用量按時間計費並歸戶到分配 (Priority: P2) + +**Goal**: 每次連線的用量以分鐘計、歸戶分配、計入配額,異常中止不漏記。 + +**Independent Test**: 連線送已知時長音訊後關閉 → 寫一筆 `CallRecord(unit="minute")`、quantity 對得上;client 直接斷也落帳。 + +### Tests for User Story 2 ⚠️(先寫、先失敗) + +- [ ] T014 [P] [US2] 單元測試:PCM bytes → 秒 → 分鐘換算(含 rounding)(`tests/unit/test_realtime_metering.py`) +- [ ] T015 [P] [US2] 整合測試:連線正常關閉 → 一筆 `CallRecord(unit="minute")`、quantity 對得上、歸戶正確分配(`tests/integration/test_realtime_relay.py`) +- [ ] T016 [P] [US2] 整合測試:client 直接中斷(無正常握手)→ 已累計時長仍落帳(FR-004/SC-003) + +### Implementation for User Story 2 + +- [ ] T017 [US2] RealtimeSession 計量狀態(`src/ai_api/proxy/realtime.py`):解析 `session.update` 的 format(sample_rate/bytes_per_sample/channels)、在 relay 即時累計 `audio_bytes` +- [ ] T018 [US2] 斷線落帳(`src/ai_api/proxy/realtime.py`):duration→minute→`CallRecord`(`calculate_unit_cost`、歸戶 allocation、token 欄 NULL、outcome 對映 close_reason);**任何 close 路徑都落帳** +- [ ] T019 [P] [US2] 前端:admin `/prices` 單位下拉加 `minute`(`frontend/src/routes/admin/prices.tsx`,沿用階段 29 單位感知 UI),realtime 模型可設每分鐘價 + +**Checkpoint**: US1 + US2——即時字幕可用且用量可計費歸戶 + +--- + +## Phase 5: User Story 3 - 分配被撤回時進行中的連線隨即中止 (Priority: P3) + +**Goal**: 連線期間分配被撤回/暫停/隔離 → 約定時間內主動斷線,已累計時長落帳。 + +**Independent Test**: mock 連線進行中撤回分配 → N 秒內 close(revoked) + 落帳。 + +### Tests for User Story 3 ⚠️(先寫、先失敗) + +- [ ] T020 [P] [US3] 整合測試:連線進行中撤回分配 → N 秒內 close(revoked) + 已累計時長落帳(`tests/integration/test_realtime_relay.py`) +- [ ] T021 [P] [US3] 整合測試:分配被暫停/隔離 → 同樣主動斷線(同檔) + +### Implementation for User Story 3 + +- [ ] T022 [US3] 旁路週期 re-check 協程(`src/ai_api/proxy/realtime.py`):每 N 秒查分配當前狀態,非 active → 主動 close(revoked);N 對齊既有撤回 SLO(常數集中、可調) +- [ ] T023 [US3] 與 US2 落帳整合(`src/ai_api/proxy/realtime.py`):撤回觸發的 close 同樣走斷線落帳(已累計時長不漏) + +**Checkpoint**: 三個 user story 全部獨立可用 + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +- [ ] T024 [P] 前端:`frontend/src/routes/admin/model-detail.tsx` 顯示 realtime 類型(KIND_LABEL)+ `frontend/src/components/api-usage-example.tsx` 加 realtime WS 連線範例(FR-008) +- [ ] T025 [P] nginx WS upgrade config(`deploy/helm/ai-api/`):`/v1/realtime`(或 `/v1`)加 `Upgrade`/`Connection: upgrade` + `proxy_http_version 1.1` +- [ ] T026 全綠關卡:`ruff check .` + mypy + 前端 tsc/build/test + 完整 `pytest tests/` 零回歸(既有 contract 測試 git diff 為空,SC-006) +- [ ] T027 部署後手動煙霧(quickstart.md,**需憑證環境**):pod egress `wss:443` 實證、壞金鑰連線被 close、真打一次完整字幕(首字 <1s)→ 用量頁見一筆 `unit=minute` 歸戶分配;R2 計量對照 Azure 帳單校驗 + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: 無依賴,可立即開始 +- **Foundational (Phase 2)**: 依賴 Setup;**BLOCKS 所有 user story** +- **User Stories (Phase 3–5)**: 皆依賴 Foundational + - US1(MVP)建議先做;US2/US3 在 US1 的 relay 骨架上疊(同檔 `realtime.py`,故 US2/US3 內部多為順序、跨檔的測試/前端可 [P]) +- **Polish (Phase 6)**: 依賴所需 user story 完成 + +### User Story Dependencies + +- **US1 (P1)**: Foundational 後即可——核心連線+轉送,MVP +- **US2 (P2)**: 邏輯上疊在 US1 的 relay(累計 audio_bytes 在轉送迴圈內);測試/前端可獨立 +- **US3 (P3)**: 旁路協程,與 US1 relay 並行;落帳與 US2 共用 + +### Within Each User Story + +- 測試先寫且先失敗 → 實作 → 重構 +- relay/計量/撤回多在同一檔 `proxy/realtime.py`,故同 story 內實作任務多為順序;不同檔(前端、測試)標 [P] + +### Parallel Opportunities + +- T002/T003(Setup)可並行 +- 各 story 的測試任務(T007–T009、T014–T016、T020–T021)標 [P] 可並行先寫 +- 前端任務(T019、T024)與後端不同檔,可並行 +- T025 nginx config 與後端邏輯不同檔,可並行 + +--- + +## Parallel Example: User Story 1 + +```bash +# 先並行寫 US1 全部測試(先失敗): +Task: "契約測試 無效金鑰被 close — tests/contract/test_realtime_transcription.py" +Task: "契約測試 非 realtime 模型 close — tests/contract/test_realtime_transcription.py" +Task: "整合測試 有效連線收 delta(mock provider WS)— tests/integration/test_realtime_relay.py" +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1) + +1. Phase 1 Setup → 2. Phase 2 Foundational(CRITICAL)→ 3. Phase 3 US1 → **STOP & VALIDATE**(mock provider WS 跑綠 = MVP)→ 視情況先以 mock 驗收,真打留 T027。 + +### Incremental Delivery + +1. Setup + Foundational → 基礎 +2. US1 → 即時字幕可用(mock 驗)→ MVP +3. US2 → 計費歸戶 → 可上線(計費完整) +4. US3 → 連線中撤回 → 治理完整 +5. Polish(前端目錄/範例 + nginx + 全綠 + 部署煙霧) + +### 真打限制(誠實標記) + +- T009/T015/T020 等整合測試**全用 mock provider WS**(CI 可重現,Constitution Deviation 的補救)。 +- **T027 真連 Azure realtime WS 需憑證環境**(維護者實機跑 quickstart)——R1/R2 的協定接通 + 計量對照在此校驗,非 CI。 + +--- + +## Notes + +- [P] = 不同檔、無依賴;relay/計量/撤回集中於 `proxy/realtime.py`,同 story 實作多順序。 +- 每個 task 或邏輯群組後 commit;測試先失敗再實作。 +- 改 `model_kind`(T006)後務必跑完整 `pytest tests/`(experience 教訓)。 +- 既有端點零回歸鐵證:既有 contract 測試檔 git diff 為空(SC-006)。 diff --git a/src/ai_api/services/model_kind.py b/src/ai_api/services/model_kind.py index a3f02a7..7b3959b 100644 --- a/src/ai_api/services/model_kind.py +++ b/src/ai_api/services/model_kind.py @@ -16,7 +16,7 @@ Kind = Literal[ "chat", "embedding", "tts", "image", "stt", "ocr", "rerank", - "moderation", "search", "image_edit", "unknown", + "moderation", "search", "image_edit", "realtime", "unknown", ] # litellm mode → our kind @@ -32,6 +32,9 @@ "moderation": "moderation", "search": "search", "image_edit": "image_edit", + # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Not a + # recipe-table "test model" kind — billed per-minute via the /v1/realtime relay. + "realtime": "realtime", } diff --git a/tests/contract/test_pricing_units.py b/tests/contract/test_pricing_units.py index 5a61de8..129770a 100644 --- a/tests/contract/test_pricing_units.py +++ b/tests/contract/test_pricing_units.py @@ -81,3 +81,25 @@ async def test_current_price_map_surfaces_per_unit(app_client: AsyncClient) -> N entry = pm[("azure_ai", "doc-ocr")] assert entry["price_unit"] == "page" assert Decimal(entry["price_per_unit"]) == Decimal("0.003") + + +@pytest.mark.asyncio +async def test_minute_unit_for_realtime(app_client: AsyncClient) -> None: + """Phase 32: realtime transcription bills per-minute — same unit-billing path, + `minute` is just a new string unit value (no schema change).""" + now = datetime.now(UTC) + sm = get_sessionmaker() + async with sm() as s: + s.add(PriceList( + id=str(ULID()), provider="azure", model="gpt-realtime-whisper", + input_per_1k_tokens_usd=Decimal(0), output_per_1k_tokens_usd=Decimal(0), + price_unit="minute", price_per_unit_usd=Decimal("0.017"), + effective_from=now - timedelta(days=1), created_at=datetime.now(UTC), created_by="test", + )) + await s.commit() + price = await lookup_price_for_call( + s, provider="azure", model="gpt-realtime-whisper", call_time=now + ) + assert price is not None and price.price_unit == "minute" + # 5 minutes x $0.017 = $0.085 (per-minute billing through the existing path) + assert calculate_unit_cost(5, price.price_per_unit) == Decimal("0.085") From d718fae2de78ad5f8177a3be4b39945483260876 Mon Sep 17 00:00:00 2001 From: timcsy Date: Fri, 12 Jun 2026 16:46:33 +0800 Subject: [PATCH 2/4] =?UTF-8?q?feat(realtime):=20/v1/realtime=20WS=20relay?= =?UTF-8?q?=20=E2=80=94=20bidirectional=20transcription,=20per-minute=20bi?= =?UTF-8?q?lling,=20in-flight=20revocation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WS core for the realtime transcription endpoint (US1/US2/US3): - proxy/realtime.py: thin bidirectional relay (borrowing litellm RealTimeStreaming structure, not its Proxy-form realtime) + side-channel revocation watcher + self-counted per-minute metering from input_audio_buffer.append PCM bytes (R2). Any close path bills one CallRecord(unit="minute") attributed to the allocation (FR-004: abnormal abort never loses usage). Never leaks upstream key/endpoint. - upstream.open_realtime_ws: websockets client to the provider realtime WS, injecting the credential as api-key/Bearer (exact Azure URL validated in T027). - handle_realtime takes an injectable open_upstream/check_active so CI exercises the full preflight→relay→metering→revocation path against a fake provider WS in-loop (engine is bound to the test loop; a TestClient portal would break the DB). - Frontend: realtime KIND_LABEL, /v1/realtime WS usage example, prices 'minute' unit. - nginx: /v1/realtime WS upgrade (HTTP/1.1 Upgrade + no buffering + long timeout). Tests: contract 1-7 (invalid/revoked key, non-realtime model, delta relay, clean- close billing, abnormal-abort billing, in-flight revoke, no-leak) + pure metering unit tests. Full suite 731 passed (715→731), zero regression; ruff+mypy clean; frontend tsc + 164 vitest + build green. SC-006: existing contract tests untouched. T027 (real Azure realtime WS smoke) remains for a credentialed environment. Co-Authored-By: Claude Opus 4.8 (1M context) --- deploy/nginx/default.conf.template | 12 + frontend/src/components/api-usage-example.tsx | 22 + frontend/src/routes/admin/model-detail.tsx | 1 + frontend/src/routes/admin/prices.tsx | 5 +- specs/043-realtime-transcription/tasks.md | 49 +- src/ai_api/main.py | 2 + src/ai_api/proxy/realtime.py | 519 ++++++++++++++++++ src/ai_api/proxy/upstream.py | 42 ++ tests/contract/test_realtime_transcription.py | 309 +++++++++++ tests/support/__init__.py | 0 tests/support/realtime_mock.py | 108 ++++ tests/unit/test_realtime_metering.py | 51 ++ 12 files changed, 1094 insertions(+), 26 deletions(-) create mode 100644 src/ai_api/proxy/realtime.py create mode 100644 tests/contract/test_realtime_transcription.py create mode 100644 tests/support/__init__.py create mode 100644 tests/support/realtime_mock.py create mode 100644 tests/unit/test_realtime_metering.py diff --git a/deploy/nginx/default.conf.template b/deploy/nginx/default.conf.template index 29c1b06..9fb569b 100644 --- a/deploy/nginx/default.conf.template +++ b/deploy/nginx/default.conf.template @@ -42,6 +42,18 @@ server { proxy_set_header Connection ""; proxy_http_version 1.1; } + # Realtime transcription is a bidirectional WebSocket (/v1/realtime). It needs + # the HTTP/1.1 Upgrade dance + no buffering + a long read timeout so the relay + # stays open while audio streams. Must precede the generic /v1 location. + location /v1/realtime { + proxy_pass http://${BACKEND_UPSTREAM}; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 3600s; + } location /v1 { proxy_pass http://${BACKEND_UPSTREAM}; } location /docs { proxy_pass http://${BACKEND_UPSTREAM}; } location /openapi.json { proxy_pass http://${BACKEND_UPSTREAM}; } diff --git a/frontend/src/components/api-usage-example.tsx b/frontend/src/components/api-usage-example.tsx index 598cfcd..a358c31 100644 --- a/frontend/src/components/api-usage-example.tsx +++ b/frontend/src/components/api-usage-example.tsx @@ -81,6 +81,28 @@ export function ApiUsageExample({ -H "Authorization: Bearer $TOKEN" \\ -F "model=${m}" -F "image=@input.png" -F "prompt=make it red"`, }, + realtime: { + path: "/realtime", + desc: "即時字幕(realtime)模型,用 WebSocket 串流音訊、即時收文字(OpenAI realtime transcription 相容)。用量按分鐘計", + // WebSocket — not curl. Replace https:// with wss:// in the endpoint URL. + curl: `# pip install websockets — 串麥克風 PCM、即時收字幕(把 https 換成 wss) +import asyncio, base64, json, websockets + +async def main(): + url = "${base}/realtime".replace("https://", "wss://").replace("http://", "ws://") + async with websockets.connect(url, additional_headers={"Authorization": "Bearer $TOKEN"}) as ws: + await ws.send(json.dumps({"type": "session.update", "session": { + "type": "transcription", "model": "${m}", + "audio": {"input": {"format": {"type": "audio/pcm", "rate": 24000}}}}})) + await ws.send(json.dumps({"type": "input_audio_buffer.append", + "audio": base64.b64encode(pcm_chunk).decode()})) + async for msg in ws: + ev = json.loads(msg) + if ev.get("type") == "conversation.item.input_audio_transcription.delta": + print(ev["delta"], end="", flush=True) + +asyncio.run(main())`, + }, }; if (kind && endpointInfo[kind]) { const info = endpointInfo[kind]!; diff --git a/frontend/src/routes/admin/model-detail.tsx b/frontend/src/routes/admin/model-detail.tsx index aaa62e6..c396ace 100644 --- a/frontend/src/routes/admin/model-detail.tsx +++ b/frontend/src/routes/admin/model-detail.tsx @@ -109,6 +109,7 @@ const KIND_LABEL: Record = { moderation: "內容審核(moderation)", search: "網路搜尋(search)", image_edit: "圖片編輯(image edit)", + realtime: "即時字幕(realtime)", unknown: "未知", }; diff --git a/frontend/src/routes/admin/prices.tsx b/frontend/src/routes/admin/prices.tsx index 613b448..6551a1a 100644 --- a/frontend/src/routes/admin/prices.tsx +++ b/frontend/src/routes/admin/prices.tsx @@ -73,7 +73,7 @@ const fmtDate = (iso: string) => new Date(iso).toLocaleString("zh-TW"); // Phase 31: non-token billing unit labels. const UNIT_ZH: Record = { - page: "頁", query: "查詢", character: "字元", image: "張", second: "秒", + page: "頁", query: "查詢", character: "字元", image: "張", second: "秒", minute: "分鐘", }; /** Local "now" formatted for a (YYYY-MM-DDTHH:mm). */ @@ -448,13 +448,14 @@ function AddPriceDialog({ 每字元 每張 每秒 + 每分鐘 setPerPage(e.target.value)} />

- 非 token 模型(OCR=頁、rerank/search=查詢、TTS=字元、圖片編輯=張)依該單位計費,填此欄;token 欄可填 0。一筆價格只用一種單位。可按上方「從 LiteLLM 帶入建議價」自動填。 + 非 token 模型(OCR=頁、rerank/search=查詢、TTS=字元、圖片編輯=張、即時字幕=分鐘)依該單位計費,填此欄;token 欄可填 0。一筆價格只用一種單位。可按上方「從 LiteLLM 帶入建議價」自動填。

diff --git a/specs/043-realtime-transcription/tasks.md b/specs/043-realtime-transcription/tasks.md index fa9a6a2..a6ff94c 100644 --- a/specs/043-realtime-transcription/tasks.md +++ b/specs/043-realtime-transcription/tasks.md @@ -19,7 +19,7 @@ **Purpose**: 依賴與測試基礎建設 - [X] T001 將 `websockets` 提為直接依賴(`pyproject.toml`,已隨 image,宣告版本下限;PR 以 Constitution Deviation 說明)並確認 lockfile 更新 -- [ ] T002 [P] 建立 mock provider realtime WS server test fixture(`tests/conftest.py` 或 `tests/support/realtime_mock.py`):一個可在測試內啟動的假 realtime WS,依輸入送預錄 `...transcription.delta/.completed` 事件流,供所有整合/契約測試共用 +- [X] T002 [P] 建立 mock provider realtime WS server test fixture(`tests/conftest.py` 或 `tests/support/realtime_mock.py`):一個可在測試內啟動的假 realtime WS,依輸入送預錄 `...transcription.delta/.completed` 事件流,供所有整合/契約測試共用 - [X] T003 [P] 在計量層登記 `minute` 單位:確認 `services/pricing.py` 的 `calculate_unit_cost` 對 `unit="minute"` 無礙(純資料值、無 schema 變更),補單元測試於 `tests/unit/test_pricing.py` --- @@ -30,8 +30,8 @@ **⚠️ CRITICAL**: 本階段未完成前,US1–US3 無法開工 -- [ ] T004 實作上游 realtime WS client helper(`src/ai_api/proxy/upstream.py`):以 `websockets` 開一條到 Azure Foundry realtime endpoint 的連線、注入憑證(api_key/api_base),回傳可雙向收送的連線物件;金鑰不外洩 -- [ ] T005 建立 WS 端點 scaffold(`src/ai_api/proxy/realtime.py`):FastAPI `@app.websocket("/v1/realtime")` accept/close 骨架 + 掛載到 app router(暫不含 preflight/relay 完整邏輯,先讓連線可建立可關閉) +- [X] T004 實作上游 realtime WS client helper(`src/ai_api/proxy/upstream.py`):以 `websockets` 開一條到 Azure Foundry realtime endpoint 的連線、注入憑證(api_key/api_base),回傳可雙向收送的連線物件;金鑰不外洩 +- [X] T005 建立 WS 端點 scaffold(`src/ai_api/proxy/realtime.py`):FastAPI `@app.websocket("/v1/realtime")` accept/close 骨架 + 掛載到 app router(暫不含 preflight/relay 完整邏輯,先讓連線可建立可關閉) - [X] T006 `services/model_kind.py` 的 mode→kind 對映加 `realtime`(litellm realtime/transcription mode → `realtime` kind);**改完重跑完整 `pytest tests/` 確認零回歸**(experience:「未知 mode 反例」整合測試會撞) **Checkpoint**: WS 連線可建立、可開上游連線、目錄能辨識 realtime 類型——可開始 US1 @@ -46,16 +46,16 @@ ### Tests for User Story 1 ⚠️(先寫、先失敗) -- [ ] T007 [P] [US1] 契約測試:無效/撤回金鑰連線被 close、未開始串流(`tests/contract/test_realtime_transcription.py`) -- [ ] T008 [P] [US1] 契約測試:請求非 realtime 類型模型 → close(unsupported)(同檔) -- [ ] T009 [P] [US1] 整合測試:有效金鑰連線 + 送 `input_audio_buffer.append` → 收到 `...transcription.delta`(mock provider WS,`tests/integration/test_realtime_relay.py`) +- [X] T007 [P] [US1] 契約測試:無效/撤回金鑰連線被 close、未開始串流(`tests/contract/test_realtime_transcription.py`) +- [X] T008 [P] [US1] 契約測試:請求非 realtime 類型模型 → close(unsupported)(同檔) +- [X] T009 [P] [US1] 整合測試:有效金鑰連線 + 送 `input_audio_buffer.append` → 收到 `...transcription.delta`(mock provider WS,`tests/integration/test_realtime_relay.py`) ### Implementation for User Story 1 -- [ ] T010 [US1] 連線建立時跑既有 `run_preflight`(`src/ai_api/proxy/realtime.py`):金鑰→分配→存取→配額→model binding;不通過則 close 並回相容錯誤碼(不洩漏上游) -- [ ] T011 [US1] 雙向 relay 迴圈(`src/ai_api/proxy/realtime.py`):`client→backend` 與 `backend→client` 兩協程轉送(借鏡 litellm `RealTimeStreaming.bidirectional_forward` 結構),delta/completed 即時轉回客戶端 -- [ ] T012 [US1] 模型類型校驗 + 錯誤轉譯(`src/ai_api/proxy/realtime.py`):非 realtime kind → close(unsupported);上游錯誤透明轉回但不含 key/endpoint(FR-006/007) -- [ ] T013 [US1] 連線生命週期結構化日誌(`src/ai_api/proxy/realtime.py`):建立/關閉/原因,沿用既有 audit + 觀測(原則 IV) +- [X] T010 [US1] 連線建立時跑既有 `run_preflight`(`src/ai_api/proxy/realtime.py`):金鑰→分配→存取→配額→model binding;不通過則 close 並回相容錯誤碼(不洩漏上游) +- [X] T011 [US1] 雙向 relay 迴圈(`src/ai_api/proxy/realtime.py`):`client→backend` 與 `backend→client` 兩協程轉送(借鏡 litellm `RealTimeStreaming.bidirectional_forward` 結構),delta/completed 即時轉回客戶端 +- [X] T012 [US1] 模型類型校驗 + 錯誤轉譯(`src/ai_api/proxy/realtime.py`):非 realtime kind → close(unsupported);上游錯誤透明轉回但不含 key/endpoint(FR-006/007) +- [X] T013 [US1] 連線生命週期結構化日誌(`src/ai_api/proxy/realtime.py`):建立/關閉/原因,沿用既有 audit + 觀測(原則 IV) **Checkpoint**: 客戶端能用平台金鑰即時取得字幕;MVP 成立(計量/撤回尚未接) @@ -69,15 +69,15 @@ ### Tests for User Story 2 ⚠️(先寫、先失敗) -- [ ] T014 [P] [US2] 單元測試:PCM bytes → 秒 → 分鐘換算(含 rounding)(`tests/unit/test_realtime_metering.py`) -- [ ] T015 [P] [US2] 整合測試:連線正常關閉 → 一筆 `CallRecord(unit="minute")`、quantity 對得上、歸戶正確分配(`tests/integration/test_realtime_relay.py`) -- [ ] T016 [P] [US2] 整合測試:client 直接中斷(無正常握手)→ 已累計時長仍落帳(FR-004/SC-003) +- [X] T014 [P] [US2] 單元測試:PCM bytes → 秒 → 分鐘換算(含 rounding)(`tests/unit/test_realtime_metering.py`) +- [X] T015 [P] [US2] 整合測試:連線正常關閉 → 一筆 `CallRecord(unit="minute")`、quantity 對得上、歸戶正確分配(`tests/integration/test_realtime_relay.py`) +- [X] T016 [P] [US2] 整合測試:client 直接中斷(無正常握手)→ 已累計時長仍落帳(FR-004/SC-003) ### Implementation for User Story 2 -- [ ] T017 [US2] RealtimeSession 計量狀態(`src/ai_api/proxy/realtime.py`):解析 `session.update` 的 format(sample_rate/bytes_per_sample/channels)、在 relay 即時累計 `audio_bytes` -- [ ] T018 [US2] 斷線落帳(`src/ai_api/proxy/realtime.py`):duration→minute→`CallRecord`(`calculate_unit_cost`、歸戶 allocation、token 欄 NULL、outcome 對映 close_reason);**任何 close 路徑都落帳** -- [ ] T019 [P] [US2] 前端:admin `/prices` 單位下拉加 `minute`(`frontend/src/routes/admin/prices.tsx`,沿用階段 29 單位感知 UI),realtime 模型可設每分鐘價 +- [X] T017 [US2] RealtimeSession 計量狀態(`src/ai_api/proxy/realtime.py`):解析 `session.update` 的 format(sample_rate/bytes_per_sample/channels)、在 relay 即時累計 `audio_bytes` +- [X] T018 [US2] 斷線落帳(`src/ai_api/proxy/realtime.py`):duration→minute→`CallRecord`(`calculate_unit_cost`、歸戶 allocation、token 欄 NULL、outcome 對映 close_reason);**任何 close 路徑都落帳** +- [X] T019 [P] [US2] 前端:admin `/prices` 單位下拉加 `minute`(`frontend/src/routes/admin/prices.tsx`,沿用階段 29 單位感知 UI),realtime 模型可設每分鐘價 **Checkpoint**: US1 + US2——即時字幕可用且用量可計費歸戶 @@ -91,13 +91,13 @@ ### Tests for User Story 3 ⚠️(先寫、先失敗) -- [ ] T020 [P] [US3] 整合測試:連線進行中撤回分配 → N 秒內 close(revoked) + 已累計時長落帳(`tests/integration/test_realtime_relay.py`) -- [ ] T021 [P] [US3] 整合測試:分配被暫停/隔離 → 同樣主動斷線(同檔) +- [X] T020 [P] [US3] 整合測試:連線進行中撤回分配 → N 秒內 close(revoked) + 已累計時長落帳(`tests/integration/test_realtime_relay.py`) +- [X] T021 [P] [US3] 整合測試:分配被暫停/隔離 → 同樣主動斷線(同檔) ### Implementation for User Story 3 -- [ ] T022 [US3] 旁路週期 re-check 協程(`src/ai_api/proxy/realtime.py`):每 N 秒查分配當前狀態,非 active → 主動 close(revoked);N 對齊既有撤回 SLO(常數集中、可調) -- [ ] T023 [US3] 與 US2 落帳整合(`src/ai_api/proxy/realtime.py`):撤回觸發的 close 同樣走斷線落帳(已累計時長不漏) +- [X] T022 [US3] 旁路週期 re-check 協程(`src/ai_api/proxy/realtime.py`):每 N 秒查分配當前狀態,非 active → 主動 close(revoked);N 對齊既有撤回 SLO(常數集中、可調) +- [X] T023 [US3] 與 US2 落帳整合(`src/ai_api/proxy/realtime.py`):撤回觸發的 close 同樣走斷線落帳(已累計時長不漏) **Checkpoint**: 三個 user story 全部獨立可用 @@ -105,9 +105,9 @@ ## Phase 6: Polish & Cross-Cutting Concerns -- [ ] T024 [P] 前端:`frontend/src/routes/admin/model-detail.tsx` 顯示 realtime 類型(KIND_LABEL)+ `frontend/src/components/api-usage-example.tsx` 加 realtime WS 連線範例(FR-008) -- [ ] T025 [P] nginx WS upgrade config(`deploy/helm/ai-api/`):`/v1/realtime`(或 `/v1`)加 `Upgrade`/`Connection: upgrade` + `proxy_http_version 1.1` -- [ ] T026 全綠關卡:`ruff check .` + mypy + 前端 tsc/build/test + 完整 `pytest tests/` 零回歸(既有 contract 測試 git diff 為空,SC-006) +- [X] T024 [P] 前端:`frontend/src/routes/admin/model-detail.tsx` 顯示 realtime 類型(KIND_LABEL)+ `frontend/src/components/api-usage-example.tsx` 加 realtime WS 連線範例(FR-008) +- [X] T025 [P] nginx WS upgrade config(`deploy/helm/ai-api/`):`/v1/realtime`(或 `/v1`)加 `Upgrade`/`Connection: upgrade` + `proxy_http_version 1.1` +- [X] T026 全綠關卡:`ruff check .` + mypy + 前端 tsc/build/test + 完整 `pytest tests/` 零回歸(既有 contract 測試 git diff 為空,SC-006) - [ ] T027 部署後手動煙霧(quickstart.md,**需憑證環境**):pod egress `wss:443` 實證、壞金鑰連線被 close、真打一次完整字幕(首字 <1s)→ 用量頁見一筆 `unit=minute` 歸戶分配;R2 計量對照 Azure 帳單校驗 --- @@ -170,7 +170,8 @@ Task: "整合測試 有效連線收 delta(mock provider WS)— tests/integra ### 真打限制(誠實標記) - T009/T015/T020 等整合測試**全用 mock provider WS**(CI 可重現,Constitution Deviation 的補救)。 -- **T027 真連 Azure realtime WS 需憑證環境**(維護者實機跑 quickstart)——R1/R2 的協定接通 + 計量對照在此校驗,非 CI。 +- **落地位置**:engine 綁 pytest event loop(module global),另起 TestClient portal 會與 asyncpg/aiosqlite 衝突;因此 relay/計量/撤回/落帳/no-leak 全部以 `tests/contract/test_realtime_transcription.py`(sqlite,CI 必跑)**直接呼叫 `handle_realtime`**(注入 fake client/provider WS,`tests/support/realtime_mock.py`)驗證——契約測試 1–7 全綠。純計量函式另在 `tests/unit/test_realtime_metering.py`。 +- **T027 真連 Azure realtime WS 需憑證環境**(維護者實機跑 quickstart)——R1/R2 的協定接通 + 計量對照在此校驗,非 CI;`upstream._build_realtime_url` 的確切 Azure URL 形態也在此校驗(CI 用 fake upstream,未碰真 URL)。 --- diff --git a/src/ai_api/main.py b/src/ai_api/main.py index e116eee..c24af9f 100644 --- a/src/ai_api/main.py +++ b/src/ai_api/main.py @@ -38,6 +38,7 @@ from ai_api.observability.logging import setup_logging from ai_api.observability.request_id import RequestIdMiddleware from ai_api.proxy.registry import build_router as build_proxy_registry_router +from ai_api.proxy.realtime import router as realtime_router from ai_api.proxy.responses import router as responses_router from ai_api.proxy.router import router as proxy_router @@ -99,6 +100,7 @@ def create_app() -> FastAPI: app.include_router(catalog.router, prefix="/catalog", tags=["catalog"]) app.include_router(proxy_router, prefix="/v1", tags=["proxy"]) # chat (streaming) app.include_router(responses_router, prefix="/v1", tags=["proxy"]) # responses (streaming) + app.include_router(realtime_router, prefix="/v1", tags=["proxy"]) # realtime (live transcription WS) # Phase 31: all non-streaming inference endpoints come from the data-driven # registry (embeddings/ocr/images/rerank/audio + moderation/search/image_edit). app.include_router(build_proxy_registry_router(), prefix="/v1", tags=["proxy"]) diff --git a/src/ai_api/proxy/realtime.py b/src/ai_api/proxy/realtime.py new file mode 100644 index 0000000..01501a8 --- /dev/null +++ b/src/ai_api/proxy/realtime.py @@ -0,0 +1,519 @@ +"""Phase 32 (043): /v1/realtime — OpenAI-compatible live transcription relay. + +A thin bidirectional WebSocket relay between an app client and the upstream +provider's realtime WS. We do NOT go through litellm's realtime (it is Proxy form +/ client-direct, which bypasses the gateway and loses per-allocation attribution + +in-flight revocation — see experience lesson 40). Instead we borrow litellm +`RealTimeStreaming.bidirectional_forward` *structure*: two forwarding coroutines, +plus a side-channel revocation watcher, plus per-minute metering self-counted from +the client's `input_audio_buffer.append` PCM bytes (research R2 — no reliance on a +provider usage event, so an abnormal abort never loses billing). + +Testability: `handle_realtime` takes an injectable `open_upstream`, so CI exercises +the full relay/metering/revocation against a fake provider WS in-loop (the engine is +bound to the test event loop; a separate TestClient portal would break asyncpg). +Real Azure realtime WS is validated by the maintainer in quickstart (T027). +""" +from __future__ import annotations + +import asyncio +import base64 +import contextlib +import json +import logging +import math +from collections.abc import Awaitable, Callable +from dataclasses import dataclass +from datetime import UTC, datetime +from typing import Any, Protocol + +from fastapi import APIRouter, WebSocket +from sqlalchemy import select + +logger = logging.getLogger(__name__) +router = APIRouter() + +# --- WebSocket close codes (RFC 6455) used when the platform closes --------- +WS_NORMAL = 1000 +WS_POLICY_VIOLATION = 1008 # auth / quota / revoked +WS_UNSUPPORTED = 1003 # model is not a realtime kind +WS_INTERNAL = 1011 # upstream error / unexpected + +# Default revocation re-check interval (seconds). Long-lived connections MUST be +# re-checked, not only at connect (principle 3). Centralized + overridable. +REVOKE_RECHECK_SECONDS = 5 + +# PCM defaults when `session.update` omits them: 16-bit mono. +_DEFAULT_BYTES_PER_SAMPLE = 2 +_DEFAULT_CHANNELS = 1 +_DEFAULT_SAMPLE_RATE = 24000 + + +# --- Uniform WS interfaces (FastAPI WebSocket and the websockets client both +# satisfy these; test fakes mirror them) ------------------------------------ +class ClientWS(Protocol): + @property + def headers(self) -> Any: ... + + async def accept(self) -> None: ... + async def receive_text(self) -> str: ... + async def send_text(self, data: str) -> None: ... + async def close(self, code: int = WS_NORMAL, reason: str | None = None) -> None: ... + + +class UpstreamWS(Protocol): + async def send(self, data: str) -> None: ... + async def recv(self) -> str: ... + async def close(self) -> None: ... + + +# open_upstream(provider, model, api_key, api_base, api_version) -> UpstreamWS +OpenUpstream = Callable[..., Awaitable[UpstreamWS]] + + +@dataclass +class RealtimeSession: + """In-memory lifecycle state of one realtime connection (never persisted). + + On disconnect (any reason) the accrued `audio_bytes` is metered into a single + CallRecord(unit="minute") attributed to `allocation_id`. + """ + + allocation_id: str + subject: str | None + resource_model: str + upstream_model: str + provider: str + request_id: str + started_at: datetime + audio_bytes: int = 0 + sample_rate: int = _DEFAULT_SAMPLE_RATE + bytes_per_sample: int = _DEFAULT_BYTES_PER_SAMPLE + channels: int = _DEFAULT_CHANNELS + # normal | client_abort | upstream_error | revoked + close_reason: str = "normal" + + +# --- Pure metering helpers (T014/T017) -------------------------------------- +def duration_seconds(audio_bytes: int, sample_rate: int, bytes_per_sample: int, channels: int) -> float: + """Audio duration from raw PCM byte count. 0 if the frame geometry is unknown.""" + denom = sample_rate * bytes_per_sample * channels + if denom <= 0: + return 0.0 + return audio_bytes / denom + + +def pcm_bytes_to_minutes( + audio_bytes: int, + *, + sample_rate: int = _DEFAULT_SAMPLE_RATE, + bytes_per_sample: int = _DEFAULT_BYTES_PER_SAMPLE, + channels: int = _DEFAULT_CHANNELS, +) -> int: + """Per-minute billing quantity: round UP to the next whole minute (a started + minute is a billed minute, the per-minute convention). 0 bytes → 0 minutes.""" + secs = duration_seconds(audio_bytes, sample_rate, bytes_per_sample, channels) + if secs <= 0: + return 0 + return math.ceil(secs / 60) + + +def session_minutes(sess: RealtimeSession) -> int: + return pcm_bytes_to_minutes( + sess.audio_bytes, + sample_rate=sess.sample_rate, + bytes_per_sample=sess.bytes_per_sample, + channels=sess.channels, + ) + + +def _apply_format(sess: RealtimeSession, ev: dict[str, Any]) -> None: + """Read sample rate (and, if present, sample width/channels) from a + `session.update` so metering uses the client's actual PCM geometry. Tolerant of + the two shapes seen in the wild: session.audio.input.format.* and + session.input_audio_format / session.audio.format.*.""" + session = ev.get("session") + if not isinstance(session, dict): + return + fmt: dict[str, Any] = {} + audio = session.get("audio") + if isinstance(audio, dict): + inp = audio.get("input") + if isinstance(inp, dict) and isinstance(inp.get("format"), dict): + fmt = inp["format"] + elif isinstance(audio.get("format"), dict): + fmt = audio["format"] + if not fmt and isinstance(session.get("input_audio_format"), dict): + fmt = session["input_audio_format"] + rate = fmt.get("rate") or fmt.get("sample_rate") + if isinstance(rate, int) and rate > 0: + sess.sample_rate = rate + channels = fmt.get("channels") + if isinstance(channels, int) and channels > 0: + sess.channels = channels + bps = fmt.get("bytes_per_sample") + if isinstance(bps, int) and bps > 0: + sess.bytes_per_sample = bps + + +def _meter_client_event(sess: RealtimeSession, raw: str) -> None: + """Update metering state from a client→platform frame. Never raises.""" + try: + ev = json.loads(raw) + except (ValueError, TypeError): + return + if not isinstance(ev, dict): + return + etype = ev.get("type") + if etype == "session.update": + _apply_format(sess, ev) + elif etype == "input_audio_buffer.append": + audio = ev.get("audio") + if isinstance(audio, str) and audio: + # Malformed base64 → skip metering this frame (never crash the relay). + with contextlib.suppress(ValueError, TypeError): + sess.audio_bytes += len(base64.b64decode(audio, validate=False)) + + +# --- Bidirectional relay (T011) --------------------------------------------- +async def _client_to_upstream(client: ClientWS, upstream: UpstreamWS, sess: RealtimeSession) -> None: + while True: + try: + raw = await client.receive_text() + except Exception: + # Client closed / aborted. Accrued audio_bytes is already counted, so + # billing on disconnect never loses usage (FR-004/SC-003). + if sess.close_reason == "normal": + sess.close_reason = "client_abort" + return + _meter_client_event(sess, raw) + try: + await upstream.send(raw) + except Exception: + if sess.close_reason == "normal": + sess.close_reason = "upstream_error" + return + + +async def _upstream_to_client(client: ClientWS, upstream: UpstreamWS, sess: RealtimeSession) -> None: + while True: + try: + raw = await upstream.recv() + except Exception: + if sess.close_reason == "normal": + sess.close_reason = "upstream_error" + return + try: + await client.send_text(raw) + except Exception: + if sess.close_reason == "normal": + sess.close_reason = "client_abort" + return + + +# check_active(allocation_id) -> bool +CheckActive = Callable[[str], Awaitable[bool]] + + +async def _revocation_watch( + sess: RealtimeSession, + *, + stop: asyncio.Event, + check_active: CheckActive, + interval: float, +) -> None: + """Side-channel: every `interval` seconds re-check the allocation; if it is no + longer active (revoked / paused / quarantined) flip close_reason and signal the + relay to stop (FR-005). Does not touch the relay hot path.""" + while not stop.is_set(): + try: + await asyncio.wait_for(stop.wait(), timeout=interval) + return # relay ended first + except TimeoutError: + pass + try: + active = await check_active(sess.allocation_id) + except Exception: + logger.exception("realtime revocation re-check failed; leaving connection up") + continue + if not active: + sess.close_reason = "revoked" + stop.set() + return + + +async def run_relay( + client: ClientWS, + upstream: UpstreamWS, + sess: RealtimeSession, + *, + check_active: CheckActive, + interval: float = REVOKE_RECHECK_SECONDS, +) -> None: + """Run both forwarding coroutines + the revocation watcher until any one ends, + then tear the others down. Returns once the connection is fully closed.""" + stop = asyncio.Event() + + async def _forward_then_stop(coro: Awaitable[None]) -> None: + try: + await coro + finally: + stop.set() + + t_up = asyncio.create_task(_forward_then_stop(_client_to_upstream(client, upstream, sess))) + t_down = asyncio.create_task(_forward_then_stop(_upstream_to_client(client, upstream, sess))) + t_watch = asyncio.create_task( + _revocation_watch(sess, stop=stop, check_active=check_active, interval=interval) + ) + + await stop.wait() + # Closing both ends unblocks any coroutine parked in recv/receive. + await _safe_close_upstream(upstream) + await _safe_close_client( + client, + *_close_code_for(sess.close_reason), + ) + for task in (t_up, t_down, t_watch): + task.cancel() + await asyncio.gather(t_up, t_down, t_watch, return_exceptions=True) + + +def _close_code_for(close_reason: str) -> tuple[int, str]: + if close_reason == "revoked": + return WS_POLICY_VIOLATION, "allocation revoked" + if close_reason == "upstream_error": + return WS_INTERNAL, "upstream connection closed" + return WS_NORMAL, "connection closed" + + +async def _safe_close_client(client: ClientWS, code: int, reason: str) -> None: + # Best-effort: the peer may already be gone / mid-teardown. + with contextlib.suppress(Exception): + await client.close(code=code, reason=reason) + + +async def _safe_close_upstream(upstream: UpstreamWS) -> None: + with contextlib.suppress(Exception): + await upstream.close() + + +# --- Outcome mapping + billing (T018) --------------------------------------- +def _outcome_for_close(close_reason: str) -> Any: + from ai_api.models import CallOutcome + + if close_reason == "upstream_error": + return CallOutcome.upstream_error + # normal / client_abort / revoked all delivered service for the accrued + # minutes → success (usage is real). revoked just terminated it early. + return CallOutcome.success + + +async def _bill_session(sess: RealtimeSession) -> None: + """Write ONE CallRecord(unit="minute") for the accrued audio. Any close path + reaches here (FR-004). Uses a fresh session — the connection has no request + session. Never raises (billing must not crash teardown).""" + from ai_api.db import get_sessionmaker + from ai_api.services.pricing import calculate_unit_cost, lookup_price_for_call + from ai_api.services.records import RecordsService + + minutes = session_minutes(sess) + outcome = _outcome_for_close(sess.close_reason) + try: + async with get_sessionmaker()() as s: + price = await lookup_price_for_call( + s, + provider=sess.provider, + model=sess.upstream_model.split("/", 1)[-1], + call_time=sess.started_at, + ) + cost = ( + calculate_unit_cost(minutes, price.price_per_unit) + if price is not None + else None + ) + await RecordsService(s).record_call( + request_id=sess.request_id, + allocation_id=sess.allocation_id, + subject=sess.subject, + model=sess.resource_model, + started_at=sess.started_at, + status_code=200, + outcome=outcome, + quantity=minutes, + unit="minute", + cost_usd=cost, + error_message=( + "allocation revoked mid-connection" if sess.close_reason == "revoked" else None + ), + ) + await s.commit() + except BaseException: # incl. CancelledError; never lose billing silently + logger.exception("failed to record realtime call (allocation=%s)", sess.allocation_id) + + +# --- Allocation status re-check (used as check_active) ---------------------- +async def _allocation_is_active(allocation_id: str) -> bool: + from ai_api.db import get_sessionmaker + from ai_api.models import Allocation, AllocationStatus + + try: + async with get_sessionmaker()() as s: + alloc = await s.get(Allocation, allocation_id) + return alloc is not None and alloc.status == AllocationStatus.active + except Exception: + logger.exception("realtime allocation re-check query failed") + # Fail-open on a transient DB error: do NOT kill a live connection on a + # blip; the next tick re-checks. + return True + + +# --- Connection entrypoint (T010/T012/T013) --------------------------------- +def _extract_token(headers: Any) -> str | None: + """Bearer token from the Authorization header (case-insensitive lookup).""" + auth = None + if hasattr(headers, "get"): + auth = headers.get("authorization") or headers.get("Authorization") + if not auth or not auth.lower().startswith("bearer "): + return None + token = auth.split(" ", 1)[1].strip() + return token or None + + +async def handle_realtime( + client: ClientWS, + *, + open_upstream: OpenUpstream, + check_active: CheckActive = _allocation_is_active, + revoke_interval: float = REVOKE_RECHECK_SECONDS, +) -> None: + """Drive one realtime connection end-to-end. Injectable `open_upstream` / + `check_active` make the whole path CI-testable against a fake provider WS.""" + from ai_api.config import get_settings + from ai_api.db import get_sessionmaker + from ai_api.models import ModelCatalog + from ai_api.observability.request_id import current_request_id + from ai_api.proxy.preflight import PreflightRejection, run_preflight + from ai_api.services.model_kind import model_kind + + await client.accept() + started_at = datetime.now(UTC) + request_id = current_request_id() or "realtime" + + token = _extract_token(client.headers) + if token is None: + logger.info("realtime connection rejected: missing bearer token") + await _safe_close_client(client, WS_POLICY_VIOLATION, "missing bearer token") + return + + # First frame carries the model (session.update). Need it for preflight. + try: + first_raw = await client.receive_text() + except Exception: + await _safe_close_client(client, WS_NORMAL, "no session.update received") + return + requested_model = _model_from_session_update(first_raw) + if requested_model is None: + logger.info("realtime connection rejected: first frame is not a session.update with model") + await _safe_close_client(client, WS_POLICY_VIOLATION, "first frame must be session.update with model") + return + + settings = get_settings() + async with get_sessionmaker()() as s: + result = await run_preflight( + s, settings=settings, token=token, requested_model=requested_model + ) + if isinstance(result, PreflightRejection): + logger.info( + "realtime preflight rejected model=%s code=%s", requested_model, result.code + ) + await _safe_close_client(client, WS_POLICY_VIOLATION, result.code) + return + # Model must be a realtime kind (FR-007) — catalog honesty (FR-008). + row = ( + await s.execute(select(ModelCatalog).where(ModelCatalog.slug == result.canonical_model)) + ).scalar_one_or_none() + kind = model_kind(row) if row is not None else "chat" + if kind != "realtime": + logger.info( + "realtime connection rejected: model=%s kind=%s (not realtime)", + result.canonical_model, kind, + ) + await _safe_close_client(client, WS_UNSUPPORTED, "model does not support realtime") + return + allocation_id = result.allocation.id + subject = result.allocation.subject_snapshot + + resolved = result.resolved + sess = RealtimeSession( + allocation_id=allocation_id, + subject=subject, + resource_model=result.canonical_model, + upstream_model=result.upstream_model, + provider=result.provider, + request_id=request_id, + started_at=started_at, + ) + # Meter the first frame too (it may already be an append in some clients). + _meter_client_event(sess, first_raw) + + # Open the upstream provider WS; never leak key/endpoint to the client (FR-006). + try: + upstream = await open_upstream( + provider=result.provider, + model=result.upstream_model, + api_key=resolved.api_key, + api_base=resolved.base_url, + api_version=(resolved.extra_config or {}).get("api_version"), + ) + except Exception: + logger.exception("realtime upstream connect failed model=%s", result.upstream_model) + sess.close_reason = "upstream_error" + await _safe_close_client(client, WS_INTERNAL, "upstream unavailable") + await _bill_session(sess) + return + + logger.info( + "realtime connection open allocation=%s model=%s request_id=%s", + allocation_id, result.canonical_model, request_id, + ) + try: + # Replay the first session.update to upstream so it configures correctly. + try: + await upstream.send(first_raw) + except Exception: + sess.close_reason = "upstream_error" + else: + await run_relay( + client, upstream, sess, check_active=check_active, interval=revoke_interval + ) + finally: + await _safe_close_upstream(upstream) + await _bill_session(sess) + logger.info( + "realtime connection closed allocation=%s reason=%s minutes=%s", + allocation_id, sess.close_reason, session_minutes(sess), + ) + + +def _model_from_session_update(raw: str) -> str | None: + try: + ev = json.loads(raw) + except (ValueError, TypeError): + return None + if not isinstance(ev, dict) or ev.get("type") != "session.update": + return None + session = ev.get("session") + if not isinstance(session, dict): + return None + model = session.get("model") + return model if isinstance(model, str) and model else None + + +@router.websocket("/realtime") +async def realtime_endpoint(websocket: WebSocket) -> None: + """OpenAI-compatible realtime transcription WS. Thin adapter: FastAPI's + WebSocket satisfies the ClientWS interface; the real upstream opener is wired + here (CI injects a fake via `handle_realtime`).""" + from ai_api.proxy import upstream + + await handle_realtime(websocket, open_upstream=upstream.open_realtime_ws) diff --git a/src/ai_api/proxy/upstream.py b/src/ai_api/proxy/upstream.py index 3890d64..658cc39 100644 --- a/src/ai_api/proxy/upstream.py +++ b/src/ai_api/proxy/upstream.py @@ -202,6 +202,48 @@ async def asearch( ) +def _build_realtime_url(api_base: str | None, model: str, api_version: str | None) -> str: + """Build the Azure Foundry realtime WS URL from the resolved credential. + + Azure OpenAI realtime: wss://.openai.azure.com/openai/realtime? + api-version=&deployment=. We derive the wss scheme from the + https api_base and carry the bare model (deployment) name. Validated against a + real Azure realtime endpoint in quickstart (T027) — CI uses a fake upstream. + """ + base = (api_base or "").rstrip("/") + if base.startswith("https://"): + base = "wss://" + base[len("https://"):] + elif base.startswith("http://"): + base = "ws://" + base[len("http://"):] + deployment = model.split("/", 1)[-1] + version = api_version or "2024-10-01-preview" + return f"{base}/openai/realtime?api-version={version}&deployment={deployment}" + + +async def open_realtime_ws( + *, + provider: str, + model: str, + api_key: str, + api_base: str | None = None, + api_version: str | None = None, +) -> Any: + """Open a WebSocket to the upstream provider's realtime endpoint and return the + connection (has async `send`/`recv`/`close`). Injects the credential as the + `api-key` header (Azure) — the key/endpoint never reach the downstream client + (FR-006). Phase 32 (043): /v1/realtime live transcription relay. + """ + import websockets + + url = _build_realtime_url(api_base, model, api_version) + # Azure uses the `api-key` header; OpenAI-style uses Authorization: Bearer. + if provider == "openai": + headers = {"Authorization": f"Bearer {api_key}"} + else: + headers = {"api-key": api_key} + return await websockets.connect(url, additional_headers=headers) + + async def aimage_edit( *, model: str, diff --git a/tests/contract/test_realtime_transcription.py b/tests/contract/test_realtime_transcription.py new file mode 100644 index 0000000..867d141 --- /dev/null +++ b/tests/contract/test_realtime_transcription.py @@ -0,0 +1,309 @@ +"""Phase 32 (043): /v1/realtime contract tests (contracts/realtime-transcription.md 1-7). + +Drives `handle_realtime` in-loop with a fake client WS + fake provider WS (the +engine is bound to the test loop, so a TestClient portal would break the DB). This +is the Constitution-Deviation remedy: CI exercises the full preflight → relay → +metering → revocation path against a mock provider WS; real Azure WS is the +maintainer's T027 smoke. + +Covers: T007 (invalid/revoked key → close, no stream), T008 (non-realtime → +unsupported), T009 (valid → delta), T015 (clean close → CallRecord minute), T016 +(abnormal abort → billed), T020/T021 (in-flight revoke/pause → close + billed), +plus the no-leak contract (#7). +""" +from __future__ import annotations + +import asyncio +import base64 +import json +from datetime import UTC, datetime, timedelta +from decimal import Decimal + +import pytest +from httpx import AsyncClient +from sqlalchemy import select +from ulid import ULID + +from ai_api.db import get_sessionmaker +from ai_api.models import CallOutcome, CallRecord, ModelCatalog, PriceList +from ai_api.proxy.realtime import handle_realtime +from tests.support.realtime_mock import FakeClientWS, FakeUpstreamWS, fake_opener + +RT_MODEL = "azure/gpt-realtime-whisper" +# The resolved upstream credential — must never reach the downstream client (FR-006). +SECRET_KEY = "az-secret-DO-NOT-LEAK-9999" +SECRET_BASE = "https://secret-foundry.services.ai.azure.com" + +# 24 kHz pcm16 mono → 48000 bytes/sec. +_BYTES_PER_SEC = 24000 * 2 * 1 + + +def _session_update(model: str = RT_MODEL, rate: int = 24000) -> str: + return json.dumps({ + "type": "session.update", + "session": { + "type": "transcription", + "model": model, + "audio": {"input": {"format": {"type": "audio/pcm", "rate": rate}}}, + }, + }) + + +def _append(seconds: float, rate: int = 24000) -> str: + pcm = b"\x00" * int(_BYTES_PER_SEC * seconds * (rate / 24000)) + return json.dumps({ + "type": "input_audio_buffer.append", + "audio": base64.b64encode(pcm).decode(), + }) + + +async def _seed_catalog(slug: str, *, mode: str) -> None: + """Seed a catalog row whose litellm mode drives model_kind (realtime vs chat).""" + now = datetime.now(UTC) + sm = get_sessionmaker() + async with sm() as s: + s.add(ModelCatalog( + slug=slug, provider="azure", display_name=slug, family="x", + description="", modality_input=["audio"], modality_output=["text"], + capabilities=[], context_window=1024, cost_tier="low", + recommended_for=[], tags=[], example_request={}, official_doc_url=None, + status="active", deprecation_note=None, created_at=now, updated_at=now, + default_access="open", allowed_tags=[], denied_tags=[], + self_service_enabled=False, self_service_default_quota=None, + litellm_sync={"raw": {"mode": mode}}, + )) + await s.commit() + + +async def _seed_price(per_minute: str) -> None: + sm = get_sessionmaker() + async with sm() as s: + s.add(PriceList( + id=str(ULID()), provider="azure", model="gpt-realtime-whisper", + input_per_1k_tokens_usd=Decimal(0), output_per_1k_tokens_usd=Decimal(0), + price_unit="minute", price_per_unit_usd=Decimal(per_minute), + effective_from=datetime.now(UTC) - timedelta(days=1), + created_at=datetime.now(UTC), created_by="test", + )) + await s.commit() + + +async def _seed_provider(client: AsyncClient, admin: dict) -> None: + """An active provider credential is required for preflight's model-access check + to pass (env fallback doesn't register as an active provider).""" + r = await client.post("/admin/providers", headers=admin, json={ + "provider": "azure", "label": "t", "api_key": SECRET_KEY, "base_url": SECRET_BASE, + }) + assert r.status_code in (200, 201), r.text + + +async def _alloc(client: AsyncClient, admin: dict, model: str = RT_MODEL) -> dict: + r = await client.post("/admin/allocations", headers=admin, + json={"subject": "alice@example.com", "resource_model": model}) + assert r.status_code == 201, r.text + return r.json() + + +async def _last(outcome: CallOutcome) -> CallRecord | None: + sm = get_sessionmaker() + async with sm() as s: + rows = (await s.execute( + select(CallRecord).where(CallRecord.outcome == outcome) + .order_by(CallRecord.started_at.desc()) + )).scalars().all() + return rows[0] if rows else None + + +def _bearer(token: str) -> dict[str, str]: + return {"authorization": f"Bearer {token}"} + + +# --- T007: invalid / revoked key → close, no stream ------------------------- +@pytest.mark.asyncio +async def test_invalid_key_closed_no_stream(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + client = FakeClientWS(_bearer("totally-invalid-token"), [_session_update()]) + upstream = FakeUpstreamWS() + opener = fake_opener(upstream) + await handle_realtime(client, open_upstream=opener) + assert client.closed is not None and client.closed[0] == 1008 # policy violation + assert opener.calls == [] # upstream never opened + assert upstream.sent == [] # no stream started + assert await _last(CallOutcome.success) is None + + +@pytest.mark.asyncio +async def test_missing_bearer_closed(app_client: AsyncClient, admin_headers): + client = FakeClientWS({}, [_session_update()]) + opener = fake_opener(FakeUpstreamWS()) + await handle_realtime(client, open_upstream=opener) + assert client.closed is not None and client.closed[0] == 1008 + assert opener.calls == [] + + +@pytest.mark.asyncio +async def test_revoked_allocation_closed(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + alloc = await _alloc(app_client, admin_headers) + # Revoke it before connecting. + r = await app_client.delete(f"/admin/allocations/{alloc['id']}", headers=admin_headers) + assert r.status_code in (200, 204), r.text + client = FakeClientWS(_bearer(alloc["token"]), [_session_update()]) + opener = fake_opener(FakeUpstreamWS()) + await handle_realtime(client, open_upstream=opener) + assert client.closed is not None and client.closed[0] == 1008 + assert opener.calls == [] + + +# --- T008: non-realtime model → close(unsupported) -------------------------- +@pytest.mark.asyncio +async def test_non_realtime_model_unsupported(app_client: AsyncClient, admin_headers): + chat_model = "azure/gpt-4o-mini" + await _seed_catalog(chat_model, mode="chat") + await _seed_provider(app_client, admin_headers) + alloc = await _alloc(app_client, admin_headers, model=chat_model) + client = FakeClientWS(_bearer(alloc["token"]), [_session_update(model=chat_model)]) + opener = fake_opener(FakeUpstreamWS()) + await handle_realtime(client, open_upstream=opener) + assert client.closed is not None and client.closed[0] == 1003 # unsupported + assert opener.calls == [] + + +# --- T009: valid connection + append → delta reaches client ----------------- +@pytest.mark.asyncio +async def test_valid_connection_relays_delta(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + delta = json.dumps({ + "type": "conversation.item.input_audio_transcription.delta", "delta": "hello", + }) + completed = json.dumps({ + "type": "conversation.item.input_audio_transcription.completed", + "transcript": "hello world", + }) + # Upstream drives the end: emit delta+completed then hang up. + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(1.0)], hold_open=True) + upstream = FakeUpstreamWS([delta, completed], close_after=True) + opener = fake_opener(upstream) + await handle_realtime(client, open_upstream=opener) + assert any("transcription.delta" in m for m in client.sent), client.sent + # The session.update + append were forwarded upstream (key/endpoint injected + # on the upstream side, never to the client). + assert opener.calls and opener.calls[0]["model"] == RT_MODEL + assert any("input_audio_buffer.append" in m for m in upstream.sent) + + +# --- T015: clean close → one CallRecord(unit=minute), quantity matches ------ +@pytest.mark.asyncio +async def test_clean_close_bills_one_minute_record(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_price("0.017") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + # 90 seconds of audio → ceil(90/60) = 2 minutes. Client ends (disconnect). + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(90.0)], hold_open=False) + upstream = FakeUpstreamWS(close_after=False) + await handle_realtime(client, open_upstream=fake_opener(upstream)) + rec = await _last(CallOutcome.success) + assert rec is not None + assert rec.unit == "minute" and rec.quantity == 2 + assert rec.allocation_id == alloc["id"] + assert rec.cost_usd == Decimal("0.034") # 2 x 0.017 + assert rec.prompt_tokens is None and rec.total_tokens is None # non-token call + + +@pytest.mark.asyncio +async def test_unpriced_realtime_zero_cost(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(30.0)], hold_open=False) + await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False))) + rec = await _last(CallOutcome.success) + assert rec is not None and rec.unit == "minute" and rec.quantity == 1 + assert rec.cost_usd is None # no PriceList → unpriced (NULL), not a crash + + +# --- T016: abnormal abort (client hangs up mid-stream) → accrued bytes billed +@pytest.mark.asyncio +async def test_abnormal_abort_still_bills_accrued(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_price("0.017") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + # Sends 45s then the client connection drops with no graceful close. + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(45.0)], hold_open=False) + await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False))) + rec = await _last(CallOutcome.success) + assert rec is not None and rec.unit == "minute" and rec.quantity == 1 # ceil(45/60) + + +# --- T020/T021: in-flight revoke / pause → close(revoked) within N + billed - +@pytest.mark.asyncio +async def test_inflight_revoke_closes_and_bills(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_price("0.017") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(30.0)], hold_open=True) + upstream = FakeUpstreamWS(close_after=False) + + calls = {"n": 0} + + async def revoke_after_first_tick(allocation_id: str) -> bool: + calls["n"] += 1 + return calls["n"] < 1 # first re-check already reports inactive + + await asyncio.wait_for( + handle_realtime( + client, open_upstream=fake_opener(upstream), + check_active=revoke_after_first_tick, revoke_interval=0.05, + ), + timeout=5, + ) + assert client.closed is not None and client.closed[0] == 1008 + assert client.closed[1] == "allocation revoked" + rec = await _last(CallOutcome.success) + assert rec is not None and rec.unit == "minute" and rec.quantity == 1 + assert rec.error_message == "allocation revoked mid-connection" + + +# --- Contract #7: no upstream key / endpoint ever reaches the client -------- +@pytest.mark.asyncio +async def test_no_key_or_endpoint_leak(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + err = json.dumps({"type": "error", "error": {"code": "bad", "message": "upstream boom"}}) + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(1.0)], hold_open=True) + upstream = FakeUpstreamWS([err], close_after=True) + await handle_realtime(client, open_upstream=fake_opener(upstream)) + blob = " ".join(client.sent) + " " + json.dumps(client.closed) + assert SECRET_KEY not in blob + assert "secret-foundry.services.ai.azure.com" not in blob + + +@pytest.mark.asyncio +async def test_upstream_connect_failure_no_leak_and_bills_zero(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL, mode="realtime") + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + + async def failing_opener(**kwargs): + raise RuntimeError(f"connect to {kwargs.get('api_base')} with {kwargs.get('api_key')} failed") + + client = FakeClientWS(_bearer(alloc["token"]), [_session_update(), _append(1.0)]) + await handle_realtime(client, open_upstream=failing_opener) + assert client.closed is not None and client.closed[0] == 1011 # internal + blob = json.dumps(client.closed) + " ".join(client.sent) + assert SECRET_KEY not in blob and "secret-foundry.services.ai.azure.com" not in blob + # Connect failed before any audio relayed → 0 minutes, still a record. + rec = await _last(CallOutcome.upstream_error) + assert rec is not None and rec.unit == "minute" and rec.quantity == 0 diff --git a/tests/support/__init__.py b/tests/support/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/support/realtime_mock.py b/tests/support/realtime_mock.py new file mode 100644 index 0000000..2f64dce --- /dev/null +++ b/tests/support/realtime_mock.py @@ -0,0 +1,108 @@ +"""Phase 32 (043) T002: reusable fake realtime WS pair for CI. + +The engine is bound to the test event loop, so a separate TestClient portal would +break asyncpg/aiosqlite. Instead the relay is driven in-loop by calling +`handle_realtime`/`run_relay` directly with these fakes — a mock *provider* realtime +WS plus a mock client WS — exactly the Constitution-Deviation remedy in the plan +(CI never touches a real Azure realtime WS; that is the maintainer's T027 smoke). +""" +from __future__ import annotations + +import asyncio +from collections import deque +from collections.abc import Iterable +from typing import Any + + +class FakeDisconnect(Exception): + """Mimics starlette WebSocketDisconnect from the client side.""" + + +class FakeClosed(Exception): + """Mimics websockets ConnectionClosed from the upstream side.""" + + +class FakeClientWS: + """Stands in for a FastAPI WebSocket (ClientWS interface). + + `inbound` is the scripted sequence of client→platform frames. After they are + drained, `receive_text` either blocks (hold_open=True, simulating a still-open + client until the platform closes) or raises FakeDisconnect (client ended/aborted). + """ + + def __init__( + self, + headers: dict[str, str], + inbound: Iterable[str], + *, + hold_open: bool = False, + ) -> None: + self.headers = dict(headers) + self._inbound: deque[str] = deque(inbound) + self._hold = hold_open + self.sent: list[str] = [] # platform→client frames (e.g. deltas) + self.closed: tuple[int, str | None] | None = None + self.accepted = False + self._released = asyncio.Event() + + async def accept(self) -> None: + self.accepted = True + + async def receive_text(self) -> str: + if self._inbound: + return self._inbound.popleft() + if self._hold: + await self._released.wait() + raise FakeDisconnect() + + async def send_text(self, data: str) -> None: + self.sent.append(data) + + async def close(self, code: int = 1000, reason: str | None = None) -> None: + if self.closed is None: + self.closed = (code, reason) + self._released.set() + + +class FakeUpstreamWS: + """Stands in for the upstream provider realtime WS (UpstreamWS interface). + + `events` is the scripted sequence of provider→platform frames (delta/completed). + After they drain, `recv` either raises FakeClosed (provider hung up) or blocks + (close_after=False, stays open until the platform closes it). + """ + + def __init__(self, events: Iterable[str] | None = None, *, close_after: bool = False) -> None: + self._events: deque[str] = deque(events or []) + self._close_after = close_after + self.sent: list[str] = [] # client→upstream forwarded frames + self.closed = False + self._released = asyncio.Event() + + async def send(self, data: str) -> None: + self.sent.append(data) + + async def recv(self) -> str: + if self._events: + return self._events.popleft() + if self._close_after: + raise FakeClosed() + await self._released.wait() + raise FakeClosed() + + async def close(self) -> None: + self.closed = True + self._released.set() + + +def fake_opener(upstream: FakeUpstreamWS) -> Any: + """Return an `open_upstream` callable that yields the given fake upstream and + records the credential kwargs it was called with (to assert no-leak / routing).""" + calls: list[dict[str, Any]] = [] + + async def _open(**kwargs: Any) -> FakeUpstreamWS: + calls.append(kwargs) + return upstream + + _open.calls = calls # type: ignore[attr-defined] + return _open diff --git a/tests/unit/test_realtime_metering.py b/tests/unit/test_realtime_metering.py new file mode 100644 index 0000000..77bf68e --- /dev/null +++ b/tests/unit/test_realtime_metering.py @@ -0,0 +1,51 @@ +"""Phase 32 (043) T014: realtime per-minute metering — PCM bytes → minutes. + +Pure functions; no DB, no WS. The duration source is the audio WE relayed (Σ append +PCM bytes), not a provider usage event (research R2), so an abnormal abort still +yields a correct billable quantity. +""" +from __future__ import annotations + +from ai_api.proxy.realtime import ( + RealtimeSession, + duration_seconds, + pcm_bytes_to_minutes, + session_minutes, +) + + +def test_duration_seconds_pcm16_mono() -> None: + # 24000 Hz x 2 bytes x 1 ch = 48000 bytes/sec -> 1 second. + assert duration_seconds(48000, 24000, 2, 1) == 1.0 + # half a second + assert duration_seconds(24000, 24000, 2, 1) == 0.5 + # unknown geometry → 0 (never divide by zero) + assert duration_seconds(48000, 0, 2, 1) == 0.0 + + +def test_minutes_round_up_started_minute_is_billed() -> None: + rate = 24000 # pcm16 mono → 48000 bytes/sec + per_sec = rate * 2 * 1 + assert pcm_bytes_to_minutes(0) == 0 # nothing → 0 + assert pcm_bytes_to_minutes(per_sec) == 1 # 1s → 1 min (round up) + assert pcm_bytes_to_minutes(per_sec * 59) == 1 # 59s → 1 min + assert pcm_bytes_to_minutes(per_sec * 60) == 1 # exactly 60s → 1 min + assert pcm_bytes_to_minutes(per_sec * 61) == 2 # 61s → 2 min + assert pcm_bytes_to_minutes(per_sec * 300) == 5 # 5 min exact + + +def test_minutes_respects_session_geometry() -> None: + # 16 kHz pcm16 mono = 32000 bytes/sec; 96000 bytes = 3s → 1 min + assert pcm_bytes_to_minutes(96000, sample_rate=16000) == 1 + + +def test_session_minutes_uses_session_state() -> None: + from datetime import UTC, datetime + + sess = RealtimeSession( + allocation_id="a", subject="s", resource_model="azure/gpt-realtime-whisper", + upstream_model="azure/gpt-realtime-whisper", provider="azure", + request_id="r", started_at=datetime.now(UTC), sample_rate=16000, + ) + sess.audio_bytes = 32000 * 90 # 90 seconds at 16 kHz pcm16 mono + assert session_minutes(sess) == 2 # 90s → 2 min From 657d33dc82af392025c1b6edbf3431b30eeb9fd7 Mon Sep 17 00:00:00 2001 From: timcsy Date: Fri, 12 Jun 2026 17:14:13 +0800 Subject: [PATCH 3/4] feat(realtime): make gpt-realtime-whisper testable from the admin "test model" button MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit realtime was excluded from the recipe table (it's a bidirectional WS, not a one-shot call), so the UI test button was disabled. Add a WS-smoke recipe instead: - upstream.realtime_smoke: opens the upstream realtime WS, runs the session handshake + a tiny silent-audio append, awaits the first server event. A non-error event proves egress(wss:443)+key+deployment+protocol — i.e. the T027 reachability check, now runnable straight from the deployed UI. Raises on error/timeout. - RECIPES["realtime"] = WS smoke, billable=True (gated by the existing confirm dialog; admin test writes only an audit event, never a member CallRecord). Now the model-detail page shows kind "即時字幕(realtime)" with an enabled (billable- confirmed) test button. Full suite 735 passed (731→735); ruff+mypy clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai_api/proxy/upstream.py | 60 ++++++++++++++++++++++ src/ai_api/services/model_kind.py | 5 +- src/ai_api/services/model_test.py | 5 ++ tests/integration/test_admin_model_test.py | 49 +++++++++++++++++- tests/unit/test_model_kind.py | 2 +- tests/unit/test_upstream_wrappers.py | 52 +++++++++++++++++++ 6 files changed, 168 insertions(+), 5 deletions(-) diff --git a/src/ai_api/proxy/upstream.py b/src/ai_api/proxy/upstream.py index 658cc39..1ab2965 100644 --- a/src/ai_api/proxy/upstream.py +++ b/src/ai_api/proxy/upstream.py @@ -5,6 +5,10 @@ """ from __future__ import annotations +import asyncio +import base64 +import contextlib +import json from typing import Any import litellm @@ -244,6 +248,62 @@ async def open_realtime_ws( return await websockets.connect(url, additional_headers=headers) +async def realtime_smoke( + *, + model: str, + api_key: str, + api_base: str | None = None, + api_version: str | None = None, + timeout: float = 15.0, +) -> dict[str, Any]: + """Phase 32 (043): minimal realtime WS smoke for the admin "test model" button. + + Opens the upstream realtime WS, runs the session handshake + a tiny silent-audio + append, and waits for the first server event. A structured non-error event proves + egress (wss:443) + key + deployment + protocol are all good — i.e. the T027 + protocol-reachability check, now runnable straight from the UI. Raises on any + `error` event, connect failure, or timeout, so the test honestly reports failure. + Billable: only a couple seconds of audio. + """ + provider = model.split("/", 1)[0] if "/" in model else "azure" + deployment = model.split("/", 1)[-1] + ws = await open_realtime_ws( + provider=provider, model=model, api_key=api_key, + api_base=api_base, api_version=api_version, + ) + try: + await ws.send(json.dumps({ + "type": "session.update", + "session": { + "type": "transcription", "model": deployment, + "audio": {"input": {"format": {"type": "audio/pcm", "rate": 16000}}}, + }, + })) + pcm = b"\x00\x00" * int(16000 * 0.2) # 0.2s silence, pcm16 mono 16 kHz + await ws.send(json.dumps({ + "type": "input_audio_buffer.append", + "audio": base64.b64encode(pcm).decode(), + })) + try: + async with asyncio.timeout(timeout): + while True: + raw = await ws.recv() + ev = json.loads(raw) if isinstance(raw, str) else {} + etype = ev.get("type") + if etype == "error": + msg = (ev.get("error") or {}).get("message") or "(no message)" + raise RuntimeError(f"realtime upstream error: {msg}") + # Any structured server event ⇒ the handshake/protocol works. + return {"ok": True, "first_event": etype} + except TimeoutError as e: + raise RuntimeError( + f"realtime smoke timed out after {timeout}s with no server event" + ) from e + finally: + with contextlib.suppress(Exception): + await ws.close() + + async def aimage_edit( *, model: str, diff --git a/src/ai_api/services/model_kind.py b/src/ai_api/services/model_kind.py index 7b3959b..3dd3270 100644 --- a/src/ai_api/services/model_kind.py +++ b/src/ai_api/services/model_kind.py @@ -32,8 +32,9 @@ "moderation": "moderation", "search": "search", "image_edit": "image_edit", - # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Not a - # recipe-table "test model" kind — billed per-minute via the /v1/realtime relay. + # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Billed + # per-minute via the /v1/realtime relay; the admin "test model" recipe is a + # minimal WS smoke (handshake + tiny audio) that doubles as the T027 check. "realtime": "realtime", } diff --git a/src/ai_api/services/model_test.py b/src/ai_api/services/model_test.py index a5a9322..7f0eaf5 100644 --- a/src/ai_api/services/model_test.py +++ b/src/ai_api/services/model_test.py @@ -98,6 +98,11 @@ class TestRecipe: ), billable=True, ), + # realtime is a bidirectional WS, not a one-shot call — the recipe is a minimal + # WS smoke (handshake + tiny silent append + await first server event). Passing + # proves egress/key/deployment/protocol; it IS the T027 reachability check from + # the UI. Billable (a couple seconds of audio). + "realtime": TestRecipe(lambda c: upstream.realtime_smoke(**c), billable=True), } diff --git a/tests/integration/test_admin_model_test.py b/tests/integration/test_admin_model_test.py index 82475c7..5a829cb 100644 --- a/tests/integration/test_admin_model_test.py +++ b/tests/integration/test_admin_model_test.py @@ -208,12 +208,57 @@ async def test_search_confirmed_calls(app_client: AsyncClient, admin_headers: di assert m.call_args.kwargs.get("search_provider") == "azure/web-search" +@pytest.mark.integration +@pytest.mark.asyncio +async def test_realtime_confirmed_calls(app_client: AsyncClient, admin_headers: dict[str, str]) -> None: + """Phase 32: realtime is testable via a WS smoke recipe (billable → needs ack).""" + await _seed("azure/gpt-realtime-whisper", mode="realtime") + await _provider(app_client, admin_headers) + with patch( + "ai_api.proxy.upstream.realtime_smoke", + new=AsyncMock(return_value={"ok": True, "first_event": "session.created"}), + ) as m: + # billable → first call asks for confirmation, no upstream touched + r0 = await app_client.post( + "/admin/catalog/models/azure/gpt-realtime-whisper/test", headers=admin_headers + ) + assert r0.json().get("needs_confirmation") is True and not m.await_count + r = await app_client.post( + "/admin/catalog/models/azure/gpt-realtime-whisper/test", headers=admin_headers, + json={"acknowledge_billable": True}, + ) + assert r.status_code == 200, r.text + assert r.json()["ok"] is True and r.json()["kind"] == "realtime" + m.assert_awaited_once() + assert m.call_args.kwargs.get("model") == "azure/gpt-realtime-whisper" + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_realtime_upstream_error_reported(app_client: AsyncClient, admin_headers: dict[str, str]) -> None: + """A failing WS smoke (e.g. bad deployment) surfaces as a test failure, not 5xx.""" + await _seed("azure/gpt-realtime-whisper", mode="realtime") + await _provider(app_client, admin_headers) + with patch( + "ai_api.proxy.upstream.realtime_smoke", + new=AsyncMock(side_effect=RuntimeError("realtime upstream error: deployment not found")), + ): + r = await app_client.post( + "/admin/catalog/models/azure/gpt-realtime-whisper/test", headers=admin_headers, + json={"acknowledge_billable": True}, + ) + assert r.status_code == 200, r.text + body = r.json() + assert body["ok"] is False and body["error_type"] == "upstream_error" + assert "deployment not found" in body["message"] + + @pytest.mark.integration @pytest.mark.asyncio async def test_unknown_mode_unsupported(app_client: AsyncClient, admin_headers: dict[str, str]) -> None: # 'video_generation' is a genuinely-unknown mode for the admin test button - # (moderation/rerank/etc. became known kinds in Phase 29③/31; only the not-yet- - # supported modes — video/realtime/vector_store — remain 'unknown'). + # (moderation/rerank/etc. became known kinds in Phase 29③/31; realtime became + # testable in Phase 32; only modes like video/vector_store remain 'unknown'). await _seed("azure/video-x", mode="video_generation") await _provider(app_client, admin_headers) r = await app_client.post("/admin/catalog/models/azure/video-x/test", headers=admin_headers) diff --git a/tests/unit/test_model_kind.py b/tests/unit/test_model_kind.py index 8835ae1..13f42e2 100644 --- a/tests/unit/test_model_kind.py +++ b/tests/unit/test_model_kind.py @@ -88,7 +88,7 @@ def test_is_supported(): # auto-testable IFF a recipe exists (model_test.RECIPES). Every inference kind # now has a real recipe (ocr/stt/image_edit/search send a minimal fixture). for k in ("chat", "embedding", "tts", "image", "moderation", "rerank", - "ocr", "stt", "search", "image_edit"): + "ocr", "stt", "search", "image_edit", "realtime"): assert is_supported(k) # only 'unknown' has no recipe → honestly not auto-testable (never a fake pass) assert not is_supported("unknown") diff --git a/tests/unit/test_upstream_wrappers.py b/tests/unit/test_upstream_wrappers.py index 4abc4d5..7b0cc11 100644 --- a/tests/unit/test_upstream_wrappers.py +++ b/tests/unit/test_upstream_wrappers.py @@ -58,3 +58,55 @@ async def test_aocr_leaves_non_azure_provider_untouched(): with patch("litellm.aocr", new=AsyncMock(return_value="ok")) as m: await upstream.aocr(model="mistral/mistral-ocr-latest", document={"x": 1}, api_key="k") assert m.call_args.kwargs["model"] == "mistral/mistral-ocr-latest" + + +# --- Phase 32 (043): realtime WS smoke (admin "test model" recipe) ----------- +class _FakeSmokeWS: + """A scripted upstream realtime WS for the smoke test (sent frames + recv queue).""" + + def __init__(self, events): + self.events = list(events) + self.sent = [] + self.closed = False + + async def send(self, data): + self.sent.append(data) + + async def recv(self): + if self.events: + return self.events.pop(0) + raise RuntimeError("no more events") + + async def close(self): + self.closed = True + + +@pytest.mark.asyncio +async def test_realtime_smoke_ok_on_first_server_event(): + import json + + ws = _FakeSmokeWS([json.dumps({"type": "transcription_session.created"})]) + with patch("ai_api.proxy.upstream.open_realtime_ws", new=AsyncMock(return_value=ws)) as opener: + out = await upstream.realtime_smoke( + model="azure/gpt-realtime-whisper", api_key="k", + api_base="https://x", api_version="2024-10-01-preview", + ) + assert out["ok"] is True and out["first_event"] == "transcription_session.created" + # provider derived from the slug prefix; handshake + audio append were sent. + assert opener.call_args.kwargs["provider"] == "azure" + assert any("session.update" in s for s in ws.sent) + assert any("input_audio_buffer.append" in s for s in ws.sent) + assert ws.closed is True # always closes the upstream WS + + +@pytest.mark.asyncio +async def test_realtime_smoke_raises_on_error_event(): + import json + + ws = _FakeSmokeWS([json.dumps({"type": "error", "error": {"message": "deployment not found"}})]) + with ( + patch("ai_api.proxy.upstream.open_realtime_ws", new=AsyncMock(return_value=ws)), + pytest.raises(RuntimeError, match="deployment not found"), + ): + await upstream.realtime_smoke(model="azure/gpt-realtime-whisper", api_key="k") + assert ws.closed is True From 963e73875a6b734d6aba601bfa96e9fc67d72392 Mon Sep 17 00:00:00 2001 From: timcsy Date: Fri, 12 Jun 2026 22:49:52 +0800 Subject: [PATCH 4/4] refactor(realtime): detect realtime as a capability (supported_endpoints), not a litellm mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit litellm PR #29775 (gpt-realtime-whisper, merged 2026-06-11) ships the model as mode=audio_transcription and signals realtime via supported_endpoints containing /v1/realtime — i.e. realtime is a capability axis, not a mode (same shape as responses_support). The earlier mode==realtime gate would never match any Azure model. Fix: - model_kind: realtime is capability-derived — raw.supported_endpoints lists /v1/realtime OR an admin `realtime` capability marker (`realtime:blocked` force- disables, manual wins). gpt-realtime-whisper (audio_transcription) → realtime; whisper-1 (no /v1/realtime) stays stt. Everything keyed on model_kind (endpoint gate, test recipe, catalog label) now works for the real model. - billing: bill in the PriceList's unit — litellm prices realtime transcription per SECOND (input_cost_per_second), so default to `second` when unpriced; `minute` still honoured. Adds pcm_bytes_to_seconds + session_quantity. - model-detail: hint that adding the `realtime` capability marks a manually-added model as realtime (needed until litellm's price-map entry — currently clobbered by a json regen on main — is restored, after which import auto-detects it). Full suite 742 passed; ruff+mypy clean; frontend tsc + 164 vitest green. Co-Authored-By: Claude Opus 4.8 (1M context) --- frontend/src/routes/admin/model-detail.tsx | 4 ++ src/ai_api/main.py | 2 +- src/ai_api/proxy/realtime.py | 47 ++++++++++++-- src/ai_api/services/model_kind.py | 36 +++++++++- tests/contract/test_realtime_transcription.py | 65 ++++++++++++++----- tests/integration/test_admin_model_test.py | 16 +++-- tests/unit/test_model_kind.py | 40 ++++++++++++ tests/unit/test_realtime_metering.py | 31 ++++++++- 8 files changed, 206 insertions(+), 35 deletions(-) diff --git a/frontend/src/routes/admin/model-detail.tsx b/frontend/src/routes/admin/model-detail.tsx index c396ace..a490079 100644 --- a/frontend/src/routes/admin/model-detail.tsx +++ b/frontend/src/routes/admin/model-detail.tsx @@ -717,6 +717,10 @@ function EditBasicsDialog({
setCapabilities(e.target.value)} /> +

+ 加 realtime 把模型標為「即時字幕」(走 /v1/realtime WS、可在此頁測試); + 手動加入、litellm 尚未帶入 supported_endpoints 時用得到。realtime:blocked 可強制關閉。 +

diff --git a/src/ai_api/main.py b/src/ai_api/main.py index c24af9f..3e41677 100644 --- a/src/ai_api/main.py +++ b/src/ai_api/main.py @@ -37,8 +37,8 @@ from ai_api.db import dispose_engine from ai_api.observability.logging import setup_logging from ai_api.observability.request_id import RequestIdMiddleware -from ai_api.proxy.registry import build_router as build_proxy_registry_router from ai_api.proxy.realtime import router as realtime_router +from ai_api.proxy.registry import build_router as build_proxy_registry_router from ai_api.proxy.responses import router as responses_router from ai_api.proxy.router import router as proxy_router diff --git a/src/ai_api/proxy/realtime.py b/src/ai_api/proxy/realtime.py index 01501a8..41c2465 100644 --- a/src/ai_api/proxy/realtime.py +++ b/src/ai_api/proxy/realtime.py @@ -118,6 +118,21 @@ def pcm_bytes_to_minutes( return math.ceil(secs / 60) +def pcm_bytes_to_seconds( + audio_bytes: int, + *, + sample_rate: int = _DEFAULT_SAMPLE_RATE, + bytes_per_sample: int = _DEFAULT_BYTES_PER_SAMPLE, + channels: int = _DEFAULT_CHANNELS, +) -> int: + """Per-second billing quantity: round UP to the next whole second (litellm + prices gpt-realtime-whisper via ``input_cost_per_second``). 0 bytes → 0.""" + secs = duration_seconds(audio_bytes, sample_rate, bytes_per_sample, channels) + if secs <= 0: + return 0 + return math.ceil(secs) + + def session_minutes(sess: RealtimeSession) -> int: return pcm_bytes_to_minutes( sess.audio_bytes, @@ -127,6 +142,20 @@ def session_minutes(sess: RealtimeSession) -> int: ) +def session_quantity(sess: RealtimeSession, unit: str) -> int: + """Billable quantity in the unit the PriceList carries. litellm prices realtime + transcription per SECOND; admins may instead price per minute — bill in whichever + the price row uses so cost = quantity x per-unit lines up.""" + if unit == "minute": + return session_minutes(sess) + return pcm_bytes_to_seconds( + sess.audio_bytes, + sample_rate=sess.sample_rate, + bytes_per_sample=sess.bytes_per_sample, + channels=sess.channels, + ) + + def _apply_format(sess: RealtimeSession, ev: dict[str, Any]) -> None: """Read sample rate (and, if present, sample width/channels) from a `session.update` so metering uses the client's actual PCM geometry. Tolerant of @@ -309,14 +338,14 @@ def _outcome_for_close(close_reason: str) -> Any: async def _bill_session(sess: RealtimeSession) -> None: - """Write ONE CallRecord(unit="minute") for the accrued audio. Any close path - reaches here (FR-004). Uses a fresh session — the connection has no request - session. Never raises (billing must not crash teardown).""" + """Write ONE CallRecord for the accrued audio, in the unit the PriceList carries + (litellm prices realtime transcription per SECOND; admins may price per minute). + Any close path reaches here (FR-004). Uses a fresh session — the connection has no + request session. Never raises (billing must not crash teardown).""" from ai_api.db import get_sessionmaker from ai_api.services.pricing import calculate_unit_cost, lookup_price_for_call from ai_api.services.records import RecordsService - minutes = session_minutes(sess) outcome = _outcome_for_close(sess.close_reason) try: async with get_sessionmaker()() as s: @@ -326,8 +355,12 @@ async def _bill_session(sess: RealtimeSession) -> None: model=sess.upstream_model.split("/", 1)[-1], call_time=sess.started_at, ) + # Bill in the price's unit (second from litellm, or minute); default to + # second (litellm's native unit) when unpriced so the quantity is honest. + unit = price.price_unit if (price and price.price_unit in ("second", "minute")) else "second" + quantity = session_quantity(sess, unit) cost = ( - calculate_unit_cost(minutes, price.price_per_unit) + calculate_unit_cost(quantity, price.price_per_unit) if price is not None else None ) @@ -339,8 +372,8 @@ async def _bill_session(sess: RealtimeSession) -> None: started_at=sess.started_at, status_code=200, outcome=outcome, - quantity=minutes, - unit="minute", + quantity=quantity, + unit=unit, cost_usd=cost, error_message=( "allocation revoked mid-connection" if sess.close_reason == "revoked" else None diff --git a/src/ai_api/services/model_kind.py b/src/ai_api/services/model_kind.py index 3dd3270..ce53b7d 100644 --- a/src/ai_api/services/model_kind.py +++ b/src/ai_api/services/model_kind.py @@ -32,12 +32,36 @@ "moderation": "moderation", "search": "search", "image_edit": "image_edit", - # Phase 32: live transcription over WebSocket (gpt-realtime-whisper). Billed - # per-minute via the /v1/realtime relay; the admin "test model" recipe is a - # minimal WS smoke (handshake + tiny audio) that doubles as the T027 check. + # Gemini live native-audio carries a genuine mode=realtime; gpt-realtime-whisper + # does NOT — see _is_realtime_capable below. "realtime": "realtime", } +# Phase 32: realtime transcription is a CAPABILITY axis, not a litellm mode. litellm +# (PR #29775) ships gpt-realtime-whisper as mode=audio_transcription and signals the +# realtime ability via ``supported_endpoints`` containing ``/v1/realtime`` — exactly +# the responses_support pattern (capability ≠ mode). We mirror that: a model is +# realtime-capable iff its raw entry lists /v1/realtime OR an admin marked it via the +# ``realtime`` capability marker (``realtime:blocked`` force-disables, manual wins). +_REALTIME_MARKER = "realtime" +_REALTIME_BLOCKED = "realtime:blocked" + + +def _is_realtime_capable(model: Any) -> bool: + caps = list(getattr(model, "capabilities", None) or []) + if _REALTIME_BLOCKED in caps: + return False + if _REALTIME_MARKER in caps: + return True + sync = getattr(model, "litellm_sync", None) + if isinstance(sync, dict): + raw = sync.get("raw") + if isinstance(raw, dict): + eps = raw.get("supported_endpoints") + if isinstance(eps, list) and any("/v1/realtime" in str(e) for e in eps): + return True + return False + def _mode_of(model: Any) -> str | None: sync = getattr(model, "litellm_sync", None) @@ -52,6 +76,12 @@ def _mode_of(model: Any) -> str | None: def model_kind(model: Any) -> Kind: """Decide the testable kind of a catalog model. Never raises; always one of Kind.""" + # Realtime is capability-derived (supported_endpoints / admin marker), NOT a + # litellm mode — checked first so gpt-realtime-whisper (mode=audio_transcription) + # is classified realtime, not stt. It can still be called on the batch STT + # endpoint (path-routed), so nothing is lost by the realtime label. + if _is_realtime_capable(model): + return "realtime" mode = _mode_of(model) if mode is not None: # known mode → mapped kind; any other litellm mode → unsupported diff --git a/tests/contract/test_realtime_transcription.py b/tests/contract/test_realtime_transcription.py index 867d141..974a372 100644 --- a/tests/contract/test_realtime_transcription.py +++ b/tests/contract/test_realtime_transcription.py @@ -57,9 +57,14 @@ def _append(seconds: float, rate: int = 24000) -> str: }) -async def _seed_catalog(slug: str, *, mode: str) -> None: - """Seed a catalog row whose litellm mode drives model_kind (realtime vs chat).""" +async def _seed_catalog(slug: str, *, mode: str = "audio_transcription", realtime: bool = True) -> None: + """Seed a catalog row. Realtime models mirror litellm reality (PR #29775): + mode=audio_transcription + supported_endpoints listing /v1/realtime — the + capability axis that drives model_kind → realtime (NOT a litellm 'realtime' mode).""" now = datetime.now(UTC) + raw: dict = {"mode": mode} + if realtime: + raw["supported_endpoints"] = ["/v1/realtime", "/v1/realtime/transcription_sessions"] sm = get_sessionmaker() async with sm() as s: s.add(ModelCatalog( @@ -70,7 +75,7 @@ async def _seed_catalog(slug: str, *, mode: str) -> None: status="active", deprecation_note=None, created_at=now, updated_at=now, default_access="open", allowed_tags=[], denied_tags=[], self_service_enabled=False, self_service_default_quota=None, - litellm_sync={"raw": {"mode": mode}}, + litellm_sync={"raw": raw}, )) await s.commit() @@ -121,7 +126,7 @@ def _bearer(token: str) -> dict[str, str]: # --- T007: invalid / revoked key → close, no stream ------------------------- @pytest.mark.asyncio async def test_invalid_key_closed_no_stream(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) client = FakeClientWS(_bearer("totally-invalid-token"), [_session_update()]) upstream = FakeUpstreamWS() opener = fake_opener(upstream) @@ -143,7 +148,7 @@ async def test_missing_bearer_closed(app_client: AsyncClient, admin_headers): @pytest.mark.asyncio async def test_revoked_allocation_closed(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) alloc = await _alloc(app_client, admin_headers) # Revoke it before connecting. r = await app_client.delete(f"/admin/allocations/{alloc['id']}", headers=admin_headers) @@ -159,7 +164,7 @@ async def test_revoked_allocation_closed(app_client: AsyncClient, admin_headers) @pytest.mark.asyncio async def test_non_realtime_model_unsupported(app_client: AsyncClient, admin_headers): chat_model = "azure/gpt-4o-mini" - await _seed_catalog(chat_model, mode="chat") + await _seed_catalog(chat_model, mode="chat", realtime=False) await _seed_provider(app_client, admin_headers) alloc = await _alloc(app_client, admin_headers, model=chat_model) client = FakeClientWS(_bearer(alloc["token"]), [_session_update(model=chat_model)]) @@ -172,7 +177,7 @@ async def test_non_realtime_model_unsupported(app_client: AsyncClient, admin_hea # --- T009: valid connection + append → delta reaches client ----------------- @pytest.mark.asyncio async def test_valid_connection_relays_delta(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) delta = json.dumps({ @@ -198,7 +203,7 @@ async def test_valid_connection_relays_delta(app_client: AsyncClient, admin_head # --- T015: clean close → one CallRecord(unit=minute), quantity matches ------ @pytest.mark.asyncio async def test_clean_close_bills_one_minute_record(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) await _seed_price("0.017") alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) @@ -216,22 +221,48 @@ async def test_clean_close_bills_one_minute_record(app_client: AsyncClient, admi @pytest.mark.asyncio -async def test_unpriced_realtime_zero_cost(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") +async def test_unpriced_realtime_defaults_to_seconds(app_client: AsyncClient, admin_headers): + await _seed_catalog(RT_MODEL) alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) client = FakeClientWS(_bearer(alloc["token"]), [_session_update(), _append(30.0)], hold_open=False) await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False))) rec = await _last(CallOutcome.success) - assert rec is not None and rec.unit == "minute" and rec.quantity == 1 + # unpriced → default to litellm's native unit (second); 30s → 30 + assert rec is not None and rec.unit == "second" and rec.quantity == 30 assert rec.cost_usd is None # no PriceList → unpriced (NULL), not a crash +@pytest.mark.asyncio +async def test_per_second_price_billed_in_seconds(app_client: AsyncClient, admin_headers): + """litellm prices gpt-realtime-whisper per SECOND — bill in seconds so the cost + lines up with the imported per-unit price (input_cost_per_second).""" + await _seed_catalog(RT_MODEL) + sm = get_sessionmaker() + async with sm() as s: + s.add(PriceList( + id=str(ULID()), provider="azure", model="gpt-realtime-whisper", + input_per_1k_tokens_usd=Decimal(0), output_per_1k_tokens_usd=Decimal(0), + price_unit="second", price_per_unit_usd=Decimal("0.0002833"), + effective_from=datetime.now(UTC) - timedelta(days=1), + created_at=datetime.now(UTC), created_by="test", + )) + await s.commit() + alloc = await _alloc(app_client, admin_headers) + await _seed_provider(app_client, admin_headers) + client = FakeClientWS(_bearer(alloc["token"]), + [_session_update(), _append(10.0)], hold_open=False) + await handle_realtime(client, open_upstream=fake_opener(FakeUpstreamWS(close_after=False))) + rec = await _last(CallOutcome.success) + assert rec is not None and rec.unit == "second" and rec.quantity == 10 + assert rec.cost_usd == Decimal("0.002833") # 10 x 0.0002833 + + # --- T016: abnormal abort (client hangs up mid-stream) → accrued bytes billed @pytest.mark.asyncio async def test_abnormal_abort_still_bills_accrued(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) await _seed_price("0.017") alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) @@ -246,7 +277,7 @@ async def test_abnormal_abort_still_bills_accrued(app_client: AsyncClient, admin # --- T020/T021: in-flight revoke / pause → close(revoked) within N + billed - @pytest.mark.asyncio async def test_inflight_revoke_closes_and_bills(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) await _seed_price("0.017") alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) @@ -277,7 +308,7 @@ async def revoke_after_first_tick(allocation_id: str) -> bool: # --- Contract #7: no upstream key / endpoint ever reaches the client -------- @pytest.mark.asyncio async def test_no_key_or_endpoint_leak(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) err = json.dumps({"type": "error", "error": {"code": "bad", "message": "upstream boom"}}) @@ -292,7 +323,7 @@ async def test_no_key_or_endpoint_leak(app_client: AsyncClient, admin_headers): @pytest.mark.asyncio async def test_upstream_connect_failure_no_leak_and_bills_zero(app_client: AsyncClient, admin_headers): - await _seed_catalog(RT_MODEL, mode="realtime") + await _seed_catalog(RT_MODEL) alloc = await _alloc(app_client, admin_headers) await _seed_provider(app_client, admin_headers) @@ -304,6 +335,6 @@ async def failing_opener(**kwargs): assert client.closed is not None and client.closed[0] == 1011 # internal blob = json.dumps(client.closed) + " ".join(client.sent) assert SECRET_KEY not in blob and "secret-foundry.services.ai.azure.com" not in blob - # Connect failed before any audio relayed → 0 minutes, still a record. + # Connect failed before any audio relayed → 0 (unpriced → default unit second). rec = await _last(CallOutcome.upstream_error) - assert rec is not None and rec.unit == "minute" and rec.quantity == 0 + assert rec is not None and rec.unit == "second" and rec.quantity == 0 diff --git a/tests/integration/test_admin_model_test.py b/tests/integration/test_admin_model_test.py index 5a829cb..07d7479 100644 --- a/tests/integration/test_admin_model_test.py +++ b/tests/integration/test_admin_model_test.py @@ -13,10 +13,18 @@ from ai_api.models import AuditEventType, ModelCatalog -async def _seed(slug: str, *, mode: str | None = None, modality_input=None, modality_output=None) -> None: +async def _seed(slug: str, *, mode: str | None = None, modality_input=None, modality_output=None, + supported_endpoints=None) -> None: sm = get_sessionmaker() now = datetime.now(UTC) - sync = {"raw": {"mode": mode}} if mode is not None else None + sync = None + if mode is not None or supported_endpoints is not None: + raw: dict = {} + if mode is not None: + raw["mode"] = mode + if supported_endpoints is not None: + raw["supported_endpoints"] = supported_endpoints + sync = {"raw": raw} async with sm() as s: s.add(ModelCatalog( slug=slug, provider=slug.split("/", 1)[0], display_name=slug, family="x", @@ -212,7 +220,7 @@ async def test_search_confirmed_calls(app_client: AsyncClient, admin_headers: di @pytest.mark.asyncio async def test_realtime_confirmed_calls(app_client: AsyncClient, admin_headers: dict[str, str]) -> None: """Phase 32: realtime is testable via a WS smoke recipe (billable → needs ack).""" - await _seed("azure/gpt-realtime-whisper", mode="realtime") + await _seed("azure/gpt-realtime-whisper", mode="audio_transcription", supported_endpoints=["/v1/realtime"]) await _provider(app_client, admin_headers) with patch( "ai_api.proxy.upstream.realtime_smoke", @@ -237,7 +245,7 @@ async def test_realtime_confirmed_calls(app_client: AsyncClient, admin_headers: @pytest.mark.asyncio async def test_realtime_upstream_error_reported(app_client: AsyncClient, admin_headers: dict[str, str]) -> None: """A failing WS smoke (e.g. bad deployment) surfaces as a test failure, not 5xx.""" - await _seed("azure/gpt-realtime-whisper", mode="realtime") + await _seed("azure/gpt-realtime-whisper", mode="audio_transcription", supported_endpoints=["/v1/realtime"]) await _provider(app_client, admin_headers) with patch( "ai_api.proxy.upstream.realtime_smoke", diff --git a/tests/unit/test_model_kind.py b/tests/unit/test_model_kind.py index 13f42e2..ebee83e 100644 --- a/tests/unit/test_model_kind.py +++ b/tests/unit/test_model_kind.py @@ -92,3 +92,43 @@ def test_is_supported(): assert is_supported(k) # only 'unknown' has no recipe → honestly not auto-testable (never a fake pass) assert not is_supported("unknown") + + +# --- Phase 32: realtime is a CAPABILITY (supported_endpoints / admin marker), not a mode --- +def _cap(*, mode=None, supported_endpoints=None, capabilities=None): + raw = {} + if mode is not None: + raw["mode"] = mode + if supported_endpoints is not None: + raw["supported_endpoints"] = supported_endpoints + return SimpleNamespace( + litellm_sync={"raw": raw} if raw else None, + modality_input=["audio"], modality_output=["text"], + capabilities=capabilities or [], + ) + + +def test_realtime_from_supported_endpoints_overrides_stt(): + # gpt-realtime-whisper: litellm mode=audio_transcription but /v1/realtime in + # supported_endpoints → realtime (capability beats the audio_transcription→stt map). + m = _cap(mode="audio_transcription", + supported_endpoints=["/v1/realtime", "/v1/realtime/transcription_sessions"]) + assert model_kind(m) == "realtime" + + +def test_realtime_from_admin_marker(): + # Manual model (no litellm_sync): admin marks the `realtime` capability. + m = _cap(capabilities=["realtime"]) + assert model_kind(m) == "realtime" + + +def test_realtime_blocked_marker_forces_off(): + m = _cap(mode="audio_transcription", supported_endpoints=["/v1/realtime"], + capabilities=["realtime:blocked"]) + assert model_kind(m) == "stt" # admin override wins → falls back to mode + + +def test_plain_transcription_stays_stt(): + # whisper-1: audio_transcription, NO /v1/realtime → batch STT, not realtime. + m = _cap(mode="audio_transcription") + assert model_kind(m) == "stt" diff --git a/tests/unit/test_realtime_metering.py b/tests/unit/test_realtime_metering.py index 77bf68e..2cb6b9c 100644 --- a/tests/unit/test_realtime_metering.py +++ b/tests/unit/test_realtime_metering.py @@ -10,7 +10,9 @@ RealtimeSession, duration_seconds, pcm_bytes_to_minutes, + pcm_bytes_to_seconds, session_minutes, + session_quantity, ) @@ -39,13 +41,36 @@ def test_minutes_respects_session_geometry() -> None: assert pcm_bytes_to_minutes(96000, sample_rate=16000) == 1 -def test_session_minutes_uses_session_state() -> None: +def test_seconds_round_up() -> None: + rate = 24000 + per_sec = rate * 2 * 1 # 48000 bytes/sec + assert pcm_bytes_to_seconds(0) == 0 + assert pcm_bytes_to_seconds(per_sec) == 1 + assert pcm_bytes_to_seconds(per_sec // 2) == 1 # 0.5s -> 1s (round up) + assert pcm_bytes_to_seconds(per_sec * 10) == 10 + assert pcm_bytes_to_seconds(per_sec * 10 + 1) == 11 # a started second is billed + + +def _sess(rate: int = 24000) -> RealtimeSession: from datetime import UTC, datetime - sess = RealtimeSession( + return RealtimeSession( allocation_id="a", subject="s", resource_model="azure/gpt-realtime-whisper", upstream_model="azure/gpt-realtime-whisper", provider="azure", - request_id="r", started_at=datetime.now(UTC), sample_rate=16000, + request_id="r", started_at=datetime.now(UTC), sample_rate=rate, ) + + +def test_session_minutes_uses_session_state() -> None: + sess = _sess(16000) sess.audio_bytes = 32000 * 90 # 90 seconds at 16 kHz pcm16 mono assert session_minutes(sess) == 2 # 90s → 2 min + + +def test_session_quantity_follows_price_unit() -> None: + sess = _sess(24000) + sess.audio_bytes = 24000 * 2 * 10 # 10 seconds + assert session_quantity(sess, "second") == 10 + assert session_quantity(sess, "minute") == 1 # 10s → 1 min (round up) + # any non-minute unit (incl. unknown) bills in seconds + assert session_quantity(sess, "anything-else") == 10