Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions testing/e2e/global-setup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,19 @@ export default async function globalSetup() {
mock.mount('/v1/text-to-speech', elevenLabsTTSMount())
mock.mount('/v1/speech-to-text', elevenLabsSTTMount())

// Gemini TTS hits the standard Gemini generateContent endpoint
// (POST /v1beta/models/{model}:generateContent) with
// responseModalities: ['AUDIO']. aimock's native Gemini audio helper derives
// the mime type from the fixture's `format`/`contentType`, so it can't emit
// the raw `audio/L16;codec=pcm;rate=24000` PCM that real Gemini TTS returns.
// Mount the TTS model's generateContent path directly so we can hand back
// PCM and exercise the adapter's PCM→WAV normalization. The path is specific
// to the TTS model, so it doesn't intercept Gemini chat/summarize requests.
mock.mount(
'/v1beta/models/gemini-3.1-flash-tts-preview:generateContent',
geminiTTSMount(),
)

// Anthropic server_tool_use bug reproduction (issue #604). aimock can't
// natively synthesize `server_tool_use` / `web_fetch_tool_result` content
// blocks, so this mount hand-crafts the raw SSE Claude would emit when a
Expand Down Expand Up @@ -107,6 +120,14 @@ const FAKE_MP3_BYTES = Buffer.from([
0xff, 0xfb, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
])

/**
* Raw 16-bit little-endian PCM bytes. Gemini TTS returns audio as
* `audio/L16;codec=pcm;rate=24000` inlineData, which the adapter wraps in a
* RIFF/WAV header before handing it to the browser. The samples are arbitrary
* silence — the spec only asserts the `<audio>` element becomes visible.
*/
const FAKE_PCM_BYTES = Buffer.alloc(32)

function grokTTSMount(): Mountable {
return {
async handleRequest(
Expand All @@ -127,6 +148,52 @@ function grokTTSMount(): Mountable {
}
}

function geminiTTSMount(): Mountable {
return {
async handleRequest(
req: http.IncomingMessage,
res: http.ServerResponse,
// aimock strips the mount prefix — pathname will be "/" for an exact match.
pathname: string,
): Promise<boolean> {
if (pathname !== '/' || req.method !== 'POST') return false
await drainBody(req)
res.statusCode = 200
res.setHeader('Content-Type', 'application/json')
// Mirror the Gemini generateContent audio response shape: audio lands as
// a single `candidates[0].content.parts[0].inlineData` entry. The PCM
// mime type forces the adapter down its PCM→WAV wrapping path.
res.end(
JSON.stringify({
candidates: [
{
content: {
role: 'model',
parts: [
{
inlineData: {
mimeType: 'audio/L16;codec=pcm;rate=24000',
data: FAKE_PCM_BYTES.toString('base64'),
},
},
],
},
finishReason: 'STOP',
index: 0,
},
],
usageMetadata: {
promptTokenCount: 5,
candidatesTokenCount: 15,
totalTokenCount: 20,
},
}),
)
return true
},
}
}

function grokSTTMount(): Mountable {
return {
async handleRequest(
Expand Down
2 changes: 1 addition & 1 deletion testing/e2e/src/lib/feature-support.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ export const matrix: Record<Feature, Set<Provider>> = {
'image-gen': new Set(['openai', 'grok']),
'audio-gen': new Set(['gemini', 'elevenlabs']),
'sound-effects': new Set(['elevenlabs']),
tts: new Set(['openai', 'grok', 'elevenlabs']),
tts: new Set(['openai', 'gemini', 'grok', 'elevenlabs']),
transcription: new Set(['openai', 'grok', 'elevenlabs']),
'video-gen': new Set(['openai']),
// Only Gemini currently surfaces a first-class stateful conversation API via
Expand Down
10 changes: 9 additions & 1 deletion testing/e2e/src/lib/media-providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ import {
createOpenaiTranscription,
createOpenaiVideo,
} from '@tanstack/ai-openai'
import { createGeminiAudio, createGeminiImage } from '@tanstack/ai-gemini'
import {
createGeminiAudio,
createGeminiImage,
createGeminiSpeech,
} from '@tanstack/ai-gemini'
import {
createGrokImage,
createGrokSpeech,
Expand Down Expand Up @@ -72,6 +76,10 @@ export function createTTSAdapter(
baseURL: openaiUrl(aimockPort),
defaultHeaders: headers,
}),
gemini: () =>
createGeminiSpeech('gemini-3.1-flash-tts-preview', DUMMY_KEY, {
httpOptions: { baseUrl: llmockBase(aimockPort), headers },
}),
grok: () =>
createGrokSpeech('grok-tts', DUMMY_KEY, {
baseURL: openaiUrl(aimockPort),
Expand Down