Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions src/voice/transcribe.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { test } from "node:test";
import assert from "node:assert/strict";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { transcribeAudio } from "./transcribe.js";

async function withEnv(
key: string,
value: string | undefined,
fn: () => Promise<void>,
): Promise<void> {
const saved = process.env[key];
if (value === undefined) delete process.env[key];
else process.env[key] = value;
try {
await fn();
} finally {
if (saved === undefined) delete process.env[key];
else process.env[key] = saved;
}
}

test("no key configured → error names BOTH providers", async () => {
await withEnv("ELEVENLABS_API_KEY", undefined, () =>
withEnv("OPENAI_API_KEY", undefined, async () => {
await assert.rejects(
() => transcribeAudio({ audioPath: "/no/such/file.webm" }),
/ELEVENLABS_API_KEY[\s\S]*OPENAI_API_KEY/,
);
}),
);
});

test("ElevenLabs is preferred and POSTs the file with xi-api-key", async () => {
const tmp = path.join(os.tmpdir(), `lisa-asr-${process.pid}.webm`);
fs.writeFileSync(tmp, Buffer.from([0x1a, 0x45, 0xdf, 0xa3])); // a few bytes
const realFetch = globalThis.fetch;
let calledUrl = "";
let sentKey: unknown;
let sentFile = false;

globalThis.fetch = (async (url: unknown, init: { headers?: Record<string, string>; body?: unknown }) => {
calledUrl = String(url);
sentKey = init?.headers?.["xi-api-key"];
sentFile = init?.body instanceof FormData && (init.body as FormData).has("file");
return new Response(JSON.stringify({ text: "hello world" }), { status: 200 });
}) as typeof fetch;

try {
await withEnv("ELEVENLABS_API_KEY", "sk_test_key", async () => {
const text = await transcribeAudio({ audioPath: tmp });
assert.equal(text, "hello world");
assert.match(calledUrl, /api\.elevenlabs\.io\/v1\/speech-to-text$/);
assert.equal(sentKey, "sk_test_key");
assert.ok(sentFile, "posts a `file` field in multipart FormData");
});
} finally {
globalThis.fetch = realFetch;
fs.rmSync(tmp, { force: true });
}
});

test("ElevenLabs non-2xx surfaces a useful error", async () => {
const tmp = path.join(os.tmpdir(), `lisa-asr-err-${process.pid}.webm`);
fs.writeFileSync(tmp, Buffer.from([1, 2, 3]));
const realFetch = globalThis.fetch;
globalThis.fetch = (async () =>
new Response("invalid_api_key", { status: 401 })) as typeof fetch;
try {
await withEnv("ELEVENLABS_API_KEY", "sk_bad", async () => {
await assert.rejects(
() => transcribeAudio({ audioPath: tmp }),
/ElevenLabs transcription failed \(401\)/,
);
});
} finally {
globalThis.fetch = realFetch;
fs.rmSync(tmp, { force: true });
}
});
63 changes: 56 additions & 7 deletions src/voice/transcribe.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,71 @@
import fs from "node:fs";
import path from "node:path";
import OpenAI from "openai";

export interface TranscribeOptions {
audioPath: string;
/** OpenAI Whisper model override (ignored by the ElevenLabs path). */
model?: string;
/** OpenAI key override (back-compat); ElevenLabs uses ELEVENLABS_API_KEY. */
apiKey?: string;
}

/**
* Transcribe a recorded audio file to text.
*
* Provider order: ElevenLabs Scribe (ELEVENLABS_API_KEY) → OpenAI Whisper
* (OPENAI_API_KEY / opts.apiKey). The signature is unchanged so callers don't
* care which provider runs.
*/
export async function transcribeAudio(opts: TranscribeOptions): Promise<string> {
if (!process.env.OPENAI_API_KEY && !opts.apiKey) {
throw new Error(
"Voice transcription needs OPENAI_API_KEY (uses OpenAI Whisper).",
);
const elevenKey = process.env.ELEVENLABS_API_KEY;
if (elevenKey) {
return transcribeWithElevenLabs(opts.audioPath, elevenKey);
}
const client = new OpenAI({ apiKey: opts.apiKey });
const openaiKey = opts.apiKey ?? process.env.OPENAI_API_KEY;
if (openaiKey) {
return transcribeWithOpenAI(opts.audioPath, openaiKey, opts.model);
}
throw new Error(
"Voice transcription needs ELEVENLABS_API_KEY (ElevenLabs Scribe) or OPENAI_API_KEY (OpenAI Whisper).",
);
}

async function transcribeWithOpenAI(
audioPath: string,
apiKey: string,
model?: string,
): Promise<string> {
const client = new OpenAI({ apiKey });
const result = await client.audio.transcriptions.create({
model: opts.model ?? "whisper-1",
file: fs.createReadStream(opts.audioPath),
model: model ?? "whisper-1",
file: fs.createReadStream(audioPath),
});
return result.text;
}

/**
* ElevenLabs Scribe speech-to-text — POST /v1/speech-to-text, multipart `file` +
* `model_id`, authed with the `xi-api-key` header. Returns `{ text }`.
*/
async function transcribeWithElevenLabs(audioPath: string, apiKey: string): Promise<string> {
const buf = await fs.promises.readFile(audioPath);
const form = new FormData();
form.append("file", new Blob([buf]), path.basename(audioPath) || "audio.webm");
form.append("model_id", process.env.ELEVENLABS_STT_MODEL || "scribe_v1");

const res = await fetch("https://api.elevenlabs.io/v1/speech-to-text", {
method: "POST",
headers: { "xi-api-key": apiKey },
body: form,
});
if (!res.ok) {
const detail = (await res.text().catch(() => "")).slice(0, 200);
throw new Error(`ElevenLabs transcription failed (${res.status})${detail ? `: ${detail}` : ""}`);
}
const json = (await res.json().catch(() => ({}))) as { text?: string };
if (typeof json.text !== "string") {
throw new Error("ElevenLabs returned no transcript text.");
}
return json.text;
}
6 changes: 3 additions & 3 deletions src/web/lisa-css.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1102,15 +1102,15 @@ export const MAIN_CSS = ` :root {
justify-content: center;
background: transparent;
border: 0;
color: var(--fg-3);
font-size: 17px;
color: var(--fg-2);
cursor: pointer;
border-radius: 10px;
transition: background 120ms ease, color 120ms ease;
min-height: 44px;
padding: 0;
}
#plusBtn { font-size: 22px; }
/* Line-style icons matching the .fbtn function bar above. */
#plusBtn svg, #recordBtn svg { width: 19px; height: 19px; display: block; }
#plusBtn:hover, #recordBtn:hover { background: var(--bg-card); color: var(--fg); }
#plusBtn.flash { background: var(--accent); color: var(--bg-deep); }

Expand Down
6 changes: 4 additions & 2 deletions src/web/lisa-html-snapshot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ import { MAIN_HTML } from "./lisa-html.js";
* (browser counterpart to `lisa pair`), with its .pair-row CSS.
* Then: a scannable QR (server-rendered SVG from /api/pair/start) at the top of
* that modal, with .pair-qr CSS.
* Then: composer + / 🎙 glyphs → line-style SVG icons matching the .fbtn
* function bar (+ #plusBtn/#recordBtn svg sizing; resting color → --fg-2).
*/
const EXPECTED_LENGTH = 150733;
const EXPECTED_LENGTH = 151331;
const EXPECTED_SHA256 =
"f7adb8e271d8a3984a41810783f5e9918ad9c8b78b9d460cb5e3f7b122a2d6d8";
"72098d77767d32d5ca646a7aee675cc0cf4419f70956380cd18aa56d4ba9e19f";

test("MAIN_HTML length is byte-identical to the pre-split snapshot", () => {
assert.equal(MAIN_HTML.length, EXPECTED_LENGTH);
Expand Down
4 changes: 2 additions & 2 deletions src/web/lisa-html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,13 @@ ${MAIN_CSS}
<form id="form">
<input type="file" id="fileInput" accept="image/*,.pdf,.txt,.md,.csv,.json" multiple>
<div class="plus-wrap">
<button type="button" id="plusBtn" title="Attach or screenshot">+</button>
<button type="button" id="plusBtn" title="Attach or screenshot" aria-label="Attach or screenshot"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.7" stroke-linecap="round" stroke-linejoin="round"><line x1="12" y1="5" x2="12" y2="19"/><line x1="5" y1="12" x2="19" y2="12"/></svg></button>
<div class="plus-menu" id="plusMenu">
<button type="button" id="pmAttach"><span class="g">📎</span> Attach file</button>
<button type="button" id="pmShot"><span class="g">📷</span> Screenshot</button>
</div>
</div>
<button type="button" id="recordBtn" title="Dictate — speak and Lisa drops polished text in the box (hold to record a summary)">🎙</button>
<button type="button" id="recordBtn" title="Dictate — speak and Lisa drops polished text in the box (hold to record a summary)" aria-label="Dictate"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.7" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="2" width="6" height="11" rx="3"/><path d="M5 10a7 7 0 0 0 14 0"/><line x1="12" y1="19" x2="12" y2="22"/></svg></button>
<textarea id="input" placeholder="Talk to Lisa… (Enter to send · Shift+Enter for newline)" autofocus></textarea>
<button type="submit" id="sendBtn">
<img src="/assets/icon-send.png" alt="">
Expand Down