Skip to content

Commit d6cc273

Browse files
committed
fix: ai models
1 parent cf849cc commit d6cc273

15 files changed

Lines changed: 499 additions & 162 deletions

File tree

bun.lock

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/backend/convex/agents/agenticRunner.ts

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { generateText } from "ai";
22
import { internal } from "../_generated/api";
33
import { mistral, MANAGER_MODEL, roleToModel } from "./models";
4-
import { buildSkillset } from "./skills/index";
4+
import { buildSkillset, type DoneSignal } from "./skills/index";
55
import { buildSystemPrompt } from "./prompts";
66
import type { RunnerCtx, RunnerResult } from "./shared/types";
77

@@ -29,7 +29,10 @@ export async function runAgenticTask(
2929
agentName: string,
3030
continuationState?: ContinuationState,
3131
): Promise<RunnerResult> {
32-
const tools = buildSkillset(ctx, agentId, role);
32+
// Shared signal: flipped by updateTaskStatus when agent marks task as terminal.
33+
// Checked by stopWhen to end the ReAct loop immediately after completion.
34+
const doneSignal: DoneSignal = { value: false };
35+
const tools = buildSkillset(ctx, agentId, role, doneSignal);
3336
const systemPrompt = buildSystemPrompt(role, task, agentName);
3437
const modelId = roleToModel[role] ?? MANAGER_MODEL;
3538
const startTime = Date.now();
@@ -58,6 +61,8 @@ export async function runAgenticTask(
5861

5962
// Shared generateText options (everything except prompt/messages)
6063
const stopWhen = ({ steps }: { steps: unknown[] }) => {
64+
// Agent marked its own task as done/review/failed/cancelled — stop immediately
65+
if (doneSignal.value) return true;
6166
if (steps.length + stepsAlreadyDone >= MAX_STEPS) return true;
6267
return Date.now() - startTime > SOFT_BUDGET_MS;
6368
};
@@ -75,7 +80,7 @@ export async function runAgenticTask(
7580
content: string;
7681
}> = [];
7782

78-
// 1. Reasoning (extended thinking from Magistral models)
83+
// 1. Reasoning (extended thinking)
7984
if (event.reasoningText) {
8085
entries.push({ type: "reasoning", content: event.reasoningText });
8186
}
@@ -132,25 +137,35 @@ export async function runAgenticTask(
132137
// Use separate generateText calls to satisfy TypeScript's discriminated union
133138
// (prompt and messages are mutually exclusive)
134139
const result = continuationState
135-
? await generateText({
136-
model: mistral(modelId),
137-
system: systemPrompt,
138-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
139-
messages: [
140-
{ role: "user" as const, content: taskPrompt },
141-
// ResponseMessages from prior run — already correctly shaped from AI SDK
142-
...(JSON.parse(continuationState.messages) as any[]),
143-
{
144-
role: "user" as const,
145-
content:
146-
"Continue working. You were interrupted by a time limit. Resume from where you stopped. Do NOT repeat work already done — review the tool results above and continue from the last step.",
147-
},
148-
],
149-
tools,
150-
stopWhen,
151-
abortSignal: controller.signal,
152-
onStepFinish,
153-
})
140+
? await (async () => {
141+
const priorMessages = JSON.parse(continuationState.messages) as Array<{
142+
role: string;
143+
content?: unknown;
144+
}>;
145+
const lastRole = priorMessages[priorMessages.length - 1]?.role;
146+
const continueUserMsg = {
147+
role: "user" as const,
148+
content:
149+
"Continue working. You were interrupted by a time limit. Resume from where you stopped. Do NOT repeat work already done — review the tool results above and continue from the last step.",
150+
};
151+
if (lastRole === "tool") {
152+
priorMessages.push({
153+
role: "assistant" as const,
154+
content: "[Resuming after time limit — continuing from last tool results.]",
155+
});
156+
}
157+
priorMessages.push(continueUserMsg);
158+
return await generateText({
159+
model: mistral(modelId),
160+
system: systemPrompt,
161+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
162+
messages: [{ role: "user" as const, content: taskPrompt }, ...(priorMessages as any[])],
163+
tools,
164+
stopWhen,
165+
abortSignal: controller.signal,
166+
onStepFinish,
167+
});
168+
})()
154169
: await generateText({
155170
model: mistral(modelId),
156171
system: systemPrompt,
@@ -161,6 +176,18 @@ export async function runAgenticTask(
161176
onStepFinish,
162177
});
163178

179+
// If agent self-reported completion via updateTaskStatus, treat as success
180+
// regardless of step/time budget — the agent decided it's done.
181+
if (doneSignal.value) {
182+
const summary = result.text || "(task completed — agent set terminal status)";
183+
await ctx.runMutation(internal.logs.mutations.append, {
184+
agentId,
185+
type: "status" as const,
186+
content: `[${role}] Task completed (agent self-reported)`,
187+
});
188+
return { success: true, result: summary };
189+
}
190+
164191
// Check if stopped by time budget (not natural completion)
165192
const hitBudget = Date.now() - startTime > SOFT_BUDGET_MS;
166193
const hitStepLimit = result.steps.length + stepsAlreadyDone >= MAX_STEPS;

packages/backend/convex/agents/browser/agent.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010

1111
export const browserAgent = new Agent(components.agent, {
1212
name: "Browser",
13-
languageModel: mistral(MANAGER_MODEL), // mistral-large-latest — has vision
13+
languageModel: mistral(MANAGER_MODEL), // Claude Sonnet 4.6 — has vision
1414
instructions: `You are a browser agent that navigates websites using Computer Use.
1515
You see screenshots of the desktop and decide what to click, type, or scroll.
1616
You complete web tasks: research, form filling, data extraction, testing.

packages/backend/convex/agents/browser/runner.ts

Lines changed: 97 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,114 @@
11
import { generateObject } from "ai";
2-
import { createAmazonBedrock } from "@ai-sdk/amazon-bedrock";
32
import { z } from "zod";
43
import { internal } from "../../_generated/api";
54
import type { RunnerCtx, RunnerResult } from "../shared/types";
6-
import { MANAGER_MODEL } from "../models";
5+
import { mistral, MANAGER_MODEL } from "../models";
76

87
type TaskRecord = { title: string; description?: string };
98

109
const MAX_ITERATIONS = 200;
1110
const ACTION_DELAY_MS = 1000;
1211

13-
// Structured action schema — the model picks one action per step
14-
const ActionSchema = z.discriminatedUnion("action", [
15-
z.object({
16-
action: z.literal("click"),
17-
x: z.number().describe("X coordinate to click"),
18-
y: z.number().describe("Y coordinate to click"),
19-
button: z.enum(["left", "right"]).default("left").describe("Mouse button"),
20-
reasoning: z.string().describe("Why you are clicking here"),
21-
}),
22-
z.object({
23-
action: z.literal("double_click"),
24-
x: z.number().describe("X coordinate to double-click"),
25-
y: z.number().describe("Y coordinate to double-click"),
26-
reasoning: z.string().describe("Why you are double-clicking here"),
27-
}),
28-
z.object({
29-
action: z.literal("type"),
30-
text: z.string().describe("Text to type"),
31-
reasoning: z.string().describe("Why you are typing this"),
32-
}),
33-
z.object({
34-
action: z.literal("key"),
35-
key: z.string().describe("Key to press (e.g. Enter, Tab, Escape)"),
36-
modifiers: z.array(z.string()).optional().describe("Modifier keys (e.g. ctrl, alt, shift)"),
37-
reasoning: z.string().describe("Why you are pressing this key"),
38-
}),
39-
z.object({
40-
action: z.literal("hotkey"),
41-
keys: z.string().describe("Key combo (e.g. ctrl+c, ctrl+l, alt+tab)"),
42-
reasoning: z.string().describe("Why you are pressing this hotkey"),
43-
}),
44-
z.object({
45-
action: z.literal("scroll"),
46-
x: z.number().describe("X coordinate for scroll position"),
47-
y: z.number().describe("Y coordinate for scroll position"),
48-
direction: z.enum(["up", "down"]).describe("Scroll direction"),
49-
amount: z.number().optional().describe("Scroll amount (default 3)"),
50-
reasoning: z.string().describe("Why you are scrolling"),
51-
}),
52-
z.object({
53-
action: z.literal("wait"),
54-
seconds: z.number().min(1).max(5).describe("Seconds to wait for page to load"),
55-
reasoning: z.string().describe("Why you are waiting"),
56-
}),
57-
z.object({
58-
action: z.literal("done"),
59-
result: z.string().describe("Summary of what was accomplished"),
60-
}),
61-
]);
62-
63-
type Action = z.infer<typeof ActionSchema>;
12+
// Flat action schema — all fields on one object; "action" acts as discriminator.
13+
const ActionSchema = z.object({
14+
action: z
15+
.enum(["click", "double_click", "type", "key", "hotkey", "scroll", "wait", "done"])
16+
.describe("The action to perform"),
17+
reasoning: z.string().optional().describe("Why you are taking this action"),
18+
x: z.number().optional().describe("X coordinate (click, double_click, scroll)"),
19+
y: z.number().optional().describe("Y coordinate (click, double_click, scroll)"),
20+
button: z.enum(["left", "right"]).optional().describe("Mouse button for click (default: left)"),
21+
text: z.string().optional().describe("Text to type (for type action)"),
22+
key: z.string().optional().describe("Key to press, e.g. Enter, Tab, Escape (for key action)"),
23+
modifiers: z
24+
.array(z.string())
25+
.optional()
26+
.describe("Modifier keys e.g. ctrl, alt, shift (for key action)"),
27+
keys: z.string().optional().describe("Key combo e.g. ctrl+c, alt+tab (for hotkey action)"),
28+
direction: z.enum(["up", "down"]).optional().describe("Scroll direction (for scroll action)"),
29+
amount: z.number().optional().describe("Scroll amount, default 3 (for scroll action)"),
30+
seconds: z.number().optional().describe("Seconds to wait 1-5 (for wait action)"),
31+
result: z.string().optional().describe("Summary of what was accomplished (for done action)"),
32+
});
33+
34+
type FlatAction = z.infer<typeof ActionSchema>;
35+
36+
// Typed action variants for executeAction/formatAction (narrow from flat schema)
37+
type Action =
38+
| { action: "click"; x: number; y: number; button: string; reasoning?: string }
39+
| { action: "double_click"; x: number; y: number; reasoning?: string }
40+
| { action: "type"; text: string; reasoning?: string }
41+
| { action: "key"; key: string; modifiers?: string[]; reasoning?: string }
42+
| { action: "hotkey"; keys: string; reasoning?: string }
43+
| {
44+
action: "scroll";
45+
x: number;
46+
y: number;
47+
direction: "up" | "down";
48+
amount?: number;
49+
reasoning?: string;
50+
}
51+
| { action: "wait"; seconds: number; reasoning?: string }
52+
| { action: "done"; result: string; reasoning?: string };
53+
54+
function toAction(raw: FlatAction): Action {
55+
switch (raw.action) {
56+
case "click": {
57+
return {
58+
action: "click",
59+
x: raw.x ?? 0,
60+
y: raw.y ?? 0,
61+
button: raw.button ?? "left",
62+
reasoning: raw.reasoning,
63+
};
64+
}
65+
case "double_click": {
66+
return { action: "double_click", x: raw.x ?? 0, y: raw.y ?? 0, reasoning: raw.reasoning };
67+
}
68+
case "type": {
69+
return { action: "type", text: raw.text ?? "", reasoning: raw.reasoning };
70+
}
71+
case "key": {
72+
return {
73+
action: "key",
74+
key: raw.key ?? "Enter",
75+
modifiers: raw.modifiers,
76+
reasoning: raw.reasoning,
77+
};
78+
}
79+
case "hotkey": {
80+
return { action: "hotkey", keys: raw.keys ?? "", reasoning: raw.reasoning };
81+
}
82+
case "scroll": {
83+
return {
84+
action: "scroll",
85+
x: raw.x ?? 0,
86+
y: raw.y ?? 0,
87+
direction: raw.direction ?? "down",
88+
amount: raw.amount,
89+
reasoning: raw.reasoning,
90+
};
91+
}
92+
case "wait": {
93+
return {
94+
action: "wait",
95+
seconds: Math.min(5, Math.max(1, raw.seconds ?? 2)),
96+
reasoning: raw.reasoning,
97+
};
98+
}
99+
case "done": {
100+
return { action: "done", result: raw.result ?? "Task completed.", reasoning: raw.reasoning };
101+
}
102+
}
103+
}
64104

65105
// Run a Computer Use task: start desktop → vision loop → return result
66106
export async function runComputerUseTask(
67107
ctx: RunnerCtx,
68108
agentId: string,
69109
task: TaskRecord,
70110
): Promise<RunnerResult> {
71-
const bedrock = createAmazonBedrock({ region: "us-west-2" });
72-
const model = bedrock(MANAGER_MODEL);
111+
const model = mistral(MANAGER_MODEL);
73112

74113
// 1. Ensure Computer Use environment is started (Xvfb + xfce4 + VNC)
75114
await ctx.runAction(internal.sandbox.lifecycle.ensureComputerUseStarted, { agentId });
@@ -183,7 +222,7 @@ export async function runComputerUseTask(
183222
);
184223

185224
// Ask Mistral Large to decide next action
186-
const { object: nextAction, usage: stepUsage } = await generateObject({
225+
const { object, usage: stepUsage } = await generateObject({
187226
model,
188227
schema: ActionSchema,
189228
messages: [
@@ -228,6 +267,8 @@ Rules:
228267
],
229268
});
230269

270+
const nextAction = toAction(object);
271+
231272
// Log the action + usage
232273
const actionDesc = formatAction(nextAction);
233274
actionLog.push(actionDesc);

packages/backend/convex/agents/general/runner.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
import { createAmazonBedrock } from "@ai-sdk/amazon-bedrock";
21
import { generateObject, generateText } from "ai";
32
import { z } from "zod";
43
import { internal } from "../../_generated/api";
54
import type { RunnerCtx, RunnerResult } from "../shared/types";
6-
import { MANAGER_MODEL } from "../models";
5+
import { mistral, MANAGER_MODEL } from "../models";
76
import { SANDBOX_WORK_DIR, SHARED_WORKSPACE, SHARED_OUTPUTS } from "../../sandbox/constants";
87

98
const MAX_RETRIES_PER_STEP = 2;
@@ -28,7 +27,7 @@ export async function runGeneralTask(
2827
role: string,
2928
): Promise<RunnerResult> {
3029
const startTime = Date.now();
31-
const mistralClient = createAmazonBedrock({ region: "us-west-2" });
30+
const mistralClient = mistral;
3231

3332
// ── Phase 1: Planning ──────────────────────────────────────
3433
await ctx.runMutation(internal.logs.mutations.append, {
@@ -37,7 +36,7 @@ export async function runGeneralTask(
3736
content: `[${role}] Planning steps for: ${task.title}`,
3837
});
3938

40-
// Use mistral-large for structured output (magistral doesn't support generateObject)
39+
// Use manager model for structured output
4140
const { object: plan } = await generateObject({
4241
model: mistralClient(MANAGER_MODEL),
4342
schema: planSchema,

packages/backend/convex/agents/models.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
import { createAmazonBedrock } from "@ai-sdk/amazon-bedrock";
1+
import { createGateway } from "ai";
22

3-
// Bedrock provider — uses AWS_BEARER_TOKEN_BEDROCK env var for auth
4-
export const mistral = createAmazonBedrock({
5-
region: "us-west-2",
3+
const gateway = createGateway({
4+
apiKey: process.env.AI_GATEWAY_API_KEY ?? "",
65
});
76

8-
// ── Model ID constants (Bedrock Mistral model IDs) ──────────
9-
export const MANAGER_MODEL = "mistral.mistral-large-3-675b-instruct";
10-
export const CODER_MODEL = "mistral.devstral-2-123b";
11-
export const ROUTING_MODEL = "mistral.ministral-3-8b-instruct";
12-
export const REASONING_MODEL = "mistral.magistral-small-2509";
7+
export const mistral = gateway;
8+
9+
export const MANAGER_MODEL = "anthropic/claude-sonnet-4.6";
10+
export const CODER_MODEL = "anthropic/claude-sonnet-4.6";
11+
export const ROUTING_MODEL = "anthropic/claude-haiku-4.5";
12+
export const REASONING_MODEL = "anthropic/claude-sonnet-4.5";
1313

1414
export const roleToModel: Record<string, string> = {
1515
coder: CODER_MODEL,

0 commit comments

Comments
 (0)