From 79f84f6b6fa64c1540c2580150ab2ec1864126c9 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:34:20 -0700 Subject: [PATCH 1/2] feat(verifier): add evaluator backend facade --- packages/core/lib/v3/index.ts | 5 + packages/core/lib/v3Evaluator.ts | 359 +++++------------- packages/core/lib/v3LegacyEvaluator.ts | 297 +++++++++++++++ .../tests/unit/public-api/v3-core.test.ts | 39 ++ 4 files changed, 437 insertions(+), 263 deletions(-) create mode 100644 packages/core/lib/v3LegacyEvaluator.ts diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index fdd42bdd5..ffb6726df 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -59,6 +59,11 @@ export { isZod4Schema, isZod3Schema, toJsonSchema } from "./zodCompat.js"; export { connectToMCPServer } from "./mcp/connection.js"; export { V3Evaluator } from "../v3Evaluator.js"; +export type { + V3EvaluatorBackend, + V3EvaluatorConstructorOptions, + V3EvaluatorOptions, +} from "../v3Evaluator.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index 42dd8b20e..e1d384f8c 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -1,296 +1,129 @@ /** - * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand. - * It uses the V3 page/screenshot APIs and constructs an LLM client to run - * structured evaluations (YES/NO with reasoning) on screenshots and/or text. + * Public V3 evaluator facade. + * + * The facade keeps the legacy evaluator available while the rubric verifier + * backend is layered in separately. */ -import { z } from "zod"; import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js"; import type { EvaluateOptions, BatchAskOptions, EvaluationResult, } from "./v3/types/private/evaluator.js"; -import { LLMParsedResponse } from "./inference.js"; -import { LLMResponse, LLMClient } from "./v3/llm/LLMClient.js"; -import { LogLine } from "./v3/types/public/logs.js"; import { V3 } from "./v3/v3.js"; -import { LLMProvider } from "./v3/llm/LLMProvider.js"; import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js"; - -const EvaluationSchema = z.object({ - evaluation: z.enum(["YES", "NO"]), - reasoning: z.string(), -}); - -const BatchEvaluationSchema = z.array(EvaluationSchema); +import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js"; + +const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND"; +const DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = "legacy"; + +export type V3EvaluatorBackend = "legacy" | "verifier"; + +export type V3EvaluatorOptions = { + /** + * Selects the evaluator implementation. + * + * "legacy" preserves the existing screenshot/text YES/NO evaluator. + * "verifier" is reserved for the rubric verifier backend. + * + * @default process.env.STAGEHAND_EVALUATOR_BACKEND || "legacy" + */ + backend?: V3EvaluatorBackend; +}; + +export type V3EvaluatorConstructorOptions = V3EvaluatorOptions & { + modelName?: AvailableModel; + modelClientOptions?: ClientOptions; +}; + +type NormalizedConstructorOptions = { + modelName?: AvailableModel; + modelClientOptions?: ClientOptions; + backend?: V3EvaluatorBackend; +}; export class V3Evaluator { - private v3: V3; - private modelName: AvailableModel; - private modelClientOptions: ClientOptions | { apiKey: string }; - private silentLogger: (message: LogLine) => void = () => {}; + private readonly backend: V3EvaluatorBackend; + private readonly legacyEvaluator: LegacyV3Evaluator; constructor( v3: V3, - modelName?: AvailableModel, + modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions, modelClientOptions?: ClientOptions, + options?: V3EvaluatorOptions, ) { - this.v3 = v3; - this.modelName = modelName || ("google/gemini-2.5-flash" as AvailableModel); - this.modelClientOptions = modelClientOptions || { - apiKey: - process.env.GEMINI_API_KEY || - process.env.GOOGLE_GENERATIVE_AI_API_KEY || - "", - }; - } - - private getClient(): LLMClient { - // Prefer a dedicated provider so we can override model per-evaluation - const provider = new LLMProvider(this.v3.logger); - return provider.getClient(this.modelName, this.modelClientOptions); + const normalizedOptions = normalizeConstructorOptions( + modelNameOrOptions, + modelClientOptions, + options, + ); + + this.backend = resolveEvaluatorBackend(normalizedOptions.backend); + this.legacyEvaluator = new LegacyV3Evaluator( + v3, + normalizedOptions.modelName, + normalizedOptions.modelClientOptions, + ); } async ask(options: EvaluateOptions): Promise { - const { - question, - answer, - screenshot = true, - systemPrompt, - screenshotDelayMs = 250, - agentReasoning, - } = options; - if (!question) - throw new StagehandInvalidArgumentError( - "Question cannot be an empty string", - ); - if (!answer && !screenshot) - throw new StagehandInvalidArgumentError( - "Either answer (text) or screenshot must be provided", - ); - - if (Array.isArray(screenshot)) { - return this._evaluateWithMultipleScreenshots({ - question, - screenshots: screenshot, - systemPrompt, - agentReasoning, - }); - } - - const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n Today's date is ${new Date().toLocaleDateString()}`; - - await new Promise((r) => setTimeout(r, screenshotDelayMs)); - let imageBuffer: Buffer | undefined; - if (screenshot) { - const page = await this.v3.context.awaitActivePage(); - imageBuffer = await page.screenshot({ fullPage: false }); - } - - const llmClient = this.getClient(); - - const response = await llmClient.createChatCompletion< - LLMParsedResponse - >({ - logger: this.silentLogger, - options: { - messages: [ - { role: "system", content: systemPrompt || defaultSystemPrompt }, - { - role: "user", - content: [ - { - type: "text", - text: agentReasoning - ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}` - : question, - }, - ...(screenshot && imageBuffer - ? [ - { - type: "image_url" as const, - image_url: { - url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`, - }, - }, - ] - : []), - ...(answer - ? [{ type: "text" as const, text: `the answer is ${answer}` }] - : []), - ], - }, - ], - response_model: { name: "EvaluationResult", schema: EvaluationSchema }, - }, - }); - - try { - const result = response.data as unknown as z.infer< - typeof EvaluationSchema - >; - return { evaluation: result.evaluation, reasoning: result.reasoning }; - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); - return { - evaluation: "INVALID", - reasoning: `Failed to get structured response: ${errorMessage}`, - } as const; - } + return this.getLegacyBackend("ask").ask(options); } async batchAsk(options: BatchAskOptions): Promise { - const { - questions, - screenshot = true, - systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.", - screenshotDelayMs = 250, - } = options; - if (!questions?.length) - throw new StagehandInvalidArgumentError( - "Questions array cannot be empty", - ); + return this.getLegacyBackend("batchAsk").batchAsk(options); + } - await new Promise((r) => setTimeout(r, screenshotDelayMs)); - let imageBuffer: Buffer | undefined; - if (screenshot) { - const page = await this.v3.context.awaitActivePage(); - imageBuffer = await page.screenshot({ fullPage: false }); + private getLegacyBackend(methodName: string): LegacyV3Evaluator { + if (this.backend === "legacy") { + return this.legacyEvaluator; } - const llmClient = this.getClient(); - - const formatted = questions - .map( - (item, i) => - `${i + 1}. ${item.question}${item.answer ? `\n Answer: ${item.answer}` : ""}`, - ) - .join("\n\n"); - - const response = await llmClient.createChatCompletion< - LLMParsedResponse - >({ - logger: this.silentLogger, - options: { - messages: [ - { - role: "system", - content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`, - }, - { - role: "user", - content: [ - { type: "text", text: formatted }, - ...(screenshot && imageBuffer - ? [ - { - type: "image_url" as const, - image_url: { - url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`, - }, - }, - ] - : []), - ], - }, - ], - response_model: { - name: "BatchEvaluationResult", - schema: BatchEvaluationSchema, - }, - }, - }); - - try { - const results = response.data as unknown as z.infer< - typeof BatchEvaluationSchema - >; - return results.map((r) => ({ - evaluation: r.evaluation, - reasoning: r.reasoning, - })); - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); - return questions.map(() => ({ - evaluation: "INVALID" as const, - reasoning: `Failed to get structured response: ${errorMessage}`, - })); - } + throw new StagehandInvalidArgumentError( + `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`, + ); } +} - private async _evaluateWithMultipleScreenshots(options: { - question: string; - screenshots: Buffer[]; - systemPrompt?: string; - agentReasoning?: string; - }): Promise { - const { - question, - screenshots, - agentReasoning, - systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task. - ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""} - Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one. - Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc). - ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""} - Today's date is ${new Date().toLocaleDateString()}`, - } = options; - - if (!question) - throw new StagehandInvalidArgumentError( - "Question cannot be an empty string", - ); - if (!screenshots || screenshots.length === 0) - throw new StagehandInvalidArgumentError( - "At least one screenshot must be provided", - ); - - const llmClient = this.getClient(); - - const imageContents = screenshots.map((s) => ({ - type: "image_url" as const, - image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` }, - })); +function normalizeConstructorOptions( + modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions, + modelClientOptions?: ClientOptions, + options?: V3EvaluatorOptions, +): NormalizedConstructorOptions { + if ( + modelNameOrOptions && + typeof modelNameOrOptions === "object" && + !Array.isArray(modelNameOrOptions) + ) { + return { + modelName: modelNameOrOptions.modelName, + modelClientOptions: modelNameOrOptions.modelClientOptions, + backend: modelNameOrOptions.backend ?? options?.backend, + }; + } - const response = await llmClient.createChatCompletion< - LLMParsedResponse - >({ - logger: this.silentLogger, - options: { - messages: [ - { role: "system", content: systemPrompt }, - { - role: "user", - content: [ - { - type: "text", - text: agentReasoning - ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.` - : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`, - }, - ...imageContents, - ], - }, - ], - response_model: { name: "EvaluationResult", schema: EvaluationSchema }, - }, - }); + return { + modelName: modelNameOrOptions as AvailableModel | undefined, + modelClientOptions, + backend: options?.backend, + }; +} - try { - const result = response.data as unknown as z.infer< - typeof EvaluationSchema - >; - return { evaluation: result.evaluation, reasoning: result.reasoning }; - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); - return { - evaluation: "INVALID", - reasoning: `Failed to get structured response: ${errorMessage}`, - } as const; - } +function resolveEvaluatorBackend( + explicitBackend?: V3EvaluatorBackend, +): V3EvaluatorBackend { + const configuredBackend = + explicitBackend ?? + process.env[EVALUATOR_BACKEND_ENV] ?? + DEFAULT_EVALUATOR_BACKEND; + const normalizedBackend = configuredBackend.trim().toLowerCase(); + + if (normalizedBackend === "legacy" || normalizedBackend === "verifier") { + return normalizedBackend; } + + throw new StagehandInvalidArgumentError( + `Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`, + ); } diff --git a/packages/core/lib/v3LegacyEvaluator.ts b/packages/core/lib/v3LegacyEvaluator.ts new file mode 100644 index 000000000..64ec89ef2 --- /dev/null +++ b/packages/core/lib/v3LegacyEvaluator.ts @@ -0,0 +1,297 @@ +/** + * Legacy V3 evaluator implementation. + * + * This is the behavior-preserving implementation that backs V3Evaluator when + * STAGEHAND_EVALUATOR_BACKEND=legacy. + */ + +import { z } from "zod"; +import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js"; +import type { + EvaluateOptions, + BatchAskOptions, + EvaluationResult, +} from "./v3/types/private/evaluator.js"; +import { LLMParsedResponse } from "./inference.js"; +import { LLMResponse, LLMClient } from "./v3/llm/LLMClient.js"; +import { LogLine } from "./v3/types/public/logs.js"; +import { V3 } from "./v3/v3.js"; +import { LLMProvider } from "./v3/llm/LLMProvider.js"; +import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js"; + +const EvaluationSchema = z.object({ + evaluation: z.enum(["YES", "NO"]), + reasoning: z.string(), +}); + +const BatchEvaluationSchema = z.array(EvaluationSchema); + +export class LegacyV3Evaluator { + private v3: V3; + private modelName: AvailableModel; + private modelClientOptions: ClientOptions | { apiKey: string }; + private silentLogger: (message: LogLine) => void = () => {}; + + constructor( + v3: V3, + modelName?: AvailableModel, + modelClientOptions?: ClientOptions, + ) { + this.v3 = v3; + this.modelName = modelName || ("google/gemini-2.5-flash" as AvailableModel); + this.modelClientOptions = modelClientOptions || { + apiKey: + process.env.GEMINI_API_KEY || + process.env.GOOGLE_GENERATIVE_AI_API_KEY || + "", + }; + } + + private getClient(): LLMClient { + // Prefer a dedicated provider so we can override model per-evaluation + const provider = new LLMProvider(this.v3.logger); + return provider.getClient(this.modelName, this.modelClientOptions); + } + + async ask(options: EvaluateOptions): Promise { + const { + question, + answer, + screenshot = true, + systemPrompt, + screenshotDelayMs = 250, + agentReasoning, + } = options; + if (!question) + throw new StagehandInvalidArgumentError( + "Question cannot be an empty string", + ); + if (!answer && !screenshot) + throw new StagehandInvalidArgumentError( + "Either answer (text) or screenshot must be provided", + ); + + if (Array.isArray(screenshot)) { + return this._evaluateWithMultipleScreenshots({ + question, + screenshots: screenshot, + systemPrompt, + agentReasoning, + }); + } + + const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n Today's date is ${new Date().toLocaleDateString()}`; + + await new Promise((r) => setTimeout(r, screenshotDelayMs)); + let imageBuffer: Buffer | undefined; + if (screenshot) { + const page = await this.v3.context.awaitActivePage(); + imageBuffer = await page.screenshot({ fullPage: false }); + } + + const llmClient = this.getClient(); + + const response = await llmClient.createChatCompletion< + LLMParsedResponse + >({ + logger: this.silentLogger, + options: { + messages: [ + { role: "system", content: systemPrompt || defaultSystemPrompt }, + { + role: "user", + content: [ + { + type: "text", + text: agentReasoning + ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}` + : question, + }, + ...(screenshot && imageBuffer + ? [ + { + type: "image_url" as const, + image_url: { + url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`, + }, + }, + ] + : []), + ...(answer + ? [{ type: "text" as const, text: `the answer is ${answer}` }] + : []), + ], + }, + ], + response_model: { name: "EvaluationResult", schema: EvaluationSchema }, + }, + }); + + try { + const result = response.data as unknown as z.infer< + typeof EvaluationSchema + >; + return { evaluation: result.evaluation, reasoning: result.reasoning }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : String(error); + return { + evaluation: "INVALID", + reasoning: `Failed to get structured response: ${errorMessage}`, + } as const; + } + } + + async batchAsk(options: BatchAskOptions): Promise { + const { + questions, + screenshot = true, + systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.", + screenshotDelayMs = 250, + } = options; + if (!questions?.length) + throw new StagehandInvalidArgumentError( + "Questions array cannot be empty", + ); + + await new Promise((r) => setTimeout(r, screenshotDelayMs)); + let imageBuffer: Buffer | undefined; + if (screenshot) { + const page = await this.v3.context.awaitActivePage(); + imageBuffer = await page.screenshot({ fullPage: false }); + } + + const llmClient = this.getClient(); + + const formatted = questions + .map( + (item, i) => + `${i + 1}. ${item.question}${item.answer ? `\n Answer: ${item.answer}` : ""}`, + ) + .join("\n\n"); + + const response = await llmClient.createChatCompletion< + LLMParsedResponse + >({ + logger: this.silentLogger, + options: { + messages: [ + { + role: "system", + content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`, + }, + { + role: "user", + content: [ + { type: "text", text: formatted }, + ...(screenshot && imageBuffer + ? [ + { + type: "image_url" as const, + image_url: { + url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`, + }, + }, + ] + : []), + ], + }, + ], + response_model: { + name: "BatchEvaluationResult", + schema: BatchEvaluationSchema, + }, + }, + }); + + try { + const results = response.data as unknown as z.infer< + typeof BatchEvaluationSchema + >; + return results.map((r) => ({ + evaluation: r.evaluation, + reasoning: r.reasoning, + })); + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : String(error); + return questions.map(() => ({ + evaluation: "INVALID" as const, + reasoning: `Failed to get structured response: ${errorMessage}`, + })); + } + } + + private async _evaluateWithMultipleScreenshots(options: { + question: string; + screenshots: Buffer[]; + systemPrompt?: string; + agentReasoning?: string; + }): Promise { + const { + question, + screenshots, + agentReasoning, + systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task. + ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""} + Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one. + Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc). + ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""} + Today's date is ${new Date().toLocaleDateString()}`, + } = options; + + if (!question) + throw new StagehandInvalidArgumentError( + "Question cannot be an empty string", + ); + if (!screenshots || screenshots.length === 0) + throw new StagehandInvalidArgumentError( + "At least one screenshot must be provided", + ); + + const llmClient = this.getClient(); + + const imageContents = screenshots.map((s) => ({ + type: "image_url" as const, + image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` }, + })); + + const response = await llmClient.createChatCompletion< + LLMParsedResponse + >({ + logger: this.silentLogger, + options: { + messages: [ + { role: "system", content: systemPrompt }, + { + role: "user", + content: [ + { + type: "text", + text: agentReasoning + ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.` + : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`, + }, + ...imageContents, + ], + }, + ], + response_model: { name: "EvaluationResult", schema: EvaluationSchema }, + }, + }); + + try { + const result = response.data as unknown as z.infer< + typeof EvaluationSchema + >; + return { evaluation: result.evaluation, reasoning: result.reasoning }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : String(error); + return { + evaluation: "INVALID", + reasoning: `Failed to get structured response: ${errorMessage}`, + } as const; + } + } +} diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts index 6987d2f31..8d710da4d 100644 --- a/packages/core/tests/unit/public-api/v3-core.test.ts +++ b/packages/core/tests/unit/public-api/v3-core.test.ts @@ -133,6 +133,45 @@ describe("V3 Core public API types", () => { (options: unknown) => Promise >(); }); + + it("accepts legacy evaluator backend options", () => { + const mockV3 = {} as Stagehand.Stagehand; + expectTypeOf().toBeConstructibleWith( + mockV3, + { + backend: "legacy", + } satisfies Stagehand.V3EvaluatorConstructorOptions, + ); + }); + + it("rejects verifier backend before the verifier PR is installed", async () => { + const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, { + backend: "verifier", + }); + + await expect( + evaluator.ask({ question: "Was the task completed?" }), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("rejects invalid evaluator backend env values", () => { + const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND; + process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend"; + + try { + expect( + () => new Stagehand.V3Evaluator({} as Stagehand.Stagehand), + ).toThrow('Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"'); + } finally { + if (previousBackend === undefined) { + delete process.env.STAGEHAND_EVALUATOR_BACKEND; + } else { + process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend; + } + } + }); }); describe("V3FunctionName", () => { From 513b9d97d142bc482f8adf2406ba895adfc82f3b Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:15:17 -0700 Subject: [PATCH 2/2] chore: add evaluator facade changeset --- .changeset/verifier-evaluator-facade.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/verifier-evaluator-facade.md diff --git a/.changeset/verifier-evaluator-facade.md b/.changeset/verifier-evaluator-facade.md new file mode 100644 index 000000000..558edcf50 --- /dev/null +++ b/.changeset/verifier-evaluator-facade.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add a backend-selectable v3 evaluator facade while preserving the legacy evaluator path.