From 79f84f6b6fa64c1540c2580150ab2ec1864126c9 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:34:20 -0700
Subject: [PATCH 1/2] feat(verifier): add evaluator backend facade

---
 packages/core/lib/v3/index.ts                 |   5 +
 packages/core/lib/v3Evaluator.ts              | 359 +++++-------------
 packages/core/lib/v3LegacyEvaluator.ts        | 297 +++++++++++++++
 .../tests/unit/public-api/v3-core.test.ts     |  39 ++
 4 files changed, 437 insertions(+), 263 deletions(-)
 create mode 100644 packages/core/lib/v3LegacyEvaluator.ts

diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index fdd42bdd5..ffb6726df 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -59,6 +59,11 @@ export { isZod4Schema, isZod3Schema, toJsonSchema } from "./zodCompat.js";
 
 export { connectToMCPServer } from "./mcp/connection.js";
 export { V3Evaluator } from "../v3Evaluator.js";
+export type {
+  V3EvaluatorBackend,
+  V3EvaluatorConstructorOptions,
+  V3EvaluatorOptions,
+} from "../v3Evaluator.js";
 export { tool } from "ai";
 export { getAISDKLanguageModel } from "./llm/LLMProvider.js";
 export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js";
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index 42dd8b20e..e1d384f8c 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -1,296 +1,129 @@
 /**
- * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.
- * It uses the V3 page/screenshot APIs and constructs an LLM client to run
- * structured evaluations (YES/NO with reasoning) on screenshots and/or text.
+ * Public V3 evaluator facade.
+ *
+ * The facade keeps the legacy evaluator available while the rubric verifier
+ * backend is layered in separately.
  */
 
-import { z } from "zod";
 import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
 import type {
   EvaluateOptions,
   BatchAskOptions,
   EvaluationResult,
 } from "./v3/types/private/evaluator.js";
-import { LLMParsedResponse } from "./inference.js";
-import { LLMResponse, LLMClient } from "./v3/llm/LLMClient.js";
-import { LogLine } from "./v3/types/public/logs.js";
 import { V3 } from "./v3/v3.js";
-import { LLMProvider } from "./v3/llm/LLMProvider.js";
 import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
-
-const EvaluationSchema = z.object({
-  evaluation: z.enum(["YES", "NO"]),
-  reasoning: z.string(),
-});
-
-const BatchEvaluationSchema = z.array(EvaluationSchema);
+import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js";
+
+const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND";
+const DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = "legacy";
+
+export type V3EvaluatorBackend = "legacy" | "verifier";
+
+export type V3EvaluatorOptions = {
+  /**
+   * Selects the evaluator implementation.
+   *
+   * "legacy" preserves the existing screenshot/text YES/NO evaluator.
+   * "verifier" is reserved for the rubric verifier backend.
+   *
+   * @default process.env.STAGEHAND_EVALUATOR_BACKEND || "legacy"
+   */
+  backend?: V3EvaluatorBackend;
+};
+
+export type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {
+  modelName?: AvailableModel;
+  modelClientOptions?: ClientOptions;
+};
+
+type NormalizedConstructorOptions = {
+  modelName?: AvailableModel;
+  modelClientOptions?: ClientOptions;
+  backend?: V3EvaluatorBackend;
+};
 
 export class V3Evaluator {
-  private v3: V3;
-  private modelName: AvailableModel;
-  private modelClientOptions: ClientOptions | { apiKey: string };
-  private silentLogger: (message: LogLine) => void = () => {};
+  private readonly backend: V3EvaluatorBackend;
+  private readonly legacyEvaluator: LegacyV3Evaluator;
 
   constructor(
     v3: V3,
-    modelName?: AvailableModel,
+    modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions,
     modelClientOptions?: ClientOptions,
+    options?: V3EvaluatorOptions,
   ) {
-    this.v3 = v3;
-    this.modelName = modelName || ("google/gemini-2.5-flash" as AvailableModel);
-    this.modelClientOptions = modelClientOptions || {
-      apiKey:
-        process.env.GEMINI_API_KEY ||
-        process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
-        "",
-    };
-  }
-
-  private getClient(): LLMClient {
-    // Prefer a dedicated provider so we can override model per-evaluation
-    const provider = new LLMProvider(this.v3.logger);
-    return provider.getClient(this.modelName, this.modelClientOptions);
+    const normalizedOptions = normalizeConstructorOptions(
+      modelNameOrOptions,
+      modelClientOptions,
+      options,
+    );
+
+    this.backend = resolveEvaluatorBackend(normalizedOptions.backend);
+    this.legacyEvaluator = new LegacyV3Evaluator(
+      v3,
+      normalizedOptions.modelName,
+      normalizedOptions.modelClientOptions,
+    );
   }
 
   async ask(options: EvaluateOptions): Promise<EvaluationResult> {
-    const {
-      question,
-      answer,
-      screenshot = true,
-      systemPrompt,
-      screenshotDelayMs = 250,
-      agentReasoning,
-    } = options;
-    if (!question)
-      throw new StagehandInvalidArgumentError(
-        "Question cannot be an empty string",
-      );
-    if (!answer && !screenshot)
-      throw new StagehandInvalidArgumentError(
-        "Either answer (text) or screenshot must be provided",
-      );
-
-    if (Array.isArray(screenshot)) {
-      return this._evaluateWithMultipleScreenshots({
-        question,
-        screenshots: screenshot,
-        systemPrompt,
-        agentReasoning,
-      });
-    }
-
-    const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to  ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n          Today's date is ${new Date().toLocaleDateString()}`;
-
-    await new Promise((r) => setTimeout(r, screenshotDelayMs));
-    let imageBuffer: Buffer | undefined;
-    if (screenshot) {
-      const page = await this.v3.context.awaitActivePage();
-      imageBuffer = await page.screenshot({ fullPage: false });
-    }
-
-    const llmClient = this.getClient();
-
-    const response = await llmClient.createChatCompletion<
-      LLMParsedResponse<LLMResponse>
-    >({
-      logger: this.silentLogger,
-      options: {
-        messages: [
-          { role: "system", content: systemPrompt || defaultSystemPrompt },
-          {
-            role: "user",
-            content: [
-              {
-                type: "text",
-                text: agentReasoning
-                  ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
-                  : question,
-              },
-              ...(screenshot && imageBuffer
-                ? [
-                    {
-                      type: "image_url" as const,
-                      image_url: {
-                        url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
-                      },
-                    },
-                  ]
-                : []),
-              ...(answer
-                ? [{ type: "text" as const, text: `the answer is ${answer}` }]
-                : []),
-            ],
-          },
-        ],
-        response_model: { name: "EvaluationResult", schema: EvaluationSchema },
-      },
-    });
-
-    try {
-      const result = response.data as unknown as z.infer<
-        typeof EvaluationSchema
-      >;
-      return { evaluation: result.evaluation, reasoning: result.reasoning };
-    } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error);
-      return {
-        evaluation: "INVALID",
-        reasoning: `Failed to get structured response: ${errorMessage}`,
-      } as const;
-    }
+    return this.getLegacyBackend("ask").ask(options);
   }
 
   async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {
-    const {
-      questions,
-      screenshot = true,
-      systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.",
-      screenshotDelayMs = 250,
-    } = options;
-    if (!questions?.length)
-      throw new StagehandInvalidArgumentError(
-        "Questions array cannot be empty",
-      );
+    return this.getLegacyBackend("batchAsk").batchAsk(options);
+  }
 
-    await new Promise((r) => setTimeout(r, screenshotDelayMs));
-    let imageBuffer: Buffer | undefined;
-    if (screenshot) {
-      const page = await this.v3.context.awaitActivePage();
-      imageBuffer = await page.screenshot({ fullPage: false });
+  private getLegacyBackend(methodName: string): LegacyV3Evaluator {
+    if (this.backend === "legacy") {
+      return this.legacyEvaluator;
     }
 
-    const llmClient = this.getClient();
-
-    const formatted = questions
-      .map(
-        (item, i) =>
-          `${i + 1}. ${item.question}${item.answer ? `\n   Answer: ${item.answer}` : ""}`,
-      )
-      .join("\n\n");
-
-    const response = await llmClient.createChatCompletion<
-      LLMParsedResponse<LLMResponse>
-    >({
-      logger: this.silentLogger,
-      options: {
-        messages: [
-          {
-            role: "system",
-            content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
-          },
-          {
-            role: "user",
-            content: [
-              { type: "text", text: formatted },
-              ...(screenshot && imageBuffer
-                ? [
-                    {
-                      type: "image_url" as const,
-                      image_url: {
-                        url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
-                      },
-                    },
-                  ]
-                : []),
-            ],
-          },
-        ],
-        response_model: {
-          name: "BatchEvaluationResult",
-          schema: BatchEvaluationSchema,
-        },
-      },
-    });
-
-    try {
-      const results = response.data as unknown as z.infer<
-        typeof BatchEvaluationSchema
-      >;
-      return results.map((r) => ({
-        evaluation: r.evaluation,
-        reasoning: r.reasoning,
-      }));
-    } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error);
-      return questions.map(() => ({
-        evaluation: "INVALID" as const,
-        reasoning: `Failed to get structured response: ${errorMessage}`,
-      }));
-    }
+    throw new StagehandInvalidArgumentError(
+      `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`,
+    );
   }
+}
 
-  private async _evaluateWithMultipleScreenshots(options: {
-    question: string;
-    screenshots: Buffer[];
-    systemPrompt?: string;
-    agentReasoning?: string;
-  }): Promise<EvaluationResult> {
-    const {
-      question,
-      screenshots,
-      agentReasoning,
-      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
-        ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
-        Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
-        Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
-        ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
-        Today's date is ${new Date().toLocaleDateString()}`,
-    } = options;
-
-    if (!question)
-      throw new StagehandInvalidArgumentError(
-        "Question cannot be an empty string",
-      );
-    if (!screenshots || screenshots.length === 0)
-      throw new StagehandInvalidArgumentError(
-        "At least one screenshot must be provided",
-      );
-
-    const llmClient = this.getClient();
-
-    const imageContents = screenshots.map((s) => ({
-      type: "image_url" as const,
-      image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` },
-    }));
+function normalizeConstructorOptions(
+  modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions,
+  modelClientOptions?: ClientOptions,
+  options?: V3EvaluatorOptions,
+): NormalizedConstructorOptions {
+  if (
+    modelNameOrOptions &&
+    typeof modelNameOrOptions === "object" &&
+    !Array.isArray(modelNameOrOptions)
+  ) {
+    return {
+      modelName: modelNameOrOptions.modelName,
+      modelClientOptions: modelNameOrOptions.modelClientOptions,
+      backend: modelNameOrOptions.backend ?? options?.backend,
+    };
+  }
 
-    const response = await llmClient.createChatCompletion<
-      LLMParsedResponse<LLMResponse>
-    >({
-      logger: this.silentLogger,
-      options: {
-        messages: [
-          { role: "system", content: systemPrompt },
-          {
-            role: "user",
-            content: [
-              {
-                type: "text",
-                text: agentReasoning
-                  ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
-                  : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
-              },
-              ...imageContents,
-            ],
-          },
-        ],
-        response_model: { name: "EvaluationResult", schema: EvaluationSchema },
-      },
-    });
+  return {
+    modelName: modelNameOrOptions as AvailableModel | undefined,
+    modelClientOptions,
+    backend: options?.backend,
+  };
+}
 
-    try {
-      const result = response.data as unknown as z.infer<
-        typeof EvaluationSchema
-      >;
-      return { evaluation: result.evaluation, reasoning: result.reasoning };
-    } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error);
-      return {
-        evaluation: "INVALID",
-        reasoning: `Failed to get structured response: ${errorMessage}`,
-      } as const;
-    }
+function resolveEvaluatorBackend(
+  explicitBackend?: V3EvaluatorBackend,
+): V3EvaluatorBackend {
+  const configuredBackend =
+    explicitBackend ??
+    process.env[EVALUATOR_BACKEND_ENV] ??
+    DEFAULT_EVALUATOR_BACKEND;
+  const normalizedBackend = configuredBackend.trim().toLowerCase();
+
+  if (normalizedBackend === "legacy" || normalizedBackend === "verifier") {
+    return normalizedBackend;
   }
+
+  throw new StagehandInvalidArgumentError(
+    `Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`,
+  );
 }
diff --git a/packages/core/lib/v3LegacyEvaluator.ts b/packages/core/lib/v3LegacyEvaluator.ts
new file mode 100644
index 000000000..64ec89ef2
--- /dev/null
+++ b/packages/core/lib/v3LegacyEvaluator.ts
@@ -0,0 +1,297 @@
+/**
+ * Legacy V3 evaluator implementation.
+ *
+ * This is the behavior-preserving implementation that backs V3Evaluator when
+ * STAGEHAND_EVALUATOR_BACKEND=legacy.
+ */
+
+import { z } from "zod";
+import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
+import type {
+  EvaluateOptions,
+  BatchAskOptions,
+  EvaluationResult,
+} from "./v3/types/private/evaluator.js";
+import { LLMParsedResponse } from "./inference.js";
+import { LLMResponse, LLMClient } from "./v3/llm/LLMClient.js";
+import { LogLine } from "./v3/types/public/logs.js";
+import { V3 } from "./v3/v3.js";
+import { LLMProvider } from "./v3/llm/LLMProvider.js";
+import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
+
+const EvaluationSchema = z.object({
+  evaluation: z.enum(["YES", "NO"]),
+  reasoning: z.string(),
+});
+
+const BatchEvaluationSchema = z.array(EvaluationSchema);
+
+export class LegacyV3Evaluator {
+  private v3: V3;
+  private modelName: AvailableModel;
+  private modelClientOptions: ClientOptions | { apiKey: string };
+  private silentLogger: (message: LogLine) => void = () => {};
+
+  constructor(
+    v3: V3,
+    modelName?: AvailableModel,
+    modelClientOptions?: ClientOptions,
+  ) {
+    this.v3 = v3;
+    this.modelName = modelName || ("google/gemini-2.5-flash" as AvailableModel);
+    this.modelClientOptions = modelClientOptions || {
+      apiKey:
+        process.env.GEMINI_API_KEY ||
+        process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
+        "",
+    };
+  }
+
+  private getClient(): LLMClient {
+    // Prefer a dedicated provider so we can override model per-evaluation
+    const provider = new LLMProvider(this.v3.logger);
+    return provider.getClient(this.modelName, this.modelClientOptions);
+  }
+
+  async ask(options: EvaluateOptions): Promise<EvaluationResult> {
+    const {
+      question,
+      answer,
+      screenshot = true,
+      systemPrompt,
+      screenshotDelayMs = 250,
+      agentReasoning,
+    } = options;
+    if (!question)
+      throw new StagehandInvalidArgumentError(
+        "Question cannot be an empty string",
+      );
+    if (!answer && !screenshot)
+      throw new StagehandInvalidArgumentError(
+        "Either answer (text) or screenshot must be provided",
+      );
+
+    if (Array.isArray(screenshot)) {
+      return this._evaluateWithMultipleScreenshots({
+        question,
+        screenshots: screenshot,
+        systemPrompt,
+        agentReasoning,
+      });
+    }
+
+    const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to  ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n          Today's date is ${new Date().toLocaleDateString()}`;
+
+    await new Promise((r) => setTimeout(r, screenshotDelayMs));
+    let imageBuffer: Buffer | undefined;
+    if (screenshot) {
+      const page = await this.v3.context.awaitActivePage();
+      imageBuffer = await page.screenshot({ fullPage: false });
+    }
+
+    const llmClient = this.getClient();
+
+    const response = await llmClient.createChatCompletion<
+      LLMParsedResponse<LLMResponse>
+    >({
+      logger: this.silentLogger,
+      options: {
+        messages: [
+          { role: "system", content: systemPrompt || defaultSystemPrompt },
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: agentReasoning
+                  ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
+                  : question,
+              },
+              ...(screenshot && imageBuffer
+                ? [
+                    {
+                      type: "image_url" as const,
+                      image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
+                      },
+                    },
+                  ]
+                : []),
+              ...(answer
+                ? [{ type: "text" as const, text: `the answer is ${answer}` }]
+                : []),
+            ],
+          },
+        ],
+        response_model: { name: "EvaluationResult", schema: EvaluationSchema },
+      },
+    });
+
+    try {
+      const result = response.data as unknown as z.infer<
+        typeof EvaluationSchema
+      >;
+      return { evaluation: result.evaluation, reasoning: result.reasoning };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      return {
+        evaluation: "INVALID",
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      } as const;
+    }
+  }
+
+  async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {
+    const {
+      questions,
+      screenshot = true,
+      systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.",
+      screenshotDelayMs = 250,
+    } = options;
+    if (!questions?.length)
+      throw new StagehandInvalidArgumentError(
+        "Questions array cannot be empty",
+      );
+
+    await new Promise((r) => setTimeout(r, screenshotDelayMs));
+    let imageBuffer: Buffer | undefined;
+    if (screenshot) {
+      const page = await this.v3.context.awaitActivePage();
+      imageBuffer = await page.screenshot({ fullPage: false });
+    }
+
+    const llmClient = this.getClient();
+
+    const formatted = questions
+      .map(
+        (item, i) =>
+          `${i + 1}. ${item.question}${item.answer ? `\n   Answer: ${item.answer}` : ""}`,
+      )
+      .join("\n\n");
+
+    const response = await llmClient.createChatCompletion<
+      LLMParsedResponse<LLMResponse>
+    >({
+      logger: this.silentLogger,
+      options: {
+        messages: [
+          {
+            role: "system",
+            content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
+          },
+          {
+            role: "user",
+            content: [
+              { type: "text", text: formatted },
+              ...(screenshot && imageBuffer
+                ? [
+                    {
+                      type: "image_url" as const,
+                      image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
+                      },
+                    },
+                  ]
+                : []),
+            ],
+          },
+        ],
+        response_model: {
+          name: "BatchEvaluationResult",
+          schema: BatchEvaluationSchema,
+        },
+      },
+    });
+
+    try {
+      const results = response.data as unknown as z.infer<
+        typeof BatchEvaluationSchema
+      >;
+      return results.map((r) => ({
+        evaluation: r.evaluation,
+        reasoning: r.reasoning,
+      }));
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      return questions.map(() => ({
+        evaluation: "INVALID" as const,
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      }));
+    }
+  }
+
+  private async _evaluateWithMultipleScreenshots(options: {
+    question: string;
+    screenshots: Buffer[];
+    systemPrompt?: string;
+    agentReasoning?: string;
+  }): Promise<EvaluationResult> {
+    const {
+      question,
+      screenshots,
+      agentReasoning,
+      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
+        ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
+        Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
+        Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
+        ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
+        Today's date is ${new Date().toLocaleDateString()}`,
+    } = options;
+
+    if (!question)
+      throw new StagehandInvalidArgumentError(
+        "Question cannot be an empty string",
+      );
+    if (!screenshots || screenshots.length === 0)
+      throw new StagehandInvalidArgumentError(
+        "At least one screenshot must be provided",
+      );
+
+    const llmClient = this.getClient();
+
+    const imageContents = screenshots.map((s) => ({
+      type: "image_url" as const,
+      image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` },
+    }));
+
+    const response = await llmClient.createChatCompletion<
+      LLMParsedResponse<LLMResponse>
+    >({
+      logger: this.silentLogger,
+      options: {
+        messages: [
+          { role: "system", content: systemPrompt },
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: agentReasoning
+                  ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
+                  : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
+              },
+              ...imageContents,
+            ],
+          },
+        ],
+        response_model: { name: "EvaluationResult", schema: EvaluationSchema },
+      },
+    });
+
+    try {
+      const result = response.data as unknown as z.infer<
+        typeof EvaluationSchema
+      >;
+      return { evaluation: result.evaluation, reasoning: result.reasoning };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      return {
+        evaluation: "INVALID",
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      } as const;
+    }
+  }
+}
diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts
index 6987d2f31..8d710da4d 100644
--- a/packages/core/tests/unit/public-api/v3-core.test.ts
+++ b/packages/core/tests/unit/public-api/v3-core.test.ts
@@ -133,6 +133,45 @@ describe("V3 Core public API types", () => {
         (options: unknown) => Promise<unknown[]>
       >();
     });
+
+    it("accepts legacy evaluator backend options", () => {
+      const mockV3 = {} as Stagehand.Stagehand;
+      expectTypeOf<typeof Stagehand.V3Evaluator>().toBeConstructibleWith(
+        mockV3,
+        {
+          backend: "legacy",
+        } satisfies Stagehand.V3EvaluatorConstructorOptions,
+      );
+    });
+
+    it("rejects verifier backend before the verifier PR is installed", async () => {
+      const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, {
+        backend: "verifier",
+      });
+
+      await expect(
+        evaluator.ask({ question: "Was the task completed?" }),
+      ).rejects.toThrow(
+        "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+      );
+    });
+
+    it("rejects invalid evaluator backend env values", () => {
+      const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND;
+      process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend";
+
+      try {
+        expect(
+          () => new Stagehand.V3Evaluator({} as Stagehand.Stagehand),
+        ).toThrow('Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"');
+      } finally {
+        if (previousBackend === undefined) {
+          delete process.env.STAGEHAND_EVALUATOR_BACKEND;
+        } else {
+          process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend;
+        }
+      }
+    });
   });
 
   describe("V3FunctionName", () => {

From 513b9d97d142bc482f8adf2406ba895adfc82f3b Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:15:17 -0700
Subject: [PATCH 2/2] chore: add evaluator facade changeset

---
 .changeset/verifier-evaluator-facade.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/verifier-evaluator-facade.md

diff --git a/.changeset/verifier-evaluator-facade.md b/.changeset/verifier-evaluator-facade.md
new file mode 100644
index 000000000..558edcf50
--- /dev/null
+++ b/.changeset/verifier-evaluator-facade.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add a backend-selectable v3 evaluator facade while preserving the legacy evaluator path.