Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,218 changes: 609 additions & 609 deletions packages/evals/datasets/webtailbench/WebTailBench_data.jsonl

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions packages/evals/framework/adHocRubric.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/**
* adHocRubric — synthesize a Rubric from one or more natural-language
* criteria without invoking the LLM-based rubric generator.
*
* Used by migrated custom agent tasks whose original verification was a
* single `V3Evaluator.ask({question})` YES/NO call. Each criterion becomes
* a 1-point rubric item.
*
* For tasks that already have a concrete predicate ("Does the page show
* flights from SF to NY?"), pass the predicate verbatim. For the lazy
* "did the agent complete this task successfully? <instruction>" pattern,
* pass the instruction.
*/
import type { Rubric } from "@browserbasehq/stagehand";

export function adHocRubric(...criteria: string[]): Rubric {
if (criteria.length === 0) {
throw new Error("adHocRubric requires at least one criterion");
}
return {
items: criteria.map((c) => ({
criterion: c,
description: c,
maxPoints: 1,
})),
};
}
18 changes: 17 additions & 1 deletion packages/evals/suites/onlineMind2Web.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,23 @@ export const buildOnlineMind2WebTestcases = (
}

const candidates = parseJsonlRows(lines, isMind2WebRow);
const rows = applySampling(candidates, sampleCount, maxCases);

// EVAL_ONLINEMIND2WEB_IDS restricts the suite to exactly those task ids,
// preserving the order given and ignoring sampling / limit knobs.
const explicitIds = process.env.EVAL_ONLINEMIND2WEB_IDS
? process.env.EVAL_ONLINEMIND2WEB_IDS.split(",")
.map((s) => s.trim())
.filter(Boolean)
: null;
let rows: Mind2WebRow[];
if (explicitIds && explicitIds.length > 0) {
const byId = new Map(candidates.map((r) => [r.task_id, r]));
rows = explicitIds
.map((id) => byId.get(id))
.filter((r): r is Mind2WebRow => Boolean(r));
} else {
rows = applySampling(candidates, sampleCount, maxCases);
}

const allTestcases: Testcase[] = [];
for (const modelEntry of normalizeAgentModelEntries(models)) {
Expand Down
76 changes: 33 additions & 43 deletions packages/evals/tasks/bench/agent/alibaba_supplier_search.ts
Original file line number Diff line number Diff line change
@@ -1,69 +1,59 @@
import type { TaskSpec } from "@browserbasehq/stagehand";

import { defineBenchTask } from "../../../framework/defineTask.js";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js";
import { adHocRubric } from "../../../framework/adHocRubric.js";
import {
runWithVerifier,
evaluationResultToSuccess,
} from "../../../framework/verifierAdapter.js";

export default defineBenchTask(
{ name: "agent/alibaba_supplier_search" },
async ({ debugUrl, sessionUrl, logger, agent, v3 }) => {
try {
const initUrl = "https://www.alibaba.com/";
const page = v3.context.pages()[0];
await page.goto("https://www.alibaba.com/");

// Start collecting screenshots throughout the agent's journey
const screenshotCollector = new ScreenshotCollector(v3, {
interval: 3000,
maxScreenshots: 15,
});
screenshotCollector.start();
await page.goto(initUrl);

const instruction =
"Search for 'solar panels' on Alibaba and find 3 suppliers. For each supplier, tell me their company name, minimum order quantity, and price range if available.";
const agentResult = await agent.execute({
instruction,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
});

// Stop and collect all screenshots from the journey
const screenshots = await screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});
const taskSpec: TaskSpec = {
id: "agent/alibaba_supplier_search",
instruction,
initUrl,
precomputedRubric: adHocRubric(
`did the agent complete this task successfully? ${instruction}`,
),
};

const evaluator = new V3Evaluator(v3);
const { evaluation, reasoning } = await evaluator.ask({
question: `did the agent complete this task successfully? ${instruction}`,
screenshot: screenshots,
agentReasoning: agentResult.message,
const { evaluationResult, trajectoryDir } = await runWithVerifier({
v3,
agent,
taskSpec,
dataset: "agent-custom",
agentOptions: {
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
},
});

console.log(`reasoning: ${reasoning}`);

const success = evaluation === "YES";
const successMode = process.env.EVAL_SUCCESS_MODE;

if (!success) {
return {
_success: false,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
_success: evaluationResultToSuccess(evaluationResult, successMode),
outcomeSuccess: evaluationResult.outcomeSuccess,
processScore: evaluationResult.processScore,
trajectoryDir,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
return {
_success: false,
message: errorMessage,
error,
trajectoryDir,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
58 changes: 35 additions & 23 deletions packages/evals/tasks/bench/agent/all_recipes.ts
Original file line number Diff line number Diff line change
@@ -1,47 +1,59 @@
import { V3Evaluator } from "@browserbasehq/stagehand";
import type { TaskSpec } from "@browserbasehq/stagehand";

import { defineBenchTask } from "../../../framework/defineTask.js";
import { adHocRubric } from "../../../framework/adHocRubric.js";
import {
runWithVerifier,
evaluationResultToSuccess,
} from "../../../framework/verifierAdapter.js";

export default defineBenchTask(
{ name: "agent/all_recipes" },
async ({ debugUrl, sessionUrl, logger, agent, v3 }) => {
try {
const initUrl = "https://www.allrecipes.com/";
const page = v3.context.pages()[0];
await page.goto("https://www.allrecipes.com/");
const evaluator = new V3Evaluator(v3);
const agentResult = await agent.execute({
instruction:
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
});
await page.goto(initUrl);

const { evaluation, reasoning } = await evaluator.ask({
question: "Did the agent find a recipe for Beef Wellington",
});
const instruction =
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.";

logger.log(agentResult);
const taskSpec: TaskSpec = {
id: "agent/all_recipes",
instruction,
initUrl,
precomputedRubric: adHocRubric(
"Did the agent find a recipe for Beef Wellington",
),
};

const success = evaluation === "YES";
const { evaluationResult, trajectoryDir } = await runWithVerifier({
v3,
agent,
taskSpec,
dataset: "agent-custom",
agentOptions: {
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
},
});

if (!success) {
return {
_success: false,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
const successMode = process.env.EVAL_SUCCESS_MODE;

return {
_success: true,
_success: evaluationResultToSuccess(evaluationResult, successMode),
outcomeSuccess: evaluationResult.outcomeSuccess,
processScore: evaluationResult.processScore,
trajectoryDir,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
return {
_success: false,
error,
trajectoryDir,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
76 changes: 33 additions & 43 deletions packages/evals/tasks/bench/agent/amazon_shoes_cart.ts
Original file line number Diff line number Diff line change
@@ -1,69 +1,59 @@
import type { TaskSpec } from "@browserbasehq/stagehand";

import { defineBenchTask } from "../../../framework/defineTask.js";
import { V3Evaluator } from "@browserbasehq/stagehand";
import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js";
import { adHocRubric } from "../../../framework/adHocRubric.js";
import {
runWithVerifier,
evaluationResultToSuccess,
} from "../../../framework/verifierAdapter.js";

export default defineBenchTask(
{ name: "agent/amazon_shoes_cart" },
async ({ debugUrl, sessionUrl, logger, agent, v3 }) => {
try {
const initUrl = "https://www.amazon.com";
const page = v3.context.pages()[0];
await page.goto("https://www.amazon.com");

// Start collecting screenshots throughout the agent's journey
const screenshotCollector = new ScreenshotCollector(v3, {
interval: 3000,
maxScreenshots: 15,
});
screenshotCollector.start();
await page.goto(initUrl);

const instruction =
"go to amazon, and add a pair of black running shoes to cart in size 14. stop after you add the item to cart, and reach the login page";
const agentResult = await agent.execute({
instruction,
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 40,
});

// Stop and collect all screenshots from the journey
const screenshots = await screenshotCollector.stop();

logger.log({
category: "evaluation",
message: `Collected ${screenshots.length} screenshots for evaluation`,
level: 1,
});
const taskSpec: TaskSpec = {
id: "agent/amazon_shoes_cart",
instruction,
initUrl,
precomputedRubric: adHocRubric(
`did the agent complete this task successfully? ${instruction}`,
),
};

const evaluator = new V3Evaluator(v3);
const { evaluation, reasoning } = await evaluator.ask({
question: `did the agent complete this task successfully? ${instruction}`,
screenshot: screenshots,
agentReasoning: agentResult.message,
const { evaluationResult, trajectoryDir } = await runWithVerifier({
v3,
agent,
taskSpec,
dataset: "agent-custom",
agentOptions: {
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 40,
},
});

console.log(`reasoning: ${reasoning}`);

const success = evaluation === "YES";
const successMode = process.env.EVAL_SUCCESS_MODE;

if (!success) {
return {
_success: false,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
_success: evaluationResultToSuccess(evaluationResult, successMode),
outcomeSuccess: evaluationResult.outcomeSuccess,
processScore: evaluationResult.processScore,
trajectoryDir,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
const errorMessage =
error instanceof Error ? error.message : String(error);
const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
return {
_success: false,
message: errorMessage,
error,
trajectoryDir,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down
Loading
Loading