Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 126 additions & 2 deletions packages/evals/framework/claudeCodeRunner.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import type { AvailableModel } from "@browserbasehq/stagehand";
import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
import { EvalsError } from "../errors.js";
import type { EvalLogger } from "../logger.js";
import type { TaskResult } from "./types.js";
import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js";
import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
import { evaluationResultToSuccess } from "./verifierAdapter.js";

type ClaudeSdkMessage = Record<string, unknown>;
type ClaudeQuery = AsyncIterable<ClaudeSdkMessage>;
Expand All @@ -16,13 +19,42 @@ export type ClaudeAgentSdk = {
}) => ClaudeQuery;
};

export interface ClaudeCodeVerifierConfig {
/**
* V3 instance used solely as the LLM-client carrier for V3Evaluator. The
* instance does NOT need to have `init()` been called — V3Evaluator.verify()
* uses only `v3.logger` to construct its LLMProvider.
*/
v3: V3;
/** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */
taskSpec: TaskSpec;
/** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
dataset: string;
/** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
successMode?: "outcome" | "process" | "both";
/** Override trajectory persistence root. */
trajectoryRoot?: string;
/** Override the run id (defaults to ISO timestamp). */
runId?: string;
}

export interface ClaudeCodeRunnerInput {
plan: ExternalHarnessTaskPlan;
model: AvailableModel;
logger: EvalLogger;
toolAdapter?: PreparedClaudeCodeToolAdapter;
signal?: AbortSignal;
sdk?: ClaudeAgentSdk;
/**
* Optional verifier integration. When provided, the runner builds a
* Trajectory from the SDK message stream (via claudeCodeAdapter), runs
* V3Evaluator.verify() against the supplied TaskSpec, and folds the
* EvaluationResult into the returned TaskResult ({_success} mode follows
* EVAL_SUCCESS_MODE).
* When omitted, the runner falls back to parsing the legacy EVAL_RESULT
* line — preserves current behavior for callers that haven't migrated.
*/
verifier?: ClaudeCodeVerifierConfig;
}

export interface ParsedClaudeCodeResult {
Expand Down Expand Up @@ -124,7 +156,9 @@ export async function runClaudeCodeAgent({
toolAdapter,
signal,
sdk: injectedSdk,
verifier,
}: ClaudeCodeRunnerInput): Promise<TaskResult> {
const startedAt = new Date().toISOString();
const sdk = injectedSdk ?? (await loadClaudeAgentSdk());
const abortController = new AbortController();
if (signal) {
Expand Down Expand Up @@ -220,8 +254,10 @@ export async function runClaudeCodeAgent({
parsed.summary ??
stopReason ??
(resultText || transcriptText || "Claude Code did not report success");
const endedAt = new Date().toISOString();
const tokenUsage = extractClaudeCodeTokenUsage(resultMessage);

return {
const baseResult: TaskResult = {
_success: parsed.success,
error: !parsed.success ? errorMessage : undefined,
reasoning: parsed.summary,
Expand All @@ -232,6 +268,94 @@ export async function runClaudeCodeAgent({
logs: logger.getLogs(),
metrics: buildClaudeCodeMetrics(resultMessage),
};

if (!verifier) {
return baseResult;
}

// Build a Trajectory from the SDK message stream and run the rubric verifier.
try {
const trajectory = claudeCodeAdapter.fromHarnessResult(
{
messages,
finalAnswer: parsed.finalAnswer ?? resultText,
status: status === "completed" ? "complete" : "error",
usage: {
input_tokens: tokenUsage.inputTokens,
output_tokens: tokenUsage.outputTokens,
cached_input_tokens: tokenUsage.cacheReadInputTokens,
},
timing: { startedAt, endedAt },
},
verifier.taskSpec,
);

const { V3Evaluator } = await import("@browserbasehq/stagehand");
const { RubricCache } = await import("./rubricCache.js");
const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });

// Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
let rubric = verifier.taskSpec.precomputedRubric;
if (!rubric) {
if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
rubric = await evaluator.generateRubric(verifier.taskSpec);
} else {
const cache = new RubricCache({ dataset: verifier.dataset });
rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator);
}
}
const hydratedSpec: TaskSpec = {
...verifier.taskSpec,
precomputedRubric: rubric,
};

const evaluationResult = await evaluator.verify(trajectory, hydratedSpec);
const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
const verifiedSuccess = evaluationResultToSuccess(
evaluationResult,
successMode,
);

const { directory: trajectoryDir } = await persistAdapterTrajectory({
trajectory,
taskSpec: hydratedSpec,
evaluationResult,
outputRoot: verifier.trajectoryRoot,
runId: verifier.runId,
});

logger.log({
category: "claude_code",
message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`,
level: 1,
});

return {
...baseResult,
_success: verifiedSuccess,
error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
outcomeSuccess: evaluationResult.outcomeSuccess,
processScore: evaluationResult.processScore,
evidenceInsufficient: evaluationResult.evidenceInsufficient,
criterionCount: rubric.items.length,
stepCount: trajectory.steps.length,
trajectoryDir,
};
} catch (verifyError) {
logger.warn({
category: "claude_code",
message: `verifier integration failed: ${stringifyError(verifyError)}`,
level: 0,
auxiliary: {
error: { value: stringifyError(verifyError), type: "string" },
},
});
return baseResult;
}
}

function formatProcessScore(score: number | undefined): string {
return typeof score === "number" ? score.toFixed(2) : "n/a";
}

function buildClaudeCodeMetrics(
Expand Down
130 changes: 128 additions & 2 deletions packages/evals/framework/codexRunner.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import type { AvailableModel } from "@browserbasehq/stagehand";
import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
import { EvalsError } from "../errors.js";
import type { EvalLogger } from "../logger.js";
import type { TaskResult } from "./types.js";
import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js";
import { codexAdapter } from "./harnesses/codexAdapter.js";
import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
import { evaluationResultToSuccess } from "./verifierAdapter.js";

type MetricValue = { count: number; value: number };
type CodexEvent = Record<string, unknown>;
Expand All @@ -25,13 +28,42 @@ export type CodexSdk = {
startThread: (options?: Record<string, unknown>) => CodexThread;
};

export interface CodexVerifierConfig {
/**
* V3 instance used solely as the LLM-client carrier for V3Evaluator. The
* instance does NOT need to have `init()` been called — V3Evaluator.verify()
* uses only `v3.logger` to construct its LLMProvider.
*/
v3: V3;
/** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */
taskSpec: TaskSpec;
/** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
dataset: string;
/** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
successMode?: "outcome" | "process" | "both";
/** Override trajectory persistence root. */
trajectoryRoot?: string;
/** Override the run id (defaults to ISO timestamp). */
runId?: string;
}

export interface CodexRunnerInput {
plan: ExternalHarnessTaskPlan;
model: AvailableModel;
logger: EvalLogger;
toolAdapter?: PreparedCodexToolAdapter;
signal?: AbortSignal;
sdk?: CodexSdk;
/**
* Optional verifier integration. When provided, the runner builds a
* Trajectory from the codex event stream (via codexAdapter), runs
* V3Evaluator.verify() against the supplied TaskSpec, and folds the
* EvaluationResult into the returned TaskResult ({_success} mode follows
* EVAL_SUCCESS_MODE).
* When omitted, the runner falls back to parsing the legacy JSON result —
* preserves current behavior for callers that haven't migrated.
*/
verifier?: CodexVerifierConfig;
}

export interface ParsedCodexResult {
Expand Down Expand Up @@ -114,7 +146,9 @@ export async function runCodexAgent({
toolAdapter,
signal,
sdk: injectedSdk,
verifier,
}: CodexRunnerInput): Promise<TaskResult> {
const startedAt = new Date().toISOString();
const sdk = injectedSdk ?? (await loadCodexSdk(toolAdapter?.env));
const prompt = buildCodexPrompt(plan, toolAdapter?.promptInstructions);
const events: CodexEvent[] = [];
Expand Down Expand Up @@ -191,8 +225,9 @@ export async function runCodexAgent({
finalResponse ||
transcriptText ||
"Codex did not report success");
const endedAt = new Date().toISOString();

return {
const baseResult: TaskResult = {
_success: parsed.success,
error: !parsed.success ? errorMessage : undefined,
reasoning: parsed.summary,
Expand All @@ -203,6 +238,97 @@ export async function runCodexAgent({
logs: logger.getLogs(),
metrics: buildCodexMetrics(usage),
};

if (!verifier) {
return baseResult;
}

try {
const trajectory = codexAdapter.fromHarnessResult(
{
events,
finalAnswer: parsed.finalAnswer ?? finalResponse,
status: status === "completed" ? "complete" : "error",
usage: {
input_tokens: toFiniteNumber(usage?.input_tokens),
output_tokens: toFiniteNumber(usage?.output_tokens),
...(usage?.reasoning_output_tokens !== undefined && {
reasoning_tokens: toFiniteNumber(usage.reasoning_output_tokens),
}),
...(usage?.cached_input_tokens !== undefined && {
cached_input_tokens: toFiniteNumber(usage.cached_input_tokens),
}),
},
timing: { startedAt, endedAt },
},
verifier.taskSpec,
);

const { V3Evaluator } = await import("@browserbasehq/stagehand");
const { RubricCache } = await import("./rubricCache.js");
const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });

let rubric = verifier.taskSpec.precomputedRubric;
if (!rubric) {
if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
rubric = await evaluator.generateRubric(verifier.taskSpec);
} else {
const cache = new RubricCache({ dataset: verifier.dataset });
rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator);
}
}
const hydratedSpec: TaskSpec = {
...verifier.taskSpec,
precomputedRubric: rubric,
};

const evaluationResult = await evaluator.verify(trajectory, hydratedSpec);
const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
const verifiedSuccess = evaluationResultToSuccess(
evaluationResult,
successMode,
);

const { directory: trajectoryDir } = await persistAdapterTrajectory({
trajectory,
taskSpec: hydratedSpec,
evaluationResult,
outputRoot: verifier.trajectoryRoot,
runId: verifier.runId,
});

logger.log({
category: "codex",
message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`,
level: 1,
});

return {
...baseResult,
_success: verifiedSuccess,
error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
outcomeSuccess: evaluationResult.outcomeSuccess,
processScore: evaluationResult.processScore,
evidenceInsufficient: evaluationResult.evidenceInsufficient,
criterionCount: rubric.items.length,
stepCount: trajectory.steps.length,
trajectoryDir,
};
} catch (verifyError) {
logger.warn({
category: "codex",
message: `verifier integration failed: ${stringifyError(verifyError)}`,
level: 0,
auxiliary: {
error: { value: stringifyError(verifyError), type: "string" },
},
});
return baseResult;
}
}

function formatProcessScore(score: number | undefined): string {
return typeof score === "number" ? score.toFixed(2) : "n/a";
}

function tryParseCodexJson(
Expand Down
Loading
Loading