Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ export const evalRunCommand = command({
long: 'workspace-path',
description: 'Static workspace directory path (used when workspace mode is static)',
}),
keepWorkspaces: flag({
long: 'keep-workspaces',
description:
'Preserve per-test workspaces after eval (default: keep on failure, cleanup on success)',
}),
otelFile: option({
type: optional(string),
long: 'otel-file',
Expand Down Expand Up @@ -241,6 +246,7 @@ export const evalRunCommand = command({
verbose: args.verbose,
workspaceMode: args.workspaceMode,
workspacePath: args.workspacePath,
keepWorkspaces: args.keepWorkspaces,
trace: false,
otelFile: args.otelFile,
exportOtel: args.exportOtel,
Expand Down
33 changes: 25 additions & 8 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ interface NormalizedOptions {
readonly retryErrors?: string;
readonly workspaceMode?: 'pooled' | 'temp' | 'static';
readonly workspacePath?: string;
readonly keepWorkspaces: boolean;
/** Deprecated: benchmark.json is always written to artifact dir */
readonly benchmarkJson?: string;
/** Deprecated: use --output instead */
Expand Down Expand Up @@ -357,6 +358,11 @@ function normalizeOptions(
retryErrors: normalizeString(rawOptions.retryErrors),
workspaceMode,
workspacePath,
// Precedence: CLI > YAML config > TS config
keepWorkspaces:
normalizeBoolean(rawOptions.keepWorkspaces) ||
yamlExecution?.keep_workspaces === true ||
config?.execution?.keepWorkspaces === true,
benchmarkJson: normalizeString(rawOptions.benchmarkJson),
artifacts: normalizeString(rawOptions.artifacts),
graderTarget: normalizeString(rawOptions.graderTarget),
Expand Down Expand Up @@ -754,6 +760,7 @@ async function runSingleEvalFile(params: {
maxConcurrency: resolvedWorkers,
workspaceMode: options.workspaceMode,
workspacePath: options.workspacePath,
keepWorkspaces: options.keepWorkspaces,
trials: trialsConfig,
totalBudgetUsd,
failOnError,
Expand Down Expand Up @@ -1455,17 +1462,27 @@ export async function runEvalCommand(
);
}

// Print workspace paths for failed cases (when preserved for debugging)
const failedWithWorkspaces = allResults.filter(
(r) => r.workspacePath && (r.error || r.score < 0.5),
);
if (failedWithWorkspaces.length > 0) {
console.log('\nWorkspaces preserved for debugging:');
for (const result of failedWithWorkspaces) {
console.log(` ${result.testId}: ${result.workspacePath}`);
// Print workspace paths summary
const resultsWithWorkspaces = allResults.filter((r) => r.workspacePath);
const preservedWorkspaces = options.keepWorkspaces
? resultsWithWorkspaces
: resultsWithWorkspaces.filter((r) => r.error || r.score < 0.5);

if (preservedWorkspaces.length > 0) {
console.log('\nPreserved workspaces:');
for (const result of preservedWorkspaces) {
console.log(` ${result.testId} -> ${result.workspacePath}`);
}
}

// Hint about --keep-workspaces when workspaces were used but some cleaned up
const usedWorkspaces =
resultsWithWorkspaces.length > 0 ||
(options.workspaceMode && options.workspaceMode !== 'static');
if (!options.keepWorkspaces && usedWorkspaces) {
console.log('Use --keep-workspaces to preserve all workspaces for inspection.');
}

if (allResults.length > 0) {
console.log(`\nResults written to: ${outputPath}`);

Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2324,6 +2324,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
}
} else if ((retainOnSuccess ?? (keepWorkspaces ? 'keep' : 'cleanup')) !== 'keep') {
await cleanupWorkspace(workspacePath).catch(() => {});
} else {
return { ...finalResult, workspacePath };
}
}

Expand Down
Loading