Skip to content

Commit 84199a9

Browse files
committed
Switch eval AgentStep to be all printmode events
1 parent 8070b9a commit 84199a9

File tree

2 files changed

+23
-63
lines changed

2 files changed

+23
-63
lines changed

evals/buffbench/agent-runner.ts

Lines changed: 7 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,10 @@ import { withTimeout } from '@codebuff/common/util/promise'
44
import { CodebuffClient } from '../../sdk/src/client'
55
import { withTestRepo } from '../subagents/test-repo-utils'
66

7+
import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
78
import type { EvalCommitV2 } from './types'
89

9-
export interface AgentStep {
10-
response: string
11-
toolCalls: any[]
12-
toolResults: any[]
13-
}
10+
export type AgentStep = PrintModeEvent
1411

1512
export async function runAgentOnCommit({
1613
client,
@@ -50,23 +47,6 @@ export async function runAgentOnCommit({
5047
initCommand,
5148
},
5249
async (repoDir) => {
53-
let responseText = ''
54-
let toolCalls: any[] = []
55-
let toolResults: any[] = []
56-
57-
function flushStep() {
58-
if (
59-
responseText.length > 0 ||
60-
toolCalls.length > 0 ||
61-
toolResults.length > 0
62-
) {
63-
trace.push({ response: responseText, toolCalls, toolResults })
64-
responseText = ''
65-
toolCalls = []
66-
toolResults = []
67-
}
68-
}
69-
7050
const timeoutMs = 30 * 60 * 1000 // 30 minutes
7151
const result = await withTimeout(
7252
client.run({
@@ -75,30 +55,18 @@ export async function runAgentOnCommit({
7555
agentDefinitions: localAgentDefinitions,
7656
cwd: repoDir,
7757
handleEvent: (event) => {
78-
if (event.type === 'text') {
79-
if (toolResults.length > 0) {
80-
flushStep()
81-
}
82-
responseText += event.text
83-
} else if (event.type === 'tool_call') {
84-
if (event.toolName === 'set_messages') {
85-
return
86-
}
87-
toolCalls.push(event)
88-
} else if (event.type === 'tool_result') {
89-
toolResults.push(event)
90-
} else if (event.type === 'finish') {
91-
flushStep()
92-
} else if (event.type === 'error') {
58+
if (event.type === 'tool_call' && event.toolName === 'set_messages') {
59+
return
60+
}
61+
if (event.type === 'error') {
9362
console.error(`[${agentId}] Error event:`, event.message)
9463
}
64+
trace.push(event)
9565
},
9666
}),
9767
timeoutMs,
9868
`Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`,
9969
)
100-
101-
flushStep()
10270
cost = result.sessionState.mainAgentState.creditsUsed / 100
10371

10472
execSync('git add .', { cwd: repoDir, stdio: 'ignore' })

evals/buffbench/trace-analyzer.ts

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,15 @@ export interface AgentTraceData {
1919
}
2020

2121
function truncateTrace(trace: AgentStep[]): AgentStep[] {
22-
return trace.map((step) => ({
23-
...step,
24-
toolResults: step.toolResults.map((result) => {
25-
// Truncate read_files, run_terminal_command, and code_search results to save tokens
26-
if (result.toolName === 'read_files' && result.output) {
27-
const output = Array.isArray(result.output)
28-
? result.output
29-
: [result.output]
22+
return trace.map((step) => {
23+
// Handle tool_result events
24+
if (step.type === 'tool_result') {
25+
const output = Array.isArray(step.output) ? step.output : [step.output]
26+
27+
// Truncate read_files results
28+
if (step.toolName === 'read_files') {
3029
const truncatedOutput = output.map((item: any) => {
3130
if (item.type === 'json' && Array.isArray(item.value)) {
32-
// Truncate file contents in read_files results
3331
return {
3432
...item,
3533
value: item.value.map((file: any) => {
@@ -47,16 +45,13 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
4745
return item
4846
})
4947
return {
50-
...result,
48+
...step,
5149
output: truncatedOutput,
5250
}
5351
}
5452

5553
// Truncate run_terminal_command results (keep first 500 chars)
56-
if (result.toolName === 'run_terminal_command' && result.output) {
57-
const output = Array.isArray(result.output)
58-
? result.output
59-
: [result.output]
54+
if (step.toolName === 'run_terminal_command') {
6055
const truncatedOutput = output.map((item: any) => {
6156
if (item.type === 'json' && item.value?.stdout) {
6257
return {
@@ -73,16 +68,13 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
7368
return item
7469
})
7570
return {
76-
...result,
71+
...step,
7772
output: truncatedOutput,
7873
}
7974
}
8075

8176
// Truncate code_search results (keep first 500 chars)
82-
if (result.toolName === 'code_search' && result.output) {
83-
const output = Array.isArray(result.output)
84-
? result.output
85-
: [result.output]
77+
if (step.toolName === 'code_search') {
8678
const truncatedOutput = output.map((item: any) => {
8779
if (item.type === 'json' && item.value?.stdout) {
8880
return {
@@ -99,14 +91,14 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
9991
return item
10092
})
10193
return {
102-
...result,
94+
...step,
10395
output: truncatedOutput,
10496
}
10597
}
106-
107-
return result
108-
}),
109-
}))
98+
}
99+
100+
return step
101+
})
110102
}
111103

112104
const traceAnalyzerAgent: AgentDefinition = {

0 commit comments

Comments
 (0)