Switch eval AgentStep to be all printmode events

jahooma · jahooma · commit 84199a94e378 · 2025-10-17T14:14:17.000-07:00
diff --git a/evals/buffbench/agent-runner.ts b/evals/buffbench/agent-runner.ts
@@ -4,13 +4,10 @@ import { withTimeout } from '@codebuff/common/util/promise'
 import { CodebuffClient } from '../../sdk/src/client'
 import { withTestRepo } from '../subagents/test-repo-utils'
 
+import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
 import type { EvalCommitV2 } from './types'
 
-export interface AgentStep {
-  response: string
-  toolCalls: any[]
-  toolResults: any[]
-}
+export type AgentStep = PrintModeEvent
 
 export async function runAgentOnCommit({
   client,
@@ -50,23 +47,6 @@ export async function runAgentOnCommit({
         initCommand,
       },
       async (repoDir) => {
-        let responseText = ''
-        let toolCalls: any[] = []
-        let toolResults: any[] = []
-
-        function flushStep() {
-          if (
-            responseText.length > 0 ||
-            toolCalls.length > 0 ||
-            toolResults.length > 0
-          ) {
-            trace.push({ response: responseText, toolCalls, toolResults })
-            responseText = ''
-            toolCalls = []
-            toolResults = []
-          }
-        }
-
         const timeoutMs = 30 * 60 * 1000 // 30 minutes
         const result = await withTimeout(
           client.run({
@@ -75,30 +55,18 @@ export async function runAgentOnCommit({
             agentDefinitions: localAgentDefinitions,
             cwd: repoDir,
             handleEvent: (event) => {
-              if (event.type === 'text') {
-                if (toolResults.length > 0) {
-                  flushStep()
-                }
-                responseText += event.text
-              } else if (event.type === 'tool_call') {
-                if (event.toolName === 'set_messages') {
-                  return
-                }
-                toolCalls.push(event)
-              } else if (event.type === 'tool_result') {
-                toolResults.push(event)
-              } else if (event.type === 'finish') {
-                flushStep()
-              } else if (event.type === 'error') {
+              if (event.type === 'tool_call' && event.toolName === 'set_messages') {
+                return
+              }
+              if (event.type === 'error') {
                 console.error(`[${agentId}] Error event:`, event.message)
               }
+              trace.push(event)
             },
           }),
           timeoutMs,
           `Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`,
         )
-
-        flushStep()
         cost = result.sessionState.mainAgentState.creditsUsed / 100
 
         execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
diff --git a/evals/buffbench/trace-analyzer.ts b/evals/buffbench/trace-analyzer.ts
@@ -19,17 +19,15 @@ export interface AgentTraceData {
 }
 
 function truncateTrace(trace: AgentStep[]): AgentStep[] {
-  return trace.map((step) => ({
-    ...step,
-    toolResults: step.toolResults.map((result) => {
-      // Truncate read_files, run_terminal_command, and code_search results to save tokens
-      if (result.toolName === 'read_files' && result.output) {
-        const output = Array.isArray(result.output)
-          ? result.output
-          : [result.output]
+  return trace.map((step) => {
+    // Handle tool_result events
+    if (step.type === 'tool_result') {
+      const output = Array.isArray(step.output) ? step.output : [step.output]
+      
+      // Truncate read_files results
+      if (step.toolName === 'read_files') {
         const truncatedOutput = output.map((item: any) => {
           if (item.type === 'json' && Array.isArray(item.value)) {
-            // Truncate file contents in read_files results
             return {
               ...item,
               value: item.value.map((file: any) => {
@@ -47,16 +45,13 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
           return item
         })
         return {
-          ...result,
+          ...step,
           output: truncatedOutput,
         }
       }
 
       // Truncate run_terminal_command results (keep first 500 chars)
-      if (result.toolName === 'run_terminal_command' && result.output) {
-        const output = Array.isArray(result.output)
-          ? result.output
-          : [result.output]
+      if (step.toolName === 'run_terminal_command') {
         const truncatedOutput = output.map((item: any) => {
           if (item.type === 'json' && item.value?.stdout) {
             return {
@@ -73,16 +68,13 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
           return item
         })
         return {
-          ...result,
+          ...step,
           output: truncatedOutput,
         }
       }
 
       // Truncate code_search results (keep first 500 chars)
-      if (result.toolName === 'code_search' && result.output) {
-        const output = Array.isArray(result.output)
-          ? result.output
-          : [result.output]
+      if (step.toolName === 'code_search') {
         const truncatedOutput = output.map((item: any) => {
           if (item.type === 'json' && item.value?.stdout) {
             return {
@@ -99,14 +91,14 @@ function truncateTrace(trace: AgentStep[]): AgentStep[] {
           return item
         })
         return {
-          ...result,
+          ...step,
           output: truncatedOutput,
         }
       }
-
-      return result
-    }),
-  }))
+    }
+    
+    return step
+  })
 }
 
 const traceAnalyzerAgent: AgentDefinition = {