Skip to content

Commit fb858ed

Browse files
committed
evals: Pass agent definitions as context for trace analyzer, meta analyzer
1 parent 8f9a45b commit fb858ed

File tree

4 files changed

+387
-12
lines changed

4 files changed

+387
-12
lines changed

evals/buffbench/agent-runner.ts

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import { execSync } from 'child_process'
2-
import path from 'path'
32

43
import { withTimeout } from '@codebuff/common/util/promise'
5-
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
64
import { CodebuffClient } from '../../sdk/src/client'
75
import { withTestRepo } from '../subagents/test-repo-utils'
86

@@ -20,12 +18,14 @@ export async function runAgentOnCommit({
2018
commit,
2119
repoUrl,
2220
initCommand,
21+
localAgentDefinitions,
2322
}: {
2423
client: CodebuffClient
2524
agentId: string
2625
commit: EvalCommitV2
2726
repoUrl: string
2827
initCommand?: string
28+
localAgentDefinitions: any[]
2929
}): Promise<{
3030
diff: string
3131
contextFiles: Record<string, string>
@@ -50,11 +50,6 @@ export async function runAgentOnCommit({
5050
initCommand,
5151
},
5252
async (repoDir) => {
53-
const agentsPath = path.join(__dirname, '../../.agents')
54-
const localAgentDefinitions = Object.values(
55-
await loadLocalAgents({ agentsPath }),
56-
)
57-
5853
let responseText = ''
5954
let toolCalls: any[] = []
6055
let toolResults: any[] = []

evals/buffbench/meta-analyzer.ts

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
import type { CodebuffClient } from '../../sdk/src/client'
2+
import type { AgentDefinition } from '../../sdk/src'
3+
import { withTimeout } from '@codebuff/common/util/promise'
4+
import { getErrorObject } from '@codebuff/common/util/error'
5+
import fs from 'fs'
6+
import path from 'path'
7+
8+
export interface TaskAnalysisData {
9+
commitSha: string
10+
prompt: string
11+
timestamp: string
12+
overallAnalysis: string
13+
agentFeedback: Array<{
14+
agentId: string
15+
strengths: string[]
16+
weaknesses: string[]
17+
recommendations: string[]
18+
}>
19+
results: Array<{
20+
agentId: string
21+
analysis: string
22+
strengths: string[]
23+
weaknesses: string[]
24+
completionScore: number
25+
codeQualityScore: number
26+
overallScore: number
27+
cost: number
28+
durationMs: number
29+
error?: string
30+
}>
31+
}
32+
33+
export interface MetaAnalysisResult {
34+
overallComparison: string
35+
agentInsights: Array<{
36+
agentId: string
37+
consistentStrengths: string[]
38+
consistentWeaknesses: string[]
39+
performanceSummary: string
40+
recommendations: string[]
41+
}>
42+
keyFindings: string[]
43+
}
44+
45+
const metaAnalyzerAgent: AgentDefinition = {
46+
id: 'buffbench-meta-analyzer',
47+
displayName: 'Buffbench Meta Analyzer',
48+
model: 'openai/gpt-5',
49+
toolNames: ['set_output'],
50+
inputSchema: {
51+
prompt: { type: 'string', description: 'The meta-analysis prompt' },
52+
},
53+
outputMode: 'structured_output',
54+
outputSchema: {
55+
type: 'object',
56+
properties: {
57+
overallComparison: {
58+
type: 'string',
59+
description: 'High-level comparison of all agents across all tasks',
60+
},
61+
agentInsights: {
62+
type: 'array',
63+
items: {
64+
type: 'object',
65+
properties: {
66+
agentId: { type: 'string' },
67+
consistentStrengths: {
68+
type: 'array',
69+
items: { type: 'string' },
70+
description: 'Patterns of strengths across multiple tasks',
71+
},
72+
consistentWeaknesses: {
73+
type: 'array',
74+
items: { type: 'string' },
75+
description: 'Patterns of weaknesses across multiple tasks',
76+
},
77+
performanceSummary: {
78+
type: 'string',
79+
description:
80+
'Summary of overall performance including scores, cost, and time',
81+
},
82+
recommendations: {
83+
type: 'array',
84+
items: { type: 'string' },
85+
description:
86+
'High-level recommendations for improving this agent based on patterns observed',
87+
},
88+
},
89+
required: [
90+
'agentId',
91+
'consistentStrengths',
92+
'consistentWeaknesses',
93+
'performanceSummary',
94+
'recommendations',
95+
],
96+
},
97+
},
98+
keyFindings: {
99+
type: 'array',
100+
items: { type: 'string' },
101+
description:
102+
'Most important insights from the evaluation that should guide development priorities',
103+
},
104+
},
105+
required: ['overallComparison', 'agentInsights', 'keyFindings'],
106+
},
107+
systemPrompt: `You are an expert AI system evaluator analyzing patterns across multiple coding tasks and agents.
108+
109+
## Your Role
110+
111+
You will receive:
112+
1. Complete agent definitions showing their configuration, tools, prompts, and capabilities
113+
2. Agent type definitions explaining the available options and structure
114+
3. Trace analyses from multiple tasks showing how agents approached different problems
115+
4. Judge analyses showing the quality of their implementations
116+
5. Performance metrics (scores, costs, times) across all tasks
117+
118+
## Focus on Patterns and Trends
119+
120+
Your analysis should identify consistent patterns across multiple tasks:
121+
122+
Key Analysis Areas:
123+
- **Agent Design Impact**: How does each agent's configuration (tools, model, prompts) affect their behavior and performance?
124+
- **Consistent Behaviors**: What patterns emerge in how each agent approaches problems?
125+
- **Performance Trends**: Which agents consistently score higher/lower? Why?
126+
- **Cost vs Quality Trade-offs**: How do agents balance thoroughness with efficiency?
127+
- **Reliability**: Which agents are more consistent vs variable in their performance?
128+
- **Comparative Analysis**: What are the key differentiators between agents? How do their configurations lead to different outcomes?
129+
- **Prompt Engineering Effectiveness**: Which agents have better-designed prompts that guide behavior effectively?
130+
131+
## Output Format
132+
133+
Provide:
134+
- **Overall Comparison**: High-level assessment comparing all agents' general approaches and performance
135+
- **Agent Insights**: For each agent:
136+
- Consistent Strengths: Patterns that work well across multiple tasks
137+
- Consistent Weaknesses: Recurring issues or limitations
138+
- Performance Summary: Overall scores, costs, times, and reliability
139+
- Recommendations: What changes would most improve this agent?
140+
- **Key Findings**: 3-5 most actionable insights that should guide development priorities
141+
142+
Focus on actionable patterns that can inform agent improvements, not individual task details.`,
143+
}
144+
145+
export async function analyzeAllTasks(params: {
146+
client: CodebuffClient
147+
logsDir: string
148+
agents: string[]
149+
analyzerContext: {
150+
agentDefinitions: any[]
151+
agentTypeDefinition: string
152+
testedAgentIds: string[]
153+
}
154+
}): Promise<MetaAnalysisResult> {
155+
const { client, logsDir, agents, analyzerContext } = params
156+
157+
try {
158+
// Read all ANALYSIS files from logs directory
159+
const files = fs.readdirSync(logsDir)
160+
const analysisFiles = files.filter((f) => f.includes('ANALYSIS'))
161+
162+
const allTaskAnalyses: TaskAnalysisData[] = []
163+
for (const file of analysisFiles) {
164+
const filePath = path.join(logsDir, file)
165+
const content = fs.readFileSync(filePath, 'utf-8')
166+
const data: TaskAnalysisData = JSON.parse(content)
167+
allTaskAnalyses.push(data)
168+
}
169+
170+
if (allTaskAnalyses.length === 0) {
171+
console.warn('No analysis files found in logs directory')
172+
return {
173+
overallComparison: 'No analysis data available',
174+
agentInsights: [],
175+
keyFindings: [],
176+
}
177+
}
178+
179+
// Create a concise summary for each task (without full agent traces)
180+
const taskSummaries = allTaskAnalyses.map((task) => ({
181+
prompt: task.prompt,
182+
traceAnalysis: {
183+
overallAnalysis: task.overallAnalysis,
184+
agentFeedback: task.agentFeedback,
185+
},
186+
judgeResults: task.results.map((r) => ({
187+
agentId: r.agentId,
188+
overallScore: r.overallScore,
189+
completionScore: r.completionScore,
190+
codeQualityScore: r.codeQualityScore,
191+
cost: r.cost,
192+
durationMs: r.durationMs,
193+
strengths: r.strengths,
194+
weaknesses: r.weaknesses,
195+
error: r.error,
196+
})),
197+
}))
198+
199+
// Filter agent definitions to only include tested agents
200+
const filteredAgentDefinitions = analyzerContext.agentDefinitions.filter(
201+
(def) => analyzerContext.testedAgentIds.includes(def.id),
202+
)
203+
204+
const prompt = `## Agent Definitions Being Evaluated
205+
206+
Below are the complete agent definitions for the agents being tested. Use this to understand their configuration, tools, prompts, and overall design.
207+
208+
${JSON.stringify(filteredAgentDefinitions, null, 2)}
209+
210+
## Agent Type Definition Reference
211+
212+
For reference, here is the TypeScript type definition that agents use:
213+
214+
\`\`\`typescript
215+
${analyzerContext.agentTypeDefinition}
216+
\`\`\`
217+
218+
## All Task Analyses
219+
220+
You are analyzing ${allTaskAnalyses.length} tasks evaluated across ${agents.length} agent(s): ${agents.join(', ')}
221+
222+
${JSON.stringify(taskSummaries, null, 2)}
223+
224+
Analyze these results to identify:
225+
226+
1. **Overall Comparison**: How do the agents compare in general? What are the key differentiators?
227+
228+
2. **Per-Agent Patterns**: For each agent, identify:
229+
- What strengths appear consistently across tasks?
230+
- What weaknesses or issues recur?
231+
- How does their performance (scores, cost, time) compare?
232+
- What patterns emerge in how they approach problems?
233+
234+
3. **Actionable Insights**: What are the 3-5 most important findings that should guide development?
235+
- Which improvements would have the biggest impact?
236+
- What trade-offs are agents making?
237+
- Are there reliability concerns?
238+
239+
Focus on patterns across multiple tasks, not individual task details.`
240+
241+
const agentOutput: string[] = []
242+
const analyzerResult = await withTimeout(
243+
client.run({
244+
agent: 'buffbench-meta-analyzer',
245+
prompt,
246+
agentDefinitions: [metaAnalyzerAgent],
247+
handleEvent: (event) => {
248+
if (event.type === 'text') {
249+
agentOutput.push(event.text)
250+
} else if (event.type === 'tool_call') {
251+
agentOutput.push(JSON.stringify(event, null, 2))
252+
} else if (event.type === 'error') {
253+
console.warn('[Meta Analyzer] Error event:', event.message)
254+
}
255+
},
256+
}),
257+
30 * 60 * 1000,
258+
'Meta analyzer agent timed out after 30 minutes',
259+
)
260+
261+
const { output } = analyzerResult
262+
263+
if (output.type !== 'structuredOutput' || output.value === null) {
264+
console.error(
265+
'Error running meta analyzer - not structured output',
266+
JSON.stringify(output, null, 2),
267+
)
268+
console.error('Meta analyzer output trace:', agentOutput.join(''))
269+
return {
270+
overallComparison:
271+
'Error running meta analyzer - not structured output',
272+
agentInsights: [],
273+
keyFindings: [],
274+
}
275+
}
276+
277+
return output.value as MetaAnalysisResult
278+
} catch (error) {
279+
console.error(`Failed to analyze all tasks:`, getErrorObject(error))
280+
return {
281+
overallComparison: `Error running meta analyzer: ${getErrorObject(error).message}`,
282+
agentInsights: [],
283+
keyFindings: [],
284+
}
285+
}
286+
}

0 commit comments

Comments
 (0)