Skip to content

Commit 56cac3b

Browse files
christsoCopilot
andauthored
refactor(core): normalize ToolCall names at provider layer (#1059) (#1061)
Phase 1: Add normalizeToolCall() pure function with static mapping tables - Canonical names: Skill, Read, Write, Edit, Bash (Claude naming) - Wired into all 9 providers and 2 import parsers - Input normalization: path → file_path for Read tools - 83 unit tests for normalizeToolCall Phase 2: Add stream_log config + auto-write transcript JSONL - New stream_log field on TargetDefinition and all resolved configs - resolveStreamLog() helper with log_format → stream_log fallback - Deprecation warning when log_format used - Auto-writes transcript.jsonl on every eval run Phase 3: Simplify skill-trigger evaluator - Removed ~120 lines of provider-specific code (ToolMatcher, matchers, PROVIDER_TOOL_SEMANTICS, resolveMatcher, readPathFromInput) - Evaluator now only checks canonical names: Skill + Read - Fully provider-agnostic (no ProviderKind import) All 2045 tests pass. Lint clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 7c3cd24 commit 56cac3b

21 files changed

Lines changed: 849 additions & 499 deletions

apps/cli/src/commands/eval/artifact-writer.ts

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
import { mkdir, readFile, writeFile } from 'node:fs/promises';
22
import path from 'node:path';
33

4-
import { DEFAULT_THRESHOLD, type EvaluationResult, type EvaluatorResult } from '@agentv/core';
4+
import {
5+
DEFAULT_THRESHOLD,
6+
type EvaluationResult,
7+
type EvaluatorResult,
8+
type TranscriptJsonLine,
9+
} from '@agentv/core';
510
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
611
import { RESULT_INDEX_FILENAME } from './result-layout.js';
712

@@ -766,5 +771,41 @@ export async function writeArtifactsFromResults(
766771

767772
await writeJsonlFile(indexPath, indexRecords);
768773

774+
// Write transcript JSONL (auto-generated on every eval run)
775+
const transcriptPath = path.join(outputDir, 'transcript.jsonl');
776+
const transcriptLines: TranscriptJsonLine[] = results.map((result) => {
777+
let inputText = '';
778+
if (typeof result.input === 'string') {
779+
inputText = result.input;
780+
} else if (Array.isArray(result.input)) {
781+
const firstUserMsg = result.input.find((m) => m.role === 'user');
782+
inputText = typeof firstUserMsg?.content === 'string' ? firstUserMsg.content : '';
783+
}
784+
return {
785+
input: inputText,
786+
output: result.output,
787+
token_usage: result.tokenUsage
788+
? {
789+
input: result.tokenUsage.input,
790+
output: result.tokenUsage.output,
791+
cached: result.tokenUsage.cached,
792+
}
793+
: undefined,
794+
duration_ms: result.durationMs,
795+
cost_usd: result.costUsd,
796+
source: {
797+
provider: result.target,
798+
session_id: result.conversationId ?? result.testId,
799+
timestamp: result.timestamp,
800+
},
801+
};
802+
});
803+
await writeFile(
804+
transcriptPath,
805+
transcriptLines.map((line) => JSON.stringify(line)).join('\n') +
806+
(transcriptLines.length ? '\n' : ''),
807+
'utf8',
808+
);
809+
769810
return { testArtifactDir, timingPath, benchmarkPath, indexPath };
770811
}

apps/cli/test/commands/eval/artifact-writer.test.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,7 @@ describe('writeArtifactsFromResults', () => {
586586
'beta',
587587
'index.jsonl',
588588
'timing.json',
589+
'transcript.jsonl',
589590
]);
590591

591592
const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha'));
@@ -624,7 +625,12 @@ describe('writeArtifactsFromResults', () => {
624625
const paths = await writeArtifactsFromResults([], testDir);
625626

626627
const artifactEntries = await readdir(paths.testArtifactDir);
627-
expect(artifactEntries.sort()).toEqual(['benchmark.json', 'index.jsonl', 'timing.json']);
628+
expect(artifactEntries.sort()).toEqual([
629+
'benchmark.json',
630+
'index.jsonl',
631+
'timing.json',
632+
'transcript.jsonl',
633+
]);
628634

629635
const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
630636
expect(timing.total_tokens).toBe(0);
Lines changed: 18 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -1,120 +1,27 @@
11
/**
22
* Built-in skill-trigger evaluator.
33
*
4-
* Detects whether the agent invoked a named skill as its first tool call.
5-
* Supports multiple provider kinds via static tool-name mappings.
6-
* For providers not covered here, use a code-grader instead.
4+
* Detects whether the agent invoked a named skill during a session.
5+
* Works with canonical tool names produced by normalizeToolCall() — no
6+
* provider-specific matching logic needed.
77
*
88
* Detection logic:
9-
* - Only the FIRST tool call matters.
10-
* - Skill tool: checks input.[skillInputField] contains the skill name (case-sensitive substring).
11-
* - Read tool: checks input.[readInputField] contains the skill name (case-sensitive substring).
12-
* - Any other tool as first call means the skill was not triggered.
9+
* - Scans ALL tool calls (not just the first) for skill invocation evidence.
10+
* - Skill tool: checks `tool === 'Skill'` and `input.skill` contains the skill name.
11+
* - Read tool: checks `tool === 'Read'` and `input.file_path` contains a skills/ path.
12+
* - Fallback: checks tool output for skill file path references.
1313
* - Supports negative cases via should_trigger: false.
1414
*
15-
* To add a new provider:
16-
* 1. Create a ToolMatcher with the provider's tool names and input fields.
17-
* 2. Add entries to PROVIDER_TOOL_SEMANTICS mapping the provider kind(s) to the matcher.
18-
* 3. If the provider's tool-call format doesn't fit the ToolMatcher model, use a code-grader instead.
15+
* Prerequisites:
16+
* All providers and import parsers must call normalizeToolCall() when
17+
* constructing ToolCall objects. This ensures canonical tool names
18+
* ("Skill", "Read", "Write", "Edit", "Bash") and canonical input field
19+
* names (input.skill, input.file_path) regardless of provider.
1920
*/
2021

21-
import type { ProviderKind } from '../providers/types.js';
2222
import type { SkillTriggerEvaluatorConfig } from '../types.js';
2323
import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
2424

25-
/** Tool-name semantics for different provider kinds. */
26-
interface ToolMatcher {
27-
/** Tool names that indicate skill invocation. */
28-
readonly skillTools: readonly string[];
29-
/** Input field that contains the skill name for skill tools. */
30-
readonly skillInputField: string;
31-
/** Tool names that indicate file read. */
32-
readonly readTools: readonly string[];
33-
/** Input field that contains the skill name for read tools. */
34-
readonly readInputField: string;
35-
/** Tool-name prefixes that encode the skill directly in the tool name. */
36-
readonly skillToolPrefixes?: readonly string[];
37-
/** Tool-name prefixes that encode the file path directly in the tool name. */
38-
readonly readToolPrefixes?: readonly string[];
39-
/** Alternate input field names that may contain the file path. */
40-
readonly readInputFields?: readonly string[];
41-
}
42-
43-
const CLAUDE_MATCHER: ToolMatcher = {
44-
skillTools: ['Skill'],
45-
skillInputField: 'skill',
46-
readTools: ['Read'],
47-
readInputField: 'file_path',
48-
};
49-
50-
/** Copilot uses ACP protocol — tool names vary by version and context. */
51-
const COPILOT_MATCHER: ToolMatcher = {
52-
skillTools: ['Skill', 'skill'],
53-
skillInputField: 'skill',
54-
readTools: ['Read File', 'readFile', 'Read', 'readTextFile'],
55-
readInputField: 'file_path',
56-
skillToolPrefixes: ['Using skill: '],
57-
readToolPrefixes: ['Viewing '],
58-
readInputFields: ['file_path', 'path'],
59-
};
60-
61-
/**
62-
* Pi CLI reads skill files using the lowercase `read` tool with a `path` argument.
63-
* Skills are auto-discovered from `.agents/skills/` relative to the working directory.
64-
*
65-
* Skill lookup order (workspace-scoped first):
66-
* 1. .agents/skills/<skill-name>/SKILL.md (workspace-relative, auto-discovered)
67-
* 2. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
68-
*/
69-
const PI_CODING_AGENT_MATCHER: ToolMatcher = {
70-
skillTools: [],
71-
skillInputField: 'skill',
72-
readTools: ['read'],
73-
readInputField: 'path',
74-
readInputFields: ['path', 'file_path', 'filePath'],
75-
};
76-
77-
/**
78-
* Codex reads skill files via command_execution using a bash sed command containing
79-
* the skill file path. The skill name appears in the command string, so we match
80-
* any command_execution whose command field includes the skill name.
81-
*
82-
* Skill lookup order (workspace-scoped first):
83-
* 1. .agents/skills/<skill-name>/SKILL.md (workspace-relative)
84-
* 2. .codex/skills/<skill-name>/SKILL.md (fallback)
85-
* 3. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
86-
*
87-
* MCP-based skill invocation (`mcp:<server>/<skill-name>`) is also supported for
88-
* Codex configurations that surface skills as MCP tools.
89-
*/
90-
const CODEX_MATCHER: ToolMatcher = {
91-
skillTools: [],
92-
skillInputField: 'skill',
93-
readTools: ['command_execution'],
94-
readInputField: 'command',
95-
skillToolPrefixes: ['mcp:'],
96-
readToolPrefixes: ['mcp:'],
97-
readInputFields: ['command', 'path', 'file_path', 'filePath'],
98-
};
99-
100-
/**
101-
* Static mapping of provider kinds to their tool-name semantics.
102-
* Providers not listed here fall back to CLAUDE_MATCHER.
103-
*/
104-
const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
105-
claude: CLAUDE_MATCHER,
106-
'claude-cli': CLAUDE_MATCHER,
107-
'claude-sdk': CLAUDE_MATCHER,
108-
codex: CODEX_MATCHER,
109-
'pi-coding-agent': PI_CODING_AGENT_MATCHER,
110-
'pi-cli': PI_CODING_AGENT_MATCHER,
111-
'copilot-cli': COPILOT_MATCHER,
112-
'copilot-log': COPILOT_MATCHER,
113-
'copilot-sdk': COPILOT_MATCHER,
114-
vscode: COPILOT_MATCHER,
115-
'vscode-insiders': COPILOT_MATCHER,
116-
};
117-
11825
export class SkillTriggerEvaluator implements Evaluator {
11926
readonly kind = 'skill-trigger';
12027

@@ -124,19 +31,9 @@ export class SkillTriggerEvaluator implements Evaluator {
12431
this.config = config;
12532
}
12633

127-
private resolveMatcher(providerKind: ProviderKind | undefined): ToolMatcher {
128-
if (providerKind) {
129-
const match = PROVIDER_TOOL_SEMANTICS[providerKind];
130-
if (match) return match;
131-
}
132-
return CLAUDE_MATCHER;
133-
}
134-
13534
evaluate(context: EvaluationContext): EvaluationScore {
13635
const skillName = this.config.skill;
13736
const shouldTrigger = this.config.should_trigger !== false;
138-
const providerKind = context.provider?.kind as ProviderKind | undefined;
139-
const matcher = this.resolveMatcher(providerKind);
14037

14138
const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
14239

@@ -147,42 +44,23 @@ export class SkillTriggerEvaluator implements Evaluator {
14744
const toolName = toolCall.tool ?? '';
14845
const input = (toolCall.input ?? {}) as Record<string, unknown>;
14946

150-
if (matcher.skillTools.includes(toolName)) {
151-
const skillArg = String(input[matcher.skillInputField] ?? '');
47+
if (toolName === 'Skill') {
48+
const skillArg = String(input.skill ?? '');
15249
if (skillArg.includes(skillName)) {
15350
triggered = true;
154-
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
51+
evidence = `Skill tool invoked with skill="${skillArg}"`;
15552
break;
15653
}
157-
} else if (
158-
matcher.skillToolPrefixes?.some(
159-
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName),
160-
)
161-
) {
162-
triggered = true;
163-
evidence = `Skill tool invoked via tool name "${toolName}"`;
164-
break;
165-
} else if (matcher.readTools.includes(toolName)) {
166-
const filePath = this.readPathFromInput(input, matcher);
167-
if (filePath.includes(skillName)) {
54+
} else if (toolName === 'Read') {
55+
const filePath = String(input.file_path ?? '');
56+
if (filePath.includes(`skills/${skillName}/`)) {
16857
triggered = true;
16958
evidence = `Read tool loaded skill file: ${filePath}`;
17059
break;
17160
}
172-
} else if (
173-
matcher.readToolPrefixes?.some(
174-
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName),
175-
)
176-
) {
177-
triggered = true;
178-
evidence = `Read tool loaded skill file via tool name "${toolName}"`;
179-
break;
18061
}
18162

18263
// Fallback: check if a tool's output contains a skill file path.
183-
// Some providers (e.g., copilot-sdk) discover skill content via search
184-
// tools (grep/glob) whose inputs don't reference the skill name, but
185-
// whose outputs include skill file paths like ".agents/skills/<name>/SKILL.md".
18664
if (!triggered && toolCall.output != null) {
18765
const outputStr =
18866
typeof toolCall.output === 'string' ? toolCall.output : JSON.stringify(toolCall.output);
@@ -228,15 +106,4 @@ export class SkillTriggerEvaluator implements Evaluator {
228106
expectedAspectCount: 1,
229107
};
230108
}
231-
232-
private readPathFromInput(input: Record<string, unknown>, matcher: ToolMatcher): string {
233-
const fields = matcher.readInputFields ?? [matcher.readInputField];
234-
for (const field of fields) {
235-
const value = input[field];
236-
if (value !== undefined && value !== null) {
237-
return String(value);
238-
}
239-
}
240-
return '';
241-
}
242109
}

packages/core/src/evaluation/providers/claude-cli.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import path from 'node:path';
77

88
import { extractTextContent, toContentArray } from './claude-content.js';
99
import { recordClaudeLogEntry } from './claude-log-tracker.js';
10+
import { normalizeToolCall } from './normalize-tool-call.js';
1011
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
1112
import type { ClaudeResolvedConfig } from './targets.js';
1213
import type {
@@ -493,11 +494,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
493494
}
494495
const p = part as Record<string, unknown>;
495496
if (p.type === 'tool_use' && typeof p.name === 'string') {
496-
toolCalls.push({
497-
tool: p.name,
498-
input: p.input,
499-
id: typeof p.id === 'string' ? p.id : undefined,
500-
});
497+
toolCalls.push(
498+
normalizeToolCall('claude-cli', {
499+
tool: p.name,
500+
input: p.input,
501+
id: typeof p.id === 'string' ? p.id : undefined,
502+
}),
503+
);
501504
}
502505
}
503506
return toolCalls;

packages/core/src/evaluation/providers/claude-sdk.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import path from 'node:path';
66

77
import { extractTextContent, toContentArray } from './claude-content.js';
88
import { recordClaudeLogEntry } from './claude-log-tracker.js';
9+
import { normalizeToolCall } from './normalize-tool-call.js';
910
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
1011
import type { ClaudeResolvedConfig } from './targets.js';
1112
import type {
@@ -297,11 +298,13 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
297298
}
298299
const p = part as Record<string, unknown>;
299300
if (p.type === 'tool_use' && typeof p.name === 'string') {
300-
toolCalls.push({
301-
tool: p.name,
302-
input: p.input,
303-
id: typeof p.id === 'string' ? p.id : undefined,
304-
});
301+
toolCalls.push(
302+
normalizeToolCall('claude-sdk', {
303+
tool: p.name,
304+
input: p.input,
305+
id: typeof p.id === 'string' ? p.id : undefined,
306+
}),
307+
);
305308
}
306309
}
307310
return toolCalls;

0 commit comments

Comments
 (0)