Skip to content

Commit babeb19

Browse files
christsoclaude
andauthored
feat: transcript import pipeline — grade existing sessions offline (#946)
* feat: transcript import pipeline — grade existing Claude/Codex/Copilot sessions offline (#872) Add `agentv import` command with Claude, Codex, and Copilot subcommands that read existing AI coding sessions from disk and normalize them into a tool-agnostic transcript JSONL format. Add `--transcript` flag to `agentv eval` that skips provider invocation and grades pre-recorded transcripts, enabling offline evaluation without re-running sessions. Rename `agentv trace` → `agentv inspect` (kept trace as deprecated alias). Key changes: - New parsers: codex-parser.ts, transcript-provider.ts - New discovery: codex-session-discovery.ts - Updated import output to spec format (input, output, source, token_usage, etc.) - TranscriptProvider implements Provider interface for eval pipeline integration - Re-export copilot parser/discovery from import barrel for CLI access Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: prevent transcript provider from being used as LLM grader When --transcript is used without --grader-target, the orchestrator's grader resolution would fall back to using the transcript provider as the grader, exhausting the transcript on the second invoke() call. Fix: return undefined from resolveGraderProvider when the target is a transcript provider so LLM-based evaluators skip gracefully. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor: use LLM_GRADER_CAPABLE_KINDS allowlist for grader resolution Replace the transcript-specific point check with a proper allowlist of provider kinds that can return structured JSON for LLM grading. Previously, resolveGraderProvider would blindly fall back to using the eval target as its own grader when no grader_target was configured. This silently broke for transcript, copilot-log, cli, and any other provider that can't produce grader responses. Now only providers in LLM_GRADER_CAPABLE_KINDS (openai, openrouter, azure, anthropic, gemini, agentv, mock) are used as fallback graders. All others return undefined, causing LLM-based evaluators to skip with a clear error rather than fail silently. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor: hard-remove agentv trace, replace with agentv inspect Delete the trace/ command directory entirely (no deprecated alias). Update all imports from trace/utils to inspect/utils. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b81e456 commit babeb19

25 files changed

Lines changed: 994 additions & 76 deletions

File tree

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,12 @@ export const evalRunCommand = command({
192192
long: 'exclude-tag',
193193
description: 'Skip eval files that have this tag (repeatable, file skipped if any match)',
194194
}),
195+
transcript: option({
196+
type: optional(string),
197+
long: 'transcript',
198+
description:
199+
'Grade a pre-recorded transcript JSONL instead of invoking a live provider. Ignores targets.',
200+
}),
195201
},
196202
handler: async (args) => {
197203
// Launch interactive wizard when no eval paths and stdin is a TTY
@@ -237,6 +243,7 @@ export const evalRunCommand = command({
237243
threshold: args.threshold,
238244
tag: args.tag,
239245
excludeTag: args.excludeTag,
246+
transcript: args.transcript,
240247
};
241248
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
242249
if (result?.allExecutionErrors) {

apps/cli/src/commands/eval/run-eval.ts

Lines changed: 106 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ interface NormalizedOptions {
9090
readonly threshold?: number;
9191
readonly tags: readonly string[];
9292
readonly excludeTags: readonly string[];
93+
readonly transcript?: string;
9394
}
9495

9596
function normalizeBoolean(value: unknown): boolean {
@@ -357,6 +358,7 @@ function normalizeOptions(
357358
threshold: normalizeOptionalNumber(rawOptions.threshold),
358359
tags: normalizeStringArray(rawOptions.tag),
359360
excludeTags: normalizeStringArray(rawOptions.excludeTag),
361+
transcript: normalizeString(rawOptions.transcript),
360362
} satisfies NormalizedOptions;
361363
}
362364

@@ -507,63 +509,86 @@ async function prepareFileMetadata(params: {
507509
category,
508510
});
509511
const testIds = suite.tests.map((value) => value.id);
510-
511-
// Determine target names: CLI --target flags override YAML
512-
const cliTargets = options.cliTargets;
513512
const suiteTargets = suite.targets;
514513

515-
// Resolve which target names to use (precedence: CLI > suite YAML targets > default)
516-
let targetNames: readonly string[];
517-
if (cliTargets.length > 0) {
518-
targetNames = cliTargets;
519-
} else if (suiteTargets && suiteTargets.length > 0) {
520-
targetNames = suiteTargets;
521-
} else {
522-
targetNames = [];
523-
}
524-
525514
let selections: { selection: TargetSelection; inlineTargetLabel: string }[];
526515

527-
if (targetNames.length > 1) {
528-
// Matrix mode: multiple targets
529-
const multiSelections = await selectMultipleTargets({
530-
testFilePath,
531-
repoRoot,
532-
cwd,
533-
explicitTargetsPath: options.targetsPath,
534-
dryRun: options.dryRun,
535-
dryRunDelay: options.dryRunDelay,
536-
dryRunDelayMin: options.dryRunDelayMin,
537-
dryRunDelayMax: options.dryRunDelayMax,
538-
env: process.env,
539-
targetNames,
540-
});
541-
542-
selections = multiSelections.map((sel) => ({
543-
selection: sel,
544-
inlineTargetLabel: sel.targetName,
545-
}));
546-
} else {
547-
// Single target mode (legacy path)
548-
const selection = await selectTarget({
549-
testFilePath,
550-
repoRoot,
551-
cwd,
552-
explicitTargetsPath: options.targetsPath,
553-
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
554-
dryRun: options.dryRun,
555-
dryRunDelay: options.dryRunDelay,
556-
dryRunDelayMin: options.dryRunDelayMin,
557-
dryRunDelayMax: options.dryRunDelayMax,
558-
env: process.env,
559-
});
560-
516+
if (options.transcript) {
517+
// --transcript mode: bypass target resolution entirely.
518+
// Create a synthetic TargetSelection for the transcript provider.
519+
const transcriptSelection: TargetSelection = {
520+
definitions: [],
521+
resolvedTarget: {
522+
kind: 'transcript',
523+
name: 'transcript',
524+
config: {} as Record<string, never>,
525+
},
526+
targetName: 'transcript',
527+
targetSource: 'cli',
528+
targetsFilePath: options.transcript,
529+
};
561530
selections = [
562531
{
563-
selection,
564-
inlineTargetLabel: selection.targetName,
532+
selection: transcriptSelection,
533+
inlineTargetLabel: `transcript (${path.basename(options.transcript)})`,
565534
},
566535
];
536+
} else {
537+
// Determine target names: CLI --target flags override YAML
538+
const cliTargets = options.cliTargets;
539+
const suiteTargets = suite.targets;
540+
541+
// Resolve which target names to use (precedence: CLI > suite YAML targets > default)
542+
let targetNames: readonly string[];
543+
if (cliTargets.length > 0) {
544+
targetNames = cliTargets;
545+
} else if (suiteTargets && suiteTargets.length > 0) {
546+
targetNames = suiteTargets;
547+
} else {
548+
targetNames = [];
549+
}
550+
551+
if (targetNames.length > 1) {
552+
// Matrix mode: multiple targets
553+
const multiSelections = await selectMultipleTargets({
554+
testFilePath,
555+
repoRoot,
556+
cwd,
557+
explicitTargetsPath: options.targetsPath,
558+
dryRun: options.dryRun,
559+
dryRunDelay: options.dryRunDelay,
560+
dryRunDelayMin: options.dryRunDelayMin,
561+
dryRunDelayMax: options.dryRunDelayMax,
562+
env: process.env,
563+
targetNames,
564+
});
565+
566+
selections = multiSelections.map((sel) => ({
567+
selection: sel,
568+
inlineTargetLabel: sel.targetName,
569+
}));
570+
} else {
571+
// Single target mode (legacy path)
572+
const selection = await selectTarget({
573+
testFilePath,
574+
repoRoot,
575+
cwd,
576+
explicitTargetsPath: options.targetsPath,
577+
cliTargetName: targetNames.length === 1 ? targetNames[0] : options.target,
578+
dryRun: options.dryRun,
579+
dryRunDelay: options.dryRunDelay,
580+
dryRunDelayMin: options.dryRunDelayMin,
581+
dryRunDelayMax: options.dryRunDelayMax,
582+
env: process.env,
583+
});
584+
585+
selections = [
586+
{
587+
selection,
588+
inlineTargetLabel: selection.targetName,
589+
},
590+
];
591+
}
567592
}
568593

569594
return {
@@ -623,6 +648,9 @@ async function runSingleEvalFile(params: {
623648
readonly totalBudgetUsd?: number;
624649
readonly failOnError?: FailOnError;
625650
readonly threshold?: number;
651+
readonly providerFactory?: (
652+
target: import('@agentv/core').ResolvedTarget,
653+
) => import('@agentv/core').Provider;
626654
}): Promise<{ results: EvaluationResult[] }> {
627655
const {
628656
testFilePath,
@@ -645,6 +673,7 @@ async function runSingleEvalFile(params: {
645673
matrixMode,
646674
totalBudgetUsd,
647675
failOnError,
676+
providerFactory,
648677
} = params;
649678

650679
const targetName = selection.targetName;
@@ -742,6 +771,7 @@ async function runSingleEvalFile(params: {
742771
graderTarget: options.graderTarget,
743772
model: options.model,
744773
threshold: options.threshold,
774+
providerFactory,
745775
streamCallbacks: streamingObserver?.getStreamCallbacks(),
746776
onResult: async (result: EvaluationResult) => {
747777
(
@@ -1198,6 +1228,31 @@ export async function runEvalCommand(
11981228
// Use only files that survived tag filtering (fileMetadata keys)
11991229
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
12001230

1231+
// --transcript: create a shared TranscriptProvider and validate line count
1232+
let transcriptProviderFactory:
1233+
| ((target: import('@agentv/core').ResolvedTarget) => import('@agentv/core').Provider)
1234+
| undefined;
1235+
if (options.transcript) {
1236+
const { TranscriptProvider } = await import('@agentv/core');
1237+
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
1238+
1239+
// Validate: transcript lines must match total test cases across all files
1240+
const totalTests = [...fileMetadata.values()].reduce(
1241+
(sum, meta) => sum + meta.testCases.length,
1242+
0,
1243+
);
1244+
if (transcriptProvider.lineCount !== totalTests) {
1245+
throw new Error(
1246+
`Transcript has ${transcriptProvider.lineCount} entry(s) but eval defines ${totalTests} test(s). Each transcript line maps positionally to one test case.`,
1247+
);
1248+
}
1249+
1250+
transcriptProviderFactory = () => transcriptProvider;
1251+
console.log(
1252+
`Using transcript: ${options.transcript} (${transcriptProvider.lineCount} entry(s))`,
1253+
);
1254+
}
1255+
12011256
try {
12021257
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
12031258
const targetPrep = fileMetadata.get(testFilePath);
@@ -1242,11 +1297,12 @@ export async function runEvalCommand(
12421297
selection,
12431298
inlineTargetLabel,
12441299
testCases: applicableTestCases,
1245-
trialsConfig: targetPrep.trialsConfig,
1300+
trialsConfig: options.transcript ? undefined : targetPrep.trialsConfig,
12461301
matrixMode: targetPrep.selections.length > 1,
12471302
totalBudgetUsd: targetPrep.totalBudgetUsd,
12481303
failOnError: targetPrep.failOnError,
12491304
threshold: resolvedThreshold,
1305+
providerFactory: transcriptProviderFactory,
12501306
});
12511307

12521308
return result.results;

apps/cli/src/commands/import/claude.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import { mkdir, writeFile } from 'node:fs/promises';
22
import path from 'node:path';
3-
import { discoverClaudeSessions, parseClaudeSession, readTranscriptFile } from '@agentv/core';
3+
import {
4+
discoverClaudeSessions,
5+
parseClaudeSession,
6+
readTranscriptFile,
7+
toTranscriptJsonLine,
8+
} from '@agentv/core';
49
import { command, flag, option, optional, string } from 'cmd-ts';
510

611
export const importClaudeCommand = command({
@@ -106,9 +111,9 @@ export const importClaudeCommand = command({
106111
// Ensure output directory exists
107112
await mkdir(path.dirname(outputPath), { recursive: true });
108113

109-
// Write transcript as JSONL (one message per line)
110-
const outputLines = transcript.messages.map((msg) => JSON.stringify(msg));
111-
await writeFile(outputPath, `${outputLines.join('\n')}\n`, 'utf8');
114+
// Write transcript as JSONL (one line per test case, snake_case wire format)
115+
const jsonLine = toTranscriptJsonLine(transcript);
116+
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');
112117

113118
const msgCount = transcript.messages.length;
114119
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import { mkdir, writeFile } from 'node:fs/promises';
2+
import path from 'node:path';
3+
import {
4+
discoverCodexSessions,
5+
parseCodexSession,
6+
readTranscriptFile,
7+
toTranscriptJsonLine,
8+
} from '@agentv/core';
9+
import { command, flag, option, optional, string } from 'cmd-ts';
10+
11+
export const importCodexCommand = command({
12+
name: 'codex',
13+
description: 'Import a Codex CLI session transcript for offline grading',
14+
args: {
15+
discover: option({
16+
type: optional(string),
17+
long: 'discover',
18+
description: 'Discovery mode: "latest" to import the most recent session',
19+
}),
20+
date: option({
21+
type: optional(string),
22+
long: 'date',
23+
description: 'Filter sessions by date (YYYY-MM-DD)',
24+
}),
25+
output: option({
26+
type: optional(string),
27+
long: 'output',
28+
short: 'o',
29+
description: 'Output file path (default: .agentv/transcripts/codex-<timestamp>.jsonl)',
30+
}),
31+
sessionsDir: option({
32+
type: optional(string),
33+
long: 'sessions-dir',
34+
description: 'Override the default ~/.codex/sessions directory',
35+
}),
36+
list: flag({
37+
long: 'list',
38+
description: 'List available sessions instead of importing',
39+
}),
40+
},
41+
handler: async ({ discover, date, output, sessionsDir, list }) => {
42+
if (list) {
43+
const sessions = await discoverCodexSessions({
44+
date,
45+
sessionsDir,
46+
limit: 20,
47+
});
48+
49+
if (sessions.length === 0) {
50+
console.log('No Codex CLI sessions found.');
51+
return;
52+
}
53+
54+
console.log(`Found ${sessions.length} session(s):\n`);
55+
for (const session of sessions) {
56+
const age = formatAge(session.updatedAt);
57+
console.log(` ${session.sessionId} ${age} ${session.filename}`);
58+
}
59+
return;
60+
}
61+
62+
if (discover !== 'latest') {
63+
console.error('Error: specify --discover latest to select a session.');
64+
process.exit(1);
65+
}
66+
67+
const sessions = await discoverCodexSessions({
68+
date,
69+
sessionsDir,
70+
latest: true,
71+
});
72+
73+
if (sessions.length === 0) {
74+
console.error('Error: no Codex CLI sessions found.');
75+
process.exit(1);
76+
}
77+
78+
const session = sessions[0];
79+
console.log(`Discovered latest session: ${session.filename}`);
80+
81+
// Parse the session
82+
const rawJsonl = await readTranscriptFile(session.filePath);
83+
const transcript = parseCodexSession(rawJsonl);
84+
85+
// Determine output path
86+
const shortId = session.sessionId.slice(0, 8);
87+
const outputPath = output ?? path.join('.agentv', 'transcripts', `codex-${shortId}.jsonl`);
88+
89+
// Ensure output directory exists
90+
await mkdir(path.dirname(outputPath), { recursive: true });
91+
92+
// Write transcript as JSONL (snake_case wire format)
93+
const jsonLine = toTranscriptJsonLine(transcript);
94+
await writeFile(outputPath, `${JSON.stringify(jsonLine)}\n`, 'utf8');
95+
96+
const msgCount = transcript.messages.length;
97+
const toolCount = transcript.messages.reduce((sum, m) => sum + (m.toolCalls?.length ?? 0), 0);
98+
99+
console.log(`Imported ${msgCount} messages (${toolCount} tool calls) → ${outputPath}`);
100+
101+
if (transcript.source.model) {
102+
console.log(` Model: ${transcript.source.model}`);
103+
}
104+
if (transcript.durationMs !== undefined) {
105+
console.log(` Duration: ${formatDurationMs(transcript.durationMs)}`);
106+
}
107+
},
108+
});
109+
110+
function formatAge(date: Date): string {
111+
const diffMs = Date.now() - date.getTime();
112+
const diffMin = Math.floor(diffMs / 60_000);
113+
if (diffMin < 60) return `${diffMin}m ago`;
114+
const diffHours = Math.floor(diffMin / 60);
115+
if (diffHours < 24) return `${diffHours}h ago`;
116+
const diffDays = Math.floor(diffHours / 24);
117+
return `${diffDays}d ago`;
118+
}
119+
120+
function formatDurationMs(ms: number): string {
121+
if (ms < 1000) return `${ms}ms`;
122+
const seconds = Math.floor(ms / 1000);
123+
if (seconds < 60) return `${seconds}s`;
124+
const minutes = Math.floor(seconds / 60);
125+
const remainingSeconds = seconds % 60;
126+
return `${minutes}m ${remainingSeconds}s`;
127+
}

0 commit comments

Comments
 (0)