Skip to content

Commit a45959a

Browse files
christsoCopilot
andauthored
refactor(results): remove flat manifest loading (#940)
* refactor(results): remove flat manifest loading Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(trace): preserve canonical run timestamps Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * fix(compare): require test_id in flat inputs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * refactor(results): drop flat jsonl compatibility Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent f22a2ad commit a45959a

47 files changed

Lines changed: 789 additions & 765 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ agentv eval evals/my-eval.yaml
7171

7272
**5. Compare results across targets:**
7373
```bash
74-
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
74+
agentv compare .agentv/results/runs/<timestamp>/index.jsonl
7575
```
7676

7777
## Output formats

apps/cli/src/commands/compare/index.ts

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {
1010
restPositionals,
1111
string,
1212
} from 'cmd-ts';
13+
1314
import { toSnakeCaseDeep } from '../../utils/case-conversion.js';
1415
import { loadLightweightResults, resolveResultSourcePath } from '../results/manifest.js';
1516

@@ -62,23 +63,40 @@ interface MatrixRow {
6263
scores: Record<string, number>;
6364
}
6465

66+
interface CompareInputRecord extends EvalResult {
67+
target?: string;
68+
}
69+
70+
function loadCompareResults(filePath: string): CompareInputRecord[] {
71+
return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => {
72+
if (!record.testId || record.testId === 'unknown') {
73+
throw new Error(`Missing test_id in result source: ${filePath}`);
74+
}
75+
if (typeof record.score !== 'number' || Number.isNaN(record.score)) {
76+
throw new Error(`Missing or invalid score in result source: ${filePath}`);
77+
}
78+
return {
79+
testId: record.testId,
80+
score: record.score,
81+
target: record.target,
82+
};
83+
});
84+
}
85+
6586
export interface MatrixOutput {
6687
matrix: MatrixRow[];
6788
pairwise: ComparisonOutput[];
6889
targets: string[];
6990
}
7091

7192
export function loadJsonlResults(filePath: string): EvalResult[] {
72-
return loadLightweightResults(resolveResultSourcePath(filePath)).map((record) => ({
73-
testId: record.testId,
74-
score: record.score,
75-
}));
93+
return loadCompareResults(filePath).map(({ testId, score }) => ({ testId, score }));
7694
}
7795

7896
export function loadCombinedResults(filePath: string): Map<string, EvalResult[]> {
7997
const groups = new Map<string, EvalResult[]>();
8098

81-
for (const record of loadLightweightResults(resolveResultSourcePath(filePath))) {
99+
for (const record of loadCompareResults(filePath)) {
82100
if (typeof record.target !== 'string') {
83101
throw new Error(`Missing target field in combined result source: ${filePath}`);
84102
}
@@ -413,12 +431,13 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
413431
export const compareCommand = command({
414432
name: 'compare',
415433
description:
416-
'Compare evaluation result files: two-file pairwise, combined JSONL pairwise, or N-way matrix',
434+
'Compare evaluation run manifests: two-run pairwise, single-run pairwise, or N-way matrix',
417435
args: {
418436
results: restPositionals({
419437
type: string,
420438
displayName: 'results',
421-
description: 'JSONL result file path(s). One file: combined mode. Two files: pairwise mode.',
439+
description:
440+
'Run workspace or index.jsonl manifest path(s). One source: single-run mode. Two sources: pairwise mode.',
422441
}),
423442
threshold: option({
424443
type: optional(number),
@@ -430,13 +449,13 @@ export const compareCommand = command({
430449
type: optional(string),
431450
long: 'baseline',
432451
short: 'b',
433-
description: 'Target name to use as baseline (filters combined JSONL)',
452+
description: 'Target name to use as baseline (filters a single run manifest)',
434453
}),
435454
candidate: option({
436455
type: optional(string),
437456
long: 'candidate',
438457
short: 'c',
439-
description: 'Target name to use as candidate (filters combined JSONL)',
458+
description: 'Target name to use as candidate (filters a single run manifest)',
440459
}),
441460
targets: multioption({
442461
type: array(string),
@@ -460,7 +479,7 @@ export const compareCommand = command({
460479

461480
try {
462481
if (results.length === 0) {
463-
throw new Error('At least one JSONL result file is required');
482+
throw new Error('At least one run workspace or index.jsonl manifest is required');
464483
}
465484

466485
if (results.length === 2) {
@@ -478,7 +497,7 @@ export const compareCommand = command({
478497
const exitCode = determineExitCode(comparison.summary.meanDelta);
479498
process.exit(exitCode);
480499
} else if (results.length === 1) {
481-
// Combined JSONL mode
500+
// Single-run manifest mode
482501
let groups = loadCombinedResults(results[0]);
483502

484503
// Filter by --targets if specified
@@ -514,7 +533,7 @@ export const compareCommand = command({
514533
}
515534

516535
if (baseline && candidate) {
517-
// Pairwise mode from combined JSONL
536+
// Pairwise mode from a single run manifest
518537
const baselineResults = groups.get(baseline);
519538
const candidateResults = groups.get(candidate);
520539
if (!baselineResults) {
@@ -548,7 +567,7 @@ export const compareCommand = command({
548567
process.exit(exitCode);
549568
}
550569
} else {
551-
throw new Error('Expected 1 or 2 JSONL result files');
570+
throw new Error('Expected 1 or 2 run workspaces or index.jsonl manifests');
552571
}
553572
} catch (error) {
554573
console.error(`Error: ${(error as Error).message}`);

apps/cli/src/commands/eval/artifact-writer.ts

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,69 @@ function toCamelCaseDeep(obj: unknown): unknown {
594594
return obj;
595595
}
596596

597+
type ParsedEvaluationResult = Record<string, unknown> & {
598+
timestamp: string;
599+
testId: string;
600+
score: number;
601+
assertions: EvaluationResult['assertions'];
602+
target: string;
603+
output: EvaluationResult['output'];
604+
executionStatus: EvaluationResult['executionStatus'];
605+
};
606+
607+
const EXECUTION_STATUSES = new Set<EvaluationResult['executionStatus']>([
608+
'ok',
609+
'quality_failure',
610+
'execution_error',
611+
]);
612+
613+
function isAssertionEntry(value: unknown): value is EvaluationResult['assertions'][number] {
614+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
615+
return false;
616+
}
617+
618+
const candidate = value as { text?: unknown; passed?: unknown; evidence?: unknown };
619+
return (
620+
typeof candidate.text === 'string' &&
621+
typeof candidate.passed === 'boolean' &&
622+
(candidate.evidence === undefined || typeof candidate.evidence === 'string')
623+
);
624+
}
625+
626+
function isOutputMessage(value: unknown): value is EvaluationResult['output'][number] {
627+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
628+
return false;
629+
}
630+
631+
const candidate = value as { role?: unknown };
632+
return typeof candidate.role === 'string';
633+
}
634+
635+
function isExecutionStatus(value: unknown): value is EvaluationResult['executionStatus'] {
636+
return (
637+
typeof value === 'string' &&
638+
EXECUTION_STATUSES.has(value as EvaluationResult['executionStatus'])
639+
);
640+
}
641+
642+
function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefined {
643+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
644+
return undefined;
645+
}
646+
647+
const result = value as Record<string, unknown>;
648+
return {
649+
...result,
650+
timestamp: typeof result.timestamp === 'string' ? result.timestamp : new Date(0).toISOString(),
651+
testId: typeof result.testId === 'string' ? result.testId : 'unknown',
652+
score: typeof result.score === 'number' ? result.score : 0,
653+
assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
654+
target: typeof result.target === 'string' ? result.target : 'unknown',
655+
output: Array.isArray(result.output) ? result.output.filter(isOutputMessage) : [],
656+
executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : 'ok',
657+
};
658+
}
659+
597660
// ---------------------------------------------------------------------------
598661
// JSONL parsing
599662
// ---------------------------------------------------------------------------
@@ -610,7 +673,10 @@ export function parseJsonlResults(content: string): EvaluationResult[] {
610673
const parsed = JSON.parse(trimmed);
611674
// JSONL files from AgentV use snake_case; convert back to camelCase
612675
const camelCased = toCamelCaseDeep(parsed);
613-
results.push(camelCased as EvaluationResult);
676+
const normalized = normalizeParsedResult(camelCased);
677+
if (normalized) {
678+
results.push(normalized);
679+
}
614680
} catch {
615681
// Skip malformed lines
616682
}

apps/cli/src/commands/eval/commands/run.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ export const evalRunCommand = command({
141141
retryErrors: option({
142142
type: optional(string),
143143
long: 'retry-errors',
144-
description: 'Path to previous output JSONL — re-run only execution_error test cases',
144+
description:
145+
'Path to a previous run workspace or index.jsonl manifest — re-run only execution_error test cases',
145146
}),
146147
strict: flag({
147148
long: 'strict',

apps/cli/src/commands/eval/result-layout.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ export function resolveRunIndexPath(runDir: string): string {
2020
return path.join(runDir, RESULT_INDEX_FILENAME);
2121
}
2222

23+
export function isRunManifestPath(filePath: string): boolean {
24+
return path.basename(filePath) === RESULT_INDEX_FILENAME;
25+
}
26+
2327
export function resolveExistingRunPrimaryPath(runDir: string): string | undefined {
2428
const indexPath = resolveRunIndexPath(runDir);
2529
if (existsSync(indexPath)) {
@@ -49,3 +53,17 @@ export function resolveWorkspaceOrFilePath(filePath: string): string {
4953

5054
return existing;
5155
}
56+
57+
export function resolveRunManifestPath(filePath: string): string {
58+
if (isDirectoryPath(filePath)) {
59+
return resolveWorkspaceOrFilePath(filePath);
60+
}
61+
62+
if (!isRunManifestPath(filePath)) {
63+
throw new Error(
64+
`Expected a run workspace directory or ${RESULT_INDEX_FILENAME} manifest: ${filePath}`,
65+
);
66+
}
67+
68+
return filePath;
69+
}
Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
import type { EvaluationResult } from '@agentv/core';
22

3-
import {
4-
loadLightweightResults,
5-
loadManifestResults,
6-
resolveResultSourcePath,
7-
} from '../results/manifest.js';
3+
import { loadManifestResults, resolveResultSourcePath } from '../results/manifest.js';
4+
5+
async function loadRetrySourceResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
6+
return loadManifestResults(resolveResultSourcePath(jsonlPath));
7+
}
88

99
/**
1010
* Load test IDs from an index/results source that have executionStatus === 'execution_error'.
1111
*/
1212
export async function loadErrorTestIds(jsonlPath: string): Promise<readonly string[]> {
13-
const resolvedPath = resolveResultSourcePath(jsonlPath);
14-
const ids = loadLightweightResults(resolvedPath)
13+
const ids = (await loadRetrySourceResults(jsonlPath))
1514
.filter((result) => result.executionStatus === 'execution_error')
1615
.map((result) => result.testId);
1716

@@ -23,8 +22,7 @@ export async function loadErrorTestIds(jsonlPath: string): Promise<readonly stri
2322
* These are the "good" results that should be preserved when merging retry output.
2423
*/
2524
export async function loadNonErrorResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
26-
const resolvedPath = resolveResultSourcePath(jsonlPath);
27-
return loadManifestResults(resolvedPath).filter(
25+
return (await loadRetrySourceResults(jsonlPath)).filter(
2826
(result) => result.testId && result.executionStatus !== 'execution_error',
2927
);
3028
}

apps/cli/src/commands/eval/run-cache.ts

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,19 @@ const CACHE_FILENAME = 'cache.json';
1616
export interface RunCache {
1717
/** Directory path for new per-run directory format (e.g. .agentv/results/runs/<ts>/) */
1818
readonly lastRunDir?: string;
19-
/** JSONL file path for legacy flat-file format. Kept for backward compat. */
19+
/** @deprecated Legacy flat-file pointer from old cache files. Ignored on read. */
2020
readonly lastResultFile?: string;
2121
readonly timestamp: string;
2222
}
2323

2424
/**
2525
* Resolve the primary result manifest path from a RunCache entry.
26-
* New format: lastRunDir/index.jsonl
27-
* Legacy format: lastResultFile (flat JSONL path)
2826
*/
2927
export function resolveRunCacheFile(cache: RunCache): string {
3028
if (cache.lastRunDir) {
3129
return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir);
3230
}
33-
return cache.lastResultFile ?? '';
31+
return '';
3432
}
3533

3634
function cachePath(cwd: string): string {
@@ -47,18 +45,15 @@ export async function loadRunCache(cwd: string): Promise<RunCache | undefined> {
4745
}
4846

4947
export async function saveRunCache(cwd: string, resultPath: string): Promise<void> {
48+
if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) {
49+
return;
50+
}
51+
5052
const dir = path.join(cwd, '.agentv');
5153
await mkdir(dir, { recursive: true });
52-
const basename = path.basename(resultPath);
53-
const cache: RunCache =
54-
basename === RESULT_INDEX_FILENAME
55-
? {
56-
lastRunDir: path.dirname(resultPath),
57-
timestamp: new Date().toISOString(),
58-
}
59-
: {
60-
lastResultFile: resultPath,
61-
timestamp: new Date().toISOString(),
62-
};
54+
const cache: RunCache = {
55+
lastRunDir: path.dirname(resultPath),
56+
timestamp: new Date().toISOString(),
57+
};
6358
await writeFile(cachePath(cwd), `${JSON.stringify(cache, null, 2)}\n`, 'utf-8');
6459
}

0 commit comments

Comments
 (0)