@@ -10,6 +10,7 @@ import {
1010 restPositionals ,
1111 string ,
1212} from 'cmd-ts' ;
13+
1314import { toSnakeCaseDeep } from '../../utils/case-conversion.js' ;
1415import { loadLightweightResults , resolveResultSourcePath } from '../results/manifest.js' ;
1516
@@ -62,23 +63,40 @@ interface MatrixRow {
6263 scores : Record < string , number > ;
6364}
6465
66+ interface CompareInputRecord extends EvalResult {
67+ target ?: string ;
68+ }
69+
70+ function loadCompareResults ( filePath : string ) : CompareInputRecord [ ] {
71+ return loadLightweightResults ( resolveResultSourcePath ( filePath ) ) . map ( ( record ) => {
72+ if ( ! record . testId || record . testId === 'unknown' ) {
73+ throw new Error ( `Missing test_id in result source: ${ filePath } ` ) ;
74+ }
75+ if ( typeof record . score !== 'number' || Number . isNaN ( record . score ) ) {
76+ throw new Error ( `Missing or invalid score in result source: ${ filePath } ` ) ;
77+ }
78+ return {
79+ testId : record . testId ,
80+ score : record . score ,
81+ target : record . target ,
82+ } ;
83+ } ) ;
84+ }
85+
6586export interface MatrixOutput {
6687 matrix : MatrixRow [ ] ;
6788 pairwise : ComparisonOutput [ ] ;
6889 targets : string [ ] ;
6990}
7091
7192export function loadJsonlResults ( filePath : string ) : EvalResult [ ] {
72- return loadLightweightResults ( resolveResultSourcePath ( filePath ) ) . map ( ( record ) => ( {
73- testId : record . testId ,
74- score : record . score ,
75- } ) ) ;
93+ return loadCompareResults ( filePath ) . map ( ( { testId, score } ) => ( { testId, score } ) ) ;
7694}
7795
7896export function loadCombinedResults ( filePath : string ) : Map < string , EvalResult [ ] > {
7997 const groups = new Map < string , EvalResult [ ] > ( ) ;
8098
81- for ( const record of loadLightweightResults ( resolveResultSourcePath ( filePath ) ) ) {
99+ for ( const record of loadCompareResults ( filePath ) ) {
82100 if ( typeof record . target !== 'string' ) {
83101 throw new Error ( `Missing target field in combined result source: ${ filePath } ` ) ;
84102 }
@@ -413,12 +431,13 @@ export function formatMatrix(matrixOutput: MatrixOutput, baselineTarget?: string
413431export const compareCommand = command ( {
414432 name : 'compare' ,
415433 description :
416- 'Compare evaluation result files : two-file pairwise, combined JSONL pairwise, or N-way matrix' ,
434+ 'Compare evaluation run manifests : two-run pairwise, single-run pairwise, or N-way matrix' ,
417435 args : {
418436 results : restPositionals ( {
419437 type : string ,
420438 displayName : 'results' ,
421- description : 'JSONL result file path(s). One file: combined mode. Two files: pairwise mode.' ,
439+ description :
440+ 'Run workspace or index.jsonl manifest path(s). One source: single-run mode. Two sources: pairwise mode.' ,
422441 } ) ,
423442 threshold : option ( {
424443 type : optional ( number ) ,
@@ -430,13 +449,13 @@ export const compareCommand = command({
430449 type : optional ( string ) ,
431450 long : 'baseline' ,
432451 short : 'b' ,
433- description : 'Target name to use as baseline (filters combined JSONL )' ,
452+ description : 'Target name to use as baseline (filters a single run manifest )' ,
434453 } ) ,
435454 candidate : option ( {
436455 type : optional ( string ) ,
437456 long : 'candidate' ,
438457 short : 'c' ,
439- description : 'Target name to use as candidate (filters combined JSONL )' ,
458+ description : 'Target name to use as candidate (filters a single run manifest )' ,
440459 } ) ,
441460 targets : multioption ( {
442461 type : array ( string ) ,
@@ -460,7 +479,7 @@ export const compareCommand = command({
460479
461480 try {
462481 if ( results . length === 0 ) {
463- throw new Error ( 'At least one JSONL result file is required' ) ;
482+ throw new Error ( 'At least one run workspace or index.jsonl manifest is required' ) ;
464483 }
465484
466485 if ( results . length === 2 ) {
@@ -478,7 +497,7 @@ export const compareCommand = command({
478497 const exitCode = determineExitCode ( comparison . summary . meanDelta ) ;
479498 process . exit ( exitCode ) ;
480499 } else if ( results . length === 1 ) {
481- // Combined JSONL mode
500+ // Single-run manifest mode
482501 let groups = loadCombinedResults ( results [ 0 ] ) ;
483502
484503 // Filter by --targets if specified
@@ -514,7 +533,7 @@ export const compareCommand = command({
514533 }
515534
516535 if ( baseline && candidate ) {
517- // Pairwise mode from combined JSONL
536+ // Pairwise mode from a single run manifest
518537 const baselineResults = groups . get ( baseline ) ;
519538 const candidateResults = groups . get ( candidate ) ;
520539 if ( ! baselineResults ) {
@@ -548,7 +567,7 @@ export const compareCommand = command({
548567 process . exit ( exitCode ) ;
549568 }
550569 } else {
551- throw new Error ( 'Expected 1 or 2 JSONL result files ' ) ;
570+ throw new Error ( 'Expected 1 or 2 run workspaces or index.jsonl manifests ' ) ;
552571 }
553572 } catch ( error ) {
554573 console . error ( `Error: ${ ( error as Error ) . message } ` ) ;
0 commit comments