11/**
2- * `agentv pipeline grade` — Run code-grader assertions against response.md files
3- * in an export directory produced by `pipeline input`.
2+ * `agentv pipeline grade` — Run code-grader and built-in deterministic assertions
3+ * against response.md files in an export directory produced by `pipeline input`.
44 *
5- * For each test, reads code_graders/<name>.json configs, executes each grader
6- * with the response text on stdin (matching CodeEvaluator payload format),
7- * and writes results to code_grader_results/<name>.json.
5+ * For each test:
6+ * - Reads code_graders/<name>.json configs, executes each grader script,
7+ * and writes results to code_grader_results/<name>.json.
8+ * - Reads builtin_graders/<name>.json configs, evaluates deterministic assertions
9+ * (contains, regex, equals, etc.) in-process, and writes results to
10+ * code_grader_results/<name>.json (same directory, so pipeline bench merges them).
811 *
9- * Graders run concurrently (default: 4 workers) for performance.
10- * Progress is printed to stderr so users see real-time feedback .
12+ * Code graders run concurrently (default: 10 workers) for performance.
13+ * Built-in graders are synchronous and evaluate instantly after code graders finish .
1114 *
1215 * Export directory additions:
1316 * <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
1417 */
1518import { mkdir , readFile , readdir , writeFile } from 'node:fs/promises' ;
1619import { join } from 'node:path' ;
1720
18- import { executeScript } from '@agentv/core' ;
21+ import {
22+ type AssertionResult ,
23+ executeScript ,
24+ runContainsAllAssertion ,
25+ runContainsAnyAssertion ,
26+ runContainsAssertion ,
27+ runEndsWithAssertion ,
28+ runEqualsAssertion ,
29+ runIcontainsAllAssertion ,
30+ runIcontainsAnyAssertion ,
31+ runIcontainsAssertion ,
32+ runIsJsonAssertion ,
33+ runRegexAssertion ,
34+ runStartsWithAssertion ,
35+ } from '@agentv/core' ;
1936import { command , number , option , optional , positional , string } from 'cmd-ts' ;
2037
2138const DEFAULT_CONCURRENCY = 10 ;
@@ -175,9 +192,130 @@ export async function runCodeGraders(
175192 return { totalGraders, totalPassed } ;
176193}
177194
195+ /**
196+ * Evaluate a single built-in deterministic assertion against the response text.
197+ *
198+ * Dispatches to the appropriate assertion function based on the config type.
199+ * Returns the assertion result with score and descriptive assertions array.
200+ *
201+ * To add a new built-in assertion type:
202+ * 1. Import the runner from @agentv/core
203+ * 2. Add a case to the switch below
204+ * 3. Add the type to BUILTIN_ASSERTION_TYPES in pipeline/input.ts
205+ */
206+ function evaluateBuiltinAssertion (
207+ config : { type : string ; value ?: unknown ; flags ?: string } ,
208+ responseText : string ,
209+ ) : AssertionResult {
210+ const value = config . value ;
211+ switch ( config . type ) {
212+ case 'contains' :
213+ return runContainsAssertion ( responseText , value as string ) ;
214+ case 'contains-any' :
215+ return runContainsAnyAssertion ( responseText , value as string [ ] ) ;
216+ case 'contains-all' :
217+ return runContainsAllAssertion ( responseText , value as string [ ] ) ;
218+ case 'icontains' :
219+ return runIcontainsAssertion ( responseText , value as string ) ;
220+ case 'icontains-any' :
221+ return runIcontainsAnyAssertion ( responseText , value as string [ ] ) ;
222+ case 'icontains-all' :
223+ return runIcontainsAllAssertion ( responseText , value as string [ ] ) ;
224+ case 'starts-with' :
225+ return runStartsWithAssertion ( responseText , value as string ) ;
226+ case 'ends-with' :
227+ return runEndsWithAssertion ( responseText , value as string ) ;
228+ case 'regex' :
229+ return runRegexAssertion ( responseText , value as string , config . flags ) ;
230+ case 'is-json' :
231+ return runIsJsonAssertion ( responseText ) ;
232+ case 'equals' :
233+ return runEqualsAssertion ( responseText , value as string ) ;
234+ default :
235+ return {
236+ score : 0 ,
237+ assertions : [ { text : `Unknown assertion type: ${ config . type } ` , passed : false } ] ,
238+ } ;
239+ }
240+ }
241+
242+ /**
243+ * Run built-in deterministic assertions for all tests in the export directory.
244+ * Reads configs from builtin_graders/<name>.json, evaluates in-process,
245+ * and writes results to code_grader_results/<name>.json.
246+ */
247+ async function runBuiltinGraders (
248+ exportDir : string ,
249+ testIds : string [ ] ,
250+ safeSuiteName : string ,
251+ ) : Promise < { total : number ; passed : number } > {
252+ let total = 0 ;
253+ let passed = 0 ;
254+
255+ for ( const testId of testIds ) {
256+ const subpath = safeSuiteName ? [ safeSuiteName , testId ] : [ testId ] ;
257+ const testDir = join ( exportDir , ...subpath ) ;
258+ const builtinGradersDir = join ( testDir , 'builtin_graders' ) ;
259+
260+ let graderFiles : string [ ] ;
261+ try {
262+ graderFiles = ( await readdir ( builtinGradersDir ) ) . filter ( ( f ) => f . endsWith ( '.json' ) ) ;
263+ } catch {
264+ continue ; // No builtin graders for this test
265+ }
266+
267+ if ( graderFiles . length === 0 ) continue ;
268+
269+ const resultsDir = join ( testDir , 'code_grader_results' ) ;
270+ await mkdir ( resultsDir , { recursive : true } ) ;
271+
272+ let responseText : string ;
273+ try {
274+ responseText = await readFile ( join ( testDir , 'response.md' ) , 'utf8' ) ;
275+ } catch {
276+ continue ; // No response yet — skip
277+ }
278+
279+ for ( const file of graderFiles ) {
280+ const config = JSON . parse ( await readFile ( join ( builtinGradersDir , file ) , 'utf8' ) ) ;
281+ const raw = evaluateBuiltinAssertion ( config , responseText ) ;
282+
283+ // Apply negate if configured
284+ const negate = config . negate === true ;
285+ const score = negate ? 1 - raw . score : raw . score ;
286+ const assertions = negate
287+ ? raw . assertions . map ( ( a : { text : string ; passed : boolean } ) => ( {
288+ text : a . text ,
289+ passed : ! a . passed ,
290+ } ) )
291+ : raw . assertions ;
292+
293+ const result = {
294+ name : config . name ,
295+ type : config . type ,
296+ score,
297+ weight : config . weight ?? 1.0 ,
298+ assertions,
299+ details : { } ,
300+ } ;
301+
302+ await writeFile (
303+ join ( resultsDir , `${ config . name } .json` ) ,
304+ `${ JSON . stringify ( result , null , 2 ) } \n` ,
305+ 'utf8' ,
306+ ) ;
307+
308+ total ++ ;
309+ if ( score >= 0.5 ) passed ++ ;
310+ }
311+ }
312+
313+ return { total, passed } ;
314+ }
315+
178316export const evalGradeCommand = command ( {
179317 name : 'grade' ,
180- description : 'Run code-grader assertions on responses in an export directory' ,
318+ description : 'Run code-grader and built-in assertions on responses in an export directory' ,
181319 args : {
182320 exportDir : positional ( {
183321 type : string ,
@@ -199,7 +337,7 @@ export const evalGradeCommand = command({
199337 const suiteName : string = manifest . suite ?? '' ;
200338 const safeSuiteName = suiteName ? suiteName . replace ( / [ \/ \\ : * ? " < > | ] / g, '_' ) : '' ;
201339
202- // Collect all grader tasks upfront so we know the total count
340+ // Collect all code- grader tasks upfront so we know the total count
203341 const tasks : GraderTask [ ] = [ ] ;
204342
205343 for ( const testId of testIds ) {
@@ -212,22 +350,31 @@ export const evalGradeCommand = command({
212350 try {
213351 graderFiles = ( await readdir ( codeGradersDir ) ) . filter ( ( f ) => f . endsWith ( '.json' ) ) ;
214352 } catch {
215- continue ; // No code graders for this test
353+ graderFiles = [ ] ;
216354 }
217355
218- if ( graderFiles . length === 0 ) continue ;
219- await mkdir ( resultsDir , { recursive : true } ) ;
220-
221- // Read response and input once per test (shared by all graders for this test)
222- const responseText = await readFile ( join ( testDir , 'response.md' ) , 'utf8' ) ;
223- const inputData = JSON . parse ( await readFile ( join ( testDir , 'input.json' ) , 'utf8' ) ) ;
356+ if ( graderFiles . length > 0 ) {
357+ await mkdir ( resultsDir , { recursive : true } ) ;
358+ const responseText = await readFile ( join ( testDir , 'response.md' ) , 'utf8' ) ;
359+ const inputData = JSON . parse ( await readFile ( join ( testDir , 'input.json' ) , 'utf8' ) ) ;
224360
225- for ( const graderFile of graderFiles ) {
226- tasks . push ( { testId, testDir, resultsDir, graderFile, responseText, inputData } ) ;
361+ for ( const graderFile of graderFiles ) {
362+ tasks . push ( { testId, testDir, resultsDir, graderFile, responseText, inputData } ) ;
363+ }
227364 }
228365 }
229366
230367 const { totalGraders, totalPassed } = await runCodeGraders ( tasks , maxWorkers ) ;
231- console . log ( `Graded ${ totalGraders } code-grader(s): ${ totalPassed } passed` ) ;
368+
369+ // Run built-in deterministic assertions (contains, regex, equals, etc.)
370+ const builtin = await runBuiltinGraders ( exportDir , testIds , safeSuiteName ) ;
371+
372+ const totalAll = totalGraders + builtin . total ;
373+ const passedAll = totalPassed + builtin . passed ;
374+ const parts : string [ ] = [ ] ;
375+ if ( totalGraders > 0 ) parts . push ( `${ totalGraders } code-grader(s)` ) ;
376+ if ( builtin . total > 0 ) parts . push ( `${ builtin . total } built-in assertion(s)` ) ;
377+ if ( parts . length === 0 ) parts . push ( '0 grader(s)' ) ;
378+ console . log ( `Graded ${ parts . join ( ' + ' ) } : ${ passedAll } /${ totalAll } passed` ) ;
232379 } ,
233380} ) ;
0 commit comments