Skip to content

Commit 64fdff9

Browse files
christsoclaude
andauthored
fix(pipeline): grade built-in deterministic assertions in subagent mode (#1085)
pipeline grade now evaluates contains, regex, equals, starts-with, ends-with, is-json, and other built-in assertion types against response.md. Previously these were silently ignored, producing score: 0 for tests with only deterministic assertions. Closes #1075 Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d55c136 commit 64fdff9

7 files changed

Lines changed: 387 additions & 22 deletions

File tree

apps/cli/src/commands/pipeline/bench.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ export const evalBenchCommand = command({
6262
const result = JSON.parse(await readFile(join(codeResultsDir, file), 'utf8'));
6363
evaluators.push({
6464
name: result.name,
65-
type: 'code-grader',
65+
type: result.type ?? 'code-grader',
6666
score: result.score,
6767
weight: result.weight ?? 1.0,
6868
assertions: result.assertions ?? [],

apps/cli/src/commands/pipeline/grade.ts

Lines changed: 167 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,38 @@
11
/**
2-
* `agentv pipeline grade` — Run code-grader assertions against response.md files
3-
* in an export directory produced by `pipeline input`.
2+
* `agentv pipeline grade` — Run code-grader and built-in deterministic assertions
3+
* against response.md files in an export directory produced by `pipeline input`.
44
*
5-
* For each test, reads code_graders/<name>.json configs, executes each grader
6-
* with the response text on stdin (matching CodeEvaluator payload format),
7-
* and writes results to code_grader_results/<name>.json.
5+
* For each test:
6+
* - Reads code_graders/<name>.json configs, executes each grader script,
7+
* and writes results to code_grader_results/<name>.json.
8+
* - Reads builtin_graders/<name>.json configs, evaluates deterministic assertions
9+
* (contains, regex, equals, etc.) in-process, and writes results to
10+
* code_grader_results/<name>.json (same directory, so pipeline bench merges them).
811
*
9-
* Graders run concurrently (default: 4 workers) for performance.
10-
* Progress is printed to stderr so users see real-time feedback.
12+
* Code graders run concurrently (default: 10 workers) for performance.
13+
* Built-in graders are synchronous and evaluate instantly after code graders finish.
1114
*
1215
* Export directory additions:
1316
* <out-dir>/<suite>/<test-id>/code_grader_results/<name>.json
1417
*/
1518
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
1619
import { join } from 'node:path';
1720

18-
import { executeScript } from '@agentv/core';
21+
import {
22+
type AssertionResult,
23+
executeScript,
24+
runContainsAllAssertion,
25+
runContainsAnyAssertion,
26+
runContainsAssertion,
27+
runEndsWithAssertion,
28+
runEqualsAssertion,
29+
runIcontainsAllAssertion,
30+
runIcontainsAnyAssertion,
31+
runIcontainsAssertion,
32+
runIsJsonAssertion,
33+
runRegexAssertion,
34+
runStartsWithAssertion,
35+
} from '@agentv/core';
1936
import { command, number, option, optional, positional, string } from 'cmd-ts';
2037

2138
const DEFAULT_CONCURRENCY = 10;
@@ -175,9 +192,130 @@ export async function runCodeGraders(
175192
return { totalGraders, totalPassed };
176193
}
177194

195+
/**
196+
* Evaluate a single built-in deterministic assertion against the response text.
197+
*
198+
* Dispatches to the appropriate assertion function based on the config type.
199+
* Returns the assertion result with score and descriptive assertions array.
200+
*
201+
* To add a new built-in assertion type:
202+
* 1. Import the runner from @agentv/core
203+
* 2. Add a case to the switch below
204+
* 3. Add the type to BUILTIN_ASSERTION_TYPES in pipeline/input.ts
205+
*/
206+
function evaluateBuiltinAssertion(
207+
config: { type: string; value?: unknown; flags?: string },
208+
responseText: string,
209+
): AssertionResult {
210+
const value = config.value;
211+
switch (config.type) {
212+
case 'contains':
213+
return runContainsAssertion(responseText, value as string);
214+
case 'contains-any':
215+
return runContainsAnyAssertion(responseText, value as string[]);
216+
case 'contains-all':
217+
return runContainsAllAssertion(responseText, value as string[]);
218+
case 'icontains':
219+
return runIcontainsAssertion(responseText, value as string);
220+
case 'icontains-any':
221+
return runIcontainsAnyAssertion(responseText, value as string[]);
222+
case 'icontains-all':
223+
return runIcontainsAllAssertion(responseText, value as string[]);
224+
case 'starts-with':
225+
return runStartsWithAssertion(responseText, value as string);
226+
case 'ends-with':
227+
return runEndsWithAssertion(responseText, value as string);
228+
case 'regex':
229+
return runRegexAssertion(responseText, value as string, config.flags);
230+
case 'is-json':
231+
return runIsJsonAssertion(responseText);
232+
case 'equals':
233+
return runEqualsAssertion(responseText, value as string);
234+
default:
235+
return {
236+
score: 0,
237+
assertions: [{ text: `Unknown assertion type: ${config.type}`, passed: false }],
238+
};
239+
}
240+
}
241+
242+
/**
243+
* Run built-in deterministic assertions for all tests in the export directory.
244+
* Reads configs from builtin_graders/<name>.json, evaluates in-process,
245+
* and writes results to code_grader_results/<name>.json.
246+
*/
247+
async function runBuiltinGraders(
248+
exportDir: string,
249+
testIds: string[],
250+
safeSuiteName: string,
251+
): Promise<{ total: number; passed: number }> {
252+
let total = 0;
253+
let passed = 0;
254+
255+
for (const testId of testIds) {
256+
const subpath = safeSuiteName ? [safeSuiteName, testId] : [testId];
257+
const testDir = join(exportDir, ...subpath);
258+
const builtinGradersDir = join(testDir, 'builtin_graders');
259+
260+
let graderFiles: string[];
261+
try {
262+
graderFiles = (await readdir(builtinGradersDir)).filter((f) => f.endsWith('.json'));
263+
} catch {
264+
continue; // No builtin graders for this test
265+
}
266+
267+
if (graderFiles.length === 0) continue;
268+
269+
const resultsDir = join(testDir, 'code_grader_results');
270+
await mkdir(resultsDir, { recursive: true });
271+
272+
let responseText: string;
273+
try {
274+
responseText = await readFile(join(testDir, 'response.md'), 'utf8');
275+
} catch {
276+
continue; // No response yet — skip
277+
}
278+
279+
for (const file of graderFiles) {
280+
const config = JSON.parse(await readFile(join(builtinGradersDir, file), 'utf8'));
281+
const raw = evaluateBuiltinAssertion(config, responseText);
282+
283+
// Apply negate if configured
284+
const negate = config.negate === true;
285+
const score = negate ? 1 - raw.score : raw.score;
286+
const assertions = negate
287+
? raw.assertions.map((a: { text: string; passed: boolean }) => ({
288+
text: a.text,
289+
passed: !a.passed,
290+
}))
291+
: raw.assertions;
292+
293+
const result = {
294+
name: config.name,
295+
type: config.type,
296+
score,
297+
weight: config.weight ?? 1.0,
298+
assertions,
299+
details: {},
300+
};
301+
302+
await writeFile(
303+
join(resultsDir, `${config.name}.json`),
304+
`${JSON.stringify(result, null, 2)}\n`,
305+
'utf8',
306+
);
307+
308+
total++;
309+
if (score >= 0.5) passed++;
310+
}
311+
}
312+
313+
return { total, passed };
314+
}
315+
178316
export const evalGradeCommand = command({
179317
name: 'grade',
180-
description: 'Run code-grader assertions on responses in an export directory',
318+
description: 'Run code-grader and built-in assertions on responses in an export directory',
181319
args: {
182320
exportDir: positional({
183321
type: string,
@@ -199,7 +337,7 @@ export const evalGradeCommand = command({
199337
const suiteName: string = manifest.suite ?? '';
200338
const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : '';
201339

202-
// Collect all grader tasks upfront so we know the total count
340+
// Collect all code-grader tasks upfront so we know the total count
203341
const tasks: GraderTask[] = [];
204342

205343
for (const testId of testIds) {
@@ -212,22 +350,31 @@ export const evalGradeCommand = command({
212350
try {
213351
graderFiles = (await readdir(codeGradersDir)).filter((f) => f.endsWith('.json'));
214352
} catch {
215-
continue; // No code graders for this test
353+
graderFiles = [];
216354
}
217355

218-
if (graderFiles.length === 0) continue;
219-
await mkdir(resultsDir, { recursive: true });
220-
221-
// Read response and input once per test (shared by all graders for this test)
222-
const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
223-
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
356+
if (graderFiles.length > 0) {
357+
await mkdir(resultsDir, { recursive: true });
358+
const responseText = await readFile(join(testDir, 'response.md'), 'utf8');
359+
const inputData = JSON.parse(await readFile(join(testDir, 'input.json'), 'utf8'));
224360

225-
for (const graderFile of graderFiles) {
226-
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
361+
for (const graderFile of graderFiles) {
362+
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
363+
}
227364
}
228365
}
229366

230367
const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);
231-
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
368+
369+
// Run built-in deterministic assertions (contains, regex, equals, etc.)
370+
const builtin = await runBuiltinGraders(exportDir, testIds, safeSuiteName);
371+
372+
const totalAll = totalGraders + builtin.total;
373+
const passedAll = totalPassed + builtin.passed;
374+
const parts: string[] = [];
375+
if (totalGraders > 0) parts.push(`${totalGraders} code-grader(s)`);
376+
if (builtin.total > 0) parts.push(`${builtin.total} built-in assertion(s)`);
377+
if (parts.length === 0) parts.push('0 grader(s)');
378+
console.log(`Graded ${parts.join(' + ')}: ${passedAll}/${totalAll} passed`);
232379
},
233380
});

apps/cli/src/commands/pipeline/input.ts

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,29 @@
1616
* ├── criteria.md
1717
* ├── expected_output.json (if present)
1818
* ├── llm_graders/<name>.json
19-
* └── code_graders/<name>.json
19+
* ├── code_graders/<name>.json
20+
* └── builtin_graders/<name>.json
2021
*/
2122
import { readFile } from 'node:fs/promises';
2223
import { mkdir, writeFile } from 'node:fs/promises';
2324
import { dirname, join, relative, resolve } from 'node:path';
2425

2526
import type { CodeEvaluatorConfig, EvaluatorConfig, LlmGraderEvaluatorConfig } from '@agentv/core';
27+
28+
/** Assertion types that can be graded deterministically without external scripts or LLMs. */
29+
const BUILTIN_ASSERTION_TYPES = new Set([
30+
'contains',
31+
'contains-any',
32+
'contains-all',
33+
'icontains',
34+
'icontains-any',
35+
'icontains-all',
36+
'starts-with',
37+
'ends-with',
38+
'regex',
39+
'is-json',
40+
'equals',
41+
]);
2642
import { deriveCategory, loadTestSuite } from '@agentv/core';
2743
import { command, option, optional, positional, string } from 'cmd-ts';
2844

@@ -190,9 +206,11 @@ async function writeGraderConfigs(
190206
): Promise<void> {
191207
const codeGradersDir = join(testDir, 'code_graders');
192208
const llmGradersDir = join(testDir, 'llm_graders');
209+
const builtinGradersDir = join(testDir, 'builtin_graders');
193210

194211
let hasCodeGraders = false;
195212
let hasLlmGraders = false;
213+
let hasBuiltinGraders = false;
196214

197215
for (const assertion of assertions) {
198216
if (assertion.type === 'code-grader') {
@@ -233,6 +251,20 @@ async function writeGraderConfigs(
233251
threshold: 0.5,
234252
config: {},
235253
});
254+
} else if (BUILTIN_ASSERTION_TYPES.has(assertion.type)) {
255+
if (!hasBuiltinGraders) {
256+
await mkdir(builtinGradersDir, { recursive: true });
257+
hasBuiltinGraders = true;
258+
}
259+
const config = assertion as EvaluatorConfig & { value?: unknown; flags?: string };
260+
await writeJson(join(builtinGradersDir, `${config.name}.json`), {
261+
name: config.name,
262+
type: config.type,
263+
value: config.value,
264+
flags: (config as { flags?: string }).flags,
265+
weight: config.weight ?? 1.0,
266+
negate: config.negate ?? false,
267+
});
236268
}
237269
}
238270
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
name: builtin-test
2+
tests:
3+
- id: test-01
4+
input: hello world
5+
criteria: Response echoes the input
6+
assertions:
7+
- name: has_hello
8+
type: contains
9+
value: hello
10+
- name: matches_pattern
11+
type: regex
12+
value: "h[aeiou]llo"
13+
- name: is_valid_json
14+
type: is-json

0 commit comments

Comments
 (0)