feat: code-grader plain-text fallback + workspace env preflight (#1209)

christso · claude · web-flow · commit 5ae93a37089d · 2026-05-04T13:09:45.000+10:00
* feat: add shell grader and workspace env preflight checks (#1207, #1208) Adds two new eval features: **Shell grader** (`type: shell`): runs a shell command and checks its stdout. - No `expected`: passes when exit code is 0 - `expected` with no `operator`: exact string match (trimmed stdout) - `expected` + `operator` (>, <, >=, <=, ==, !=): numeric float comparison **Workspace env preflight** (`workspace.env`): declares required system dependencies that are checked once before before_all hooks run. Fails fast with a clear diagnostic listing all missing commands/modules. Example: ```yaml workspace: env: required_commands: [ffmpeg, pandoc] required_python_modules: [PIL, openai] assertions: - type: shell command: "pdfinfo report.pdf | grep Pages | awk '{print $2}'" operator: ">=" expected: "5" ``` Closes #1207, #1208 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: resolve lint errors in shell grader and targets-validator imports Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: replace shell grader with code-grader plain-text fallback (#1210) Per design review: the `shell` grader type violated the "audit existing primitives first" principle — `code-grader` already runs shell commands. Promptfoo solves this the same way (javascript/python fallbacks, no dedicated shell type). Remove the `shell` grader type entirely and instead extend `code-grader` to accept plain-text stdout without requiring the JSON protocol: | stdout (trimmed, case-insensitive) | score | |---|---| | empty string | 1 if exit 0, 0 if exit non-zero | | "true", "pass", "1" | 1 | | "false", "fail", "0" | 0 | | numeric string | clamped float | | anything else | 1 if exit 0, 0 if exit non-zero | Scripts that write to stderr on non-zero exit still surface as errors (existing behavior). Silent non-zero exits (e.g. `[ "$pages" -ge 5 ]`) use exit-code convention. Usage: # numeric comparison via exit code - type: code-grader command: ["bash", "-c", "[ $(pdfinfo report.pdf | grep Pages | awk '{print $2}') -ge 5 ]"] # score from stdout - type: code-grader command: ["bash", "-c", "echo 0.75"] Closes #1210 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style: fix biome formatting in code-grader Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * refactor: simplify code-grader plain-text fallback to exit-code + assertion text Replace the string/numeric score interpretation with a clean two-convention model: - Exit code: 0 = score 1 (pass), non-zero = score 0 (fail) - Stdout: becomes the assertion text (human-readable context for the result) - Stderr on non-zero exit: still surfaces as an error For numeric scores or multi-aspect results, use the JSON protocol. This removes the "0"/"1"/numeric string ambiguity and aligns with how Unix tooling (bats, make, shell builtins) already signals pass/fail. Updates docs and tests to reflect the new model. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style: fix biome formatting Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx
@@ -9,7 +9,7 @@ Code graders are scripts that evaluate agent responses deterministically. Write
 
 ## Contract
 
-Code graders communicate via stdin/stdout JSON:
+Code graders receive eval context via stdin JSON and return a result via stdout.
 
 **Input (stdin):**
 ```json
@@ -19,8 +19,12 @@ Code graders communicate via stdin/stdout JSON:
   "output": "The answer is 42.",
   "expected_output": "42"
 }
+```
+
+### JSON output (full protocol)
+
+Emit a JSON object for numeric scores or multi-aspect results:
 
-**Output (stdout):**
 ```json
 {
   "score": 1.0,
@@ -35,6 +39,43 @@ Code graders communicate via stdin/stdout JSON:
 | `score` | `number` | 0.0 to 1.0 |
 | `assertions` | `Array<{ text, passed, evidence? }>` | Per-aspect results with verdict and optional evidence |
 
+### Plain-text output (exit-code convention)
+
+For simple pass/fail checks, skip the JSON protocol entirely. The exit code determines the score and stdout becomes the assertion text:
+
+| Exit code | Score | Verdict |
+|-----------|-------|---------|
+| 0 | 1.0 | pass |
+| non-zero (no stderr) | 0.0 | fail |
+
+```bash
+#!/bin/bash
+# check-pages.sh — passes when PDF has at least 5 pages
+pages=$(pdfinfo report.pdf | grep Pages | awk '{print $2}')
+if [ "$pages" -ge 5 ]; then
+  echo "PDF has $pages pages (≥5 required)"
+else
+  echo "PDF has only $pages pages (<5 required)"
+  exit 1
+fi
+```
+
+```yaml
+assertions:
+  - type: code-grader
+    command: [bash, scripts/check-pages.sh]
+```
+
+Silent one-liners work too — stdout is optional:
+
+```yaml
+assertions:
+  - type: code-grader
+    command: ["bash", "-c", "[ $(wc -l < output.txt) -ge 10 ]"]
+```
+
+Scripts that write to stderr and exit non-zero surface as execution errors rather than quality failures.
+
 ## Python Example
 
 ```python
diff --git a/packages/core/src/evaluation/graders/code-grader.ts b/packages/core/src/evaluation/graders/code-grader.ts
@@ -212,6 +212,8 @@ export class CodeGrader implements Grader {
 
     try {
       let stdout: string;
+      let exitCode = 0;
+      let execStderr = '';
       if (context.dockerConfig) {
         // Docker execution mode: run grader inside a container
         const { DockerWorkspaceProvider } = await import('../workspace/docker-workspace.js');
@@ -221,40 +223,68 @@ export class CodeGrader implements Grader {
           stdin: inputPayload,
           repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos),
         });
-        if (result.exitCode !== 0) {
-          const trimmedErr = result.stderr.trim();
-          throw new Error(
-            trimmedErr.length > 0
-              ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}`
-              : `Code evaluator exited with code ${result.exitCode}`,
-          );
-        }
+        exitCode = result.exitCode;
         stdout = result.stdout.trim();
+        execStderr = result.stderr;
       } else {
-        stdout = await executeScript(
+        const result = await runScriptRaw(
           this.command,
           inputPayload,
           this.agentTimeoutMs,
           this.cwd,
           env,
         );
+        exitCode = result.exitCode;
+        stdout = result.stdout.trim();
+        execStderr = result.stderr;
+      }
+      // Non-zero exit with JSON stdout, or with stderr output, is treated as an error
+      // (script signaled failure through the protocol or wrote an error message).
+      // Non-zero exit with plain stdout and no stderr uses the exit-code convention —
+      // score 0 (fail), stdout becomes the assertion text.
+      const looksLikeJson = stdout.startsWith('{') || stdout.startsWith('[');
+      const hasStderr = execStderr.trim().length > 0;
+      if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
+        const trimmedErr = formatStderr(execStderr);
+        throw new Error(
+          trimmedErr.length > 0
+            ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}`
+            : `Code evaluator exited with code ${exitCode}`,
+        );
       }
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === 'number' ? parsed.score : 0);
-      const assertions: AssertionEntry[] = Array.isArray(parsed?.assertions)
-        ? parsed.assertions
-            .filter(
-              (a: unknown): a is { text: string; passed: boolean; evidence?: string } =>
-                typeof a === 'object' &&
-                a !== null &&
-                typeof (a as Record<string, unknown>).text === 'string',
-            )
-            .map((a) => ({
-              text: String(a.text),
-              passed: Boolean(a.passed),
-              ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}),
-            }))
-        : [];
+      const rawParsed = parseJsonSafe(stdout);
+      // Only treat stdout as the JSON protocol if it parsed as a plain object.
+      // Bare JSON scalars (numbers, booleans, strings) fall through to the plain-text path.
+      const parsed =
+        rawParsed != null && typeof rawParsed === 'object' && !Array.isArray(rawParsed)
+          ? rawParsed
+          : undefined;
+      // Plain-text fallback: exit code is pass/fail, stdout is the assertion text.
+      // For numeric scores or multi-aspect results, use the JSON protocol instead.
+      const passed = exitCode === 0;
+      const score =
+        parsed != null
+          ? clampScore(typeof parsed.score === 'number' ? parsed.score : 0)
+          : passed
+            ? 1
+            : 0;
+      const assertions: AssertionEntry[] =
+        parsed != null && Array.isArray(parsed?.assertions)
+          ? parsed.assertions
+              .filter(
+                (a: unknown): a is { text: string; passed: boolean; evidence?: string } =>
+                  typeof a === 'object' &&
+                  a !== null &&
+                  typeof (a as Record<string, unknown>).text === 'string',
+              )
+              .map((a) => ({
+                text: String(a.text),
+                passed: Boolean(a.passed),
+                ...(typeof a.evidence === 'string' ? { evidence: a.evidence } : {}),
+              }))
+          : parsed == null
+            ? [{ text: stdout.trim() || (passed ? 'exit 0' : `exit ${exitCode}`), passed }]
+            : [];
       // Capture optional structured details from code judge output
       const details =
         parsed?.details && typeof parsed.details === 'object' && !Array.isArray(parsed.details)
@@ -325,17 +355,33 @@ export class CodeGrader implements Grader {
   }
 }
 
+/** Run a script and return raw stdout/stderr/exitCode without throwing. */
+async function runScriptRaw(
+  scriptPath: readonly string[] | string,
+  input: string,
+  agentTimeoutMs?: number,
+  cwd?: string,
+  env?: Record<string, string>,
+): Promise<{ stdout: string; stderr: string; exitCode: number }> {
+  return typeof scriptPath === 'string'
+    ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env })
+    : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
+}
+
 export async function executeScript(
   scriptPath: readonly string[] | string,
   input: string,
   agentTimeoutMs?: number,
   cwd?: string,
   env?: Record<string, string>,
 ): Promise<string> {
-  const { stdout, stderr, exitCode } =
-    typeof scriptPath === 'string'
-      ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env })
-      : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
+  const { stdout, stderr, exitCode } = await runScriptRaw(
+    scriptPath,
+    input,
+    agentTimeoutMs,
+    cwd,
+    env,
+  );
 
   if (exitCode !== 0) {
     const trimmedErr = formatStderr(stderr);
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
@@ -958,6 +958,20 @@ export async function runEvaluation(
       setupLog('Docker image pull complete');
     }
 
+    // Run preflight environment checks (fail fast before any hooks or test cases)
+    if (suiteWorkspace?.env) {
+      try {
+        await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? undefined, setupLog);
+        setupLog('preflight checks passed');
+      } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        if (sharedWorkspacePath && !useStaticWorkspace) {
+          await cleanupWorkspace(sharedWorkspacePath).catch(() => {});
+        }
+        throw new Error(message);
+      }
+    }
+
     // Execute before_all (runs ONCE before first test per workspace)
     const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
     const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
@@ -3924,3 +3938,45 @@ function computeWeightedMean(
 
   return totalWeight > 0 ? weightedSum / totalWeight : 0;
 }
+
+/**
+ * Run preflight environment checks for workspace.env config.
+ * Fails fast if any required command or Python module is missing.
+ * Called once before before_all hooks, so long evals abort immediately on missing deps.
+ */
+async function runPreflightChecks(
+  env: import('./types.js').WorkspaceEnvConfig,
+  cwd: string | undefined,
+  log: (msg: string) => void,
+): Promise<void> {
+  const execFileAsync = promisify(execFile);
+  const missing: string[] = [];
+
+  for (const cmd of env.required_commands ?? []) {
+    log(`preflight: checking command "${cmd}"`);
+    try {
+      if (process.platform === 'win32') {
+        await execFileAsync('where', [cmd], { cwd });
+      } else {
+        await execFileAsync('sh', ['-c', `command -v ${cmd}`], { cwd });
+      }
+    } catch {
+      missing.push(`command: ${cmd}`);
+    }
+  }
+
+  for (const mod of env.required_python_modules ?? []) {
+    log(`preflight: checking Python module "${mod}"`);
+    try {
+      await execFileAsync('python3', ['-c', `import ${mod}`], { cwd });
+    } catch {
+      missing.push(`python module: ${mod}`);
+    }
+  }
+
+  if (missing.length > 0) {
+    throw new Error(
+      `Preflight checks failed — missing dependencies:\n${missing.map((m) => `  • ${m}`).join('\n')}\n\nInstall the missing dependencies before running this eval.`,
+    );
+  }
+}
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
@@ -339,6 +339,25 @@ export type DockerWorkspaceConfig = {
   readonly cpus?: number;
 };
 
+/**
+ * Preflight environment requirements for the workspace.
+ * Checked once before before_all hooks run. Fails fast if anything is missing.
+ *
+ * @example
+ * ```yaml
+ * workspace:
+ *   env:
+ *     required_commands: [ffmpeg, pandoc]
+ *     required_python_modules: [PIL, openai]
+ * ```
+ */
+export type WorkspaceEnvConfig = {
+  /** Shell commands that must be present in PATH (checked via `command -v`) */
+  readonly required_commands?: readonly string[];
+  /** Python modules that must be importable (checked via `python3 -c "import <module>"`) */
+  readonly required_python_modules?: readonly string[];
+};
+
 export type WorkspaceConfig = {
   /** Template directory or .code-workspace file. Directories are copied to temp workspace.
    *  .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
@@ -359,6 +378,8 @@ export type WorkspaceConfig = {
    *  Used as default cwd for hook commands so that file-referenced templates resolve
    *  relative paths from their own directory, not the eval file's directory. */
   readonly workspaceFileDir?: string;
+  /** Preflight environment requirements. Checked before before_all hooks run. */
+  readonly env?: WorkspaceEnvConfig;
 };
 
 export type CodeGraderConfig = {
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
@@ -54,6 +54,7 @@ import type {
   TrialsConfig,
   TurnFailurePolicy,
   WorkspaceConfig,
+  WorkspaceEnvConfig,
   WorkspaceHookConfig,
   WorkspaceHooksConfig,
   WorkspaceScriptConfig,
@@ -853,8 +854,9 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi
   const mode = explicitMode ?? (workspacePath ? 'static' : undefined);
 
   const docker = parseDockerWorkspaceConfig(obj.docker);
+  const env = parseWorkspaceEnvConfig(obj.env);
 
-  if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
+  if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
     return undefined;
 
   return {
@@ -865,6 +867,26 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi
     ...(mode !== undefined && { mode }),
     ...(workspacePath !== undefined && { path: workspacePath }),
     ...(docker !== undefined && { docker }),
+    ...(env !== undefined && { env }),
+  };
+}
+
+function parseWorkspaceEnvConfig(raw: unknown): WorkspaceEnvConfig | undefined {
+  if (!isJsonObject(raw)) return undefined;
+  const obj = raw as Record<string, unknown>;
+
+  const required_commands = Array.isArray(obj.required_commands)
+    ? (obj.required_commands.filter((c) => typeof c === 'string') as string[])
+    : undefined;
+  const required_python_modules = Array.isArray(obj.required_python_modules)
+    ? (obj.required_python_modules.filter((m) => typeof m === 'string') as string[])
+    : undefined;
+
+  if (!required_commands?.length && !required_python_modules?.length) return undefined;
+
+  return {
+    ...(required_commands?.length && { required_commands }),
+    ...(required_python_modules?.length && { required_python_modules }),
   };
 }
 
diff --git a/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts b/packages/core/test/evaluation/graders/code-grader-plain-text.test.ts