@@ -212,6 +212,8 @@ export class CodeGrader implements Grader {
212212
213213 try {
214214 let stdout : string ;
215+ let exitCode = 0 ;
216+ let execStderr = '' ;
215217 if ( context . dockerConfig ) {
216218 // Docker execution mode: run grader inside a container
217219 const { DockerWorkspaceProvider } = await import ( '../workspace/docker-workspace.js' ) ;
@@ -221,40 +223,68 @@ export class CodeGrader implements Grader {
221223 stdin : inputPayload ,
222224 repoCheckouts : getRepoCheckoutTargets ( context . evalCase . workspace ?. repos ) ,
223225 } ) ;
224- if ( result . exitCode !== 0 ) {
225- const trimmedErr = result . stderr . trim ( ) ;
226- throw new Error (
227- trimmedErr . length > 0
228- ? `Code evaluator exited with code ${ result . exitCode } : ${ trimmedErr } `
229- : `Code evaluator exited with code ${ result . exitCode } ` ,
230- ) ;
231- }
226+ exitCode = result . exitCode ;
232227 stdout = result . stdout . trim ( ) ;
228+ execStderr = result . stderr ;
233229 } else {
234- stdout = await executeScript (
230+ const result = await runScriptRaw (
235231 this . command ,
236232 inputPayload ,
237233 this . agentTimeoutMs ,
238234 this . cwd ,
239235 env ,
240236 ) ;
237+ exitCode = result . exitCode ;
238+ stdout = result . stdout . trim ( ) ;
239+ execStderr = result . stderr ;
240+ }
241+ // Non-zero exit with JSON stdout, or with stderr output, is treated as an error
242+ // (script signaled failure through the protocol or wrote an error message).
243+ // Non-zero exit with plain stdout and no stderr uses the exit-code convention —
244+ // score 0 (fail), stdout becomes the assertion text.
245+ const looksLikeJson = stdout . startsWith ( '{' ) || stdout . startsWith ( '[' ) ;
246+ const hasStderr = execStderr . trim ( ) . length > 0 ;
247+ if ( exitCode !== 0 && ( looksLikeJson || hasStderr ) ) {
248+ const trimmedErr = formatStderr ( execStderr ) ;
249+ throw new Error (
250+ trimmedErr . length > 0
251+ ? `Code evaluator exited with code ${ exitCode } : ${ trimmedErr } `
252+ : `Code evaluator exited with code ${ exitCode } ` ,
253+ ) ;
241254 }
242- const parsed = parseJsonSafe ( stdout ) ;
243- const score = clampScore ( typeof parsed ?. score === 'number' ? parsed . score : 0 ) ;
244- const assertions : AssertionEntry [ ] = Array . isArray ( parsed ?. assertions )
245- ? parsed . assertions
246- . filter (
247- ( a : unknown ) : a is { text : string ; passed : boolean ; evidence ?: string } =>
248- typeof a === 'object' &&
249- a !== null &&
250- typeof ( a as Record < string , unknown > ) . text === 'string' ,
251- )
252- . map ( ( a ) => ( {
253- text : String ( a . text ) ,
254- passed : Boolean ( a . passed ) ,
255- ...( typeof a . evidence === 'string' ? { evidence : a . evidence } : { } ) ,
256- } ) )
257- : [ ] ;
255+ const rawParsed = parseJsonSafe ( stdout ) ;
256+ // Only treat stdout as the JSON protocol if it parsed as a plain object.
257+ // Bare JSON scalars (numbers, booleans, strings) fall through to the plain-text path.
258+ const parsed =
259+ rawParsed != null && typeof rawParsed === 'object' && ! Array . isArray ( rawParsed )
260+ ? rawParsed
261+ : undefined ;
262+ // Plain-text fallback: exit code is pass/fail, stdout is the assertion text.
263+ // For numeric scores or multi-aspect results, use the JSON protocol instead.
264+ const passed = exitCode === 0 ;
265+ const score =
266+ parsed != null
267+ ? clampScore ( typeof parsed . score === 'number' ? parsed . score : 0 )
268+ : passed
269+ ? 1
270+ : 0 ;
271+ const assertions : AssertionEntry [ ] =
272+ parsed != null && Array . isArray ( parsed ?. assertions )
273+ ? parsed . assertions
274+ . filter (
275+ ( a : unknown ) : a is { text : string ; passed : boolean ; evidence ?: string } =>
276+ typeof a === 'object' &&
277+ a !== null &&
278+ typeof ( a as Record < string , unknown > ) . text === 'string' ,
279+ )
280+ . map ( ( a ) => ( {
281+ text : String ( a . text ) ,
282+ passed : Boolean ( a . passed ) ,
283+ ...( typeof a . evidence === 'string' ? { evidence : a . evidence } : { } ) ,
284+ } ) )
285+ : parsed == null
286+ ? [ { text : stdout . trim ( ) || ( passed ? 'exit 0' : `exit ${ exitCode } ` ) , passed } ]
287+ : [ ] ;
258288 // Capture optional structured details from code judge output
259289 const details =
260290 parsed ?. details && typeof parsed . details === 'object' && ! Array . isArray ( parsed . details )
@@ -325,17 +355,33 @@ export class CodeGrader implements Grader {
325355 }
326356}
327357
358+ /** Run a script and return raw stdout/stderr/exitCode without throwing. */
359+ async function runScriptRaw (
360+ scriptPath : readonly string [ ] | string ,
361+ input : string ,
362+ agentTimeoutMs ?: number ,
363+ cwd ?: string ,
364+ env ?: Record < string , string > ,
365+ ) : Promise < { stdout : string ; stderr : string ; exitCode : number } > {
366+ return typeof scriptPath === 'string'
367+ ? execShellWithStdin ( scriptPath , input , { cwd, timeoutMs : agentTimeoutMs , env } )
368+ : execFileWithStdin ( scriptPath , input , { cwd, timeoutMs : agentTimeoutMs , env } ) ;
369+ }
370+
328371export async function executeScript (
329372 scriptPath : readonly string [ ] | string ,
330373 input : string ,
331374 agentTimeoutMs ?: number ,
332375 cwd ?: string ,
333376 env ?: Record < string , string > ,
334377) : Promise < string > {
335- const { stdout, stderr, exitCode } =
336- typeof scriptPath === 'string'
337- ? await execShellWithStdin ( scriptPath , input , { cwd, timeoutMs : agentTimeoutMs , env } )
338- : await execFileWithStdin ( scriptPath , input , { cwd, timeoutMs : agentTimeoutMs , env } ) ;
378+ const { stdout, stderr, exitCode } = await runScriptRaw (
379+ scriptPath ,
380+ input ,
381+ agentTimeoutMs ,
382+ cwd ,
383+ env ,
384+ ) ;
339385
340386 if ( exitCode !== 0 ) {
341387 const trimmedErr = formatStderr ( stderr ) ;
0 commit comments