@@ -216,19 +216,20 @@ async function llmCall(messages, opts = {}) {
216216 // - OpenAI cloud (GPT-5.4+): requires 'max_completion_tokens', rejects 'max_tokens'
217217 // - Local llama-server: requires 'max_tokens', may not understand 'max_completion_tokens'
218218 const isCloudApi = ! opts . vlm && ( LLM_API_TYPE === 'openai' || LLM_BASE_URL . includes ( 'openai.com' ) || LLM_BASE_URL . includes ( 'api.anthropic' ) ) ;
219- const maxTokensParam = opts . maxTokens
220- ? ( isCloudApi ? { max_completion_tokens : opts . maxTokens } : { max_tokens : opts . maxTokens } )
221- : { } ;
219+
220+ // No max_tokens for any API — the streaming loop's 2000-token hard cap is the safety net.
221+ // Sending max_tokens to thinking models (Qwen3.5) starves actual output since
222+ // reasoning_content counts against the limit.
222223
223224 // Build request params
224225 const params = {
225226 messages,
226227 stream : true ,
227- // Request token usage in streaming response (supported by OpenAI, some local servers)
228- stream_options : { include_usage : true } ,
228+ // Request token usage in streaming response (only supported by cloud APIs;
229+ // llama-server crashes with "Failed to parse input" when stream_options is present)
230+ ...( isCloudApi && { stream_options : { include_usage : true } } ) ,
229231 ...( model && { model } ) ,
230232 ...( opts . temperature !== undefined && { temperature : opts . temperature } ) ,
231- ...maxTokensParam ,
232233 ...( opts . expectJSON && opts . temperature === undefined && { temperature : 0.7 } ) ,
233234 ...( opts . expectJSON && { top_p : 0.8 } ) ,
234235 ...( opts . tools && { tools : opts . tools } ) ,
@@ -306,10 +307,10 @@ async function llmCall(messages, opts = {}) {
306307 }
307308
308309 // Smart early abort for JSON-expected tests:
309- // If the model is producing reasoning_content (thinking) for a JSON test,
310- // abort after 100 reasoning tokens — it should output JSON directly .
311- if ( opts . expectJSON && ! isContent && tokenCount > 100 ) {
312- log ( ` ⚠ Aborting: ${ tokenCount } reasoning tokens for JSON test — model is thinking instead of outputting JSON ` ) ;
310+ // Allow thinking models (Qwen3.5) up to 500 reasoning tokens before aborting.
311+ // They legitimately need to reason before outputting JSON.
312+ if ( opts . expectJSON && ! isContent && tokenCount > 500 ) {
313+ log ( ` ⚠ Aborting: ${ tokenCount } reasoning tokens for JSON test — model is thinking too long ` ) ;
313314 controller . abort ( ) ;
314315 break ;
315316 }
@@ -356,8 +357,19 @@ async function llmCall(messages, opts = {}) {
356357
357358 // If the model only produced reasoning_content (thinking) with no content,
358359 // use the reasoning output as the response content for evaluation purposes.
360+ // Try to extract JSON from reasoning if this was a JSON-expected call.
359361 if ( ! content && reasoningContent ) {
360- content = reasoningContent ;
362+ // Try to find JSON embedded in the reasoning output
363+ try {
364+ const jsonMatch = reasoningContent . match ( / [ { \[ ] [ \s \S ] * [ } \] ] / ) ;
365+ if ( jsonMatch ) {
366+ content = jsonMatch [ 0 ] ;
367+ } else {
368+ content = reasoningContent ;
369+ }
370+ } catch {
371+ content = reasoningContent ;
372+ }
361373 }
362374
363375 // Build per-call token data:
@@ -431,14 +443,23 @@ function parseJSON(text) {
431443 }
432444 // Clean common local model artifacts before parsing:
433445 // - Replace literal "..." or "…" placeholders in arrays/values
434- // - Replace <indices> placeholder tags
446+ // - Replace <any placeholder text> tags (model echoes prompt templates)
435447 jsonStr = jsonStr
436448 . replace ( / , \s * \. { 3 , } \s * (? = [ \] } , ] ) / g, '' ) // trailing ..., before ] } or ,
437449 . replace ( / \. { 3 , } / g, '"..."' ) // standalone ... → string
438450 . replace ( / … / g, '"..."' ) // ellipsis char
439- . replace ( / < [ a - z _ ] + > / gi , '"placeholder"' ) // <indices> etc.
451+ . replace ( / < [ ^ > ] + > / g , '"placeholder"' ) // <any text> → "placeholder" (multi-word)
440452 . replace ( / , \s * ( [ } \] ] ) / g, '$1' ) ; // trailing commas
441- return JSON . parse ( jsonStr . trim ( ) ) ;
453+ try {
454+ return JSON . parse ( jsonStr . trim ( ) ) ;
455+ } catch ( firstErr ) {
456+ // Aggressive retry: strip all non-JSON artifacts
457+ const aggressive = jsonStr
458+ . replace ( / " p l a c e h o l d e r " ( \s * " p l a c e h o l d e r " ) * / g, '"placeholder"' ) // collapse repeated placeholders
459+ . replace ( / \b p l a c e h o l d e r \b / g, '""' ) // placeholder → empty string
460+ . replace ( / , \s * ( [ } \] ] ) / g, '$1' ) ; // re-clean trailing commas
461+ return JSON . parse ( aggressive . trim ( ) ) ;
462+ }
442463}
443464
444465function assert ( condition , msg ) {
@@ -520,11 +541,7 @@ ${userMessage}
5205413. Always keep the last 2 user messages (most recent context)
5215424. Keep system messages (they contain tool results)
522543
523- ## Response Format
524- Respond with ONLY a valid JSON object, no other text:
525- {"keep": [<actual index numbers from the list above>], "summary": "<summary of what was dropped>"}
526-
527- Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"}
544+ Respond with ONLY valid JSON: {"keep": [0, 18, 22], "summary": "Removed 4 duplicate questions"}
528545If nothing should be dropped, keep ALL indices and set summary to "".` ;
529546}
530547
@@ -2026,7 +2043,7 @@ async function main() {
20262043 log ( ` Base URL: ${ llmBaseUrl } ` ) ;
20272044 log ( ' Check that the LLM server is running.\n' ) ;
20282045 emit ( { event : 'error' , message : `Cannot reach LLM endpoint: ${ err . message } ` } ) ;
2029- process . exit ( 1 ) ;
2046+ process . exit ( IS_SKILL_MODE ? 0 : 1 ) ;
20302047 }
20312048
20322049 // Collect system info
@@ -2169,7 +2186,7 @@ if (isDirectRun) {
21692186 main ( ) . catch ( err => {
21702187 log ( `Fatal: ${ err . message } ` ) ;
21712188 emit ( { event : 'error' , message : err . message } ) ;
2172- process . exit ( 1 ) ;
2189+ process . exit ( IS_SKILL_MODE ? 0 : 1 ) ;
21732190 } ) ;
21742191}
21752192
0 commit comments