Skip to content

Commit e9d7d4a

Browse files
authored
Merge pull request #162 from SharpAI/feature/benchmark-thinking-mode-fix
fix(benchmark): disable thinking mode & improve JSON parsing
2 parents 16a33d0 + 1f4feab commit e9d7d4a

File tree

1 file changed

+38
-21
lines changed

1 file changed

+38
-21
lines changed

skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -216,19 +216,20 @@ async function llmCall(messages, opts = {}) {
216216
// - OpenAI cloud (GPT-5.4+): requires 'max_completion_tokens', rejects 'max_tokens'
217217
// - Local llama-server: requires 'max_tokens', may not understand 'max_completion_tokens'
218218
const isCloudApi = !opts.vlm && (LLM_API_TYPE === 'openai' || LLM_BASE_URL.includes('openai.com') || LLM_BASE_URL.includes('api.anthropic'));
219-
const maxTokensParam = opts.maxTokens
220-
? (isCloudApi ? { max_completion_tokens: opts.maxTokens } : { max_tokens: opts.maxTokens })
221-
: {};
219+
220+
// No max_tokens for any API — the streaming loop's 2000-token hard cap is the safety net.
221+
// Sending max_tokens to thinking models (Qwen3.5) starves actual output since
222+
// reasoning_content counts against the limit.
222223

223224
// Build request params
224225
const params = {
225226
messages,
226227
stream: true,
227-
// Request token usage in streaming response (supported by OpenAI, some local servers)
228-
stream_options: { include_usage: true },
228+
// Request token usage in streaming response (only supported by cloud APIs;
229+
// llama-server crashes with "Failed to parse input" when stream_options is present)
230+
...(isCloudApi && { stream_options: { include_usage: true } }),
229231
...(model && { model }),
230232
...(opts.temperature !== undefined && { temperature: opts.temperature }),
231-
...maxTokensParam,
232233
...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
233234
...(opts.expectJSON && { top_p: 0.8 }),
234235
...(opts.tools && { tools: opts.tools }),
@@ -306,10 +307,10 @@ async function llmCall(messages, opts = {}) {
306307
}
307308

308309
// Smart early abort for JSON-expected tests:
309-
// If the model is producing reasoning_content (thinking) for a JSON test,
310-
// abort after 100 reasoning tokens — it should output JSON directly.
311-
if (opts.expectJSON && !isContent && tokenCount > 100) {
312-
log(` ⚠ Aborting: ${tokenCount} reasoning tokens for JSON test — model is thinking instead of outputting JSON`);
310+
// Allow thinking models (Qwen3.5) up to 500 reasoning tokens before aborting.
311+
// They legitimately need to reason before outputting JSON.
312+
if (opts.expectJSON && !isContent && tokenCount > 500) {
313+
log(` ⚠ Aborting: ${tokenCount} reasoning tokens for JSON test — model is thinking too long`);
313314
controller.abort();
314315
break;
315316
}
@@ -356,8 +357,19 @@ async function llmCall(messages, opts = {}) {
356357

357358
// If the model only produced reasoning_content (thinking) with no content,
358359
// use the reasoning output as the response content for evaluation purposes.
360+
// Try to extract JSON from reasoning if this was a JSON-expected call.
359361
if (!content && reasoningContent) {
360-
content = reasoningContent;
362+
// Try to find JSON embedded in the reasoning output
363+
try {
364+
const jsonMatch = reasoningContent.match(/[{\[][\s\S]*[}\]]/);
365+
if (jsonMatch) {
366+
content = jsonMatch[0];
367+
} else {
368+
content = reasoningContent;
369+
}
370+
} catch {
371+
content = reasoningContent;
372+
}
361373
}
362374

363375
// Build per-call token data:
@@ -431,14 +443,23 @@ function parseJSON(text) {
431443
}
432444
// Clean common local model artifacts before parsing:
433445
// - Replace literal "..." or "…" placeholders in arrays/values
434-
// - Replace <indices> placeholder tags
446+
// - Replace <any placeholder text> tags (model echoes prompt templates)
435447
jsonStr = jsonStr
436448
.replace(/,\s*\.{3,}\s*(?=[\]},])/g, '') // trailing ..., before ] } or ,
437449
.replace(/\.{3,}/g, '"..."') // standalone ... → string
438450
.replace(//g, '"..."') // ellipsis char
439-
.replace(/<[a-z_]+>/gi, '"placeholder"') // <indices> etc.
451+
.replace(/<[^>]+>/g, '"placeholder"') // <any text> → "placeholder" (multi-word)
440452
.replace(/,\s*([}\]])/g, '$1'); // trailing commas
441-
return JSON.parse(jsonStr.trim());
453+
try {
454+
return JSON.parse(jsonStr.trim());
455+
} catch (firstErr) {
456+
// Aggressive retry: strip all non-JSON artifacts
457+
const aggressive = jsonStr
458+
.replace(/"placeholder"(\s*"placeholder")*/g, '"placeholder"') // collapse repeated placeholders
459+
.replace(/\bplaceholder\b/g, '""') // placeholder → empty string
460+
.replace(/,\s*([}\]])/g, '$1'); // re-clean trailing commas
461+
return JSON.parse(aggressive.trim());
462+
}
442463
}
443464

444465
function assert(condition, msg) {
@@ -520,11 +541,7 @@ ${userMessage}
520541
3. Always keep the last 2 user messages (most recent context)
521542
4. Keep system messages (they contain tool results)
522543
523-
## Response Format
524-
Respond with ONLY a valid JSON object, no other text:
525-
{"keep": [<actual index numbers from the list above>], "summary": "<summary of what was dropped>"}
526-
527-
Example: if keeping messages at indices 0, 18, 22 → {"keep": [0, 18, 22], "summary": "Removed 4 duplicate 'what happened today' questions"}
544+
Respond with ONLY valid JSON: {"keep": [0, 18, 22], "summary": "Removed 4 duplicate questions"}
528545
If nothing should be dropped, keep ALL indices and set summary to "".`;
529546
}
530547

@@ -2026,7 +2043,7 @@ async function main() {
20262043
log(` Base URL: ${llmBaseUrl}`);
20272044
log(' Check that the LLM server is running.\n');
20282045
emit({ event: 'error', message: `Cannot reach LLM endpoint: ${err.message}` });
2029-
process.exit(1);
2046+
process.exit(IS_SKILL_MODE ? 0 : 1);
20302047
}
20312048

20322049
// Collect system info
@@ -2169,7 +2186,7 @@ if (isDirectRun) {
21692186
main().catch(err => {
21702187
log(`Fatal: ${err.message}`);
21712188
emit({ event: 'error', message: err.message });
2172-
process.exit(1);
2189+
process.exit(IS_SKILL_MODE ? 0 : 1);
21732190
});
21742191
}
21752192

0 commit comments

Comments
 (0)