diff --git a/.env.example b/.env.example index 1b2b097..8f9c917 100644 --- a/.env.example +++ b/.env.example @@ -217,6 +217,14 @@ WORKSPACE_INDEX_ENABLED=true # - client/passthrough: Return tool calls to CLI for local execution TOOL_EXECUTION_MODE=server +# Suggestion mode model override +# Controls which model handles suggestion mode (predicting next user input). +# Values: +# default - Use the same model as MODEL_PROVIDER (no change) +# none - Skip suggestion mode LLM calls entirely (saves GPU time) +# - Use a specific model (e.g. "llama3.1" for a lighter model) +SUGGESTION_MODE_MODEL=default + # Enable/disable automatic tool injection for local models INJECT_TOOLS_LLAMACPP=true INJECT_TOOLS_OLLAMA=true diff --git a/src/clients/databricks.js b/src/clients/databricks.js index 2e1f788..9b536cd 100644 --- a/src/clients/databricks.js +++ b/src/clients/databricks.js @@ -309,7 +309,7 @@ async function invokeOllama(body) { } const ollamaBody = { - model: config.ollama.model, + model: body._suggestionModeModel || config.ollama.model, messages: deduplicated, stream: false, // Force non-streaming for Ollama - streaming format conversion not yet implemented options: { @@ -410,7 +410,7 @@ async function invokeOpenRouter(body) { } const openRouterBody = { - model: config.openrouter.model, + model: body._suggestionModeModel || config.openrouter.model, messages, temperature: body.temperature ?? 0.7, max_tokens: body.max_tokens ?? 4096, @@ -496,7 +496,7 @@ async function invokeAzureOpenAI(body) { max_tokens: Math.min(body.max_tokens ?? 4096, 16384), // Cap at Azure OpenAI's limit top_p: body.top_p ?? 1.0, stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented - model: config.azureOpenAI.deployment + model: body._suggestionModeModel || config.azureOpenAI.deployment }; // Add tools - inject standard tools if client didn't send any (passthrough mode) @@ -842,7 +842,7 @@ async function invokeOpenAI(body) { } const openAIBody = { - model: config.openai.model || "gpt-4o", + model: body._suggestionModeModel || config.openai.model || "gpt-4o", messages, temperature: body.temperature ?? 0.7, max_tokens: body.max_tokens ?? 4096, diff --git a/src/config/index.js b/src/config/index.js index 6c1a698..466585d 100644 --- a/src/config/index.js +++ b/src/config/index.js @@ -136,6 +136,10 @@ const zaiModel = process.env.ZAI_MODEL?.trim() || "GLM-4.7"; const vertexApiKey = process.env.VERTEX_API_KEY?.trim() || process.env.GOOGLE_API_KEY?.trim() || null; const vertexModel = process.env.VERTEX_MODEL?.trim() || "gemini-2.0-flash"; +// Suggestion mode model override +// Values: "default" (use MODEL_DEFAULT), "none" (skip LLM call), or a model name +const suggestionModeModel = (process.env.SUGGESTION_MODE_MODEL ?? "default").trim(); + // Hot reload configuration const hotReloadEnabled = process.env.HOT_RELOAD_ENABLED !== "false"; // default true const hotReloadDebounceMs = Number.parseInt(process.env.HOT_RELOAD_DEBOUNCE_MS ?? "1000", 10); @@ -596,6 +600,7 @@ var config = { modelProvider: { type: modelProvider, defaultModel, + suggestionModeModel, // Hybrid routing settings preferOllama, fallbackEnabled, @@ -885,6 +890,7 @@ function reloadConfig() { config.modelProvider.preferOllama = process.env.PREFER_OLLAMA === "true"; config.modelProvider.fallbackEnabled = process.env.FALLBACK_ENABLED !== "false"; config.modelProvider.fallbackProvider = (process.env.FALLBACK_PROVIDER ?? "databricks").toLowerCase(); + config.modelProvider.suggestionModeModel = (process.env.SUGGESTION_MODE_MODEL ?? "default").trim(); // Log level config.logger.level = process.env.LOG_LEVEL ?? "info"; diff --git a/src/orchestrator/index.js b/src/orchestrator/index.js index d553b69..bd96483 100644 --- a/src/orchestrator/index.js +++ b/src/orchestrator/index.js @@ -1234,6 +1234,15 @@ function sanitizePayload(payload) { toolCount: clean.tools?.length ?? 0 }, '[CONTEXT_FLOW] After sanitizePayload'); + // === Suggestion mode: tag request and override model if configured === + const { isSuggestionMode: isSuggestion } = detectSuggestionMode(clean.messages); + clean._requestMode = isSuggestion ? "suggestion" : "main"; + const smConfig = config.modelProvider?.suggestionModeModel ?? "default"; + if (isSuggestion && smConfig.toLowerCase() !== "default" && smConfig.toLowerCase() !== "none") { + clean.model = smConfig; + clean._suggestionModeModel = smConfig; + } + return clean; } @@ -1876,11 +1885,26 @@ IMPORTANT TOOL USAGE RULES: toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : []; } + // Guard: drop hallucinated tool calls when no tools were sent to the model. + // Some models (e.g. Llama 3.1) hallucinate tool_call blocks from conversation + // history even when the request contained zero tool definitions. + const toolsWereSent = Array.isArray(cleanPayload.tools) && cleanPayload.tools.length > 0; + if (toolCalls.length > 0 && !toolsWereSent) { + logger.warn({ + sessionId: session?.id ?? null, + step: steps, + hallucinated: toolCalls.map(tc => tc.function?.name || tc.name), + noToolInjection: !!cleanPayload._noToolInjection, + }, "Dropped hallucinated tool calls (no tools were sent to model)"); + toolCalls = []; + // If there's also no text content, treat as empty response (handled below) + } + if (toolCalls.length > 0) { // Convert OpenAI/OpenRouter format to Anthropic format for session storage let sessionContent; if (providerType === "azure-anthropic") { - // Azure Anthropic already returns content in Anthropic format + // Azure Anthropic already returns content in Anthropic sessionContent = databricksResponse.json?.content ?? []; } else { // Convert OpenAI/OpenRouter format to Anthropic content blocks @@ -3217,6 +3241,34 @@ IMPORTANT TOOL USAGE RULES: }; } +/** + * Detect if the current request is a suggestion mode call. + * Scans the last user message for the [SUGGESTION MODE: marker. + * @param {Array} messages - The conversation messages + * @returns {{ isSuggestionMode: boolean }} + */ +function detectSuggestionMode(messages) { + if (!Array.isArray(messages) || messages.length === 0) { + return { isSuggestionMode: false }; + } + // Scan from the end to find the last user message + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg?.role !== 'user') continue; + const content = typeof msg.content === 'string' + ? msg.content + : Array.isArray(msg.content) + ? msg.content.map(b => b.text || '').join(' ') + : ''; + if (content.includes('[SUGGESTION MODE:')) { + return { isSuggestionMode: true }; + } + // Only check the last user message + break; + } + return { isSuggestionMode: false }; +} + async function processMessage({ payload, headers, session, cwd, options = {} }) { const requestedModel = payload?.model ?? @@ -3226,6 +3278,32 @@ async function processMessage({ payload, headers, session, cwd, options = {} }) typeof headers?.["anthropic-beta"] === "string" && headers["anthropic-beta"].includes("interleaved-thinking"); + // === SUGGESTION MODE: Early return when SUGGESTION_MODE_MODEL=none === + const { isSuggestionMode } = detectSuggestionMode(payload?.messages); + const suggestionModelConfig = config.modelProvider?.suggestionModeModel ?? "default"; + if (isSuggestionMode && suggestionModelConfig.toLowerCase() === "none") { + logger.info('Suggestion mode: skipping LLM call (SUGGESTION_MODE_MODEL=none)'); + return { + response: { + json: { + id: `msg_suggestion_skip_${Date.now()}`, + type: "message", + role: "assistant", + content: [{ type: "text", text: "" }], + model: requestedModel, + stop_reason: "end_turn", + stop_sequence: null, + usage: { input_tokens: 0, output_tokens: 0 }, + }, + ok: true, + status: 200, + }, + steps: 0, + durationMs: 0, + terminationReason: "suggestion_mode_skip", + }; + } + // === TOOL LOOP GUARD (EARLY CHECK) === // Check BEFORE sanitization since sanitizePayload removes conversation history const toolLoopThreshold = config.policy?.toolLoopThreshold ?? 3;