Fast-Editor · vishalveerareddy123 · Feb 11, 2026 · Feb 10, 2026
diff --git a/.env.example b/.env.example
@@ -217,6 +217,14 @@ WORKSPACE_INDEX_ENABLED=true
 # - client/passthrough: Return tool calls to CLI for local execution
 TOOL_EXECUTION_MODE=server
 
+# Suggestion mode model override
+# Controls which model handles suggestion mode (predicting next user input).
+# Values:
+#   default - Use the same model as MODEL_PROVIDER (no change)
+#   none    - Skip suggestion mode LLM calls entirely (saves GPU time)
+#   <model> - Use a specific model (e.g. "llama3.1" for a lighter model)
+SUGGESTION_MODE_MODEL=default
+
 # Enable/disable automatic tool injection for local models
 INJECT_TOOLS_LLAMACPP=true
 INJECT_TOOLS_OLLAMA=true

diff --git a/src/clients/databricks.js b/src/clients/databricks.js
@@ -309,7 +309,7 @@ async function invokeOllama(body) {
   }
 
   const ollamaBody = {
-    model: config.ollama.model,
+    model: body._suggestionModeModel || config.ollama.model,
     messages: deduplicated,
     stream: false,  // Force non-streaming for Ollama - streaming format conversion not yet implemented
     options: {
@@ -410,7 +410,7 @@ async function invokeOpenRouter(body) {
   }
 
   const openRouterBody = {
-    model: config.openrouter.model,
+    model: body._suggestionModeModel || config.openrouter.model,
     messages,
     temperature: body.temperature ?? 0.7,
     max_tokens: body.max_tokens ?? 4096,
@@ -496,7 +496,7 @@ async function invokeAzureOpenAI(body) {
     max_tokens: Math.min(body.max_tokens ?? 4096, 16384),  // Cap at Azure OpenAI's limit
     top_p: body.top_p ?? 1.0,
     stream: false,  // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
-    model: config.azureOpenAI.deployment
+    model: body._suggestionModeModel || config.azureOpenAI.deployment
   };
 
   // Add tools - inject standard tools if client didn't send any (passthrough mode)
@@ -842,7 +842,7 @@ async function invokeOpenAI(body) {
   }
 
   const openAIBody = {
-    model: config.openai.model || "gpt-4o",
+    model: body._suggestionModeModel || config.openai.model || "gpt-4o",
     messages,
     temperature: body.temperature ?? 0.7,
     max_tokens: body.max_tokens ?? 4096,

diff --git a/src/config/index.js b/src/config/index.js
@@ -136,6 +136,10 @@ const zaiModel = process.env.ZAI_MODEL?.trim() || "GLM-4.7";
 const vertexApiKey = process.env.VERTEX_API_KEY?.trim() || process.env.GOOGLE_API_KEY?.trim() || null;
 const vertexModel = process.env.VERTEX_MODEL?.trim() || "gemini-2.0-flash";
 
+// Suggestion mode model override
+// Values: "default" (use MODEL_DEFAULT), "none" (skip LLM call), or a model name
+const suggestionModeModel = (process.env.SUGGESTION_MODE_MODEL ?? "default").trim();
+
 // Hot reload configuration
 const hotReloadEnabled = process.env.HOT_RELOAD_ENABLED !== "false"; // default true
 const hotReloadDebounceMs = Number.parseInt(process.env.HOT_RELOAD_DEBOUNCE_MS ?? "1000", 10);
@@ -596,6 +600,7 @@ var config = {
   modelProvider: {
     type: modelProvider,
     defaultModel,
+    suggestionModeModel,
     // Hybrid routing settings
     preferOllama,
     fallbackEnabled,
@@ -885,6 +890,7 @@ function reloadConfig() {
   config.modelProvider.preferOllama = process.env.PREFER_OLLAMA === "true";
   config.modelProvider.fallbackEnabled = process.env.FALLBACK_ENABLED !== "false";
   config.modelProvider.fallbackProvider = (process.env.FALLBACK_PROVIDER ?? "databricks").toLowerCase();
+  config.modelProvider.suggestionModeModel = (process.env.SUGGESTION_MODE_MODEL ?? "default").trim();
 
   // Log level
   config.logger.level = process.env.LOG_LEVEL ?? "info";

diff --git a/src/orchestrator/index.js b/src/orchestrator/index.js
@@ -1234,6 +1234,15 @@ function sanitizePayload(payload) {
     toolCount: clean.tools?.length ?? 0
   }, '[CONTEXT_FLOW] After sanitizePayload');
 
+  // === Suggestion mode: tag request and override model if configured ===
+  const { isSuggestionMode: isSuggestion } = detectSuggestionMode(clean.messages);
+  clean._requestMode = isSuggestion ? "suggestion" : "main";
+  const smConfig = config.modelProvider?.suggestionModeModel ?? "default";
+  if (isSuggestion && smConfig.toLowerCase() !== "default" && smConfig.toLowerCase() !== "none") {
+    clean.model = smConfig;
+    clean._suggestionModeModel = smConfig;
+  }
+
   return clean;
 }
 
@@ -1876,11 +1885,26 @@ IMPORTANT TOOL USAGE RULES:
       toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
     }
 
+    // Guard: drop hallucinated tool calls when no tools were sent to the model.
+    // Some models (e.g. Llama 3.1) hallucinate tool_call blocks from conversation
+    // history even when the request contained zero tool definitions.
+    const toolsWereSent = Array.isArray(cleanPayload.tools) && cleanPayload.tools.length > 0;
+    if (toolCalls.length > 0 && !toolsWereSent) {
+      logger.warn({
+        sessionId: session?.id ?? null,
+        step: steps,
+        hallucinated: toolCalls.map(tc => tc.function?.name || tc.name),
+        noToolInjection: !!cleanPayload._noToolInjection,
+      }, "Dropped hallucinated tool calls (no tools were sent to model)");
+      toolCalls = [];
+      // If there's also no text content, treat as empty response (handled below)
+    }
+
     if (toolCalls.length > 0) {
       // Convert OpenAI/OpenRouter format to Anthropic format for session storage
       let sessionContent;
       if (providerType === "azure-anthropic") {
-        // Azure Anthropic already returns content in Anthropic format
+        // Azure Anthropic already returns content in Anthropic
         sessionContent = databricksResponse.json?.content ?? [];
       } else {
         // Convert OpenAI/OpenRouter format to Anthropic content blocks
@@ -3217,6 +3241,34 @@ IMPORTANT TOOL USAGE RULES:
   };
 }
 
+/**
+ * Detect if the current request is a suggestion mode call.
+ * Scans the last user message for the [SUGGESTION MODE: marker.
+ * @param {Array} messages - The conversation messages
+ * @returns {{ isSuggestionMode: boolean }}
+ */
+function detectSuggestionMode(messages) {
+  if (!Array.isArray(messages) || messages.length === 0) {
+    return { isSuggestionMode: false };
+  }
+  // Scan from the end to find the last user message
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg?.role !== 'user') continue;
+    const content = typeof msg.content === 'string'
+      ? msg.content
+      : Array.isArray(msg.content)
+        ? msg.content.map(b => b.text || '').join(' ')
+        : '';
+    if (content.includes('[SUGGESTION MODE:')) {
+      return { isSuggestionMode: true };
+    }
+    // Only check the last user message
+    break;
+  }
+  return { isSuggestionMode: false };
+}
+
 async function processMessage({ payload, headers, session, cwd, options = {} }) {
   const requestedModel =
     payload?.model ??
@@ -3226,6 +3278,32 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
     typeof headers?.["anthropic-beta"] === "string" &&
     headers["anthropic-beta"].includes("interleaved-thinking");
 
+  // === SUGGESTION MODE: Early return when SUGGESTION_MODE_MODEL=none ===
+  const { isSuggestionMode } = detectSuggestionMode(payload?.messages);
+  const suggestionModelConfig = config.modelProvider?.suggestionModeModel ?? "default";
+  if (isSuggestionMode && suggestionModelConfig.toLowerCase() === "none") {
+    logger.info('Suggestion mode: skipping LLM call (SUGGESTION_MODE_MODEL=none)');
+    return {
+      response: {
+        json: {
+          id: `msg_suggestion_skip_${Date.now()}`,
+          type: "message",
+          role: "assistant",
+          content: [{ type: "text", text: "" }],
+          model: requestedModel,
+          stop_reason: "end_turn",
+          stop_sequence: null,
+          usage: { input_tokens: 0, output_tokens: 0 },
+        },
+        ok: true,
+        status: 200,
+      },
+      steps: 0,
+      durationMs: 0,
+      terminationReason: "suggestion_mode_skip",
+    };
+  }
+
   // === TOOL LOOP GUARD (EARLY CHECK) ===
   // Check BEFORE sanitization since sanitizePayload removes conversation history
   const toolLoopThreshold = config.policy?.toolLoopThreshold ?? 3;