Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,14 @@ WORKSPACE_INDEX_ENABLED=true
# - client/passthrough: Return tool calls to CLI for local execution
TOOL_EXECUTION_MODE=server

# Suggestion mode model override
# Controls which model handles suggestion mode (predicting next user input).
# Values:
# default - Use the same model as MODEL_PROVIDER (no change)
# none - Skip suggestion mode LLM calls entirely (saves GPU time)
# <model> - Use a specific model (e.g. "llama3.1" for a lighter model)
SUGGESTION_MODE_MODEL=default

# Enable/disable automatic tool injection for local models
INJECT_TOOLS_LLAMACPP=true
INJECT_TOOLS_OLLAMA=true
Expand Down
8 changes: 4 additions & 4 deletions src/clients/databricks.js
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ async function invokeOllama(body) {
}

const ollamaBody = {
model: config.ollama.model,
model: body._suggestionModeModel || config.ollama.model,
messages: deduplicated,
stream: false, // Force non-streaming for Ollama - streaming format conversion not yet implemented
options: {
Expand Down Expand Up @@ -410,7 +410,7 @@ async function invokeOpenRouter(body) {
}

const openRouterBody = {
model: config.openrouter.model,
model: body._suggestionModeModel || config.openrouter.model,
messages,
temperature: body.temperature ?? 0.7,
max_tokens: body.max_tokens ?? 4096,
Expand Down Expand Up @@ -496,7 +496,7 @@ async function invokeAzureOpenAI(body) {
max_tokens: Math.min(body.max_tokens ?? 4096, 16384), // Cap at Azure OpenAI's limit
top_p: body.top_p ?? 1.0,
stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
model: config.azureOpenAI.deployment
model: body._suggestionModeModel || config.azureOpenAI.deployment
};

// Add tools - inject standard tools if client didn't send any (passthrough mode)
Expand Down Expand Up @@ -842,7 +842,7 @@ async function invokeOpenAI(body) {
}

const openAIBody = {
model: config.openai.model || "gpt-4o",
model: body._suggestionModeModel || config.openai.model || "gpt-4o",
messages,
temperature: body.temperature ?? 0.7,
max_tokens: body.max_tokens ?? 4096,
Expand Down
6 changes: 6 additions & 0 deletions src/config/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ const zaiModel = process.env.ZAI_MODEL?.trim() || "GLM-4.7";
const vertexApiKey = process.env.VERTEX_API_KEY?.trim() || process.env.GOOGLE_API_KEY?.trim() || null;
const vertexModel = process.env.VERTEX_MODEL?.trim() || "gemini-2.0-flash";

// Suggestion mode model override
// Values: "default" (use MODEL_DEFAULT), "none" (skip LLM call), or a model name
const suggestionModeModel = (process.env.SUGGESTION_MODE_MODEL ?? "default").trim();

// Hot reload configuration
const hotReloadEnabled = process.env.HOT_RELOAD_ENABLED !== "false"; // default true
const hotReloadDebounceMs = Number.parseInt(process.env.HOT_RELOAD_DEBOUNCE_MS ?? "1000", 10);
Expand Down Expand Up @@ -596,6 +600,7 @@ var config = {
modelProvider: {
type: modelProvider,
defaultModel,
suggestionModeModel,
// Hybrid routing settings
preferOllama,
fallbackEnabled,
Expand Down Expand Up @@ -885,6 +890,7 @@ function reloadConfig() {
config.modelProvider.preferOllama = process.env.PREFER_OLLAMA === "true";
config.modelProvider.fallbackEnabled = process.env.FALLBACK_ENABLED !== "false";
config.modelProvider.fallbackProvider = (process.env.FALLBACK_PROVIDER ?? "databricks").toLowerCase();
config.modelProvider.suggestionModeModel = (process.env.SUGGESTION_MODE_MODEL ?? "default").trim();

// Log level
config.logger.level = process.env.LOG_LEVEL ?? "info";
Expand Down
80 changes: 79 additions & 1 deletion src/orchestrator/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,15 @@ function sanitizePayload(payload) {
toolCount: clean.tools?.length ?? 0
}, '[CONTEXT_FLOW] After sanitizePayload');

// === Suggestion mode: tag request and override model if configured ===
const { isSuggestionMode: isSuggestion } = detectSuggestionMode(clean.messages);
clean._requestMode = isSuggestion ? "suggestion" : "main";
const smConfig = config.modelProvider?.suggestionModeModel ?? "default";
if (isSuggestion && smConfig.toLowerCase() !== "default" && smConfig.toLowerCase() !== "none") {
clean.model = smConfig;
clean._suggestionModeModel = smConfig;
}

return clean;
}

Expand Down Expand Up @@ -1876,11 +1885,26 @@ IMPORTANT TOOL USAGE RULES:
toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
}

// Guard: drop hallucinated tool calls when no tools were sent to the model.
// Some models (e.g. Llama 3.1) hallucinate tool_call blocks from conversation
// history even when the request contained zero tool definitions.
const toolsWereSent = Array.isArray(cleanPayload.tools) && cleanPayload.tools.length > 0;
if (toolCalls.length > 0 && !toolsWereSent) {
logger.warn({
sessionId: session?.id ?? null,
step: steps,
hallucinated: toolCalls.map(tc => tc.function?.name || tc.name),
noToolInjection: !!cleanPayload._noToolInjection,
}, "Dropped hallucinated tool calls (no tools were sent to model)");
toolCalls = [];
// If there's also no text content, treat as empty response (handled below)
}

if (toolCalls.length > 0) {
// Convert OpenAI/OpenRouter format to Anthropic format for session storage
let sessionContent;
if (providerType === "azure-anthropic") {
// Azure Anthropic already returns content in Anthropic format
// Azure Anthropic already returns content in Anthropic
sessionContent = databricksResponse.json?.content ?? [];
} else {
// Convert OpenAI/OpenRouter format to Anthropic content blocks
Expand Down Expand Up @@ -3217,6 +3241,34 @@ IMPORTANT TOOL USAGE RULES:
};
}

/**
* Detect if the current request is a suggestion mode call.
* Scans the last user message for the [SUGGESTION MODE: marker.
* @param {Array} messages - The conversation messages
* @returns {{ isSuggestionMode: boolean }}
*/
function detectSuggestionMode(messages) {
if (!Array.isArray(messages) || messages.length === 0) {
return { isSuggestionMode: false };
}
// Scan from the end to find the last user message
for (let i = messages.length - 1; i >= 0; i--) {
const msg = messages[i];
if (msg?.role !== 'user') continue;
const content = typeof msg.content === 'string'
? msg.content
: Array.isArray(msg.content)
? msg.content.map(b => b.text || '').join(' ')
: '';
if (content.includes('[SUGGESTION MODE:')) {
return { isSuggestionMode: true };
}
// Only check the last user message
break;
}
return { isSuggestionMode: false };
}

async function processMessage({ payload, headers, session, cwd, options = {} }) {
const requestedModel =
payload?.model ??
Expand All @@ -3226,6 +3278,32 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
typeof headers?.["anthropic-beta"] === "string" &&
headers["anthropic-beta"].includes("interleaved-thinking");

// === SUGGESTION MODE: Early return when SUGGESTION_MODE_MODEL=none ===
const { isSuggestionMode } = detectSuggestionMode(payload?.messages);
const suggestionModelConfig = config.modelProvider?.suggestionModeModel ?? "default";
if (isSuggestionMode && suggestionModelConfig.toLowerCase() === "none") {
logger.info('Suggestion mode: skipping LLM call (SUGGESTION_MODE_MODEL=none)');
return {
response: {
json: {
id: `msg_suggestion_skip_${Date.now()}`,
type: "message",
role: "assistant",
content: [{ type: "text", text: "" }],
model: requestedModel,
stop_reason: "end_turn",
stop_sequence: null,
usage: { input_tokens: 0, output_tokens: 0 },
},
ok: true,
status: 200,
},
steps: 0,
durationMs: 0,
terminationReason: "suggestion_mode_skip",
};
}

// === TOOL LOOP GUARD (EARLY CHECK) ===
// Check BEFORE sanitization since sanitizePayload removes conversation history
const toolLoopThreshold = config.policy?.toolLoopThreshold ?? 3;
Expand Down
Loading