fix: route reasoning keywords only in user prompt, replace o3 with DeepSeek

1bcMax · 1bcMax · commit c691a2983c92 · 2026-02-07T10:20:02.000-05:00
- Fix bug where system prompt reasoning keywords triggered REASONING tier
- Replace expensive o3 (/M) with DeepSeek Reasoner (/bin/zsh.42/M) for ~10x savings
- Add tier &amp; reasoning to debug logging for easier troubleshooting
- Add test case for system prompt with reasoning keywords
- Fix formatting for CI
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ One wallet, 30+ models, zero API keys.
 "What is 2+2?"            → DeepSeek        $0.27/M    saved 99%
 "Summarize this article"  → GPT-4o-mini     $0.60/M    saved 99%
 "Build a React component" → Claude Sonnet   $15.00/M   best balance
-"Prove this theorem"      → o3              $10.00/M   reasoning
+"Prove this theorem"      → DeepSeek-R      $0.42/M    reasoning
 "Run 50 parallel searches"→ Kimi K2.5       $2.40/M    agentic swarm
 ```
 
@@ -117,12 +117,12 @@ Weighted sum → sigmoid confidence calibration → tier selection.
 
 ### Tier → Model Mapping
 
-| Tier      | Primary Model   | Cost/M | Savings vs Opus |
-| --------- | --------------- | ------ | --------------- |
-| SIMPLE    | deepseek-chat   | $0.27  | **99.6%**       |
-| MEDIUM    | gpt-4o-mini     | $0.60  | **99.2%**       |
-| COMPLEX   | claude-sonnet-4 | $15.00 | **80%**         |
-| REASONING | o3              | $10.00 | **87%**         |
+| Tier      | Primary Model     | Cost/M | Savings vs Opus |
+| --------- | ----------------- | ------ | --------------- |
+| SIMPLE    | gemini-2.5-flash  | $0.60  | **99.2%**       |
+| MEDIUM    | deepseek-chat     | $0.42  | **99.4%**       |
+| COMPLEX   | claude-opus-4     | $75.00 | baseline        |
+| REASONING | deepseek-reasoner | $0.42  | **99.4%**       |
 
 Special rule: 2+ reasoning markers → REASONING at 0.97 confidence.
 
@@ -365,12 +365,12 @@ const decision = route("Prove sqrt(2) is irrational", undefined, 4096, {
 
 console.log(decision);
 // {
-//   model: "openai/o3",
+//   model: "deepseek/deepseek-reasoner",
 //   tier: "REASONING",
 //   confidence: 0.97,
 //   method: "rules",
-//   savings: 0.87,
-//   costEstimate: 0.041,
+//   savings: 0.994,
+//   costEstimate: 0.002,
 // }
 ```
 
@@ -471,9 +471,9 @@ openclaw logs --follow
 You should see model selection for each request:
 
 ```
-[plugins] google/gemini-2.5-flash $0.0012 (saved 99%)
-[plugins] deepseek/deepseek-chat $0.0003 (saved 99%)
-[plugins] anthropic/claude-sonnet-4 $0.0450 (saved 80%)
+[plugins] [SIMPLE] google/gemini-2.5-flash $0.0012 (saved 99%)
+[plugins] [MEDIUM] deepseek/deepseek-chat $0.0003 (saved 99%)
+[plugins] [REASONING] deepseek/deepseek-reasoner $0.0005 (saved 99%)
 ```
 
 ---
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@blockrun/clawrouter",
-  "version": "0.3.28",
+  "version": "0.3.29",
   "description": "Smart LLM router — save 78% on inference costs. 30+ models, one wallet, x402 micropayments.",
   "type": "module",
   "main": "dist/index.js",
diff --git a/src/index.ts b/src/index.ts
@@ -124,7 +124,10 @@ function injectAuthProfile(logger: { info: (msg: string) => void }): void {
 
       // Load or create auth-profiles.json with correct OpenClaw format
       // Format: { version: 1, profiles: { "provider:profileId": { type, provider, key } } }
-      let store: { version: number; profiles: Record<string, unknown> } = { version: 1, profiles: {} };
+      let store: { version: number; profiles: Record<string, unknown> } = {
+        version: 1,
+        profiles: {},
+      };
       if (existsSync(authPath)) {
         try {
           const existing = JSON.parse(readFileSync(authPath, "utf-8"));
@@ -219,7 +222,9 @@ async function startProxyInBackground(api: OpenClawPluginApi): Promise<void> {
     onRouted: (decision) => {
       const cost = decision.costEstimate.toFixed(4);
       const saved = (decision.savings * 100).toFixed(0);
-      api.logger.info(`${decision.model} $${cost} (saved ${saved}%)`);
+      api.logger.info(
+        `[${decision.tier}] ${decision.model} $${cost} (saved ${saved}%) | ${decision.reasoning}`,
+      );
     },
     onLowBalance: (info) => {
       api.logger.warn(`[!] Low balance: ${info.balanceUSD}. Fund wallet: ${info.walletAddress}`);
diff --git a/src/proxy.ts b/src/proxy.ts
@@ -317,11 +317,16 @@ async function proxyRequest(
       }
 
       // Normalize model name for comparison (trim whitespace, lowercase)
-      const normalizedModel = typeof parsed.model === "string" ? parsed.model.trim().toLowerCase() : "";
-      const isAutoModel = normalizedModel === AUTO_MODEL.toLowerCase() || normalizedModel === AUTO_MODEL_SHORT.toLowerCase();
+      const normalizedModel =
+        typeof parsed.model === "string" ? parsed.model.trim().toLowerCase() : "";
+      const isAutoModel =
+        normalizedModel === AUTO_MODEL.toLowerCase() ||
+        normalizedModel === AUTO_MODEL_SHORT.toLowerCase();
 
       // Debug: log received model name
-      console.log(`[ClawRouter] Received model: "${parsed.model}" -> normalized: "${normalizedModel}", isAuto: ${isAutoModel}`);
+      console.log(
+        `[ClawRouter] Received model: "${parsed.model}" -> normalized: "${normalizedModel}", isAuto: ${isAutoModel}`,
+      );
 
       if (isAutoModel) {
         // Extract prompt from messages
diff --git a/src/router/config.ts b/src/router/config.ts
@@ -187,8 +187,8 @@ export const DEFAULT_ROUTING_CONFIG: RoutingConfig = {
       fallback: ["anthropic/claude-sonnet-4", "openai/gpt-4o"],
     },
     REASONING: {
-      primary: "openai/o3",
-      fallback: ["google/gemini-2.5-pro", "anthropic/claude-sonnet-4"],
+      primary: "deepseek/deepseek-reasoner",
+      fallback: ["moonshot/kimi-k2.5", "google/gemini-2.5-pro"],
     },
   },
 
diff --git a/src/router/rules.ts b/src/router/rules.ts
@@ -80,6 +80,8 @@ export function classifyByRules(
   config: ScoringConfig,
 ): ScoringResult {
   const text = `${systemPrompt ?? ""} ${prompt}`.toLowerCase();
+  // User prompt only — used for reasoning markers (system prompt shouldn't influence complexity)
+  const userText = prompt.toLowerCase();
 
   // Score all 14 dimensions
   const dimensions: DimensionScore[] = [
@@ -93,8 +95,9 @@ export function classifyByRules(
       { low: 1, high: 2 },
       { none: 0, low: 0.5, high: 1.0 },
     ),
+    // Reasoning markers use USER prompt only — system prompt "step by step" shouldn't trigger reasoning
     scoreKeywordMatch(
-      text,
+      userText,
       config.reasoningKeywords,
       "reasoningMarkers",
       "reasoning",
@@ -190,8 +193,11 @@ export function classifyByRules(
     weightedScore += d.score * w;
   }
 
-  // Count reasoning markers for override
-  const reasoningMatches = config.reasoningKeywords.filter((kw) => text.includes(kw.toLowerCase()));
+  // Count reasoning markers for override — only check USER prompt, not system prompt
+  // This prevents system prompts with "step by step" from triggering REASONING for simple queries
+  const reasoningMatches = config.reasoningKeywords.filter((kw) =>
+    userText.includes(kw.toLowerCase()),
+  );
 
   // Direct reasoning override: 2+ reasoning markers = high confidence REASONING
   if (reasoningMatches.length >= 2) {
diff --git a/test/e2e.ts b/test/e2e.ts
@@ -75,6 +75,43 @@ const config = DEFAULT_ROUTING_CONFIG;
   );
 }
 
+// System prompt with reasoning keywords should NOT trigger REASONING for simple queries
+// This was a bug: if client's system prompt had "step by step" or "logically", ALL queries became REASONING
+{
+  console.log("\nSystem prompt with reasoning keywords (should NOT affect simple queries):");
+  const systemPrompt = "Think step by step and reason logically about the user's question.";
+
+  const r1 = classifyByRules("What is 2+2?", systemPrompt, 10, config.scoring);
+  assert(
+    r1.tier === "SIMPLE",
+    `"2+2" with reasoning system prompt → ${r1.tier} (should be SIMPLE)`,
+  );
+
+  const r2 = classifyByRules("Hello", systemPrompt, 5, config.scoring);
+  assert(
+    r2.tier === "SIMPLE",
+    `"Hello" with reasoning system prompt → ${r2.tier} (should be SIMPLE)`,
+  );
+
+  const r3 = classifyByRules("What is the capital of France?", systemPrompt, 12, config.scoring);
+  assert(
+    r3.tier === "SIMPLE",
+    `"Capital of France" with reasoning system prompt → ${r3.tier} (should be SIMPLE)`,
+  );
+
+  // But if USER explicitly asks for step-by-step, it SHOULD trigger REASONING
+  const r4 = classifyByRules(
+    "Prove step by step that sqrt(2) is irrational",
+    systemPrompt,
+    50,
+    config.scoring,
+  );
+  assert(
+    r4.tier === "REASONING",
+    `User asks for step-by-step proof → ${r4.tier} (should be REASONING)`,
+  );
+}
+
 // Medium queries (may be ambiguous — that's ok, LLM classifier handles them)
 {
   console.log("\nMedium/Ambiguous queries:");

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@blockrun/clawrouter",`
`3`		`- "version": "0.3.28",`
	`3`	`+ "version": "0.3.29",`
`4`	`4`	`"description": "Smart LLM router — save 78% on inference costs. 30+ models, one wallet, x402 micropayments.",`
`5`	`5`	`"type": "module",`
`6`	`6`	`"main": "dist/index.js",`