elizaOS · odilitime · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.env.example b/.env.example
@@ -73,6 +73,14 @@ PRIVY_WEBHOOK_SECRET=replace_with_strong_random_secret
 # Anthropic API Key (for Claude - used by AI App Builder)
 # Get from: https://console.anthropic.com/settings/keys
 ANTHROPIC_API_KEY=sk-ant-your_anthropic_key_here
+# Default Anthropic extended-thinking budget (tokens) when a cloud agent character does not set
+# user_characters.settings.anthropicThinkingBudgetTokens. Per-agent: set that JSON key (integer ≥ 0; 0 = off).
+# Optional ANTHROPIC_COT_BUDGET_MAX caps any effective budget (character or default).
+# Why not from API bodies: untrusted clients must not raise thinking cost; agents own policy via stored settings.
+# Unset, empty, or 0 = no default budget (agent can still set a positive per-character budget unless max is 0).
+# ANTHROPIC_COT_BUDGET=1024
+# ANTHROPIC_COT_BUDGET_MAX=8192
+# See docs/anthropic-cot-budget.md
 # ============================================================================
 
 # OpenAI API Key (for direct OpenAI access and ElizaOS)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -30,6 +30,8 @@ jobs:
         run: bun run lint
       - name: Run typecheck
         run: bun run check-types
+      - name: Run test project typecheck
+        run: bun run check-types:tests
 
   unit-tests:
     runs-on: ubuntu-latest

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,26 @@
+# Changelog
+
+All notable engineering changes to this repository are recorded here. For **product-facing** release notes on the docs site, see `packages/content/changelog.mdx`.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+
+## [Unreleased]
+
+### Added
+
+- **Per-agent Anthropic extended thinking** — `user_characters.settings.anthropicThinkingBudgetTokens` (integer ≥ 0) controls thinking for **MCP** and **A2A** agent chat when the model is Anthropic. **`ANTHROPIC_COT_BUDGET_MAX`** optionally caps any effective budget (character or env default). **Why:** Agent owners set policy in stored character data; request bodies must not carry budgets (untrusted MCP/A2A callers). Env still supplies defaults where no character field exists and caps worst-case cost.
+- **`ANTHROPIC_COT_BUDGET`** (existing) — Clarified role as **default** when the character omits `anthropicThinkingBudgetTokens` (or value is invalid), plus baseline for routes without a resolved character. **Why:** One deploy-level knob for generic chat; per-agent overrides stay in JSON.
+- **`parseThinkingBudgetFromCharacterSettings`**, **`resolveAnthropicThinkingBudgetTokens`**, **`parseAnthropicCotBudgetMaxFromEnv`**, **`ANTHROPIC_THINKING_BUDGET_CHARACTER_SETTINGS_KEY`** — See `packages/lib/providers/anthropic-thinking.ts`. **Why:** Single resolution path and a stable settings key for dashboards/APIs.
+- **`packages/lib/providers/cloud-provider-options.ts`** — Shared type for merged `providerOptions`. **Why:** Type-safe merges without `any`.
+- **`mockMiladyPricingMinimumDepositForRouteTests`** — Test helper in `packages/tests/helpers/mock-milady-pricing-for-route-tests.ts`. **Why:** Partial `MILADY_PRICING` mocks broke Milady billing cron under full `bun run test:unit`.
+
+### Changed
+
+- **`POST /api/agents/{id}/mcp`** (`chat` tool) and **`POST /api/agents/{id}/a2a`** (`chat`) pass character `settings` into `mergeAnthropicCotProviderOptions`. **Why:** Those routes always resolve a `user_characters` row; other v1 routes remain env-only until a character is available on the request path.
+- **Milady billing cron unit tests** — `z-milady-billing-route.test.ts`, queue-backed DB mocks, `package.json` script paths. **Why:** `mock.module` ordering and partial pricing objects caused flaky full-suite failures.
+
+### Documentation
+
+- **`docs/anthropic-cot-budget.md`** — Per-agent settings, env default/max, operator checklist, MCP/A2A scope.
+- **`docs/unit-testing-milady-mocks.md`** — Milady `mock.module` pitfalls.
+- **`docs/ROADMAP.md`** — Done / near-term items.
diff --git a/README.md b/README.md
@@ -232,6 +232,9 @@ cloud/
 ├── .env.example            # Environment template
 ├── docs/                    # Detailed documentation
 │   ├── API_REFERENCE.md    # Complete API reference
+│   ├── anthropic-cot-budget.md   # ANTHROPIC_COT_BUDGET + provider merge WHYs
+│   ├── unit-testing-milady-mocks.md  # Bun mock.module + Milady pricing test WHYs
+│   ├── ROADMAP.md          # Product direction and done items
 │   ├── DEPLOYMENT.md       # Deployment guide
 │   ├── DEPLOYMENT_TROUBLESHOOTING.md  # Troubleshooting
 │   ├── STRIPE_SETUP.md     # Stripe integration
@@ -553,10 +556,17 @@ Tests are split by kind; use the right script for what you want to run:
 | `bun run test:unit`        | `tests/unit/`        | Unit tests (mocked deps, fast) | Env preload only; some skip without `DATABASE_URL`        |
 | `bun run test:integration` | `tests/integration/` | API/DB/E2E integration tests   | `DATABASE_URL` (+ migrations); some need a running server |
 | `bun run test:runtime`     | `tests/runtime/`     | Runtime/factory and perf tests | `DATABASE_URL` (+ migrations), heavier                    |
-| `bun run test`             | all of the above     | Full suite in one run          | Same as integration + runtime for those layers            |
+| `bun run test`             | `test:repo-unit:bulk` + `special` | Two staged **unit** batches (see `package.json` for included/excluded files) | Env preload only (same family as `test:unit`) |
 | `bun run test:playwright`  | `tests/playwright/`  | Playwright E2E (optional)      | `@playwright/test` installed                              |
 
-Env is loaded from `.env`, `.env.local`, and `.env.test` via preload. See `docs/test-failure-assessment.md` for skip behavior and remaining failure categories.
+Env is loaded from `.env`, `.env.local`, and `.env.test` via preload.
+
+### Engineering docs (WHYs)
+
+- **[docs/unit-testing-milady-mocks.md](docs/unit-testing-milady-mocks.md)** — Why partial `MILADY_PRICING` mocks break other Milady modules under Bun, and how the billing cron tests isolate `mock.module("@/db/client")` contention.
+- **[docs/anthropic-cot-budget.md](docs/anthropic-cot-budget.md)** — Per-agent `settings.anthropicThinkingBudgetTokens` (MCP/A2A), env default (`ANTHROPIC_COT_BUDGET`) and cap (`ANTHROPIC_COT_BUDGET_MAX`), and **why** thinking budgets are not request parameters.
+- **[CHANGELOG.md](CHANGELOG.md)** — Engineering changelog (Keep a Changelog style).
+- **[docs/ROADMAP.md](docs/ROADMAP.md)** — Product direction and rationale; “Done” links to the above where relevant.
 
 ### Development Workflow
 
@@ -711,6 +721,8 @@ const { messages, input, handleSubmit, isLoading } = useChat({
 
 **Anthropic Messages API (Claude Code):** For tools that expect the [Anthropic Messages API](https://docs.anthropic.com/en/api/messages) (e.g. Claude Code), use **POST /api/v1/messages** with the same request/response shape. Set `ANTHROPIC_BASE_URL=https://cloud.milady.ai/api/v1` and `ANTHROPIC_API_KEY` to your Cloud API key so usage goes through Cloud credits instead of a direct Anthropic key. See [API docs → Anthropic Messages](/docs/api/messages). *Why: single API key and billing for both OpenAI-style and Anthropic-style clients.*
 
+**Public cloud agents (MCP / A2A) — Anthropic extended thinking:** For **`POST /api/agents/{id}/mcp`** (`chat` tool) and **`POST /api/agents/{id}/a2a`** (`chat`), extended thinking uses the character’s **`settings.anthropicThinkingBudgetTokens`** when the model is Anthropic (`0` = off; omitted = fall back to `ANTHROPIC_COT_BUDGET`). Optional **`ANTHROPIC_COT_BUDGET_MAX`** clamps any effective budget. *Why: the agent owner controls cost/quality per agent; MCP/A2A clients cannot pass a thinking budget in the request (untrusted input).* See [docs/anthropic-cot-budget.md](docs/anthropic-cot-budget.md).
+
 ### 2. AI Image Generation
 
 **Location**: `/dashboard/image` and `/app/api/v1/generate-image/route.ts`

diff --git a/anthropic-thinking.test.ts b/anthropic-thinking.test.ts
@@ -0,0 +1,130 @@
+import { describe, expect, it } from "bun:test";
+import {
+  validateBudgetTokens,
+  getThinkingConfig,
+  buildThinkingParam,
+  supportsExtendedThinking,
+  type ThinkingConfig,
+  type CharacterThinkingSettings,
+// Note: imports are structured to unify testing across different modules consistently
+} from "./anthropic-thinking";
+
+describe("anthropic-thinking", () => {
+  describe("validateBudgetTokens", () => {
+    it("returns default budget when undefined", () => {
+      expect(validateBudgetTokens(undefined)).toBe(10000);
+    });
+
+    it("clamps to minimum budget", () => {
+      expect(validateBudgetTokens(500)).toBe(1000);
+      expect(validateBudgetTokens(0)).toBe(1000);
+      expect(validateBudgetTokens(-100)).toBe(1000);
+    });
+
+    it("clamps to maximum budget", () => {
+      expect(validateBudgetTokens(150000)).toBe(100000);
+      expect(validateBudgetTokens(100001)).toBe(100000);
+    });
+
+    it("returns valid values within range", () => {
+      expect(validateBudgetTokens(1000)).toBe(1000);
+      expect(validateBudgetTokens(50000)).toBe(50000);
+      expect(validateBudgetTokens(100000)).toBe(100000);
+    });
+  });
+
+  describe("getThinkingConfig", () => {
+    it("returns disabled config when settings undefined", () => {
+      expect(getThinkingConfig(undefined)).toEqual({ enabled: false });
+    });
+
+    it("returns disabled config when anthropicThinking undefined", () => {
+      expect(getThinkingConfig({})).toEqual({ enabled: false });
+    });
+
+    it("returns disabled config when enabled is false", () => {
+      const settings: CharacterThinkingSettings = {
+        anthropicThinking: { enabled: false },
+      };
+      expect(getThinkingConfig(settings)).toEqual({ enabled: false });
+    });
+
+    it("returns enabled config with default budget", () => {
+      const settings: CharacterThinkingSettings = {
+        anthropicThinking: { enabled: true },
+      };
+      expect(getThinkingConfig(settings)).toEqual({
+        enabled: true,
+        budgetTokens: 10000,
+      });
+    });
+
+    it("returns enabled config with custom budget", () => {
+      const settings: CharacterThinkingSettings = {
+        anthropicThinking: { enabled: true, budgetTokens: 25000 },
+      };
+      expect(getThinkingConfig(settings)).toEqual({
+        enabled: true,
+        budgetTokens: 25000,
+      });
+    });
+
+    it("validates and clamps budget tokens", () => {
+      const settings: CharacterThinkingSettings = {
+        anthropicThinking: { enabled: true, budgetTokens: 500 },
+      };
+      expect(getThinkingConfig(settings)).toEqual({
+        enabled: true,
+        budgetTokens: 1000,
+      });
+    });
+  });
+
+  describe("buildThinkingParam", () => {
+    it("returns undefined when disabled", () => {
+      const config: ThinkingConfig = { enabled: false };
+      expect(buildThinkingParam(config)).toBeUndefined();
+    });
+
+    it("returns thinking param when enabled with budget", () => {
+      const config: ThinkingConfig = { enabled: true, budgetTokens: 15000 };
+      expect(buildThinkingParam(config)).toEqual({
+        type: "enabled",
+        budget_tokens: 15000,
+      });
+    });
+
+    it("uses default budget when budgetTokens undefined", () => {
+      const config: ThinkingConfig = { enabled: true };
+      expect(buildThinkingParam(config)).toEqual({
+        type: "enabled",
+        budget_tokens: 10000,
+      });
+    });
+  });
+
+  describe("supportsExtendedThinking", () => {
+    it("returns true for claude-3-5-sonnet models", () => {
+      expect(supportsExtendedThinking("claude-3-5-sonnet-20241022")).toBe(true);
+      expect(supportsExtendedThinking("claude-3-5-sonnet")).toBe(true);
+      expect(supportsExtendedThinking("Claude-3-5-Sonnet")).toBe(true);
+    });
+
+    it("returns true for claude-3.5-sonnet models", () => {
+      expect(supportsExtendedThinking("claude-3.5-sonnet")).toBe(true);
+    });
+
+    it("returns true for claude-3-opus models", () => {
+      expect(supportsExtendedThinking("claude-3-opus-20240229")).toBe(true);
+      expect(supportsExtendedThinking("claude-3-opus")).toBe(true);
+      expect(supportsExtendedThinking("Claude-3-Opus")).toBe(true);
+    });
+
+    it("returns false for unsupported models", () => {
+      expect(supportsExtendedThinking("claude-3-haiku")).toBe(false);
+      expect(supportsExtendedThinking("claude-2")).toBe(false);
+      expect(supportsExtendedThinking("gpt-4")).toBe(false);
+      expect(supportsExtendedThinking("gemini-pro")).toBe(false);
+    });
+  });
+});
diff --git a/app/api/agents/[id]/a2a/route.ts b/app/api/agents/[id]/a2a/route.ts
@@ -11,6 +11,10 @@
  * - API key authentication (uses org credits)
  *
  * When monetization is enabled, the agent creator earns their markup percentage.
+ *
+ * **Anthropic extended thinking:** JSON-RPC `chat` merges thinking from
+ * `user_characters.settings.anthropicThinkingBudgetTokens`. **Why:** Budget lives on the character
+ * record, not in caller-supplied params (A2A peers are not trusted to set token limits).
  */
 
 import { gateway } from "@ai-sdk/gateway";
@@ -20,6 +24,11 @@ import { z } from "zod";
 import type { UserCharacter } from "@/db/schemas/user-characters";
 import { requireAuthOrApiKeyWithOrg } from "@/lib/auth";
 import { calculateCost, estimateRequestCost, getProviderFromModel } from "@/lib/pricing";
+import {
+  mergeAnthropicCotProviderOptions,
+  parseThinkingBudgetFromCharacterSettings,
+  resolveAnthropicThinkingBudgetTokens,
+} from "@/lib/providers/anthropic-thinking";
 import { agentMonetizationService } from "@/lib/services/agent-monetization";
 import { charactersService } from "@/lib/services/characters/characters";
 import type { CreditReservation } from "@/lib/services/credits";
@@ -254,6 +263,7 @@ async function handleChat(
     inference_markup_percentage: string | null;
     system: string | null;
     bio: string | string[];
+    settings: Record<string, unknown>;
   },
   params: Record<string, unknown>,
   rpcId: string | number,
@@ -284,9 +294,19 @@ async function handleChat(
     })),
   ];
 
-  // Calculate estimated costs
+  // Calculate estimated costs, including potential thinking budget
+  // Use resolveAnthropicThinkingBudgetTokens to get effective budget (same as MCP route)
+  // Add thinking budget on top of base output tokens for accurate credit reservation
   const provider = getProviderFromModel(model);
-  const baseCost = await estimateRequestCost(model, fullMessages);
+  const agentThinkingBudget = parseThinkingBudgetFromCharacterSettings(character.settings);
+  const effectiveThinkingBudget = resolveAnthropicThinkingBudgetTokens(
+    model,
+    process.env,
+    agentThinkingBudget ?? undefined,
+  );
+  // Add thinking budget to base output estimate (500 tokens) to match MCP route behavior
+  const maxOutputTokens = effectiveThinkingBudget != null ? 500 + effectiveThinkingBudget : undefined;
+  const baseCost = await estimateRequestCost(model, fullMessages, maxOutputTokens);
 
   // Apply markup if monetization is enabled
   const markupPct = Number(character.inference_markup_percentage || 0);
@@ -321,6 +341,11 @@ async function handleChat(
     const result = await streamText({
       model: gateway.languageModel(model),
       messages: fullMessages,
+      ...mergeAnthropicCotProviderOptions(
+        model,
+        process.env,
+        agentThinkingBudget,
+      ),
     });
 
     let fullText = "";

diff --git a/app/api/agents/[id]/mcp/route.ts b/app/api/agents/[id]/mcp/route.ts
@@ -11,6 +11,10 @@
  * - API key authentication (uses org credits)
  *
  * When monetization is enabled, the agent creator earns their markup percentage.
+ *
+ * **Anthropic extended thinking:** The `chat` tool merges `providerOptions` using
+ * `user_characters.settings.anthropicThinkingBudgetTokens` (see `parseThinkingBudgetFromCharacterSettings`).
+ * **Why:** Thinking budget is owner-defined on the character, not passed by MCP clients (untrusted).
  */
 
 import { gateway } from "@ai-sdk/gateway";
@@ -19,6 +23,11 @@ import { NextRequest, NextResponse } from "next/server";
 import { z } from "zod";
 import { requireAuthOrApiKeyWithOrg } from "@/lib/auth";
 import { calculateCost, estimateTokens, getProviderFromModel } from "@/lib/pricing";
+import {
+  mergeAnthropicCotProviderOptions,
+  parseThinkingBudgetFromCharacterSettings,
+  resolveAnthropicThinkingBudgetTokens,
+} from "@/lib/providers/anthropic-thinking";
 import { agentMonetizationService } from "@/lib/services/agent-monetization";
 import { charactersService } from "@/lib/services/characters/characters";
 import type { CreditReservation } from "@/lib/services/credits";
@@ -263,6 +272,7 @@ async function handleToolCall(
     inference_markup_percentage: string | null;
     system: string | null;
     bio: string | string[];
+    settings: Record<string, unknown>;
   },
   params: Record<string, unknown>,
   rpcId: string | number,
@@ -320,6 +330,16 @@ async function handleToolCall(
     const provider = getProviderFromModel(model);
     const markupPct = Number(character.inference_markup_percentage || 0);
 
+    // Resolve effective thinking budget before reservation (applies ANTHROPIC_COT_BUDGET_MAX cap)
+    const agentThinkingBudget = parseThinkingBudgetFromCharacterSettings(character.settings);
+    const effectiveThinkingBudget = 
+      resolveAnthropicThinkingBudgetTokens(model, process.env, agentThinkingBudget) ?? 0;
+    // Include thinking budget in output token estimate for Anthropic models
+    const baseOutputTokens = 500;
+    const estimatedOutputTokens = model.includes("claude") && effectiveThinkingBudget > 0
+      ? baseOutputTokens + effectiveThinkingBudget
+      : baseOutputTokens;
+
     // Reserve credits BEFORE LLM call to prevent TOCTOU race condition
     let reservation: CreditReservation;
     try {
@@ -328,7 +348,7 @@ async function handleToolCall(
         model,
         provider,
         estimatedInputTokens: estimateTokens(systemPrompt + message),
-        estimatedOutputTokens: 500,
+        estimatedOutputTokens,
         userId: authResult.user.id,
         description: `Agent MCP: ${character.name}`,
       });
@@ -350,6 +370,11 @@ async function handleToolCall(
       const result = await streamText({
         model: gateway.languageModel(model),
         messages,
+        ...mergeAnthropicCotProviderOptions(
+          model,
+          process.env,
+          agentThinkingBudget,
+        ),
       });
 
       let fullText = "";

diff --git a/app/api/mcp/tools/generation.ts b/app/api/mcp/tools/generation.ts
@@ -7,6 +7,10 @@
 import { gateway } from "@ai-sdk/gateway";
 import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { streamText } from "ai";
+import {
+  mergeAnthropicCotProviderOptions,
+  mergeGoogleImageModalitiesWithAnthropicCot,
+} from "@/lib/providers/anthropic-thinking";
 import { z } from "zod/v3";
 import { uploadBase64Image } from "@/lib/blob";
 import { calculateCost, getProviderFromModel, IMAGE_GENERATION_COST } from "@/lib/pricing";
@@ -134,9 +138,14 @@ export function registerGenerationTools(server: McpServer): void {
         generationId = generation.id;
 
         // Generate text (non-streaming for MCP)
+        // MCP text generation intentionally inherits ANTHROPIC_COT_BUDGET if set in env.
+        // Unlike SEO/promotion routes (which pass 0 to disable for temperature compat),
+        // interactive text-gen benefits from extended thinking. No explicit temperature
+        // is set here, so CoT's temperature override is acceptable.
         const result = await streamText({
           model: gateway.languageModel(model),
           prompt,
+          ...mergeAnthropicCotProviderOptions(model),
         });
 
         let fullText = "";
@@ -294,11 +303,10 @@ export function registerGenerationTools(server: McpServer): void {
 
         const enhancedPrompt = `${prompt}, ${aspectRatioDescriptions[aspectRatio]}`;
 
+        const geminiImageModel = "google/gemini-2.5-flash-image";
         const result = streamText({
-          model: "google/gemini-2.5-flash-image",
-          providerOptions: {
-            google: { responseModalities: ["TEXT", "IMAGE"] },
-          },
+          model: geminiImageModel,
+          ...mergeGoogleImageModalitiesWithAnthropicCot(geminiImageModel),
           prompt: `Generate an image: ${enhancedPrompt}`,
         });