From 8c98feaf0e86ea981060655b0c5f3e09f77c0e27 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 21:42:04 +0800
Subject: [PATCH 1/9] refactor: rewrite context compression with scratchpad
 support and smart token counting

- Rewrite infinite_context.ts: class -> function, structured output (summary + recent messages)
- Rewrite infinite_context_chain.ts: class -> simple compressChunk function
- Add scratchpad compression in agent loop (legacy-executor.ts)
- Extract shared countMessageTokens/countMessagesTokens to utils/count_tokens.ts
  with usage_metadata baseline optimization
- Update chat_history.ts and model.ts cropMessages to use baseline optimization
- Fix multimodal warning: 'chatluna-multimodal-service' -> 'multimodal-service'
---
 .../src/llm-core/agent/legacy-executor.ts     | 157 +++++-
 .../llm-core/chain/infinite_context_chain.ts  | 133 ++---
 packages/core/src/llm-core/chat/app.ts        |  58 +-
 .../src/llm-core/chat/infinite_context.ts     | 509 +++++++++---------
 packages/core/src/llm-core/platform/model.ts  | 110 +++-
 .../core/src/llm-core/prompt/chat_history.ts  | 138 ++++-
 .../src/llm-core/prompt/system_prompts.ts     |  47 +-
 .../core/src/llm-core/utils/count_tokens.ts   |  89 ++-
 .../src/middlewares/chat/read_chat_message.ts |   2 +-
 9 files changed, 775 insertions(+), 468 deletions(-)
diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index e6287140f..bedb5cfbe 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -1,5 +1,5 @@
 import { CallbackManagerForChainRun } from '@langchain/core/callbacks/manager'
-import { AIMessage, AIMessageChunk } from '@langchain/core/messages'
+import { AIMessage, AIMessageChunk, BaseMessage, HumanMessage } from '@langchain/core/messages'
 import { isDirectToolOutput } from '@langchain/core/messages/tool'
 import { OutputParserException } from '@langchain/core/output_parsers'
 import {
@@ -34,6 +34,8 @@ import {
     MessageQueue,
     ScratchpadEntry
 } from './types'
+import { compressChunk } from '../chain/infinite_context_chain'
+import type { ChatLunaChatModel } from '../platform/model'
 
 async function executeTools(
     actions: AgentAction[],
@@ -289,6 +291,19 @@ export async function* runAgent(
             }
         }
 
+        // Compress scratchpad if it's getting too large
+        const model = config?.configurable?.['model'] as
+            | ChatLunaChatModel
+            | undefined
+        if (model && scratchpad.length > 6) {
+            await compressScratchpad(
+                scratchpad,
+                options.input,
+                model,
+                config?.configurable?.['conversationId'] ?? ''
+            )
+        }
+
         const last = newSteps[newSteps.length - 1]
         const tool = last ? toolMap[last.action.tool?.toLowerCase()] : undefined
 
@@ -345,6 +360,146 @@ export async function* runAgent(
     }
 }
 
+/**
+ * Compress scratchpad when it grows too large during tool-call loops.
+ * Summarizes early scratchpad entries + chat_history into a single summary,
+ * replaces input.chat_history with [summary], and keeps only recent scratchpad entries.
+ */
+async function compressScratchpad(
+    scratchpad: ScratchpadEntry[],
+    input: ChainValues,
+    model: ChatLunaChatModel,
+    conversationId: string
+): Promise<void> {
+    const tokenCounter = (text: string) => model.getNumTokens(text)
+
+    // Estimate scratchpad tokens from text content
+    const scratchpadText = formatScratchpadForCount(scratchpad)
+    const scratchpadTokens = await tokenCounter(scratchpadText)
+
+    const invocation = model.invocationParams()
+    const maxTokenLimit =
+        invocation.maxTokenLimit && invocation.maxTokenLimit > 0
+            ? invocation.maxTokenLimit
+            : model.getModelMaxContextSize()
+
+    if (!maxTokenLimit || maxTokenLimit <= 0) return
+
+    // Only compress if scratchpad exceeds 50% of context window
+    if (scratchpadTokens < maxTokenLimit * 0.5) return
+
+    logger.info(
+        '[ScratchpadCompress] Scratchpad tokens %d exceed 50%% of %d, compressing',
+        scratchpadTokens,
+        maxTokenLimit
+    )
+
+    // Keep the last 3 entries (most recent tool calls), compress the rest
+    const keepCount = Math.min(3, scratchpad.length)
+    const toCompress = scratchpad.slice(0, scratchpad.length - keepCount)
+
+    if (toCompress.length === 0) return
+
+    // Build transcript from chat_history + early scratchpad
+    const chatHistory = (input['chat_history'] ?? []) as BaseMessage[]
+    const chatTranscript = chatHistory
+        .map((msg) => {
+            const role = msg.getType().toUpperCase()
+            const name = msg.name ? ` (${msg.name})` : ''
+            const content =
+                typeof msg.content === 'string'
+                    ? msg.content.trim()
+                    : JSON.stringify(msg.content)
+            return `[${role}${name}]\n${content || '(empty)'}`
+        })
+        .join('\n\n---\n\n')
+
+    const scratchTranscript = formatScratchpadTranscript(toCompress)
+    const transcript = chatTranscript
+        ? `${chatTranscript}\n\n---\n\n${scratchTranscript}`
+        : scratchTranscript
+
+    if (!transcript.trim()) return
+
+    try {
+        const summary = await compressChunk(
+            model,
+            transcript,
+            conversationId
+        )
+
+        if (!summary?.text.trim()) return
+
+        // Replace chat_history with summary
+        input['chat_history'] = [
+            new HumanMessage({
+                content: summary.text.trim(),
+                name: 'infinite_context',
+                additional_kwargs: { source: 'scratchpad-compression' }
+            })
+        ]
+
+        // Trim scratchpad: remove early entries, keep recent
+        scratchpad.splice(0, scratchpad.length - keepCount)
+
+        logger.info(
+            '[ScratchpadCompress] Compressed %d entries, kept %d',
+            toCompress.length,
+            keepCount
+        )
+    } catch (e) {
+        logger.error('[ScratchpadCompress] Failed:', e)
+    }
+}
+
+function formatScratchpadForCount(entries: ScratchpadEntry[]): string {
+    return entries
+        .map((entry) => {
+            if ('messages' in entry) {
+                return entry.messages
+                    .map((m) =>
+                        typeof m.content === 'string'
+                            ? m.content
+                            : JSON.stringify(m.content)
+                    )
+                    .join('\n')
+            }
+            const obs = observationToMessageContent(entry.observation)
+            return `${entry.action.tool}: ${typeof entry.action.toolInput === 'string' ? entry.action.toolInput : JSON.stringify(entry.action.toolInput)}\n${obs}`
+        })
+        .join('\n')
+}
+
+function formatScratchpadTranscript(entries: ScratchpadEntry[]): string {
+    return entries
+        .map((entry) => {
+            if ('messages' in entry) {
+                return entry.messages
+                    .map((m) => {
+                        const content =
+                            typeof m.content === 'string'
+                                ? m.content.trim()
+                                : JSON.stringify(m.content)
+                        return `[HUMAN]\n${content}`
+                    })
+                    .join('\n\n---\n\n')
+            }
+            const toolInput =
+                typeof entry.action.toolInput === 'string'
+                    ? entry.action.toolInput
+                    : JSON.stringify(entry.action.toolInput)
+            const truncatedInput =
+                toolInput.length > 300
+                    ? toolInput.slice(0, 300) + '...'
+                    : toolInput
+            const obs = observationToMessageContent(entry.observation)
+            const truncatedObs =
+                obs.length > 500 ? obs.slice(0, 500) + '...' : obs
+            return `[AI Tool Call: ${entry.action.tool}]\n${truncatedInput}\n\n[TOOL Result]\n${truncatedObs}`
+        })
+        .join('\n\n---\n\n')
+}
+
 export async function emitAgentEvent(
     runManager: CallbackManagerForChainRun | undefined,
     configurable: AgentRuntimeConfigurable,
diff --git a/packages/core/src/llm-core/chain/infinite_context_chain.ts b/packages/core/src/llm-core/chain/infinite_context_chain.ts
index bb1a2c2ef..084950f35 100644
--- a/packages/core/src/llm-core/chain/infinite_context_chain.ts
+++ b/packages/core/src/llm-core/chain/infinite_context_chain.ts
@@ -1,57 +1,16 @@
-import { ChainValues } from '@langchain/core/utils/types'
 import { PromptTemplate } from '@langchain/core/prompts'
 import { AIMessage, type UsageMetadata } from '@langchain/core/messages'
-import { BufferMemory } from 'koishi-plugin-chatluna/llm-core/memory/langchain'
-import {
-    ChatLunaLLMCallArg,
-    ChatLunaLLMChain,
-    ChatLunaLLMChainWrapper
-} from 'koishi-plugin-chatluna/llm-core/chain/base'
+import { ChatLunaLLMChain } from 'koishi-plugin-chatluna/llm-core/chain/base'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
-import {
-    ChatLunaError,
-    ChatLunaErrorCode
-} from 'koishi-plugin-chatluna/utils/error'
 import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
 
-export interface ChatLunaInfiniteContextChainInput {
-    historyMemory: BufferMemory
-}
-
-export interface ChatLunaInfiniteContextChunkArg {
-    chunk: string
-    conversationId: string
-    signal?: AbortSignal
-}
-
-export interface ChatLunaInfiniteContextChunkResult {
+export interface CompressChunkResult {
     text: string
     usageMetadata?: UsageMetadata
 }
 
-export class ChatLunaInfiniteContextChain
-    extends ChatLunaLLMChainWrapper
-    implements ChatLunaInfiniteContextChainInput
-{
-    historyMemory: BufferMemory
-
-    private chain: ChatLunaLLMChain
-
-    constructor({
-        historyMemory,
-        chain
-    }: ChatLunaInfiniteContextChainInput & { chain: ChatLunaLLMChain }) {
-        super()
-        this.historyMemory = historyMemory
-        this.chain = chain
-    }
-
-    static fromLLM(
-        llm: ChatLunaChatModel,
-        { historyMemory }: ChatLunaInfiniteContextChainInput
-    ) {
-        const prompt =
-            PromptTemplate.fromTemplate(`You are a helpful AI assistant tasked with summarizing conversations.
+const COMPRESS_PROMPT =
+    PromptTemplate.fromTemplate(`You are a helpful AI assistant tasked with summarizing conversations.
 
 When asked to summarize, provide a detailed but concise summary of the conversation.
 Focus on information that would be helpful for continuing the conversation, including:
@@ -61,6 +20,7 @@ Focus on information that would be helpful for continuing the conversation, incl
 - What needs to be done next
 - Key user requests, constraints, or preferences that should persist
 - Important technical decisions and why they were made
+- Tool calls that were made and their results (summarize the key outcomes)
 
 Some old tool result messages may say that the original tool output expired and was removed.
 Treat those as intentional retention placeholders, not as meaningful tool output.
@@ -72,72 +32,39 @@ Do not respond to any questions in the conversation, only output the summary.
 Conversation:
 {conversation_chunk}`)
 
-        const chain = new ChatLunaLLMChain({ llm, prompt })
+export async function compressChunk(
+    model: ChatLunaChatModel,
+    transcript: string,
+    conversationId: string,
+    signal?: AbortSignal
+): Promise<CompressChunkResult | null> {
+    const trimmed = transcript?.trim()
 
-        return new ChatLunaInfiniteContextChain({
-            historyMemory,
-            chain
-        })
+    if (!trimmed) {
+        return null
     }
 
-    async compressChunk({
-        chunk,
-        conversationId,
-        signal
-    }: ChatLunaInfiniteContextChunkArg): Promise<ChatLunaInfiniteContextChunkResult | null> {
-        const trimmedChunk = chunk?.trim()
-
-        if (!trimmedChunk) {
-            return null
-        }
-
-        const result = await this.chain.invoke({
-            conversation_chunk: trimmedChunk,
-            id: conversationId,
-            stream: false,
-            signal
-        })
-
-        const rawMessage = (result['message'] ?? null) as AIMessage | null
+    const chain = new ChatLunaLLMChain({ llm: model, prompt: COMPRESS_PROMPT })
 
-        const text =
-            (result['text'] ?? '').toString().trim() ||
-            (rawMessage ? getMessageContent(rawMessage.content).trim() : '')
-
-        if (!text) {
-            return null
-        }
-
-        return {
-            text,
-            usageMetadata: rawMessage?.usage_metadata
-        }
-    }
+    const result = await chain.invoke({
+        conversation_chunk: trimmed,
+        id: conversationId,
+        stream: false,
+        signal
+    })
 
-    async call(
-        arg: ChatLunaLLMCallArg & { chunk?: string }
-    ): Promise<ChainValues> {
-        const chunk = arg['chunk'] ?? getMessageContent(arg.message.content)
+    const rawMessage = (result['message'] ?? null) as AIMessage | null
 
-        if (!chunk?.trim()) {
-            throw new ChatLunaError(
-                ChatLunaErrorCode.UNKNOWN_ERROR,
-                new Error(
-                    'Empty context chunk passed to Infinite Context chain'
-                )
-            )
-        }
+    const text =
+        (result['text'] ?? '').toString().trim() ||
+        (rawMessage ? getMessageContent(rawMessage.content).trim() : '')
 
-        return this.chain.invoke({
-            conversation_chunk: chunk,
-            id: arg.conversationId,
-            stream: arg.stream,
-            signal: arg.signal,
-            maxTokens: arg.maxToken
-        })
+    if (!text) {
+        return null
     }
 
-    get model(): ChatLunaChatModel {
-        return this.chain.llm
+    return {
+        text,
+        usageMetadata: rawMessage?.usage_metadata
     }
 }
diff --git a/packages/core/src/llm-core/chat/app.ts b/packages/core/src/llm-core/chat/app.ts
index a3fa3a0b4..ffc33c616 100644
--- a/packages/core/src/llm-core/chat/app.ts
+++ b/packages/core/src/llm-core/chat/app.ts
@@ -24,8 +24,10 @@ import {
     initModel,
     supportChatMode
 } from './helper'
-import type { CompressContextResult } from './infinite_context'
-import { InfiniteContextManager } from './infinite_context'
+import {
+    type CompressContextResult,
+    compressIfNeeded
+} from './infinite_context'
 import type {
     ArchiveRecord,
     BindingRecord,
@@ -41,7 +43,6 @@ export class ChatInterface {
     private _embeddings: ComputedRef<Embeddings>
 
     private _historyMemory?: BufferMemory
-    private _infiniteContextManager?: InfiniteContextManager
 
     private _chatCount = 0
 
@@ -58,7 +59,6 @@ export class ChatInterface {
         this._chain = undefined
         this._embeddings = undefined
         this._historyMemory = undefined
-        this._infiniteContextManager = undefined
     }
 
     private async handleChatError(
@@ -157,10 +157,17 @@ export class ChatInterface {
             hasSavedUser = true
         }
 
-        try {
-            if (this.chatluna.currentConfig.infiniteContext) {
-                const manager = this._ensureInfiniteContextManager()
-                const result = await manager?.compressIfNeeded(wrapper)
+        // Compress chat history before starting
+        if (this.chatluna.currentConfig.infiniteContext && this._chatHistory) {
+            try {
+                const result = await compressIfNeeded({
+                    chatHistory: this._chatHistory,
+                    model: wrapper.model,
+                    conversationId: this._input.conversationId,
+                    preset: this._input.preset,
+                    threshold:
+                        this.chatluna.currentConfig.infiniteContextThreshold
+                })
                 if (result?.messages) {
                     await this._chatHistory.replaceMessages(result.messages)
                 }
@@ -170,9 +177,9 @@ export class ChatInterface {
                         result
                     )
                 }
+            } catch (error) {
+                logger.error('Error compressing context:', error)
             }
-        } catch (error) {
-            logger.error('Error compressing context:', error)
         }
 
         const response = (await wrapper.call({
@@ -387,15 +394,21 @@ export class ChatInterface {
 
     async compressContext(force = false): Promise<CompressContextResult> {
         const wrapper = await this.getChatLunaLLMChainWrapper()
-        const manager = this._ensureInfiniteContextManager()
-        if (!manager) {
+        if (!this._chatHistory) {
             throw new ChatLunaError(
                 ChatLunaErrorCode.CHAT_HISTORY_INIT_ERROR,
                 new Error('Chat history is not initialized')
             )
         }
 
-        const result = await manager.compressIfNeeded(wrapper, force)
+        const result = await compressIfNeeded({
+            chatHistory: this._chatHistory,
+            model: wrapper.model,
+            conversationId: this._input.conversationId,
+            preset: this._input.preset,
+            threshold: this.chatluna.currentConfig.infiniteContextThreshold,
+            force
+        })
         if (result.messages) {
             await this._chatHistory.replaceMessages(result.messages)
         }
@@ -441,25 +454,6 @@ export class ChatInterface {
 
         return this._historyMemory
     }
-
-    private _ensureInfiniteContextManager():
-        | InfiniteContextManager
-        | undefined {
-        if (!this._chatHistory) {
-            return undefined
-        }
-
-        if (!this._infiniteContextManager) {
-            this._infiniteContextManager = new InfiniteContextManager({
-                chatHistory: this._chatHistory,
-                conversationId: this._input.conversationId,
-                preset: this._input.preset,
-                threshold: this.chatluna.currentConfig.infiniteContextThreshold
-            })
-        }
-
-        return this._infiniteContextManager
-    }
 }
 
 async function autoSummarizeTitle(
diff --git a/packages/core/src/llm-core/chat/infinite_context.ts b/packages/core/src/llm-core/chat/infinite_context.ts
index 3af3c4f5f..27ad84fdd 100644
--- a/packages/core/src/llm-core/chat/infinite_context.ts
+++ b/packages/core/src/llm-core/chat/infinite_context.ts
@@ -1,18 +1,18 @@
-/* eslint-disable max-len */
 import {
     BaseMessage,
     HumanMessage,
     mapStoredMessageToChatMessage
 } from '@langchain/core/messages'
-import { ComputedRef } from '@vue/reactivity'
 import { logger } from 'koishi-plugin-chatluna'
-import { ChatLunaLLMChainWrapper } from 'koishi-plugin-chatluna/llm-core/chain/base'
 import { KoishiChatMessageHistory } from 'koishi-plugin-chatluna/llm-core/memory/message'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
 import { PresetTemplate } from 'koishi-plugin-chatluna/llm-core/prompt'
 import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
-import { ChatLunaInfiniteContextChain } from '../chain/infinite_context_chain'
+import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain'
+import { countMessagesTokens } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
+import { compressChunk } from '../chain/infinite_context_chain'
 import type { ChatLunaMessageMeta } from '../../types'
+import { ComputedRef } from '@vue/reactivity'
 
 export interface CompressContextResult {
     inputTokens: number
@@ -25,298 +25,301 @@ export interface CompressContextResult {
     messages?: BaseMessage[]
 }
 
-function formatTranscript(messages: BaseMessage[]) {
-    return messages
-        .map((message) => {
-            const role = message.getType().toUpperCase()
-            const name = message.name ? ` (${message.name})` : ''
-            const content = getMessageContent(message.content).trim()
-            return `[${role}${name}]\n${content || '(empty)'}`
-        })
-        .join('\n\n---\n\n')
-}
-
-export interface InfiniteContextManagerOptions {
+export interface CompressContextOptions {
     chatHistory: KoishiChatMessageHistory
+    model: ChatLunaChatModel
     conversationId: string
     preset?: ComputedRef<PresetTemplate>
     threshold?: number
+    force?: boolean
 }
 
-export class InfiniteContextManager {
-    private _chain?: ChatLunaInfiniteContextChain
-
-    constructor(private readonly options: InfiniteContextManagerOptions) {}
-
-    async compressIfNeeded(
-        wrapper: ChatLunaLLMChainWrapper,
-        force = false
-    ): Promise<CompressContextResult> {
-        const model = wrapper.model
-
-        if (!model) {
-            return {
-                inputTokens: 0,
-                outputTokens: 0,
-                reducedTokens: 0,
-                reducedPercent: 0,
-                compressed: false,
-                originalMessageCount: 0,
-                remainingMessageCount: 0
-            }
-        }
-
-        const messages = await this.options.chatHistory.getMessages()
-
-        if (messages.length === 0) {
-            return {
-                inputTokens: 0,
-                outputTokens: 0,
-                reducedTokens: 0,
-                reducedPercent: 0,
-                compressed: false,
-                originalMessageCount: 0,
-                remainingMessageCount: 0
-            }
-        }
+/**
+ * Compress chat history when token usage exceeds threshold.
+ * Produces structured output: [summary message, ...recent messages].
+ */
+export async function compressIfNeeded(
+    opts: CompressContextOptions
+): Promise<CompressContextResult> {
+    const { chatHistory, model, conversationId, force } = opts
 
-        const inputTokens = await this._countMessagesTokens(model, messages)
-        const expiredToolResultText =
-            'This tool result expired after 1 hour, so the original output was removed.'
-        let compactedCount = 0
-        const compactedIndexes = new Set<number>()
+    const messages = await chatHistory.getMessages()
 
-        for (let idx = 0; idx < messages.length; idx++) {
-            const message = messages[idx]
-
-            if (message.getType() !== 'tool') {
-                continue
-            }
-
-            const meta = message.response_metadata?.chatluna as
-                | ChatLunaMessageMeta
-                | undefined
-
-            if (meta?.createdAt == null) {
-                continue
-            }
-
-            if (Date.now() - new Date(meta.createdAt).getTime() < 3600000) {
-                continue
-            }
-
-            if (
-                getMessageContent(message.content).trim() ===
-                expiredToolResultText
-            ) {
-                continue
-            }
-
-            compactedCount++
-            compactedIndexes.add(idx)
-        }
+    if (messages.length === 0) {
+        return emptyResult()
+    }
 
-        const compactedMessages =
-            compactedCount > 0
-                ? messages.map((message, idx) => {
-                      const cloned = mapStoredMessageToChatMessage(
-                          message.toDict()
-                      )
-
-                      if (compactedIndexes.has(idx)) {
-                          cloned.content = expiredToolResultText
-                      }
-
-                      return cloned
-                  })
-                : messages
-        const nextMessages = compactedCount > 0 ? compactedMessages : messages
-        const compactedTokens =
-            compactedCount > 0
-                ? await this._countMessagesTokens(model, nextMessages)
-                : inputTokens
-        let presetTokens = 0
-        let threshold: number | undefined
-
-        if (compactedCount > 0) {
-            logger.info(
-                '[InfiniteContext] Replaced %d expired tool results before compression',
-                compactedCount
-            )
-        }
+    // Step 1: compact expired tool results in-place
+    const compacted = compactExpiredToolResults(messages)
 
-        if (!force) {
-            const invocation = model.invocationParams()
-            const maxTokenLimit =
-                invocation.maxTokenLimit && invocation.maxTokenLimit > 0
-                    ? invocation.maxTokenLimit
-                    : model.getModelMaxContextSize()
-
-            if (!maxTokenLimit || maxTokenLimit <= 0) {
-                return {
-                    inputTokens,
-                    outputTokens: compactedTokens,
-                    reducedTokens: inputTokens - compactedTokens,
-                    reducedPercent:
-                        inputTokens > 0
-                            ? ((inputTokens - compactedTokens) / inputTokens) *
-                              100
-                            : 0,
-                    compressed: false,
-                    originalMessageCount: messages.length,
-                    remainingMessageCount: nextMessages.length,
-                    messages: compactedCount > 0 ? nextMessages : undefined
-                }
-            }
+    // Step 2: count tokens
+    const tokenCounter = (text: string) => model.getNumTokens(text)
+    const inputTokens = await countMessagesTokens(compacted, tokenCounter)
 
-            const presetMessages = Array.isArray(
-                this.options.preset?.value?.messages
-            )
-                ? (this.options.preset?.value.messages as BaseMessage[])
-                : []
-
-            presetTokens = await this._countMessagesTokens(
-                model,
-                presetMessages
-            )
-            threshold = Math.floor(
-                maxTokenLimit * (this.options.threshold ?? 0.85)
-            )
-
-            if (compactedTokens + presetTokens <= threshold) {
-                return {
-                    inputTokens,
-                    outputTokens: compactedTokens,
-                    reducedTokens: inputTokens - compactedTokens,
-                    reducedPercent:
-                        inputTokens > 0
-                            ? ((inputTokens - compactedTokens) / inputTokens) *
-                              100
-                            : 0,
-                    compressed: false,
-                    originalMessageCount: messages.length,
-                    remainingMessageCount: nextMessages.length,
-                    messages: compactedCount > 0 ? nextMessages : undefined
-                }
-            }
+    // Step 3: determine if compression is needed
+    if (!force) {
+        const invocation = model.invocationParams()
+        const maxTokenLimit =
+            invocation.maxTokenLimit && invocation.maxTokenLimit > 0
+                ? invocation.maxTokenLimit
+                : model.getModelMaxContextSize()
 
-            logger.info(
-                '[InfiniteContext] Start compression with history tokens: %d, total tokens: %d, threshold: %d',
-                compactedTokens,
-                compactedTokens + presetTokens,
-                threshold
-            )
-        } else {
-            logger.info(
-                '[InfiniteContext] Start manual compression with history tokens: %d',
-                compactedTokens
-            )
-        }
-
-        const transcript = formatTranscript(nextMessages)
-
-        if (!transcript.trim()) {
+        if (!maxTokenLimit || maxTokenLimit <= 0) {
             return {
+                ...emptyResult(),
                 inputTokens,
-                outputTokens: compactedTokens,
-                reducedTokens: inputTokens - compactedTokens,
-                reducedPercent:
-                    inputTokens > 0
-                        ? ((inputTokens - compactedTokens) / inputTokens) * 100
-                        : 0,
-                compressed: false,
                 originalMessageCount: messages.length,
-                remainingMessageCount: nextMessages.length,
-                messages: compactedCount > 0 ? nextMessages : undefined
+                remainingMessageCount: compacted.length,
+                messages:
+                    compacted.length !== messages.length
+                        ? compacted
+                        : undefined
             }
         }
 
-        const summary = await this._ensureInfiniteContextChain(
-            wrapper
-        ).compressChunk({
-            chunk: transcript,
-            conversationId: this.options.conversationId
-        })
+        const presetMessages = Array.isArray(opts.preset?.value?.messages)
+            ? (opts.preset.value.messages as BaseMessage[])
+            : []
+        const presetTokens = await countMessagesTokens(
+            presetMessages,
+            tokenCounter
+        )
+        const threshold = Math.floor(
+            maxTokenLimit * (opts.threshold ?? 0.85)
+        )
 
-        if (!summary?.text.trim()) {
+        if (inputTokens + presetTokens <= threshold) {
             return {
+                ...emptyResult(),
                 inputTokens,
-                outputTokens: compactedTokens,
-                reducedTokens: inputTokens - compactedTokens,
-                reducedPercent:
-                    inputTokens > 0
-                        ? ((inputTokens - compactedTokens) / inputTokens) * 100
-                        : 0,
-                compressed: false,
                 originalMessageCount: messages.length,
-                remainingMessageCount: nextMessages.length,
-                messages: compactedCount > 0 ? nextMessages : undefined
+                remainingMessageCount: compacted.length,
+                messages:
+                    compacted.length !== messages.length
+                        ? compacted
+                        : undefined
             }
         }
 
-        const message = new HumanMessage({
-            content: summary.text.trim(),
-            name: 'infinite_context',
-            additional_kwargs: {
-                source: 'infinite-context'
-            }
-        })
-
-        const outputTokens = summary.usageMetadata?.output_tokens ?? 0
-        const reducedTokens = inputTokens - outputTokens
-        const reducedPercent =
-            inputTokens > 0 ? (reducedTokens / inputTokens) * 100 : 0
-
         logger.info(
-            '[InfiniteContext] Compressed history from %d to %d (-%d, %s%%)',
+            '[InfiniteContext] Start compression: history=%d tokens, total=%d, threshold=%d',
             inputTokens,
-            outputTokens,
-            reducedTokens,
-            reducedPercent.toFixed(2)
+            inputTokens + presetTokens,
+            threshold
         )
+    } else {
+        logger.info(
+            '[InfiniteContext] Manual compression: history=%d tokens',
+            inputTokens
+        )
+    }
 
-        if (threshold != null && outputTokens + presetTokens > threshold) {
-            logger.warn(
-                '[InfiniteContext] Tokens remain above threshold after compression: %d > %d',
-                outputTokens + presetTokens,
-                threshold
-            )
+    // Step 4: split messages into [to-compress, to-keep]
+    // Keep the most recent complete rounds that fit within 40% of threshold
+    const { toCompress, toKeep } = splitMessages(compacted)
+
+    if (toCompress.length === 0) {
+        return {
+            ...emptyResult(),
+            inputTokens,
+            originalMessageCount: messages.length,
+            remainingMessageCount: compacted.length,
+            messages:
+                compacted.length !== messages.length ? compacted : undefined
         }
+    }
 
+    // Step 5: generate summary from early messages
+    const transcript = formatTranscript(toCompress)
+
+    if (!transcript.trim()) {
         return {
+            ...emptyResult(),
             inputTokens,
-            outputTokens,
-            reducedTokens,
-            reducedPercent,
-            compressed: true,
             originalMessageCount: messages.length,
-            remainingMessageCount: 1,
-            messages: [message]
+            remainingMessageCount: compacted.length,
+            messages:
+                compacted.length !== messages.length ? compacted : undefined
         }
     }
 
-    private async _countMessagesTokens(
-        model: ChatLunaChatModel,
-        messages: BaseMessage[]
-    ): Promise<number> {
-        let total = 0
+    const summary = await compressChunk(model, transcript, conversationId)
+
+    if (!summary?.text.trim()) {
+        return {
+            ...emptyResult(),
+            inputTokens,
+            originalMessageCount: messages.length,
+            remainingMessageCount: compacted.length,
+            messages:
+                compacted.length !== messages.length ? compacted : undefined
+        }
+    }
 
-        for (const message of messages) {
-            total += await model.countMessageTokens(message)
+    // Step 6: build structured output
+    const summaryMessage = new HumanMessage({
+        content: summary.text.trim(),
+        name: 'infinite_context',
+        additional_kwargs: {
+            source: 'infinite-context'
         }
+    })
+
+    const resultMessages = [summaryMessage, ...toKeep]
+    const outputTokens = await countMessagesTokens(resultMessages, tokenCounter)
+    const reducedTokens = inputTokens - outputTokens
+    const reducedPercent =
+        inputTokens > 0 ? (reducedTokens / inputTokens) * 100 : 0
+
+    logger.info(
+        '[InfiniteContext] Compressed: %d → %d tokens (-%d, %.2f%%), kept %d recent messages',
+        inputTokens,
+        outputTokens,
+        reducedTokens,
+        reducedPercent,
+        toKeep.length
+    )
+
+    return {
+        inputTokens,
+        outputTokens,
+        reducedTokens,
+        reducedPercent,
+        compressed: true,
+        originalMessageCount: messages.length,
+        remainingMessageCount: resultMessages.length,
+        messages: resultMessages
+    }
+}
 
-        return total
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function emptyResult(): CompressContextResult {
+    return {
+        inputTokens: 0,
+        outputTokens: 0,
+        reducedTokens: 0,
+        reducedPercent: 0,
+        compressed: false,
+        originalMessageCount: 0,
+        remainingMessageCount: 0
     }
+}
+
+/**
+ * Replace expired (>1h) tool result content with a placeholder.
+ * Returns a new array if any were compacted, otherwise the original.
+ */
+function compactExpiredToolResults(messages: BaseMessage[]): BaseMessage[] {
+    const placeholder =
+        'This tool result expired after 1 hour, so the original output was removed.'
+    let changed = false
+
+    const result = messages.map((msg) => {
+        if (msg.getType() !== 'tool') return msg
+
+        const meta = msg.response_metadata?.chatluna as
+            | ChatLunaMessageMeta
+            | undefined
+        if (meta?.createdAt == null) return msg
+        if (Date.now() - new Date(meta.createdAt).getTime() < 3600000)
+            return msg
+        if (getMessageContent(msg.content).trim() === placeholder) return msg
+
+        changed = true
+        const cloned = msg.toDict()
+        cloned.data.content = placeholder
+        return mapStoredMessageToChatMessage(cloned)
+    })
+
+    if (!changed) return messages
+
+    logger.info(
+        '[InfiniteContext] Compacted %d expired tool results',
+        result.filter((_, i) => result[i] !== messages[i]).length
+    )
+
+    return result
+}
 
-    private _ensureInfiniteContextChain(
-        wrapper: ChatLunaLLMChainWrapper
-    ): ChatLunaInfiniteContextChain {
-        if (!this._chain || this._chain.model !== wrapper.model) {
-            this._chain = ChatLunaInfiniteContextChain.fromLLM(wrapper.model, {
-                historyMemory: wrapper.historyMemory
-            })
+/**
+ * Split messages into [toCompress, toKeep].
+ * Keep the most recent complete conversation rounds.
+ * A round starts at a user message (HumanMessage or ChatLuna user message).
+ */
+function splitMessages(messages: BaseMessage[]): {
+    toCompress: BaseMessage[]
+    toKeep: BaseMessage[]
+} {
+    // Build rounds from the end
+    const rounds: BaseMessage[][] = []
+    let current: BaseMessage[] = []
+
+    for (let i = messages.length - 1; i >= 0; i--) {
+        const msg = messages[i]
+        current.unshift(msg)
+
+        const isRoundStart =
+            isChatLunaUserMessage(msg) || msg.getType() === 'human'
+
+        if (isRoundStart && i > 0) {
+            rounds.unshift(current)
+            current = []
         }
+    }
 
-        return this._chain
+    if (current.length > 0) {
+        rounds.unshift(current)
     }
+
+    // Keep at least the last round, at most last 3 rounds
+    const keepCount = Math.min(Math.max(1, Math.ceil(rounds.length * 0.3)), 3)
+    const splitIdx = rounds.length - keepCount
+
+    const toCompress = rounds.slice(0, splitIdx).flat()
+    const toKeep = rounds.slice(splitIdx).flat()
+
+    return { toCompress, toKeep }
+}
+
+/**
+ * Format messages into a transcript string for the LLM summarizer.
+ * Preserves tool-call structure information.
+ */
+function formatTranscript(messages: BaseMessage[]): string {
+    return messages
+        .map((msg) => {
+            const role = msg.getType().toUpperCase()
+            const name = msg.name ? ` (${msg.name})` : ''
+            const content = getMessageContent(msg.content).trim()
+
+            // Include tool_calls info for AI messages
+            const toolCalls = msg['tool_calls'] as
+                | Array<{ name: string; args: unknown }>
+                | undefined
+            let toolInfo = ''
+            if (toolCalls?.length > 0) {
+                toolInfo =
+                    '\nTool calls: ' +
+                    toolCalls
+                        .map((tc) => {
+                            const args = JSON.stringify(tc.args)
+                            const truncated =
+                                args.length > 200
+                                    ? args.slice(0, 200) + '...'
+                                    : args
+                            return `${tc.name}(${truncated})`
+                        })
+                        .join(', ')
+            }
+
+            // Include tool_call_id for tool messages
+            const toolCallId = msg['tool_call_id'] as string | undefined
+            const idInfo = toolCallId ? ` [call_id: ${toolCallId}]` : ''
+
+            return `[${role}${name}${idInfo}]\n${content || '(empty)'}${toolInfo}`
+        })
+        .join('\n\n---\n\n')
 }
diff --git a/packages/core/src/llm-core/platform/model.ts b/packages/core/src/llm-core/platform/model.ts
index ad657c126..d04fcd041 100644
--- a/packages/core/src/llm-core/platform/model.ts
+++ b/packages/core/src/llm-core/platform/model.ts
@@ -806,7 +806,8 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
             return rounds
         }
 
-        const countMessagesTokens = async (items: BaseMessage[]) => {
+        const tokenCounter = (text: string) => this.getNumTokens(text)
+        const countRoundTokens = async (items: BaseMessage[]) => {
             let tokens = 0
             for (const item of items) {
                 tokens += await this.countMessageTokens(item)
@@ -818,31 +819,106 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
         const selectedRounds: BaseMessage[][] = []
         let truncated = false
 
-        for (let i = conversationRounds.length - 1; i >= 0; i--) {
-            const round = conversationRounds[i]
-            const roundTokens = await countMessagesTokens(round)
-            const exceedsLimit =
-                maxTokenLimit != null && maxTokenLimit > 0
-                    ? totalTokens + roundTokens > maxTokenLimit
-                    : false
-
-            if (exceedsLimit && selectedRounds.length > 0) {
-                truncated = true
+        // Find baseline: last AI message with usage_metadata in the conversation
+        let baselineIdx = -1
+        let baselineTokens = 0
+        for (let i = messages.length - 1; i >= 0; i--) {
+            if (messages[i].getType() !== 'ai') continue
+            const usage = (messages[i] as AIMessage).usage_metadata
+            if (usage?.input_tokens > 0) {
+                baselineIdx = i
+                // input_tokens includes system messages we already counted
+                baselineTokens = usage.input_tokens - totalTokens
                 break
             }
+        }
 
-            totalTokens += roundTokens
-            selectedRounds.unshift(round)
+        if (baselineIdx >= 0 && maxTokenLimit != null && maxTokenLimit > 0) {
+            // Find which round the baseline falls in
+            let msgCount = 0
+            let baselineRoundIdx = -1
+            for (let r = 0; r < conversationRounds.length; r++) {
+                msgCount += conversationRounds[r].length
+                if (msgCount > baselineIdx) {
+                    baselineRoundIdx = r
+                    break
+                }
+            }
+            if (baselineRoundIdx < 0) {
+                baselineRoundIdx = conversationRounds.length - 1
+            }
 
-            if (exceedsLimit) {
-                truncated = true
-                break
+            // Iterate from end; when we reach baseline region, add all at once
+            for (let i = conversationRounds.length - 1; i >= 0; i--) {
+                if (i <= baselineRoundIdx && selectedRounds.length === 0) {
+                    // Bulk add all rounds up to baseline
+                    const exceedsLimit =
+                        totalTokens + baselineTokens > maxTokenLimit
+
+                    if (exceedsLimit && selectedRounds.length > 0) {
+                        truncated = true
+                        break
+                    }
+
+                    totalTokens += baselineTokens
+                    for (let j = 0; j <= baselineRoundIdx; j++) {
+                        selectedRounds.unshift(
+                            conversationRounds[baselineRoundIdx - j]
+                        )
+                    }
+
+                    if (exceedsLimit) {
+                        truncated = true
+                    }
+                    break
+                }
+
+                const round = conversationRounds[i]
+                const roundTokens = await countRoundTokens(round)
+                const exceedsLimit =
+                    totalTokens + roundTokens > maxTokenLimit
+
+                if (exceedsLimit && selectedRounds.length > 0) {
+                    truncated = true
+                    break
+                }
+
+                totalTokens += roundTokens
+                selectedRounds.unshift(round)
+
+                if (exceedsLimit) {
+                    truncated = true
+                    break
+                }
+            }
+        } else {
+            // No baseline or no limit, fallback to counting each round
+            for (let i = conversationRounds.length - 1; i >= 0; i--) {
+                const round = conversationRounds[i]
+                const roundTokens = await countRoundTokens(round)
+                const exceedsLimit =
+                    maxTokenLimit != null && maxTokenLimit > 0
+                        ? totalTokens + roundTokens > maxTokenLimit
+                        : false
+
+                if (exceedsLimit && selectedRounds.length > 0) {
+                    truncated = true
+                    break
+                }
+
+                totalTokens += roundTokens
+                selectedRounds.unshift(round)
+
+                if (exceedsLimit) {
+                    truncated = true
+                    break
+                }
             }
         }
 
         if (conversationRounds.length > 0 && selectedRounds.length === 0) {
             const round = conversationRounds[conversationRounds.length - 1]
-            totalTokens += await countMessagesTokens(round)
+            totalTokens += await countRoundTokens(round)
             selectedRounds.unshift(round)
             truncated = maxTokenLimit != null && maxTokenLimit > 0
         }
diff --git a/packages/core/src/llm-core/prompt/chat_history.ts b/packages/core/src/llm-core/prompt/chat_history.ts
index 43a1d69cc..4f4fe7955 100644
--- a/packages/core/src/llm-core/prompt/chat_history.ts
+++ b/packages/core/src/llm-core/prompt/chat_history.ts
@@ -1,4 +1,4 @@
-import { BaseMessage } from '@langchain/core/messages'
+import { AIMessage, BaseMessage } from '@langchain/core/messages'
 import {
     ChatLunaContextManagerService,
     PromptContextRuntime,
@@ -16,6 +16,9 @@ import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain'
  * Truncates conversation history to fit within the token budget, keeping
  * the most recent complete turns.  Also accounts for input + scratchpad
  * token consumption so that downstream stages know the remaining budget.
+ *
+ * Uses usage_metadata from AI messages as a baseline to avoid re-counting
+ * tokens for messages that were already counted by the LLM.
  */
 export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
     return async (runtime: PromptContextRuntime, next) => {
@@ -64,27 +67,98 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
         let truncated = false
         let usedTokens = runtime.usedTokens
 
-        for (let i = rounds.length - 1; i >= 0; i--) {
-            const round = rounds[i]
-            const roundTokens = await countMessagesTokens(
-                round,
-                runtime.tokenCounter
-            )
-            const exceedsLimit = hasValidLimit
-                ? usedTokens + roundTokens > availableLimit
-                : false
+        // Find baseline: last AI message with usage_metadata in chatHistory
+        // Everything up to and including that message has a known token count
+        const baseline = findBaseline(chatHistory, runtime.usedTokens)
 
-            if (exceedsLimit && selectedRounds.length > 0) {
-                truncated = true
-                break
+        if (baseline && hasValidLimit) {
+            // We know the total tokens for all messages up to baseline index.
+            // Find which rounds are fully before the baseline, which are after.
+            let msgIdx = 0
+            let baselineRoundIdx = -1
+            for (let r = 0; r < rounds.length; r++) {
+                msgIdx += rounds[r].length
+                if (msgIdx > baseline.idx) {
+                    baselineRoundIdx = r
+                    break
+                }
             }
+            if (baselineRoundIdx < 0) baselineRoundIdx = rounds.length - 1
+
+            // Rounds from baselineRoundIdx onward: count individually
+            // Rounds before baselineRoundIdx: total is baseline.tokens
+            // We iterate from the end, adding rounds until budget is exceeded
+            for (let i = rounds.length - 1; i >= 0; i--) {
+                const round = rounds[i]
+                let roundTokens: number
+
+                if (i <= baselineRoundIdx && selectedRounds.length === 0) {
+                    // First time hitting baseline region from the end:
+                    // all rounds [0..baselineRoundIdx] together = baseline.tokens
+                    // Add them all at once
+                    const bulkRounds = rounds.slice(0, baselineRoundIdx + 1)
+                    const bulkTokens = baseline.tokens
+                    const exceedsLimit = usedTokens + bulkTokens > availableLimit
 
-            usedTokens += roundTokens
-            selectedRounds.unshift(round)
+                    if (exceedsLimit && selectedRounds.length > 0) {
+                        truncated = true
+                        break
+                    }
 
-            if (exceedsLimit) {
-                truncated = true
-                break
+                    usedTokens += bulkTokens
+                    for (let j = 0; j <= baselineRoundIdx; j++) {
+                        selectedRounds.unshift(bulkRounds[baselineRoundIdx - j])
+                    }
+
+                    if (exceedsLimit) {
+                        truncated = true
+                    }
+                    break
+                }
+
+                roundTokens = await countMessagesTokens(
+                    round,
+                    runtime.tokenCounter
+                )
+                const exceedsLimit = usedTokens + roundTokens > availableLimit
+
+                if (exceedsLimit && selectedRounds.length > 0) {
+                    truncated = true
+                    break
+                }
+
+                usedTokens += roundTokens
+                selectedRounds.unshift(round)
+
+                if (exceedsLimit) {
+                    truncated = true
+                    break
+                }
+            }
+        } else {
+            // No baseline, fallback to counting each round
+            for (let i = rounds.length - 1; i >= 0; i--) {
+                const round = rounds[i]
+                const roundTokens = await countMessagesTokens(
+                    round,
+                    runtime.tokenCounter
+                )
+                const exceedsLimit = hasValidLimit
+                    ? usedTokens + roundTokens > availableLimit
+                    : false
+
+                if (exceedsLimit && selectedRounds.length > 0) {
+                    truncated = true
+                    break
+                }
+
+                usedTokens += roundTokens
+                selectedRounds.unshift(round)
+
+                if (exceedsLimit) {
+                    truncated = true
+                    break
+                }
             }
         }
 
@@ -117,6 +191,34 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
     }
 }
 
+/**
+ * Find the last AI message with usage_metadata.input_tokens in the history.
+ * Returns the index and the estimated history-only tokens up to that point.
+ */
+function findBaseline(
+    messages: BaseMessage[],
+    preAccountedTokens: number
+): { idx: number; tokens: number } | null {
+    for (let i = messages.length - 1; i >= 0; i--) {
+        const msg = messages[i]
+        if (msg.getType() !== 'ai') continue
+
+        const usage = (msg as AIMessage).usage_metadata
+        if (usage?.input_tokens > 0) {
+            // input_tokens includes system prompts + history + input.
+            // preAccountedTokens already covers system + input + scratchpad.
+            // The history portion is roughly: input_tokens - (system + input)
+            // But we don't know exact system tokens here. Use a simpler model:
+            // The baseline tells us "all messages up to this AI response
+            // plus the AI response itself" consumed input_tokens total input.
+            // For truncation purposes, we treat it as the token cost of
+            // messages[0..i] in the history array.
+            return { idx: i, tokens: usage.input_tokens - preAccountedTokens }
+        }
+    }
+    return null
+}
+
 /**
  * Split a flat message list into conversation rounds. Marked ChatLuna user
  * messages start rounds; old unmarked human messages still start rounds.
diff --git a/packages/core/src/llm-core/prompt/system_prompts.ts b/packages/core/src/llm-core/prompt/system_prompts.ts
index 5269dcbc4..06a9d963e 100644
--- a/packages/core/src/llm-core/prompt/system_prompts.ts
+++ b/packages/core/src/llm-core/prompt/system_prompts.ts
@@ -6,49 +6,12 @@ import {
     PromptPipelineMiddleware
 } from './context_manager'
 import { logger } from 'koishi-plugin-chatluna'
-import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
-import { messageTypeToOpenAIRole } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
-
-// ---------------------------------------------------------------------------
-// Token counting helpers (shared by multiple middlewares)
-// ---------------------------------------------------------------------------
-
-export async function countMessageTokens(
-    message: BaseMessage,
-    tokenCounter: (text: string) => Promise<number>
-): Promise<number> {
-    let content = getMessageContent(message.content)
-
-    if (
-        content.includes('![image]') &&
-        content.includes('base64') &&
-        message.additional_kwargs?.['images']
-    ) {
-        content = content.replaceAll(/!\[.*?\]\(.*?\)/g, '')
-        message.content = content
-    }
-
-    let result =
-        (await tokenCounter(getMessageContent(message.content))) +
-        (await tokenCounter(messageTypeToOpenAIRole(message.getType())))
-
-    if (message.name) {
-        result += await tokenCounter(message.name)
-    }
-
-    return result
-}
+import {
+    countMessageTokens,
+    countMessagesTokens
+} from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
 
-export async function countMessagesTokens(
-    messages: BaseMessage[],
-    tokenCounter: (text: string) => Promise<number>
-): Promise<number> {
-    let total = 0
-    for (const message of messages) {
-        total += await countMessageTokens(message, tokenCounter)
-    }
-    return total
-}
+export { countMessageTokens, countMessagesTokens }
 
 // ---------------------------------------------------------------------------
 // system_prompts pipeline middleware
diff --git a/packages/core/src/llm-core/utils/count_tokens.ts b/packages/core/src/llm-core/utils/count_tokens.ts
index 3e9d3a489..6bd6c5c8c 100644
--- a/packages/core/src/llm-core/utils/count_tokens.ts
+++ b/packages/core/src/llm-core/utils/count_tokens.ts
@@ -1,6 +1,7 @@
-import { MessageType } from '@langchain/core/messages'
+import { AIMessage, BaseMessage, MessageType } from '@langchain/core/messages'
 import { type TiktokenModel } from 'js-tiktoken/lite'
 import { encodingForModel } from './tiktoken'
+import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
 
 // https://www.npmjs.com/package/js-tiktoken
 
@@ -207,3 +208,89 @@ export function parseRawModelName(
 
     return [value.slice(0, index), value.slice(index + 1)]
 }
+
+// ---------------------------------------------------------------------------
+// Smart token counting with usage_metadata optimization
+// ---------------------------------------------------------------------------
+
+/**
+ * Count tokens for a single message using a tokenCounter function.
+ * Strips base64 image markdown before counting.
+ */
+export async function countMessageTokens(
+    message: BaseMessage,
+    tokenCounter: (text: string) => Promise<number>
+): Promise<number> {
+    let content = getMessageContent(message.content)
+
+    if (
+        content.includes('![image]') &&
+        content.includes('base64') &&
+        message.additional_kwargs?.['images']
+    ) {
+        content = content.replaceAll(/!\[.*?\]\(.*?\)/g, '')
+    }
+
+    let result =
+        (await tokenCounter(content)) +
+        (await tokenCounter(messageTypeToOpenAIRole(message.getType())))
+
+    if (message.name) {
+        result += await tokenCounter(message.name)
+    }
+
+    return result
+}
+
+/**
+ * Count tokens for a list of messages, using usage_metadata from the last
+ * AI message as a baseline when available.
+ *
+ * If an AI message has usage_metadata.input_tokens, that value represents
+ * the total input tokens at that LLM call (all prior messages + system prompt).
+ * We use the last such message as a baseline and only count messages after it.
+ *
+ * @param messages - The message list to count
+ * @param tokenCounter - Function to count tokens for a string
+ * @param presetTokens - Token count of system/preset messages (subtracted from
+ *                       baseline since usage_metadata.input_tokens includes them)
+ */
+export async function countMessagesTokens(
+    messages: BaseMessage[],
+    tokenCounter: (text: string) => Promise<number>,
+    presetTokens = 0
+): Promise<number> {
+    // Find the last AI message with usage_metadata.input_tokens
+    let baselineIdx = -1
+    let baselineTokens = 0
+
+    for (let i = messages.length - 1; i >= 0; i--) {
+        const msg = messages[i]
+        if (msg.getType() !== 'ai') continue
+
+        const usage = (msg as AIMessage).usage_metadata
+        if (usage?.input_tokens > 0) {
+            baselineIdx = i
+            // input_tokens includes preset, subtract to get history-only tokens
+            baselineTokens = usage.input_tokens - presetTokens
+            break
+        }
+    }
+
+    if (baselineIdx >= 0 && baselineIdx < messages.length - 1) {
+        // Count only messages from the baseline AI message onward
+        // (the AI message's output becomes part of next call's input)
+        let tail = 0
+        for (let i = baselineIdx; i < messages.length; i++) {
+            tail += await countMessageTokens(messages[i], tokenCounter)
+        }
+        return Math.max(baselineTokens + tail, 0)
+    }
+
+    // Fallback: count all messages
+    let total = 0
+    for (const msg of messages) {
+        total += await countMessageTokens(msg, tokenCounter)
+    }
+    return total
+}
diff --git a/packages/core/src/middlewares/chat/read_chat_message.ts b/packages/core/src/middlewares/chat/read_chat_message.ts
index 7c97418a0..e68be9a02 100644
--- a/packages/core/src/middlewares/chat/read_chat_message.ts
+++ b/packages/core/src/middlewares/chat/read_chat_message.ts
@@ -249,7 +249,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) {
                     : undefined
 
             const isInstalledImageService =
-                ctx.chatluna.getPlugin('chatluna-multimodal-service') != null
+                ctx.chatluna.getPlugin('multimodal-service') != null
 
             if (
                 parsedModelInfo?.value != null &&

From 384630d8b451f4ee3c2ef46b489a818e4f718418 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 21:43:11 +0800
Subject: [PATCH 2/9] fix: scratchpad compression threshold 50% -> 84%

---
 packages/core/src/llm-core/agent/legacy-executor.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index bedb5cfbe..b2aac575e 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -385,11 +385,11 @@ async function compressScratchpad(
 
     if (!maxTokenLimit || maxTokenLimit <= 0) return
 
-    // Only compress if scratchpad exceeds 50% of context window
-    if (scratchpadTokens < maxTokenLimit * 0.5) return
+    // Only compress if scratchpad exceeds 84% of context window
+    if (scratchpadTokens < maxTokenLimit * 0.84) return
 
     logger.info(
-        '[ScratchpadCompress] Scratchpad tokens %d exceed 50%% of %d, compressing',
+        '[ScratchpadCompress] Scratchpad tokens %d exceed 84%% of %d, compressing',
         scratchpadTokens,
         maxTokenLimit
     )

From 9256c50b336e911520e271b710ff55a55dbf17d9 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 21:47:02 +0800
Subject: [PATCH 3/9] [Refactor] streamline context compression

---
 .../src/llm-core/agent/legacy-executor.ts     | 19 ++++++++++++-------
 .../src/llm-core/chat/infinite_context.ts     | 14 ++++----------
 packages/core/src/llm-core/platform/model.ts  |  4 +---
 .../core/src/llm-core/prompt/chat_history.ts  |  6 +++---
 .../src/llm-core/prompt/system_prompts.ts     |  6 +++---
 5 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index b2aac575e..fef902b1a 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -1,5 +1,10 @@
 import { CallbackManagerForChainRun } from '@langchain/core/callbacks/manager'
-import { AIMessage, AIMessageChunk, BaseMessage, HumanMessage } from '@langchain/core/messages'
+import {
+    AIMessage,
+    AIMessageChunk,
+    BaseMessage,
+    HumanMessage
+} from '@langchain/core/messages'
 import { isDirectToolOutput } from '@langchain/core/messages/tool'
 import { OutputParserException } from '@langchain/core/output_parsers'
 import {
@@ -422,11 +427,7 @@ async function compressScratchpad(
     if (!transcript.trim()) return
 
     try {
-        const summary = await compressChunk(
-            model,
-            transcript,
-            conversationId
-        )
+        const summary = await compressChunk(model, transcript, conversationId)
 
         if (!summary?.text.trim()) return
 
@@ -465,7 +466,11 @@ function formatScratchpadForCount(entries: ScratchpadEntry[]): string {
                     .join('\n')
             }
             const obs = observationToMessageContent(entry.observation)
-            return `${entry.action.tool}: ${typeof entry.action.toolInput === 'string' ? entry.action.toolInput : JSON.stringify(entry.action.toolInput)}\n${obs}`
+            const input =
+                typeof entry.action.toolInput === 'string'
+                    ? entry.action.toolInput
+                    : JSON.stringify(entry.action.toolInput)
+            return `${entry.action.tool}: ${input}\n${obs}`
         })
         .join('\n')
 }
diff --git a/packages/core/src/llm-core/chat/infinite_context.ts b/packages/core/src/llm-core/chat/infinite_context.ts
index 27ad84fdd..8f2d3545c 100644
--- a/packages/core/src/llm-core/chat/infinite_context.ts
+++ b/packages/core/src/llm-core/chat/infinite_context.ts
@@ -71,9 +71,7 @@ export async function compressIfNeeded(
                 originalMessageCount: messages.length,
                 remainingMessageCount: compacted.length,
                 messages:
-                    compacted.length !== messages.length
-                        ? compacted
-                        : undefined
+                    compacted.length !== messages.length ? compacted : undefined
             }
         }
 
@@ -84,9 +82,7 @@ export async function compressIfNeeded(
             presetMessages,
             tokenCounter
         )
-        const threshold = Math.floor(
-            maxTokenLimit * (opts.threshold ?? 0.85)
-        )
+        const threshold = Math.floor(maxTokenLimit * (opts.threshold ?? 0.85))
 
         if (inputTokens + presetTokens <= threshold) {
             return {
@@ -95,9 +91,7 @@ export async function compressIfNeeded(
                 originalMessageCount: messages.length,
                 remainingMessageCount: compacted.length,
                 messages:
-                    compacted.length !== messages.length
-                        ? compacted
-                        : undefined
+                    compacted.length !== messages.length ? compacted : undefined
             }
         }
 
@@ -297,7 +291,7 @@ function formatTranscript(messages: BaseMessage[]): string {
 
             // Include tool_calls info for AI messages
             const toolCalls = msg['tool_calls'] as
-                | Array<{ name: string; args: unknown }>
+                | { name: string; args: unknown }[]
                 | undefined
             let toolInfo = ''
             if (toolCalls?.length > 0) {
diff --git a/packages/core/src/llm-core/platform/model.ts b/packages/core/src/llm-core/platform/model.ts
index d04fcd041..0c9746b89 100644
--- a/packages/core/src/llm-core/platform/model.ts
+++ b/packages/core/src/llm-core/platform/model.ts
@@ -806,7 +806,6 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
             return rounds
         }
 
-        const tokenCounter = (text: string) => this.getNumTokens(text)
         const countRoundTokens = async (items: BaseMessage[]) => {
             let tokens = 0
             for (const item of items) {
@@ -875,8 +874,7 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
 
                 const round = conversationRounds[i]
                 const roundTokens = await countRoundTokens(round)
-                const exceedsLimit =
-                    totalTokens + roundTokens > maxTokenLimit
+                const exceedsLimit = totalTokens + roundTokens > maxTokenLimit
 
                 if (exceedsLimit && selectedRounds.length > 0) {
                     truncated = true
diff --git a/packages/core/src/llm-core/prompt/chat_history.ts b/packages/core/src/llm-core/prompt/chat_history.ts
index 4f4fe7955..895fc3bf7 100644
--- a/packages/core/src/llm-core/prompt/chat_history.ts
+++ b/packages/core/src/llm-core/prompt/chat_history.ts
@@ -90,7 +90,6 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
             // We iterate from the end, adding rounds until budget is exceeded
             for (let i = rounds.length - 1; i >= 0; i--) {
                 const round = rounds[i]
-                let roundTokens: number
 
                 if (i <= baselineRoundIdx && selectedRounds.length === 0) {
                     // First time hitting baseline region from the end:
@@ -98,7 +97,8 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
                     // Add them all at once
                     const bulkRounds = rounds.slice(0, baselineRoundIdx + 1)
                     const bulkTokens = baseline.tokens
-                    const exceedsLimit = usedTokens + bulkTokens > availableLimit
+                    const exceedsLimit =
+                        usedTokens + bulkTokens > availableLimit
 
                     if (exceedsLimit && selectedRounds.length > 0) {
                         truncated = true
@@ -116,7 +116,7 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
                     break
                 }
 
-                roundTokens = await countMessagesTokens(
+                const roundTokens = await countMessagesTokens(
                     round,
                     runtime.tokenCounter
                 )
diff --git a/packages/core/src/llm-core/prompt/system_prompts.ts b/packages/core/src/llm-core/prompt/system_prompts.ts
index 06a9d963e..b6c679e59 100644
--- a/packages/core/src/llm-core/prompt/system_prompts.ts
+++ b/packages/core/src/llm-core/prompt/system_prompts.ts
@@ -1,4 +1,4 @@
-import { BaseMessage, SystemMessage } from '@langchain/core/messages'
+import { SystemMessage } from '@langchain/core/messages'
 import { HumanMessagePromptTemplate } from '@langchain/core/prompts'
 import {
     ChatLunaContextManagerService,
@@ -7,8 +7,8 @@ import {
 } from './context_manager'
 import { logger } from 'koishi-plugin-chatluna'
 import {
-    countMessageTokens,
-    countMessagesTokens
+    countMessagesTokens,
+    countMessageTokens
 } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
 
 export { countMessageTokens, countMessagesTokens }

From a6c686b0bd2c923fd06ddca788bb18c5312c9bd0 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 21:53:46 +0800
Subject: [PATCH 4/9] fix: use actual usage_metadata.input_tokens for
 scratchpad compression trigger

Instead of estimating tokens by formatting scratchpad text, use the real
input_tokens from the AI message's usage_metadata returned by the LLM call.
This is accurate since it's what the model actually consumed.
---
 .../src/llm-core/agent/legacy-executor.ts     | 74 ++++++++-----------
 1 file changed, 31 insertions(+), 43 deletions(-)

diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index fef902b1a..0644fd423 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -296,17 +296,26 @@ export async function* runAgent(
             }
         }
 
-        // Compress scratchpad if it's getting too large
+        // Compress scratchpad if input tokens are approaching context limit
         const model = config?.configurable?.['model'] as
             | ChatLunaChatModel
             | undefined
         if (model && scratchpad.length > 6) {
-            await compressScratchpad(
-                scratchpad,
-                options.input,
-                model,
-                config?.configurable?.['conversationId'] ?? ''
-            )
+            // Get input_tokens from the AI message that triggered tool calls
+            const aiMsg = output[0]?.['messageLog']?.[0] as
+                | AIMessage
+                | undefined
+            const inputTokens = (aiMsg as AIMessage)?.usage_metadata
+                ?.input_tokens
+            if (inputTokens > 0) {
+                await compressScratchpad(
+                    scratchpad,
+                    options.input,
+                    model,
+                    config?.configurable?.['conversationId'] ?? '',
+                    inputTokens
+                )
+            }
         }
 
         const last = newSteps[newSteps.length - 1]
@@ -367,21 +376,18 @@ export async function* runAgent(
 
 /**
  * Compress scratchpad when it grows too large during tool-call loops.
- * Summarizes early scratchpad entries + chat_history into a single summary,
- * replaces input.chat_history with [summary], and keeps only recent scratchpad entries.
+ * Uses the actual input_tokens from the last LLM call to determine if
+ * compression is needed. Summarizes early scratchpad entries + chat_history
+ * into a single summary, replaces input.chat_history, and keeps only recent
+ * scratchpad entries.
  */
 async function compressScratchpad(
     scratchpad: ScratchpadEntry[],
     input: ChainValues,
     model: ChatLunaChatModel,
-    conversationId: string
+    conversationId: string,
+    inputTokens: number
 ): Promise<void> {
-    const tokenCounter = (text: string) => model.getNumTokens(text)
-
-    // Estimate scratchpad tokens from text content
-    const scratchpadText = formatScratchpadForCount(scratchpad)
-    const scratchpadTokens = await tokenCounter(scratchpadText)
-
     const invocation = model.invocationParams()
     const maxTokenLimit =
         invocation.maxTokenLimit && invocation.maxTokenLimit > 0
@@ -390,12 +396,12 @@ async function compressScratchpad(
 
     if (!maxTokenLimit || maxTokenLimit <= 0) return
 
-    // Only compress if scratchpad exceeds 84% of context window
-    if (scratchpadTokens < maxTokenLimit * 0.84) return
+    // Only compress if input tokens exceed 84% of context window
+    if (inputTokens < maxTokenLimit * 0.84) return
 
     logger.info(
-        '[ScratchpadCompress] Scratchpad tokens %d exceed 84%% of %d, compressing',
-        scratchpadTokens,
+        '[ScratchpadCompress] Input tokens %d exceed 84%% of %d, compressing',
+        inputTokens,
         maxTokenLimit
     )
 
@@ -427,7 +433,11 @@ async function compressScratchpad(
     if (!transcript.trim()) return
 
     try {
-        const summary = await compressChunk(model, transcript, conversationId)
+        const summary = await compressChunk(
+            model,
+            transcript,
+            conversationId
+        )
 
         if (!summary?.text.trim()) return
 
@@ -453,28 +463,6 @@ async function compressScratchpad(
     }
 }
 
-function formatScratchpadForCount(entries: ScratchpadEntry[]): string {
-    return entries
-        .map((entry) => {
-            if ('messages' in entry) {
-                return entry.messages
-                    .map((m) =>
-                        typeof m.content === 'string'
-                            ? m.content
-                            : JSON.stringify(m.content)
-                    )
-                    .join('\n')
-            }
-            const obs = observationToMessageContent(entry.observation)
-            const input =
-                typeof entry.action.toolInput === 'string'
-                    ? entry.action.toolInput
-                    : JSON.stringify(entry.action.toolInput)
-            return `${entry.action.tool}: ${input}\n${obs}`
-        })
-        .join('\n')
-}
-
 function formatScratchpadTranscript(entries: ScratchpadEntry[]): string {
     return entries
         .map((entry) => {

From 876cfe165de39193503829bc7dd9aa59c2e0fc1a Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 22:00:21 +0800
Subject: [PATCH 5/9] fix: address PR review feedback

- count_tokens.ts: allow baseline when it's the last message (baselineIdx >= 0)
- Pass AbortSignal through compression chain (app.ts -> infinite_context -> compressChunk, legacy-executor -> compressScratchpad -> compressChunk)
- Unify compression threshold to 0.85
- Fix compacted messages detection: use reference equality (compacted !== messages) instead of length comparison
- Revert chat_history.ts baseline optimization (unreliable in prompt pipeline context where system tokens differ between calls)
---
 .../src/llm-core/agent/legacy-executor.ts     |  15 +-
 packages/core/src/llm-core/chat/app.ts        |   3 +-
 .../src/llm-core/chat/infinite_context.ts     |  19 ++-
 .../core/src/llm-core/prompt/chat_history.ts  | 138 +++---------------
 .../core/src/llm-core/utils/count_tokens.ts   |   2 +-
 5 files changed, 42 insertions(+), 135 deletions(-)

diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index 0644fd423..89231b28e 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -313,7 +313,8 @@ export async function* runAgent(
                     options.input,
                     model,
                     config?.configurable?.['conversationId'] ?? '',
-                    inputTokens
+                    inputTokens,
+                    signal
                 )
             }
         }
@@ -386,7 +387,8 @@ async function compressScratchpad(
     input: ChainValues,
     model: ChatLunaChatModel,
     conversationId: string,
-    inputTokens: number
+    inputTokens: number,
+    signal?: AbortSignal
 ): Promise<void> {
     const invocation = model.invocationParams()
     const maxTokenLimit =
@@ -396,11 +398,11 @@ async function compressScratchpad(
 
     if (!maxTokenLimit || maxTokenLimit <= 0) return
 
-    // Only compress if input tokens exceed 84% of context window
-    if (inputTokens < maxTokenLimit * 0.84) return
+    // Only compress if input tokens exceed 85% of context window
+    if (inputTokens < maxTokenLimit * 0.85) return
 
     logger.info(
-        '[ScratchpadCompress] Input tokens %d exceed 84%% of %d, compressing',
+        '[ScratchpadCompress] Input tokens %d exceed 85%% of %d, compressing',
         inputTokens,
         maxTokenLimit
     )
@@ -436,7 +438,8 @@ async function compressScratchpad(
         const summary = await compressChunk(
             model,
             transcript,
-            conversationId
+            conversationId,
+            signal
         )
 
         if (!summary?.text.trim()) return
diff --git a/packages/core/src/llm-core/chat/app.ts b/packages/core/src/llm-core/chat/app.ts
index ffc33c616..8c2aa77ca 100644
--- a/packages/core/src/llm-core/chat/app.ts
+++ b/packages/core/src/llm-core/chat/app.ts
@@ -166,7 +166,8 @@ export class ChatInterface {
                     conversationId: this._input.conversationId,
                     preset: this._input.preset,
                     threshold:
-                        this.chatluna.currentConfig.infiniteContextThreshold
+                        this.chatluna.currentConfig.infiniteContextThreshold,
+                    signal: arg.signal
                 })
                 if (result?.messages) {
                     await this._chatHistory.replaceMessages(result.messages)
diff --git a/packages/core/src/llm-core/chat/infinite_context.ts b/packages/core/src/llm-core/chat/infinite_context.ts
index 8f2d3545c..252bb9f60 100644
--- a/packages/core/src/llm-core/chat/infinite_context.ts
+++ b/packages/core/src/llm-core/chat/infinite_context.ts
@@ -32,6 +32,7 @@ export interface CompressContextOptions {
     preset?: ComputedRef<PresetTemplate>
     threshold?: number
     force?: boolean
+    signal?: AbortSignal
 }
 
 /**
@@ -71,7 +72,7 @@ export async function compressIfNeeded(
                 originalMessageCount: messages.length,
                 remainingMessageCount: compacted.length,
                 messages:
-                    compacted.length !== messages.length ? compacted : undefined
+                    compacted !== messages ? compacted : undefined
             }
         }
 
@@ -90,8 +91,7 @@ export async function compressIfNeeded(
                 inputTokens,
                 originalMessageCount: messages.length,
                 remainingMessageCount: compacted.length,
-                messages:
-                    compacted.length !== messages.length ? compacted : undefined
+                messages: compacted !== messages ? compacted : undefined
             }
         }
 
@@ -119,7 +119,7 @@ export async function compressIfNeeded(
             originalMessageCount: messages.length,
             remainingMessageCount: compacted.length,
             messages:
-                compacted.length !== messages.length ? compacted : undefined
+                compacted !== messages ? compacted : undefined
         }
     }
 
@@ -133,11 +133,16 @@ export async function compressIfNeeded(
             originalMessageCount: messages.length,
             remainingMessageCount: compacted.length,
             messages:
-                compacted.length !== messages.length ? compacted : undefined
+                compacted !== messages ? compacted : undefined
         }
     }
 
-    const summary = await compressChunk(model, transcript, conversationId)
+    const summary = await compressChunk(
+        model,
+        transcript,
+        conversationId,
+        opts.signal
+    )
 
     if (!summary?.text.trim()) {
         return {
@@ -146,7 +151,7 @@ export async function compressIfNeeded(
             originalMessageCount: messages.length,
             remainingMessageCount: compacted.length,
             messages:
-                compacted.length !== messages.length ? compacted : undefined
+                compacted !== messages ? compacted : undefined
         }
     }
 
diff --git a/packages/core/src/llm-core/prompt/chat_history.ts b/packages/core/src/llm-core/prompt/chat_history.ts
index 895fc3bf7..43a1d69cc 100644
--- a/packages/core/src/llm-core/prompt/chat_history.ts
+++ b/packages/core/src/llm-core/prompt/chat_history.ts
@@ -1,4 +1,4 @@
-import { AIMessage, BaseMessage } from '@langchain/core/messages'
+import { BaseMessage } from '@langchain/core/messages'
 import {
     ChatLunaContextManagerService,
     PromptContextRuntime,
@@ -16,9 +16,6 @@ import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain'
  * Truncates conversation history to fit within the token budget, keeping
  * the most recent complete turns.  Also accounts for input + scratchpad
  * token consumption so that downstream stages know the remaining budget.
- *
- * Uses usage_metadata from AI messages as a baseline to avoid re-counting
- * tokens for messages that were already counted by the LLM.
  */
 export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
     return async (runtime: PromptContextRuntime, next) => {
@@ -67,98 +64,27 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
         let truncated = false
         let usedTokens = runtime.usedTokens
 
-        // Find baseline: last AI message with usage_metadata in chatHistory
-        // Everything up to and including that message has a known token count
-        const baseline = findBaseline(chatHistory, runtime.usedTokens)
-
-        if (baseline && hasValidLimit) {
-            // We know the total tokens for all messages up to baseline index.
-            // Find which rounds are fully before the baseline, which are after.
-            let msgIdx = 0
-            let baselineRoundIdx = -1
-            for (let r = 0; r < rounds.length; r++) {
-                msgIdx += rounds[r].length
-                if (msgIdx > baseline.idx) {
-                    baselineRoundIdx = r
-                    break
-                }
-            }
-            if (baselineRoundIdx < 0) baselineRoundIdx = rounds.length - 1
-
-            // Rounds from baselineRoundIdx onward: count individually
-            // Rounds before baselineRoundIdx: total is baseline.tokens
-            // We iterate from the end, adding rounds until budget is exceeded
-            for (let i = rounds.length - 1; i >= 0; i--) {
-                const round = rounds[i]
-
-                if (i <= baselineRoundIdx && selectedRounds.length === 0) {
-                    // First time hitting baseline region from the end:
-                    // all rounds [0..baselineRoundIdx] together = baseline.tokens
-                    // Add them all at once
-                    const bulkRounds = rounds.slice(0, baselineRoundIdx + 1)
-                    const bulkTokens = baseline.tokens
-                    const exceedsLimit =
-                        usedTokens + bulkTokens > availableLimit
-
-                    if (exceedsLimit && selectedRounds.length > 0) {
-                        truncated = true
-                        break
-                    }
-
-                    usedTokens += bulkTokens
-                    for (let j = 0; j <= baselineRoundIdx; j++) {
-                        selectedRounds.unshift(bulkRounds[baselineRoundIdx - j])
-                    }
-
-                    if (exceedsLimit) {
-                        truncated = true
-                    }
-                    break
-                }
-
-                const roundTokens = await countMessagesTokens(
-                    round,
-                    runtime.tokenCounter
-                )
-                const exceedsLimit = usedTokens + roundTokens > availableLimit
-
-                if (exceedsLimit && selectedRounds.length > 0) {
-                    truncated = true
-                    break
-                }
-
-                usedTokens += roundTokens
-                selectedRounds.unshift(round)
+        for (let i = rounds.length - 1; i >= 0; i--) {
+            const round = rounds[i]
+            const roundTokens = await countMessagesTokens(
+                round,
+                runtime.tokenCounter
+            )
+            const exceedsLimit = hasValidLimit
+                ? usedTokens + roundTokens > availableLimit
+                : false
 
-                if (exceedsLimit) {
-                    truncated = true
-                    break
-                }
+            if (exceedsLimit && selectedRounds.length > 0) {
+                truncated = true
+                break
             }
-        } else {
-            // No baseline, fallback to counting each round
-            for (let i = rounds.length - 1; i >= 0; i--) {
-                const round = rounds[i]
-                const roundTokens = await countMessagesTokens(
-                    round,
-                    runtime.tokenCounter
-                )
-                const exceedsLimit = hasValidLimit
-                    ? usedTokens + roundTokens > availableLimit
-                    : false
-
-                if (exceedsLimit && selectedRounds.length > 0) {
-                    truncated = true
-                    break
-                }
 
-                usedTokens += roundTokens
-                selectedRounds.unshift(round)
+            usedTokens += roundTokens
+            selectedRounds.unshift(round)
 
-                if (exceedsLimit) {
-                    truncated = true
-                    break
-                }
+            if (exceedsLimit) {
+                truncated = true
+                break
             }
         }
 
@@ -191,34 +117,6 @@ export function createChatHistoryMiddleware(): PromptPipelineMiddleware {
     }
 }
 
-/**
- * Find the last AI message with usage_metadata.input_tokens in the history.
- * Returns the index and the estimated history-only tokens up to that point.
- */
-function findBaseline(
-    messages: BaseMessage[],
-    preAccountedTokens: number
-): { idx: number; tokens: number } | null {
-    for (let i = messages.length - 1; i >= 0; i--) {
-        const msg = messages[i]
-        if (msg.getType() !== 'ai') continue
-
-        const usage = (msg as AIMessage).usage_metadata
-        if (usage?.input_tokens > 0) {
-            // input_tokens includes system prompts + history + input.
-            // preAccountedTokens already covers system + input + scratchpad.
-            // The history portion is roughly: input_tokens - (system + input)
-            // But we don't know exact system tokens here. Use a simpler model:
-            // The baseline tells us "all messages up to this AI response
-            // plus the AI response itself" consumed input_tokens total input.
-            // For truncation purposes, we treat it as the token cost of
-            // messages[0..i] in the history array.
-            return { idx: i, tokens: usage.input_tokens - preAccountedTokens }
-        }
-    }
-    return null
-}
-
 /**
  * Split a flat message list into conversation rounds. Marked ChatLuna user
  * messages start rounds; old unmarked human messages still start rounds.
diff --git a/packages/core/src/llm-core/utils/count_tokens.ts b/packages/core/src/llm-core/utils/count_tokens.ts
index 6bd6c5c8c..e504c1d55 100644
--- a/packages/core/src/llm-core/utils/count_tokens.ts
+++ b/packages/core/src/llm-core/utils/count_tokens.ts
@@ -277,7 +277,7 @@ export async function countMessagesTokens(
         }
     }
 
-    if (baselineIdx >= 0 && baselineIdx < messages.length - 1) {
+    if (baselineIdx >= 0) {
         // Count only messages from the baseline AI message onward
         // (the AI message's output becomes part of next call's input)
         let tail = 0

From 96a45b8a2d8c4f7c238011025f7be0f746f8d068 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 22:02:19 +0800
Subject: [PATCH 6/9] fix: model.ts baseline counts AI+tool messages in round,
 fix warning text

- cropMessages baseline now counts the AI message itself and subsequent
  tool messages in the same round (usage_metadata.input_tokens only covers
  messages before the AI response)
- Update warning messages to show both plugin names for clarity
---
 packages/core/src/llm-core/platform/model.ts      | 15 +++++++++++++++
 .../src/middlewares/chat/read_chat_message.ts     |  8 ++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/packages/core/src/llm-core/platform/model.ts b/packages/core/src/llm-core/platform/model.ts
index 0c9746b89..2f4dc74eb 100644
--- a/packages/core/src/llm-core/platform/model.ts
+++ b/packages/core/src/llm-core/platform/model.ts
@@ -847,6 +847,21 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
                 baselineRoundIdx = conversationRounds.length - 1
             }
 
+            // baselineTokens covers messages[0..baselineIdx-1].
+            // We also need to count the AI message itself and any messages
+            // after it within the same round (tool messages, etc.)
+            let roundStartIdx = 0
+            for (let r = 0; r < baselineRoundIdx; r++) {
+                roundStartIdx += conversationRounds[r].length
+            }
+            for (
+                let i = baselineIdx;
+                i < roundStartIdx + conversationRounds[baselineRoundIdx].length;
+                i++
+            ) {
+                baselineTokens += await this.countMessageTokens(messages[i])
+            }
+
             // Iterate from end; when we reach baseline region, add all at once
             for (let i = conversationRounds.length - 1; i >= 0; i--) {
                 if (i <= baselineRoundIdx && selectedRounds.length === 0) {
diff --git a/packages/core/src/middlewares/chat/read_chat_message.ts b/packages/core/src/middlewares/chat/read_chat_message.ts
index e68be9a02..3bfa4f6bd 100644
--- a/packages/core/src/middlewares/chat/read_chat_message.ts
+++ b/packages/core/src/middlewares/chat/read_chat_message.ts
@@ -261,7 +261,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) {
                     logger.warn(
                         `Model "${model}" does not support image input. ` +
                             'Please use a model that supports vision capabilities, ' +
-                            'or install chatluna-multimodal-service plugin to enable image description.'
+                            'or install chatluna-multimodal-service (multimodal-service) plugin to enable image description.'
                     )
                 }
                 return false
@@ -292,7 +292,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) {
             if (ext === 'image/gif') {
                 if (!isInstalledImageService) {
                     logger.warn(
-                        `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service plugin to parse GIF animations.`
+                        `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service (multimodal-service) plugin to parse GIF animations.`
                     )
                 }
                 return false
@@ -625,7 +625,7 @@ async function handleFileElement(
                 ctx.chatluna.getPlugin('multimodal-service') != null
             if (!isInstalledMultimodalService) {
                 logger.warn(
-                    `Unsupported audio format "${mimeType}". Please install chatluna-multimodal-service plugin to handle this format.`
+                    `Unsupported audio format "${mimeType}". Please install chatluna-multimodal-service (multimodal-service) plugin to handle this format.`
                 )
             }
             return false
@@ -737,7 +737,7 @@ async function oldImageRead(
         if (ext === 'image/gif') {
             if (!isInstalledImageService) {
                 logger.warn(
-                    `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service plugin to parse GIF animations.`
+                    `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service (multimodal-service) plugin to parse GIF animations.`
                 )
             }
             return false

From ff19d58587f0b270a8ae87ddba3045200ceff22d Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 22:04:39 +0800
Subject: [PATCH 7/9] [Fix] format context compression output

---
 packages/core/src/llm-core/chat/infinite_context.ts | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/packages/core/src/llm-core/chat/infinite_context.ts b/packages/core/src/llm-core/chat/infinite_context.ts
index 252bb9f60..e92c2034b 100644
--- a/packages/core/src/llm-core/chat/infinite_context.ts
+++ b/packages/core/src/llm-core/chat/infinite_context.ts
@@ -71,8 +71,7 @@ export async function compressIfNeeded(
                 inputTokens,
                 originalMessageCount: messages.length,
                 remainingMessageCount: compacted.length,
-                messages:
-                    compacted !== messages ? compacted : undefined
+                messages: compacted !== messages ? compacted : undefined
             }
         }
 
@@ -118,8 +117,7 @@ export async function compressIfNeeded(
             inputTokens,
             originalMessageCount: messages.length,
             remainingMessageCount: compacted.length,
-            messages:
-                compacted !== messages ? compacted : undefined
+            messages: compacted !== messages ? compacted : undefined
         }
     }
 
@@ -132,8 +130,7 @@ export async function compressIfNeeded(
             inputTokens,
             originalMessageCount: messages.length,
             remainingMessageCount: compacted.length,
-            messages:
-                compacted !== messages ? compacted : undefined
+            messages: compacted !== messages ? compacted : undefined
         }
     }
 
@@ -150,8 +147,7 @@ export async function compressIfNeeded(
             inputTokens,
             originalMessageCount: messages.length,
             remainingMessageCount: compacted.length,
-            messages:
-                compacted !== messages ? compacted : undefined
+            messages: compacted !== messages ? compacted : undefined
         }
     }
 

From c8b84c108f5ca66ad805e2cfcb2ba06303d87891 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sat, 23 May 2026 22:17:38 +0800
Subject: [PATCH 8/9] [Fix] simplify context token counting

---
 .../src/llm-core/agent/legacy-executor.ts     |  89 ++++-------
 .../src/llm-core/chat/infinite_context.ts     | 126 +++++-----------
 packages/core/src/llm-core/platform/model.ts  | 138 ++++++------------
 .../core/src/llm-core/utils/count_tokens.ts   |  63 ++------
 4 files changed, 127 insertions(+), 289 deletions(-)

diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index 89231b28e..bf411cdfd 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -376,11 +376,8 @@ export async function* runAgent(
 }
 
 /**
- * Compress scratchpad when it grows too large during tool-call loops.
- * Uses the actual input_tokens from the last LLM call to determine if
- * compression is needed. Summarizes early scratchpad entries + chat_history
- * into a single summary, replaces input.chat_history, and keeps only recent
- * scratchpad entries.
+ * Compress scratchpad when input tokens approach context limit.
+ * Summarizes early scratchpad + chat_history, keeps recent entries.
  */
 async function compressScratchpad(
     scratchpad: ScratchpadEntry[],
@@ -391,47 +388,59 @@ async function compressScratchpad(
     signal?: AbortSignal
 ): Promise<void> {
     const invocation = model.invocationParams()
-    const maxTokenLimit =
+    const limit =
         invocation.maxTokenLimit && invocation.maxTokenLimit > 0
             ? invocation.maxTokenLimit
             : model.getModelMaxContextSize()
 
-    if (!maxTokenLimit || maxTokenLimit <= 0) return
-
-    // Only compress if input tokens exceed 85% of context window
-    if (inputTokens < maxTokenLimit * 0.85) return
+    if (!limit || limit <= 0 || inputTokens < limit * 0.85) return
 
     logger.info(
-        '[ScratchpadCompress] Input tokens %d exceed 85%% of %d, compressing',
+        '[ScratchpadCompress] %d tokens exceed 85%% of %d, compressing',
         inputTokens,
-        maxTokenLimit
+        limit
     )
 
-    // Keep the last 3 entries (most recent tool calls), compress the rest
     const keepCount = Math.min(3, scratchpad.length)
     const toCompress = scratchpad.slice(0, scratchpad.length - keepCount)
-
     if (toCompress.length === 0) return
 
-    // Build transcript from chat_history + early scratchpad
     const chatHistory = (input['chat_history'] ?? []) as BaseMessage[]
-    const chatTranscript = chatHistory
+    const chatPart = chatHistory
         .map((msg) => {
-            const role = msg.getType().toUpperCase()
-            const name = msg.name ? ` (${msg.name})` : ''
             const content =
                 typeof msg.content === 'string'
                     ? msg.content.trim()
                     : JSON.stringify(msg.content)
-            return `[${role}${name}]\n${content || '(empty)'}`
+            return `[${msg.getType().toUpperCase()}${msg.name ? ` (${msg.name})` : ''}]\n${content || '(empty)'}`
         })
         .join('\n\n---\n\n')
 
-    const scratchTranscript = formatScratchpadTranscript(toCompress)
-    const transcript = chatTranscript
-        ? `${chatTranscript}\n\n---\n\n${scratchTranscript}`
-        : scratchTranscript
+    const scratchPart = toCompress
+        .map((entry) => {
+            if ('messages' in entry) {
+                return entry.messages
+                    .map((m) => {
+                        const c =
+                            typeof m.content === 'string'
+                                ? m.content.trim()
+                                : JSON.stringify(m.content)
+                        return `[HUMAN]\n${c}`
+                    })
+                    .join('\n\n---\n\n')
+            }
+            const inp =
+                typeof entry.action.toolInput === 'string'
+                    ? entry.action.toolInput
+                    : JSON.stringify(entry.action.toolInput)
+            const obs = observationToMessageContent(entry.observation)
+            return `[AI Tool Call: ${entry.action.tool}]\n${inp.slice(0, 300)}\n\n[TOOL Result]\n${obs.slice(0, 500)}`
+        })
+        .join('\n\n---\n\n')
 
+    const transcript = chatPart
+        ? `${chatPart}\n\n---\n\n${scratchPart}`
+        : scratchPart
     if (!transcript.trim()) return
 
     try {
@@ -441,10 +450,8 @@ async function compressScratchpad(
             conversationId,
             signal
         )
-
         if (!summary?.text.trim()) return
 
-        // Replace chat_history with summary
         input['chat_history'] = [
             new HumanMessage({
                 content: summary.text.trim(),
@@ -452,8 +459,6 @@ async function compressScratchpad(
                 additional_kwargs: { source: 'scratchpad-compression' }
             })
         ]
-
-        // Trim scratchpad: remove early entries, keep recent
         scratchpad.splice(0, scratchpad.length - keepCount)
 
         logger.info(
@@ -466,36 +471,6 @@ async function compressScratchpad(
     }
 }
 
-function formatScratchpadTranscript(entries: ScratchpadEntry[]): string {
-    return entries
-        .map((entry) => {
-            if ('messages' in entry) {
-                return entry.messages
-                    .map((m) => {
-                        const content =
-                            typeof m.content === 'string'
-                                ? m.content.trim()
-                                : JSON.stringify(m.content)
-                        return `[HUMAN]\n${content}`
-                    })
-                    .join('\n\n---\n\n')
-            }
-            const toolInput =
-                typeof entry.action.toolInput === 'string'
-                    ? entry.action.toolInput
-                    : JSON.stringify(entry.action.toolInput)
-            const truncatedInput =
-                toolInput.length > 300
-                    ? toolInput.slice(0, 300) + '...'
-                    : toolInput
-            const obs = observationToMessageContent(entry.observation)
-            const truncatedObs =
-                obs.length > 500 ? obs.slice(0, 500) + '...' : obs
-            return `[AI Tool Call: ${entry.action.tool}]\n${truncatedInput}\n\n[TOOL Result]\n${truncatedObs}`
-        })
-        .join('\n\n---\n\n')
-}
-
 export async function emitAgentEvent(
     runManager: CallbackManagerForChainRun | undefined,
     configurable: AgentRuntimeConfigurable,
diff --git a/packages/core/src/llm-core/chat/infinite_context.ts b/packages/core/src/llm-core/chat/infinite_context.ts
index e92c2034b..43d192c9b 100644
--- a/packages/core/src/llm-core/chat/infinite_context.ts
+++ b/packages/core/src/llm-core/chat/infinite_context.ts
@@ -43,20 +43,25 @@ export async function compressIfNeeded(
     opts: CompressContextOptions
 ): Promise<CompressContextResult> {
     const { chatHistory, model, conversationId, force } = opts
-
     const messages = await chatHistory.getMessages()
 
-    if (messages.length === 0) {
-        return emptyResult()
-    }
+    if (messages.length === 0) return emptyResult()
 
-    // Step 1: compact expired tool results in-place
+    // Step 1: compact expired tool results
     const compacted = compactExpiredToolResults(messages)
 
     // Step 2: count tokens
     const tokenCounter = (text: string) => model.getNumTokens(text)
     const inputTokens = await countMessagesTokens(compacted, tokenCounter)
 
+    const noCompressResult = (): CompressContextResult => ({
+        ...emptyResult(),
+        inputTokens,
+        originalMessageCount: messages.length,
+        remainingMessageCount: compacted.length,
+        messages: compacted !== messages ? compacted : undefined
+    })
+
     // Step 3: determine if compression is needed
     if (!force) {
         const invocation = model.invocationParams()
@@ -65,15 +70,7 @@ export async function compressIfNeeded(
                 ? invocation.maxTokenLimit
                 : model.getModelMaxContextSize()
 
-        if (!maxTokenLimit || maxTokenLimit <= 0) {
-            return {
-                ...emptyResult(),
-                inputTokens,
-                originalMessageCount: messages.length,
-                remainingMessageCount: compacted.length,
-                messages: compacted !== messages ? compacted : undefined
-            }
-        }
+        if (!maxTokenLimit || maxTokenLimit <= 0) return noCompressResult()
 
         const presetMessages = Array.isArray(opts.preset?.value?.messages)
             ? (opts.preset.value.messages as BaseMessage[])
@@ -84,15 +81,7 @@ export async function compressIfNeeded(
         )
         const threshold = Math.floor(maxTokenLimit * (opts.threshold ?? 0.85))
 
-        if (inputTokens + presetTokens <= threshold) {
-            return {
-                ...emptyResult(),
-                inputTokens,
-                originalMessageCount: messages.length,
-                remainingMessageCount: compacted.length,
-                messages: compacted !== messages ? compacted : undefined
-            }
-        }
+        if (inputTokens + presetTokens <= threshold) return noCompressResult()
 
         logger.info(
             '[InfiniteContext] Start compression: history=%d tokens, total=%d, threshold=%d',
@@ -107,32 +96,13 @@ export async function compressIfNeeded(
         )
     }
 
-    // Step 4: split messages into [to-compress, to-keep]
-    // Keep the most recent complete rounds that fit within 40% of threshold
+    // Step 4: split messages
     const { toCompress, toKeep } = splitMessages(compacted)
+    if (toCompress.length === 0) return noCompressResult()
 
-    if (toCompress.length === 0) {
-        return {
-            ...emptyResult(),
-            inputTokens,
-            originalMessageCount: messages.length,
-            remainingMessageCount: compacted.length,
-            messages: compacted !== messages ? compacted : undefined
-        }
-    }
-
-    // Step 5: generate summary from early messages
+    // Step 5: generate summary
     const transcript = formatTranscript(toCompress)
-
-    if (!transcript.trim()) {
-        return {
-            ...emptyResult(),
-            inputTokens,
-            originalMessageCount: messages.length,
-            remainingMessageCount: compacted.length,
-            messages: compacted !== messages ? compacted : undefined
-        }
-    }
+    if (!transcript.trim()) return noCompressResult()
 
     const summary = await compressChunk(
         model,
@@ -140,24 +110,13 @@ export async function compressIfNeeded(
         conversationId,
         opts.signal
     )
+    if (!summary?.text.trim()) return noCompressResult()
 
-    if (!summary?.text.trim()) {
-        return {
-            ...emptyResult(),
-            inputTokens,
-            originalMessageCount: messages.length,
-            remainingMessageCount: compacted.length,
-            messages: compacted !== messages ? compacted : undefined
-        }
-    }
-
-    // Step 6: build structured output
+    // Step 6: build result
     const summaryMessage = new HumanMessage({
         content: summary.text.trim(),
         name: 'infinite_context',
-        additional_kwargs: {
-            source: 'infinite-context'
-        }
+        additional_kwargs: { source: 'infinite-context' }
     })
 
     const resultMessages = [summaryMessage, ...toKeep]
@@ -205,7 +164,6 @@ function emptyResult(): CompressContextResult {
 
 /**
  * Replace expired (>1h) tool result content with a placeholder.
- * Returns a new array if any were compacted, otherwise the original.
  */
 function compactExpiredToolResults(messages: BaseMessage[]): BaseMessage[] {
     const placeholder =
@@ -214,11 +172,10 @@ function compactExpiredToolResults(messages: BaseMessage[]): BaseMessage[] {
 
     const result = messages.map((msg) => {
         if (msg.getType() !== 'tool') return msg
-
         const meta = msg.response_metadata?.chatluna as
             | ChatLunaMessageMeta
             | undefined
-        if (meta?.createdAt == null) return msg
+        if (!meta?.createdAt) return msg
         if (Date.now() - new Date(meta.createdAt).getTime() < 3600000)
             return msg
         if (getMessageContent(msg.content).trim() === placeholder) return msg
@@ -229,14 +186,7 @@ function compactExpiredToolResults(messages: BaseMessage[]): BaseMessage[] {
         return mapStoredMessageToChatMessage(cloned)
     })
 
-    if (!changed) return messages
-
-    logger.info(
-        '[InfiniteContext] Compacted %d expired tool results',
-        result.filter((_, i) => result[i] !== messages[i]).length
-    )
-
-    return result
+    return changed ? result : messages
 }
 
 /**
@@ -281,7 +231,6 @@ function splitMessages(messages: BaseMessage[]): {
 
 /**
  * Format messages into a transcript string for the LLM summarizer.
- * Preserves tool-call structure information.
  */
 function formatTranscript(messages: BaseMessage[]): string {
     return messages
@@ -290,29 +239,22 @@ function formatTranscript(messages: BaseMessage[]): string {
             const name = msg.name ? ` (${msg.name})` : ''
             const content = getMessageContent(msg.content).trim()
 
-            // Include tool_calls info for AI messages
             const toolCalls = msg['tool_calls'] as
                 | { name: string; args: unknown }[]
                 | undefined
-            let toolInfo = ''
-            if (toolCalls?.length > 0) {
-                toolInfo =
-                    '\nTool calls: ' +
-                    toolCalls
-                        .map((tc) => {
-                            const args = JSON.stringify(tc.args)
-                            const truncated =
-                                args.length > 200
-                                    ? args.slice(0, 200) + '...'
-                                    : args
-                            return `${tc.name}(${truncated})`
-                        })
-                        .join(', ')
-            }
-
-            // Include tool_call_id for tool messages
-            const toolCallId = msg['tool_call_id'] as string | undefined
-            const idInfo = toolCallId ? ` [call_id: ${toolCallId}]` : ''
+            const toolInfo =
+                toolCalls?.length > 0
+                    ? '\nTool calls: ' +
+                      toolCalls
+                          .map((tc) => {
+                              const args = JSON.stringify(tc.args)
+                              return `${tc.name}(${args.length > 200 ? args.slice(0, 200) + '...' : args})`
+                          })
+                          .join(', ')
+                    : ''
+
+            const callId = msg['tool_call_id'] as string | undefined
+            const idInfo = callId ? ` [call_id: ${callId}]` : ''
 
             return `[${role}${name}${idInfo}]\n${content || '(empty)'}${toolInfo}`
         })
diff --git a/packages/core/src/llm-core/platform/model.ts b/packages/core/src/llm-core/platform/model.ts
index 2f4dc74eb..9ad10c0ef 100644
--- a/packages/core/src/llm-core/platform/model.ts
+++ b/packages/core/src/llm-core/platform/model.ts
@@ -817,115 +817,67 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
         const conversationRounds = buildConversationRounds(messages)
         const selectedRounds: BaseMessage[][] = []
         let truncated = false
+        const hasLimit = maxTokenLimit != null && maxTokenLimit > 0
 
-        // Find baseline: last AI message with usage_metadata in the conversation
-        let baselineIdx = -1
+        // Find baseline: last AI message with usage_metadata
+        let baselineRoundIdx = -1
         let baselineTokens = 0
-        for (let i = messages.length - 1; i >= 0; i--) {
-            if (messages[i].getType() !== 'ai') continue
-            const usage = (messages[i] as AIMessage).usage_metadata
-            if (usage?.input_tokens > 0) {
-                baselineIdx = i
-                // input_tokens includes system messages we already counted
-                baselineTokens = usage.input_tokens - totalTokens
-                break
-            }
-        }
-
-        if (baselineIdx >= 0 && maxTokenLimit != null && maxTokenLimit > 0) {
-            // Find which round the baseline falls in
-            let msgCount = 0
-            let baselineRoundIdx = -1
+        if (hasLimit) {
+            let msgOffset = 0
             for (let r = 0; r < conversationRounds.length; r++) {
-                msgCount += conversationRounds[r].length
-                if (msgCount > baselineIdx) {
-                    baselineRoundIdx = r
-                    break
+                for (let j = 0; j < conversationRounds[r].length; j++) {
+                    const msg = messages[msgOffset + j]
+                    if (msg.getType() === 'ai') {
+                        const usage = (msg as AIMessage).usage_metadata
+                        if (usage?.input_tokens > 0) {
+                            baselineRoundIdx = r
+                            baselineTokens = usage.input_tokens - totalTokens
+                        }
+                    }
                 }
+                msgOffset += conversationRounds[r].length
             }
-            if (baselineRoundIdx < 0) {
-                baselineRoundIdx = conversationRounds.length - 1
-            }
-
-            // baselineTokens covers messages[0..baselineIdx-1].
-            // We also need to count the AI message itself and any messages
-            // after it within the same round (tool messages, etc.)
-            let roundStartIdx = 0
-            for (let r = 0; r < baselineRoundIdx; r++) {
-                roundStartIdx += conversationRounds[r].length
-            }
-            for (
-                let i = baselineIdx;
-                i < roundStartIdx + conversationRounds[baselineRoundIdx].length;
-                i++
-            ) {
-                baselineTokens += await this.countMessageTokens(messages[i])
-            }
-
-            // Iterate from end; when we reach baseline region, add all at once
-            for (let i = conversationRounds.length - 1; i >= 0; i--) {
-                if (i <= baselineRoundIdx && selectedRounds.length === 0) {
-                    // Bulk add all rounds up to baseline
-                    const exceedsLimit =
-                        totalTokens + baselineTokens > maxTokenLimit
-
-                    if (exceedsLimit && selectedRounds.length > 0) {
-                        truncated = true
-                        break
+            // Add tokens for messages after the baseline AI msg within its round
+            if (baselineRoundIdx >= 0) {
+                // Count the tail of the baseline round (AI msg itself + tool msgs)
+                for (const msg of conversationRounds[baselineRoundIdx]) {
+                    if (msg.getType() === 'ai' || msg.getType() === 'tool') {
+                        baselineTokens += await this.countMessageTokens(msg)
                     }
-
-                    totalTokens += baselineTokens
-                    for (let j = 0; j <= baselineRoundIdx; j++) {
-                        selectedRounds.unshift(
-                            conversationRounds[baselineRoundIdx - j]
-                        )
-                    }
-
-                    if (exceedsLimit) {
-                        truncated = true
-                    }
-                    break
                 }
+            }
+        }
 
-                const round = conversationRounds[i]
-                const roundTokens = await countRoundTokens(round)
-                const exceedsLimit = totalTokens + roundTokens > maxTokenLimit
-
-                if (exceedsLimit && selectedRounds.length > 0) {
+        // Select rounds from end to start
+        for (let i = conversationRounds.length - 1; i >= 0; i--) {
+            // If we hit the baseline region, bulk-add everything up to it
+            if (baselineRoundIdx >= 0 && i <= baselineRoundIdx) {
+                if (hasLimit && totalTokens + baselineTokens > maxTokenLimit) {
                     truncated = true
                     break
                 }
+                totalTokens += baselineTokens
+                for (let j = 0; j <= i; j++) {
+                    selectedRounds.unshift(conversationRounds[j])
+                }
+                break
+            }
 
-                totalTokens += roundTokens
-                selectedRounds.unshift(round)
+            const roundTokens = await countRoundTokens(conversationRounds[i])
+            const exceeds =
+                hasLimit && totalTokens + roundTokens > maxTokenLimit
 
-                if (exceedsLimit) {
-                    truncated = true
-                    break
-                }
+            if (exceeds && selectedRounds.length > 0) {
+                truncated = true
+                break
             }
-        } else {
-            // No baseline or no limit, fallback to counting each round
-            for (let i = conversationRounds.length - 1; i >= 0; i--) {
-                const round = conversationRounds[i]
-                const roundTokens = await countRoundTokens(round)
-                const exceedsLimit =
-                    maxTokenLimit != null && maxTokenLimit > 0
-                        ? totalTokens + roundTokens > maxTokenLimit
-                        : false
-
-                if (exceedsLimit && selectedRounds.length > 0) {
-                    truncated = true
-                    break
-                }
 
-                totalTokens += roundTokens
-                selectedRounds.unshift(round)
+            totalTokens += roundTokens
+            selectedRounds.unshift(conversationRounds[i])
 
-                if (exceedsLimit) {
-                    truncated = true
-                    break
-                }
+            if (exceeds) {
+                truncated = true
+                break
             }
         }
 
diff --git a/packages/core/src/llm-core/utils/count_tokens.ts b/packages/core/src/llm-core/utils/count_tokens.ts
index e504c1d55..1813589c2 100644
--- a/packages/core/src/llm-core/utils/count_tokens.ts
+++ b/packages/core/src/llm-core/utils/count_tokens.ts
@@ -210,13 +210,9 @@ export function parseRawModelName(
 }
 
 // ---------------------------------------------------------------------------
-// Smart token counting with usage_metadata optimization
+// Token counting with usage_metadata baseline optimization
 // ---------------------------------------------------------------------------
 
-/**
- * Count tokens for a single message using a tokenCounter function.
- * Strips base64 image markdown before counting.
- */
 export async function countMessageTokens(
     message: BaseMessage,
     tokenCounter: (text: string) => Promise<number>
@@ -231,63 +227,36 @@ export async function countMessageTokens(
         content = content.replaceAll(/!\[.*?\]\(.*?\)/g, '')
     }
 
-    let result =
+    return (
         (await tokenCounter(content)) +
-        (await tokenCounter(messageTypeToOpenAIRole(message.getType())))
-
-    if (message.name) {
-        result += await tokenCounter(message.name)
-    }
-
-    return result
+        (await tokenCounter(messageTypeToOpenAIRole(message.getType()))) +
+        (message.name ? await tokenCounter(message.name) : 0)
+    )
 }
 
 /**
- * Count tokens for a list of messages, using usage_metadata from the last
- * AI message as a baseline when available.
- *
- * If an AI message has usage_metadata.input_tokens, that value represents
- * the total input tokens at that LLM call (all prior messages + system prompt).
- * We use the last such message as a baseline and only count messages after it.
- *
- * @param messages - The message list to count
- * @param tokenCounter - Function to count tokens for a string
- * @param presetTokens - Token count of system/preset messages (subtracted from
- *                       baseline since usage_metadata.input_tokens includes them)
+ * Count tokens for messages. Uses the last AI message's usage_metadata as
+ * baseline to skip re-counting earlier messages.
  */
 export async function countMessagesTokens(
     messages: BaseMessage[],
     tokenCounter: (text: string) => Promise<number>,
     presetTokens = 0
 ): Promise<number> {
-    // Find the last AI message with usage_metadata.input_tokens
-    let baselineIdx = -1
-    let baselineTokens = 0
-
+    // Find last AI message with usage_metadata as baseline
     for (let i = messages.length - 1; i >= 0; i--) {
-        const msg = messages[i]
-        if (msg.getType() !== 'ai') continue
-
-        const usage = (msg as AIMessage).usage_metadata
+        if (messages[i].getType() !== 'ai') continue
+        const usage = (messages[i] as AIMessage).usage_metadata
         if (usage?.input_tokens > 0) {
-            baselineIdx = i
-            // input_tokens includes preset, subtract to get history-only tokens
-            baselineTokens = usage.input_tokens - presetTokens
-            break
-        }
-    }
-
-    if (baselineIdx >= 0) {
-        // Count only messages from the baseline AI message onward
-        // (the AI message's output becomes part of next call's input)
-        let tail = 0
-        for (let i = baselineIdx; i < messages.length; i++) {
-            tail += await countMessageTokens(messages[i], tokenCounter)
+            let tail = 0
+            for (let j = i; j < messages.length; j++) {
+                tail += await countMessageTokens(messages[j], tokenCounter)
+            }
+            return Math.max(usage.input_tokens - presetTokens + tail, 0)
         }
-        return Math.max(baselineTokens + tail, 0)
     }
 
-    // Fallback: count all messages
+    // Fallback: count all
     let total = 0
     for (const msg of messages) {
         total += await countMessageTokens(msg, tokenCounter)

From 28562b32499aa62c5d6676b7d2a6e991490233ae Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Sun, 24 May 2026 14:13:44 +0800
Subject: [PATCH 9/9] fix(extension-usage): hide zero-value models in chart
 tooltips

---
 packages/extension-usage/client/charts/token-line.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/extension-usage/client/charts/token-line.ts b/packages/extension-usage/client/charts/token-line.ts
index 8f1bee9e7..376405a77 100644
--- a/packages/extension-usage/client/charts/token-line.ts
+++ b/packages/extension-usage/client/charts/token-line.ts
@@ -55,7 +55,7 @@ function hour(date: string | Date) {
 function tooltip(
     params: { marker?: string; seriesName: string; value: number }[],
     theme: typeof chartTheme.value,
-    skipZero = false
+    skipZero = true
 ) {
     const row =
         'display:flex;align-items:center;justify-content:space-between;' +