diff --git a/packages/core/src/llm-core/agent/legacy-executor.ts b/packages/core/src/llm-core/agent/legacy-executor.ts
index e6287140f..bf411cdfd 100644
--- a/packages/core/src/llm-core/agent/legacy-executor.ts
+++ b/packages/core/src/llm-core/agent/legacy-executor.ts
@@ -1,5 +1,10 @@
 import { CallbackManagerForChainRun } from '@langchain/core/callbacks/manager'
-import { AIMessage, AIMessageChunk } from '@langchain/core/messages'
+import {
+    AIMessage,
+    AIMessageChunk,
+    BaseMessage,
+    HumanMessage
+} from '@langchain/core/messages'
 import { isDirectToolOutput } from '@langchain/core/messages/tool'
 import { OutputParserException } from '@langchain/core/output_parsers'
 import {
@@ -34,6 +39,8 @@ import {
     MessageQueue,
     ScratchpadEntry
 } from './types'
+import { compressChunk } from '../chain/infinite_context_chain'
+import type { ChatLunaChatModel } from '../platform/model'
 
 async function executeTools(
     actions: AgentAction[],
@@ -289,6 +296,29 @@ export async function* runAgent(
             }
         }
 
+        // Compress scratchpad if input tokens are approaching context limit
+        const model = config?.configurable?.['model'] as
+            | ChatLunaChatModel
+            | undefined
+        if (model && scratchpad.length > 6) {
+            // Get input_tokens from the AI message that triggered tool calls
+            const aiMsg = output[0]?.['messageLog']?.[0] as
+                | AIMessage
+                | undefined
+            const inputTokens = (aiMsg as AIMessage)?.usage_metadata
+                ?.input_tokens
+            if (inputTokens > 0) {
+                await compressScratchpad(
+                    scratchpad,
+                    options.input,
+                    model,
+                    config?.configurable?.['conversationId'] ?? '',
+                    inputTokens,
+                    signal
+                )
+            }
+        }
+
         const last = newSteps[newSteps.length - 1]
         const tool = last ? toolMap[last.action.tool?.toLowerCase()] : undefined
 
@@ -345,6 +375,102 @@ export async function* runAgent(
     }
 }
 
+/**
+ * Compress scratchpad when input tokens approach context limit.
+ * Summarizes early scratchpad + chat_history, keeps recent entries.
+ */
+async function compressScratchpad(
+    scratchpad: ScratchpadEntry[],
+    input: ChainValues,
+    model: ChatLunaChatModel,
+    conversationId: string,
+    inputTokens: number,
+    signal?: AbortSignal
+): Promise<void> {
+    const invocation = model.invocationParams()
+    const limit =
+        invocation.maxTokenLimit && invocation.maxTokenLimit > 0
+            ? invocation.maxTokenLimit
+            : model.getModelMaxContextSize()
+
+    if (!limit || limit <= 0 || inputTokens < limit * 0.85) return
+
+    logger.info(
+        '[ScratchpadCompress] %d tokens exceed 85%% of %d, compressing',
+        inputTokens,
+        limit
+    )
+
+    const keepCount = Math.min(3, scratchpad.length)
+    const toCompress = scratchpad.slice(0, scratchpad.length - keepCount)
+    if (toCompress.length === 0) return
+
+    const chatHistory = (input['chat_history'] ?? []) as BaseMessage[]
+    const chatPart = chatHistory
+        .map((msg) => {
+            const content =
+                typeof msg.content === 'string'
+                    ? msg.content.trim()
+                    : JSON.stringify(msg.content)
+            return `[${msg.getType().toUpperCase()}${msg.name ? ` (${msg.name})` : ''}]\n${content || '(empty)'}`
+        })
+        .join('\n\n---\n\n')
+
+    const scratchPart = toCompress
+        .map((entry) => {
+            if ('messages' in entry) {
+                return entry.messages
+                    .map((m) => {
+                        const c =
+                            typeof m.content === 'string'
+                                ? m.content.trim()
+                                : JSON.stringify(m.content)
+                        return `[HUMAN]\n${c}`
+                    })
+                    .join('\n\n---\n\n')
+            }
+            const inp =
+                typeof entry.action.toolInput === 'string'
+                    ? entry.action.toolInput
+                    : JSON.stringify(entry.action.toolInput)
+            const obs = observationToMessageContent(entry.observation)
+            return `[AI Tool Call: ${entry.action.tool}]\n${inp.slice(0, 300)}\n\n[TOOL Result]\n${obs.slice(0, 500)}`
+        })
+        .join('\n\n---\n\n')
+
+    const transcript = chatPart
+        ? `${chatPart}\n\n---\n\n${scratchPart}`
+        : scratchPart
+    if (!transcript.trim()) return
+
+    try {
+        const summary = await compressChunk(
+            model,
+            transcript,
+            conversationId,
+            signal
+        )
+        if (!summary?.text.trim()) return
+
+        input['chat_history'] = [
+            new HumanMessage({
+                content: summary.text.trim(),
+                name: 'infinite_context',
+                additional_kwargs: { source: 'scratchpad-compression' }
+            })
+        ]
+        scratchpad.splice(0, scratchpad.length - keepCount)
+
+        logger.info(
+            '[ScratchpadCompress] Compressed %d entries, kept %d',
+            toCompress.length,
+            keepCount
+        )
+    } catch (e) {
+        logger.error('[ScratchpadCompress] Failed:', e)
+    }
+}
+
 export async function emitAgentEvent(
     runManager: CallbackManagerForChainRun | undefined,
     configurable: AgentRuntimeConfigurable,
diff --git a/packages/core/src/llm-core/chain/infinite_context_chain.ts b/packages/core/src/llm-core/chain/infinite_context_chain.ts
index bb1a2c2ef..084950f35 100644
--- a/packages/core/src/llm-core/chain/infinite_context_chain.ts
+++ b/packages/core/src/llm-core/chain/infinite_context_chain.ts
@@ -1,57 +1,16 @@
-import { ChainValues } from '@langchain/core/utils/types'
 import { PromptTemplate } from '@langchain/core/prompts'
 import { AIMessage, type UsageMetadata } from '@langchain/core/messages'
-import { BufferMemory } from 'koishi-plugin-chatluna/llm-core/memory/langchain'
-import {
-    ChatLunaLLMCallArg,
-    ChatLunaLLMChain,
-    ChatLunaLLMChainWrapper
-} from 'koishi-plugin-chatluna/llm-core/chain/base'
+import { ChatLunaLLMChain } from 'koishi-plugin-chatluna/llm-core/chain/base'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
-import {
-    ChatLunaError,
-    ChatLunaErrorCode
-} from 'koishi-plugin-chatluna/utils/error'
 import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
 
-export interface ChatLunaInfiniteContextChainInput {
-    historyMemory: BufferMemory
-}
-
-export interface ChatLunaInfiniteContextChunkArg {
-    chunk: string
-    conversationId: string
-    signal?: AbortSignal
-}
-
-export interface ChatLunaInfiniteContextChunkResult {
+export interface CompressChunkResult {
     text: string
     usageMetadata?: UsageMetadata
 }
 
-export class ChatLunaInfiniteContextChain
-    extends ChatLunaLLMChainWrapper
-    implements ChatLunaInfiniteContextChainInput
-{
-    historyMemory: BufferMemory
-
-    private chain: ChatLunaLLMChain
-
-    constructor({
-        historyMemory,
-        chain
-    }: ChatLunaInfiniteContextChainInput & { chain: ChatLunaLLMChain }) {
-        super()
-        this.historyMemory = historyMemory
-        this.chain = chain
-    }
-
-    static fromLLM(
-        llm: ChatLunaChatModel,
-        { historyMemory }: ChatLunaInfiniteContextChainInput
-    ) {
-        const prompt =
-            PromptTemplate.fromTemplate(`You are a helpful AI assistant tasked with summarizing conversations.
+const COMPRESS_PROMPT =
+    PromptTemplate.fromTemplate(`You are a helpful AI assistant tasked with summarizing conversations.
 
 When asked to summarize, provide a detailed but concise summary of the conversation.
 Focus on information that would be helpful for continuing the conversation, including:
@@ -61,6 +20,7 @@ Focus on information that would be helpful for continuing the conversation, incl
 - What needs to be done next
 - Key user requests, constraints, or preferences that should persist
 - Important technical decisions and why they were made
+- Tool calls that were made and their results (summarize the key outcomes)
 
 Some old tool result messages may say that the original tool output expired and was removed.
 Treat those as intentional retention placeholders, not as meaningful tool output.
@@ -72,72 +32,39 @@ Do not respond to any questions in the conversation, only output the summary.
 Conversation:
 {conversation_chunk}`)
 
-        const chain = new ChatLunaLLMChain({ llm, prompt })
+export async function compressChunk(
+    model: ChatLunaChatModel,
+    transcript: string,
+    conversationId: string,
+    signal?: AbortSignal
+): Promise<CompressChunkResult | null> {
+    const trimmed = transcript?.trim()
 
-        return new ChatLunaInfiniteContextChain({
-            historyMemory,
-            chain
-        })
+    if (!trimmed) {
+        return null
     }
 
-    async compressChunk({
-        chunk,
-        conversationId,
-        signal
-    }: ChatLunaInfiniteContextChunkArg): Promise<ChatLunaInfiniteContextChunkResult | null> {
-        const trimmedChunk = chunk?.trim()
-
-        if (!trimmedChunk) {
-            return null
-        }
-
-        const result = await this.chain.invoke({
-            conversation_chunk: trimmedChunk,
-            id: conversationId,
-            stream: false,
-            signal
-        })
-
-        const rawMessage = (result['message'] ?? null) as AIMessage | null
+    const chain = new ChatLunaLLMChain({ llm: model, prompt: COMPRESS_PROMPT })
 
-        const text =
-            (result['text'] ?? '').toString().trim() ||
-            (rawMessage ? getMessageContent(rawMessage.content).trim() : '')
-
-        if (!text) {
-            return null
-        }
-
-        return {
-            text,
-            usageMetadata: rawMessage?.usage_metadata
-        }
-    }
+    const result = await chain.invoke({
+        conversation_chunk: trimmed,
+        id: conversationId,
+        stream: false,
+        signal
+    })
 
-    async call(
-        arg: ChatLunaLLMCallArg & { chunk?: string }
-    ): Promise<ChainValues> {
-        const chunk = arg['chunk'] ?? getMessageContent(arg.message.content)
+    const rawMessage = (result['message'] ?? null) as AIMessage | null
 
-        if (!chunk?.trim()) {
-            throw new ChatLunaError(
-                ChatLunaErrorCode.UNKNOWN_ERROR,
-                new Error(
-                    'Empty context chunk passed to Infinite Context chain'
-                )
-            )
-        }
+    const text =
+        (result['text'] ?? '').toString().trim() ||
+        (rawMessage ? getMessageContent(rawMessage.content).trim() : '')
 
-        return this.chain.invoke({
-            conversation_chunk: chunk,
-            id: arg.conversationId,
-            stream: arg.stream,
-            signal: arg.signal,
-            maxTokens: arg.maxToken
-        })
+    if (!text) {
+        return null
     }
 
-    get model(): ChatLunaChatModel {
-        return this.chain.llm
+    return {
+        text,
+        usageMetadata: rawMessage?.usage_metadata
     }
 }
diff --git a/packages/core/src/llm-core/chat/app.ts b/packages/core/src/llm-core/chat/app.ts
index a3fa3a0b4..8c2aa77ca 100644
--- a/packages/core/src/llm-core/chat/app.ts
+++ b/packages/core/src/llm-core/chat/app.ts
@@ -24,8 +24,10 @@ import {
     initModel,
     supportChatMode
 } from './helper'
-import type { CompressContextResult } from './infinite_context'
-import { InfiniteContextManager } from './infinite_context'
+import {
+    type CompressContextResult,
+    compressIfNeeded
+} from './infinite_context'
 import type {
     ArchiveRecord,
     BindingRecord,
@@ -41,7 +43,6 @@ export class ChatInterface {
     private _embeddings: ComputedRef<Embeddings>
 
     private _historyMemory?: BufferMemory
-    private _infiniteContextManager?: InfiniteContextManager
 
     private _chatCount = 0
 
@@ -58,7 +59,6 @@ export class ChatInterface {
         this._chain = undefined
         this._embeddings = undefined
         this._historyMemory = undefined
-        this._infiniteContextManager = undefined
     }
 
     private async handleChatError(
@@ -157,10 +157,18 @@ export class ChatInterface {
             hasSavedUser = true
         }
 
-        try {
-            if (this.chatluna.currentConfig.infiniteContext) {
-                const manager = this._ensureInfiniteContextManager()
-                const result = await manager?.compressIfNeeded(wrapper)
+        // Compress chat history before starting
+        if (this.chatluna.currentConfig.infiniteContext && this._chatHistory) {
+            try {
+                const result = await compressIfNeeded({
+                    chatHistory: this._chatHistory,
+                    model: wrapper.model,
+                    conversationId: this._input.conversationId,
+                    preset: this._input.preset,
+                    threshold:
+                        this.chatluna.currentConfig.infiniteContextThreshold,
+                    signal: arg.signal
+                })
                 if (result?.messages) {
                     await this._chatHistory.replaceMessages(result.messages)
                 }
@@ -170,9 +178,9 @@ export class ChatInterface {
                         result
                     )
                 }
+            } catch (error) {
+                logger.error('Error compressing context:', error)
             }
-        } catch (error) {
-            logger.error('Error compressing context:', error)
         }
 
         const response = (await wrapper.call({
@@ -387,15 +395,21 @@ export class ChatInterface {
 
     async compressContext(force = false): Promise<CompressContextResult> {
         const wrapper = await this.getChatLunaLLMChainWrapper()
-        const manager = this._ensureInfiniteContextManager()
-        if (!manager) {
+        if (!this._chatHistory) {
             throw new ChatLunaError(
                 ChatLunaErrorCode.CHAT_HISTORY_INIT_ERROR,
                 new Error('Chat history is not initialized')
             )
         }
 
-        const result = await manager.compressIfNeeded(wrapper, force)
+        const result = await compressIfNeeded({
+            chatHistory: this._chatHistory,
+            model: wrapper.model,
+            conversationId: this._input.conversationId,
+            preset: this._input.preset,
+            threshold: this.chatluna.currentConfig.infiniteContextThreshold,
+            force
+        })
         if (result.messages) {
             await this._chatHistory.replaceMessages(result.messages)
         }
@@ -441,25 +455,6 @@ export class ChatInterface {
 
         return this._historyMemory
     }
-
-    private _ensureInfiniteContextManager():
-        | InfiniteContextManager
-        | undefined {
-        if (!this._chatHistory) {
-            return undefined
-        }
-
-        if (!this._infiniteContextManager) {
-            this._infiniteContextManager = new InfiniteContextManager({
-                chatHistory: this._chatHistory,
-                conversationId: this._input.conversationId,
-                preset: this._input.preset,
-                threshold: this.chatluna.currentConfig.infiniteContextThreshold
-            })
-        }
-
-        return this._infiniteContextManager
-    }
 }
 
 async function autoSummarizeTitle(
diff --git a/packages/core/src/llm-core/chat/infinite_context.ts b/packages/core/src/llm-core/chat/infinite_context.ts
index 3af3c4f5f..43d192c9b 100644
--- a/packages/core/src/llm-core/chat/infinite_context.ts
+++ b/packages/core/src/llm-core/chat/infinite_context.ts
@@ -1,18 +1,18 @@
-/* eslint-disable max-len */
 import {
     BaseMessage,
     HumanMessage,
     mapStoredMessageToChatMessage
 } from '@langchain/core/messages'
-import { ComputedRef } from '@vue/reactivity'
 import { logger } from 'koishi-plugin-chatluna'
-import { ChatLunaLLMChainWrapper } from 'koishi-plugin-chatluna/llm-core/chain/base'
 import { KoishiChatMessageHistory } from 'koishi-plugin-chatluna/llm-core/memory/message'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
 import { PresetTemplate } from 'koishi-plugin-chatluna/llm-core/prompt'
 import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
-import { ChatLunaInfiniteContextChain } from '../chain/infinite_context_chain'
+import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain'
+import { countMessagesTokens } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
+import { compressChunk } from '../chain/infinite_context_chain'
 import type { ChatLunaMessageMeta } from '../../types'
+import { ComputedRef } from '@vue/reactivity'
 
 export interface CompressContextResult {
     inputTokens: number
@@ -25,298 +25,238 @@ export interface CompressContextResult {
     messages?: BaseMessage[]
 }
 
-function formatTranscript(messages: BaseMessage[]) {
-    return messages
-        .map((message) => {
-            const role = message.getType().toUpperCase()
-            const name = message.name ? ` (${message.name})` : ''
-            const content = getMessageContent(message.content).trim()
-            return `[${role}${name}]\n${content || '(empty)'}`
-        })
-        .join('\n\n---\n\n')
-}
-
-export interface InfiniteContextManagerOptions {
+export interface CompressContextOptions {
     chatHistory: KoishiChatMessageHistory
+    model: ChatLunaChatModel
     conversationId: string
     preset?: ComputedRef<PresetTemplate>
     threshold?: number
+    force?: boolean
+    signal?: AbortSignal
 }
 
-export class InfiniteContextManager {
-    private _chain?: ChatLunaInfiniteContextChain
-
-    constructor(private readonly options: InfiniteContextManagerOptions) {}
-
-    async compressIfNeeded(
-        wrapper: ChatLunaLLMChainWrapper,
-        force = false
-    ): Promise<CompressContextResult> {
-        const model = wrapper.model
-
-        if (!model) {
-            return {
-                inputTokens: 0,
-                outputTokens: 0,
-                reducedTokens: 0,
-                reducedPercent: 0,
-                compressed: false,
-                originalMessageCount: 0,
-                remainingMessageCount: 0
-            }
-        }
-
-        const messages = await this.options.chatHistory.getMessages()
-
-        if (messages.length === 0) {
-            return {
-                inputTokens: 0,
-                outputTokens: 0,
-                reducedTokens: 0,
-                reducedPercent: 0,
-                compressed: false,
-                originalMessageCount: 0,
-                remainingMessageCount: 0
-            }
-        }
-
-        const inputTokens = await this._countMessagesTokens(model, messages)
-        const expiredToolResultText =
-            'This tool result expired after 1 hour, so the original output was removed.'
-        let compactedCount = 0
-        const compactedIndexes = new Set<number>()
-
-        for (let idx = 0; idx < messages.length; idx++) {
-            const message = messages[idx]
-
-            if (message.getType() !== 'tool') {
-                continue
-            }
-
-            const meta = message.response_metadata?.chatluna as
-                | ChatLunaMessageMeta
-                | undefined
-
-            if (meta?.createdAt == null) {
-                continue
-            }
-
-            if (Date.now() - new Date(meta.createdAt).getTime() < 3600000) {
-                continue
-            }
-
-            if (
-                getMessageContent(message.content).trim() ===
-                expiredToolResultText
-            ) {
-                continue
-            }
-
-            compactedCount++
-            compactedIndexes.add(idx)
-        }
-
-        const compactedMessages =
-            compactedCount > 0
-                ? messages.map((message, idx) => {
-                      const cloned = mapStoredMessageToChatMessage(
-                          message.toDict()
-                      )
-
-                      if (compactedIndexes.has(idx)) {
-                          cloned.content = expiredToolResultText
-                      }
-
-                      return cloned
-                  })
-                : messages
-        const nextMessages = compactedCount > 0 ? compactedMessages : messages
-        const compactedTokens =
-            compactedCount > 0
-                ? await this._countMessagesTokens(model, nextMessages)
-                : inputTokens
-        let presetTokens = 0
-        let threshold: number | undefined
-
-        if (compactedCount > 0) {
-            logger.info(
-                '[InfiniteContext] Replaced %d expired tool results before compression',
-                compactedCount
-            )
-        }
-
-        if (!force) {
-            const invocation = model.invocationParams()
-            const maxTokenLimit =
-                invocation.maxTokenLimit && invocation.maxTokenLimit > 0
-                    ? invocation.maxTokenLimit
-                    : model.getModelMaxContextSize()
-
-            if (!maxTokenLimit || maxTokenLimit <= 0) {
-                return {
-                    inputTokens,
-                    outputTokens: compactedTokens,
-                    reducedTokens: inputTokens - compactedTokens,
-                    reducedPercent:
-                        inputTokens > 0
-                            ? ((inputTokens - compactedTokens) / inputTokens) *
-                              100
-                            : 0,
-                    compressed: false,
-                    originalMessageCount: messages.length,
-                    remainingMessageCount: nextMessages.length,
-                    messages: compactedCount > 0 ? nextMessages : undefined
-                }
-            }
-
-            const presetMessages = Array.isArray(
-                this.options.preset?.value?.messages
-            )
-                ? (this.options.preset?.value.messages as BaseMessage[])
-                : []
-
-            presetTokens = await this._countMessagesTokens(
-                model,
-                presetMessages
-            )
-            threshold = Math.floor(
-                maxTokenLimit * (this.options.threshold ?? 0.85)
-            )
-
-            if (compactedTokens + presetTokens <= threshold) {
-                return {
-                    inputTokens,
-                    outputTokens: compactedTokens,
-                    reducedTokens: inputTokens - compactedTokens,
-                    reducedPercent:
-                        inputTokens > 0
-                            ? ((inputTokens - compactedTokens) / inputTokens) *
-                              100
-                            : 0,
-                    compressed: false,
-                    originalMessageCount: messages.length,
-                    remainingMessageCount: nextMessages.length,
-                    messages: compactedCount > 0 ? nextMessages : undefined
-                }
-            }
-
-            logger.info(
-                '[InfiniteContext] Start compression with history tokens: %d, total tokens: %d, threshold: %d',
-                compactedTokens,
-                compactedTokens + presetTokens,
-                threshold
-            )
-        } else {
-            logger.info(
-                '[InfiniteContext] Start manual compression with history tokens: %d',
-                compactedTokens
-            )
-        }
-
-        const transcript = formatTranscript(nextMessages)
-
-        if (!transcript.trim()) {
-            return {
-                inputTokens,
-                outputTokens: compactedTokens,
-                reducedTokens: inputTokens - compactedTokens,
-                reducedPercent:
-                    inputTokens > 0
-                        ? ((inputTokens - compactedTokens) / inputTokens) * 100
-                        : 0,
-                compressed: false,
-                originalMessageCount: messages.length,
-                remainingMessageCount: nextMessages.length,
-                messages: compactedCount > 0 ? nextMessages : undefined
-            }
-        }
-
-        const summary = await this._ensureInfiniteContextChain(
-            wrapper
-        ).compressChunk({
-            chunk: transcript,
-            conversationId: this.options.conversationId
-        })
-
-        if (!summary?.text.trim()) {
-            return {
-                inputTokens,
-                outputTokens: compactedTokens,
-                reducedTokens: inputTokens - compactedTokens,
-                reducedPercent:
-                    inputTokens > 0
-                        ? ((inputTokens - compactedTokens) / inputTokens) * 100
-                        : 0,
-                compressed: false,
-                originalMessageCount: messages.length,
-                remainingMessageCount: nextMessages.length,
-                messages: compactedCount > 0 ? nextMessages : undefined
-            }
-        }
-
-        const message = new HumanMessage({
-            content: summary.text.trim(),
-            name: 'infinite_context',
-            additional_kwargs: {
-                source: 'infinite-context'
-            }
-        })
+/**
+ * Compress chat history when token usage exceeds threshold.
+ * Produces structured output: [summary message, ...recent messages].
+ */
+export async function compressIfNeeded(
+    opts: CompressContextOptions
+): Promise<CompressContextResult> {
+    const { chatHistory, model, conversationId, force } = opts
+    const messages = await chatHistory.getMessages()
+
+    if (messages.length === 0) return emptyResult()
+
+    // Step 1: compact expired tool results
+    const compacted = compactExpiredToolResults(messages)
+
+    // Step 2: count tokens
+    const tokenCounter = (text: string) => model.getNumTokens(text)
+    const inputTokens = await countMessagesTokens(compacted, tokenCounter)
+
+    const noCompressResult = (): CompressContextResult => ({
+        ...emptyResult(),
+        inputTokens,
+        originalMessageCount: messages.length,
+        remainingMessageCount: compacted.length,
+        messages: compacted !== messages ? compacted : undefined
+    })
+
+    // Step 3: determine if compression is needed
+    if (!force) {
+        const invocation = model.invocationParams()
+        const maxTokenLimit =
+            invocation.maxTokenLimit && invocation.maxTokenLimit > 0
+                ? invocation.maxTokenLimit
+                : model.getModelMaxContextSize()
+
+        if (!maxTokenLimit || maxTokenLimit <= 0) return noCompressResult()
+
+        const presetMessages = Array.isArray(opts.preset?.value?.messages)
+            ? (opts.preset.value.messages as BaseMessage[])
+            : []
+        const presetTokens = await countMessagesTokens(
+            presetMessages,
+            tokenCounter
+        )
+        const threshold = Math.floor(maxTokenLimit * (opts.threshold ?? 0.85))
 
-        const outputTokens = summary.usageMetadata?.output_tokens ?? 0
-        const reducedTokens = inputTokens - outputTokens
-        const reducedPercent =
-            inputTokens > 0 ? (reducedTokens / inputTokens) * 100 : 0
+        if (inputTokens + presetTokens <= threshold) return noCompressResult()
 
         logger.info(
-            '[InfiniteContext] Compressed history from %d to %d (-%d, %s%%)',
+            '[InfiniteContext] Start compression: history=%d tokens, total=%d, threshold=%d',
             inputTokens,
-            outputTokens,
-            reducedTokens,
-            reducedPercent.toFixed(2)
+            inputTokens + presetTokens,
+            threshold
+        )
+    } else {
+        logger.info(
+            '[InfiniteContext] Manual compression: history=%d tokens',
+            inputTokens
         )
+    }
 
-        if (threshold != null && outputTokens + presetTokens > threshold) {
-            logger.warn(
-                '[InfiniteContext] Tokens remain above threshold after compression: %d > %d',
-                outputTokens + presetTokens,
-                threshold
-            )
-        }
+    // Step 4: split messages
+    const { toCompress, toKeep } = splitMessages(compacted)
+    if (toCompress.length === 0) return noCompressResult()
+
+    // Step 5: generate summary
+    const transcript = formatTranscript(toCompress)
+    if (!transcript.trim()) return noCompressResult()
+
+    const summary = await compressChunk(
+        model,
+        transcript,
+        conversationId,
+        opts.signal
+    )
+    if (!summary?.text.trim()) return noCompressResult()
+
+    // Step 6: build result
+    const summaryMessage = new HumanMessage({
+        content: summary.text.trim(),
+        name: 'infinite_context',
+        additional_kwargs: { source: 'infinite-context' }
+    })
+
+    const resultMessages = [summaryMessage, ...toKeep]
+    const outputTokens = await countMessagesTokens(resultMessages, tokenCounter)
+    const reducedTokens = inputTokens - outputTokens
+    const reducedPercent =
+        inputTokens > 0 ? (reducedTokens / inputTokens) * 100 : 0
+
+    logger.info(
+        '[InfiniteContext] Compressed: %d → %d tokens (-%d, %.2f%%), kept %d recent messages',
+        inputTokens,
+        outputTokens,
+        reducedTokens,
+        reducedPercent,
+        toKeep.length
+    )
+
+    return {
+        inputTokens,
+        outputTokens,
+        reducedTokens,
+        reducedPercent,
+        compressed: true,
+        originalMessageCount: messages.length,
+        remainingMessageCount: resultMessages.length,
+        messages: resultMessages
+    }
+}
 
-        return {
-            inputTokens,
-            outputTokens,
-            reducedTokens,
-            reducedPercent,
-            compressed: true,
-            originalMessageCount: messages.length,
-            remainingMessageCount: 1,
-            messages: [message]
-        }
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function emptyResult(): CompressContextResult {
+    return {
+        inputTokens: 0,
+        outputTokens: 0,
+        reducedTokens: 0,
+        reducedPercent: 0,
+        compressed: false,
+        originalMessageCount: 0,
+        remainingMessageCount: 0
     }
+}
 
-    private async _countMessagesTokens(
-        model: ChatLunaChatModel,
-        messages: BaseMessage[]
-    ): Promise<number> {
-        let total = 0
+/**
+ * Replace expired (>1h) tool result content with a placeholder.
+ */
+function compactExpiredToolResults(messages: BaseMessage[]): BaseMessage[] {
+    const placeholder =
+        'This tool result expired after 1 hour, so the original output was removed.'
+    let changed = false
+
+    const result = messages.map((msg) => {
+        if (msg.getType() !== 'tool') return msg
+        const meta = msg.response_metadata?.chatluna as
+            | ChatLunaMessageMeta
+            | undefined
+        if (!meta?.createdAt) return msg
+        if (Date.now() - new Date(meta.createdAt).getTime() < 3600000)
+            return msg
+        if (getMessageContent(msg.content).trim() === placeholder) return msg
+
+        changed = true
+        const cloned = msg.toDict()
+        cloned.data.content = placeholder
+        return mapStoredMessageToChatMessage(cloned)
+    })
+
+    return changed ? result : messages
+}
 
-        for (const message of messages) {
-            total += await model.countMessageTokens(message)
+/**
+ * Split messages into [toCompress, toKeep].
+ * Keep the most recent complete conversation rounds.
+ * A round starts at a user message (HumanMessage or ChatLuna user message).
+ */
+function splitMessages(messages: BaseMessage[]): {
+    toCompress: BaseMessage[]
+    toKeep: BaseMessage[]
+} {
+    // Build rounds from the end
+    const rounds: BaseMessage[][] = []
+    let current: BaseMessage[] = []
+
+    for (let i = messages.length - 1; i >= 0; i--) {
+        const msg = messages[i]
+        current.unshift(msg)
+
+        const isRoundStart =
+            isChatLunaUserMessage(msg) || msg.getType() === 'human'
+
+        if (isRoundStart && i > 0) {
+            rounds.unshift(current)
+            current = []
         }
+    }
 
-        return total
+    if (current.length > 0) {
+        rounds.unshift(current)
     }
 
-    private _ensureInfiniteContextChain(
-        wrapper: ChatLunaLLMChainWrapper
-    ): ChatLunaInfiniteContextChain {
-        if (!this._chain || this._chain.model !== wrapper.model) {
-            this._chain = ChatLunaInfiniteContextChain.fromLLM(wrapper.model, {
-                historyMemory: wrapper.historyMemory
-            })
-        }
+    // Keep at least the last round, at most last 3 rounds
+    const keepCount = Math.min(Math.max(1, Math.ceil(rounds.length * 0.3)), 3)
+    const splitIdx = rounds.length - keepCount
 
-        return this._chain
-    }
+    const toCompress = rounds.slice(0, splitIdx).flat()
+    const toKeep = rounds.slice(splitIdx).flat()
+
+    return { toCompress, toKeep }
+}
+
+/**
+ * Format messages into a transcript string for the LLM summarizer.
+ */
+function formatTranscript(messages: BaseMessage[]): string {
+    return messages
+        .map((msg) => {
+            const role = msg.getType().toUpperCase()
+            const name = msg.name ? ` (${msg.name})` : ''
+            const content = getMessageContent(msg.content).trim()
+
+            const toolCalls = msg['tool_calls'] as
+                | { name: string; args: unknown }[]
+                | undefined
+            const toolInfo =
+                toolCalls?.length > 0
+                    ? '\nTool calls: ' +
+                      toolCalls
+                          .map((tc) => {
+                              const args = JSON.stringify(tc.args)
+                              return `${tc.name}(${args.length > 200 ? args.slice(0, 200) + '...' : args})`
+                          })
+                          .join(', ')
+                    : ''
+
+            const callId = msg['tool_call_id'] as string | undefined
+            const idInfo = callId ? ` [call_id: ${callId}]` : ''
+
+            return `[${role}${name}${idInfo}]\n${content || '(empty)'}${toolInfo}`
+        })
+        .join('\n\n---\n\n')
 }
diff --git a/packages/core/src/llm-core/platform/model.ts b/packages/core/src/llm-core/platform/model.ts
index ad657c126..9ad10c0ef 100644
--- a/packages/core/src/llm-core/platform/model.ts
+++ b/packages/core/src/llm-core/platform/model.ts
@@ -806,7 +806,7 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
             return rounds
         }
 
-        const countMessagesTokens = async (items: BaseMessage[]) => {
+        const countRoundTokens = async (items: BaseMessage[]) => {
             let tokens = 0
             for (const item of items) {
                 tokens += await this.countMessageTokens(item)
@@ -817,24 +817,65 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
         const conversationRounds = buildConversationRounds(messages)
         const selectedRounds: BaseMessage[][] = []
         let truncated = false
+        const hasLimit = maxTokenLimit != null && maxTokenLimit > 0
+
+        // Find baseline: last AI message with usage_metadata
+        let baselineRoundIdx = -1
+        let baselineTokens = 0
+        if (hasLimit) {
+            let msgOffset = 0
+            for (let r = 0; r < conversationRounds.length; r++) {
+                for (let j = 0; j < conversationRounds[r].length; j++) {
+                    const msg = messages[msgOffset + j]
+                    if (msg.getType() === 'ai') {
+                        const usage = (msg as AIMessage).usage_metadata
+                        if (usage?.input_tokens > 0) {
+                            baselineRoundIdx = r
+                            baselineTokens = usage.input_tokens - totalTokens
+                        }
+                    }
+                }
+                msgOffset += conversationRounds[r].length
+            }
+            // Add tokens for messages after the baseline AI msg within its round
+            if (baselineRoundIdx >= 0) {
+                // Count the tail of the baseline round (AI msg itself + tool msgs)
+                for (const msg of conversationRounds[baselineRoundIdx]) {
+                    if (msg.getType() === 'ai' || msg.getType() === 'tool') {
+                        baselineTokens += await this.countMessageTokens(msg)
+                    }
+                }
+            }
+        }
 
+        // Select rounds from end to start
         for (let i = conversationRounds.length - 1; i >= 0; i--) {
-            const round = conversationRounds[i]
-            const roundTokens = await countMessagesTokens(round)
-            const exceedsLimit =
-                maxTokenLimit != null && maxTokenLimit > 0
-                    ? totalTokens + roundTokens > maxTokenLimit
-                    : false
-
-            if (exceedsLimit && selectedRounds.length > 0) {
+            // If we hit the baseline region, bulk-add everything up to it
+            if (baselineRoundIdx >= 0 && i <= baselineRoundIdx) {
+                if (hasLimit && totalTokens + baselineTokens > maxTokenLimit) {
+                    truncated = true
+                    break
+                }
+                totalTokens += baselineTokens
+                for (let j = 0; j <= i; j++) {
+                    selectedRounds.unshift(conversationRounds[j])
+                }
+                break
+            }
+
+            const roundTokens = await countRoundTokens(conversationRounds[i])
+            const exceeds =
+                hasLimit && totalTokens + roundTokens > maxTokenLimit
+
+            if (exceeds && selectedRounds.length > 0) {
                 truncated = true
                 break
             }
 
             totalTokens += roundTokens
-            selectedRounds.unshift(round)
+            selectedRounds.unshift(conversationRounds[i])
 
-            if (exceedsLimit) {
+            if (exceeds) {
                 truncated = true
                 break
             }
@@ -842,7 +883,7 @@ export class ChatLunaChatModel extends BaseChatModel<ChatLunaModelCallOptions> {
 
         if (conversationRounds.length > 0 && selectedRounds.length === 0) {
             const round = conversationRounds[conversationRounds.length - 1]
-            totalTokens += await countMessagesTokens(round)
+            totalTokens += await countRoundTokens(round)
             selectedRounds.unshift(round)
             truncated = maxTokenLimit != null && maxTokenLimit > 0
         }
diff --git a/packages/core/src/llm-core/prompt/system_prompts.ts b/packages/core/src/llm-core/prompt/system_prompts.ts
index 5269dcbc4..b6c679e59 100644
--- a/packages/core/src/llm-core/prompt/system_prompts.ts
+++ b/packages/core/src/llm-core/prompt/system_prompts.ts
@@ -1,4 +1,4 @@
-import { BaseMessage, SystemMessage } from '@langchain/core/messages'
+import { SystemMessage } from '@langchain/core/messages'
 import { HumanMessagePromptTemplate } from '@langchain/core/prompts'
 import {
     ChatLunaContextManagerService,
@@ -6,49 +6,12 @@ import {
     PromptPipelineMiddleware
 } from './context_manager'
 import { logger } from 'koishi-plugin-chatluna'
-import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
-import { messageTypeToOpenAIRole } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
-
-// ---------------------------------------------------------------------------
-// Token counting helpers (shared by multiple middlewares)
-// ---------------------------------------------------------------------------
-
-export async function countMessageTokens(
-    message: BaseMessage,
-    tokenCounter: (text: string) => Promise<number>
-): Promise<number> {
-    let content = getMessageContent(message.content)
-
-    if (
-        content.includes('![image]') &&
-        content.includes('base64') &&
-        message.additional_kwargs?.['images']
-    ) {
-        content = content.replaceAll(/!\[.*?\]\(.*?\)/g, '')
-        message.content = content
-    }
-
-    let result =
-        (await tokenCounter(getMessageContent(message.content))) +
-        (await tokenCounter(messageTypeToOpenAIRole(message.getType())))
-
-    if (message.name) {
-        result += await tokenCounter(message.name)
-    }
-
-    return result
-}
+import {
+    countMessagesTokens,
+    countMessageTokens
+} from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
 
-export async function countMessagesTokens(
-    messages: BaseMessage[],
-    tokenCounter: (text: string) => Promise<number>
-): Promise<number> {
-    let total = 0
-    for (const message of messages) {
-        total += await countMessageTokens(message, tokenCounter)
-    }
-    return total
-}
+export { countMessageTokens, countMessagesTokens }
 
 // ---------------------------------------------------------------------------
 // system_prompts pipeline middleware
diff --git a/packages/core/src/llm-core/utils/count_tokens.ts b/packages/core/src/llm-core/utils/count_tokens.ts
index 3e9d3a489..1813589c2 100644
--- a/packages/core/src/llm-core/utils/count_tokens.ts
+++ b/packages/core/src/llm-core/utils/count_tokens.ts
@@ -1,6 +1,7 @@
-import { MessageType } from '@langchain/core/messages'
+import { AIMessage, BaseMessage, MessageType } from '@langchain/core/messages'
 import { type TiktokenModel } from 'js-tiktoken/lite'
 import { encodingForModel } from './tiktoken'
+import { getMessageContent } from 'koishi-plugin-chatluna/utils/string'
 
 // https://www.npmjs.com/package/js-tiktoken
 
@@ -207,3 +208,58 @@ export function parseRawModelName(
 
     return [value.slice(0, index), value.slice(index + 1)]
 }
+
+// ---------------------------------------------------------------------------
+// Token counting with usage_metadata baseline optimization
+// ---------------------------------------------------------------------------
+
+export async function countMessageTokens(
+    message: BaseMessage,
+    tokenCounter: (text: string) => Promise<number>
+): Promise<number> {
+    let content = getMessageContent(message.content)
+
+    if (
+        content.includes('![image]') &&
+        content.includes('base64') &&
+        message.additional_kwargs?.['images']
+    ) {
+        content = content.replaceAll(/!\[.*?\]\(.*?\)/g, '')
+    }
+
+    return (
+        (await tokenCounter(content)) +
+        (await tokenCounter(messageTypeToOpenAIRole(message.getType()))) +
+        (message.name ? await tokenCounter(message.name) : 0)
+    )
+}
+
+/**
+ * Count tokens for messages. Uses the last AI message's usage_metadata as
+ * baseline to skip re-counting earlier messages.
+ */
+export async function countMessagesTokens(
+    messages: BaseMessage[],
+    tokenCounter: (text: string) => Promise<number>,
+    presetTokens = 0
+): Promise<number> {
+    // Find last AI message with usage_metadata as baseline
+    for (let i = messages.length - 1; i >= 0; i--) {
+        if (messages[i].getType() !== 'ai') continue
+        const usage = (messages[i] as AIMessage).usage_metadata
+        if (usage?.input_tokens > 0) {
+            let tail = 0
+            for (let j = i; j < messages.length; j++) {
+                tail += await countMessageTokens(messages[j], tokenCounter)
+            }
+            return Math.max(usage.input_tokens - presetTokens + tail, 0)
+        }
+    }
+
+    // Fallback: count all
+    let total = 0
+    for (const msg of messages) {
+        total += await countMessageTokens(msg, tokenCounter)
+    }
+    return total
+}
diff --git a/packages/core/src/middlewares/chat/read_chat_message.ts b/packages/core/src/middlewares/chat/read_chat_message.ts
index 7c97418a0..3bfa4f6bd 100644
--- a/packages/core/src/middlewares/chat/read_chat_message.ts
+++ b/packages/core/src/middlewares/chat/read_chat_message.ts
@@ -249,7 +249,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) {
                     : undefined
 
             const isInstalledImageService =
-                ctx.chatluna.getPlugin('chatluna-multimodal-service') != null
+                ctx.chatluna.getPlugin('multimodal-service') != null
 
             if (
                 parsedModelInfo?.value != null &&
@@ -261,7 +261,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) {
                     logger.warn(
                         `Model "${model}" does not support image input. ` +
                             'Please use a model that supports vision capabilities, ' +
-                            'or install chatluna-multimodal-service plugin to enable image description.'
+                            'or install chatluna-multimodal-service (multimodal-service) plugin to enable image description.'
                     )
                 }
                 return false
@@ -292,7 +292,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) {
             if (ext === 'image/gif') {
                 if (!isInstalledImageService) {
                     logger.warn(
-                        `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service plugin to parse GIF animations.`
+                        `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service (multimodal-service) plugin to parse GIF animations.`
                     )
                 }
                 return false
@@ -625,7 +625,7 @@ async function handleFileElement(
                 ctx.chatluna.getPlugin('multimodal-service') != null
             if (!isInstalledMultimodalService) {
                 logger.warn(
-                    `Unsupported audio format "${mimeType}". Please install chatluna-multimodal-service plugin to handle this format.`
+                    `Unsupported audio format "${mimeType}". Please install chatluna-multimodal-service (multimodal-service) plugin to handle this format.`
                 )
             }
             return false
@@ -737,7 +737,7 @@ async function oldImageRead(
         if (ext === 'image/gif') {
             if (!isInstalledImageService) {
                 logger.warn(
-                    `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service plugin to parse GIF animations.`
+                    `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service (multimodal-service) plugin to parse GIF animations.`
                 )
             }
             return false
diff --git a/packages/extension-usage/client/charts/token-line.ts b/packages/extension-usage/client/charts/token-line.ts
index 8f1bee9e7..376405a77 100644
--- a/packages/extension-usage/client/charts/token-line.ts
+++ b/packages/extension-usage/client/charts/token-line.ts
@@ -55,7 +55,7 @@ function hour(date: string | Date) {
 function tooltip(
     params: { marker?: string; seriesName: string; value: number }[],
     theme: typeof chartTheme.value,
-    skipZero = false
+    skipZero = true
 ) {
     const row =
         'display:flex;align-items:center;justify-content:space-between;' +