NVIDIA · VALLIS-NERIA · Mar 4, 2026 · Mar 5, 2026 · Jan 28, 2026 · Jan 30, 2026
@@ -189,6 +189,10 @@ struct LinearAttentionMetadata
             // take a snapshot every `blockAlignment` blocks.
             auto perBlockBytes = allRecurrentStatesBytes * numLayers;
             auto numDynamicBlocks = (memoryBudget / perBlockBytes);
+            TLLM_LOG_INFO(
+                "Calculated max memory blocks for linear cache with recurrent states: memoryBudget=%zu, "
+                "perBlockBytes=%zu, numDynamicBlocks=%d",
+                memoryBudget, perBlockBytes, numDynamicBlocks);
             return static_cast<SizeType32>(numDynamicBlocks);
         }
         TLLM_THROW("Unknown linear cache type");
@@ -356,10 +360,7 @@ class KVCacheBlock : public std::enable_shared_from_this<KVCacheBlock>
     static BlockPtr createPlaceholder(IdType blockId, SizeType32 windowSize);
 
     void detachDescendantsFromLookupTree();
-    //! \brief Detach all placeholder blocks in the previous-block chain from the lookup tree.
-    //! \details Walks upward via getPrevBlock() and calls detachFromLookupNode() on each
-    //! block that is a placeholder. Stops at the root (kCachedBlocksRootId).
-    void detachPreviousPlaceholdersFromLookupTree() const;
+
     void freeBlockAndAllDescendants();
 
     //! \brief Find block matching blockKey. If allowPartial is true, the returned block may match only a prefix of
@@ -520,6 +521,11 @@ class GenerationRequest
         return mCacheBlockIds.at(windowSize);
     }
 
+    [[nodiscard]] std::vector<std::vector<SizeType32>>& getCacheBlockIds(SizeType32 windowSize)
+    {
+        return mCacheBlockIds.at(windowSize);
+    }
+
     [[nodiscard]] runtime::ITensor& getCacheBlockIndices(SizeType32 windowSize)
     {
         return *(mCacheBlockIndices.at(windowSize));

@@ -494,34 +494,9 @@ void KVCacheBlock::detachDescendantsFromLookupTree()
     }
 }
 
-void KVCacheBlock::detachPreviousPlaceholdersFromLookupTree() const
-{
-    BlockPtr current = getPrevBlock();
-    while (current != nullptr && current->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
-    {
-        if (!current->isPlaceholder())
-        {
-            return;
-        }
-        auto siblings = current->getNextBlocks();
-        for (auto const& [key, block] : siblings)
-        {
-            if (!block->isPlaceholder() && block.get() != this)
-            {
-                return;
-            }
-        }
-        BlockPtr prev = current->getPrevBlock();
-        current->detachFromLookupNode();
-        current->setPrevBlockInSeq(nullptr);
-        current = prev;
-    }
-}
-
 void KVCacheBlock::freeBlockAndAllDescendants()
 {
     detachDescendantsFromLookupTree();
-    detachPreviousPlaceholdersFromLookupTree();
     detachFromLookupNode();
 }
 
@@ -861,7 +836,9 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
             mLogPrefix.c_str(), numPlaceholderBlocks, KVCacheBlock::kCachedBlocksRootId - 1 - numPlaceholderBlocks,
             KVCacheBlock::kCachedBlocksRootId - 2);
         TLLM_CHECK_WITH_INFO(isRecurrentState(),
-            "numPlaceholderBlocks > 0 is only supported for recurrent-state (kRecurrentStates) managers");
+            "numPlaceholderBlocks > 0 is only supported for recurrent-state (kRecurrentStates) managers, but this "
+            "manager has windowSize=%d and isSWA=%d",
+            windowSize, isSWA);
         mAllPlaceholderBlocksById.resize(numPlaceholderBlocks + 2, nullptr);
         for (SizeType32 i = 0; i < numPlaceholderBlocks; ++i)
         {
@@ -1859,7 +1836,16 @@ bool WindowBlockManager::tryAllocatePlaceholderForLinearAttention(GenerationRequ
     for (auto beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
     {
         auto lastBlockId = lastBlockIds[beamIdx];
-        TLLM_CHECK(lastBlockId >= 0);
+        if (lastBlockId < 0)
+        {
+            auto const& blockIds = sequence.getCacheBlockIds(mWindowSize).at(0);
+            for (auto id : blockIds)
+            {
+                std::cout << id << " ";
+            }
+            std::cout << lastBlockId << std::endl;
+            TLLM_THROW("ERROR!");
+        }
         TLLM_LOG_DEBUG("%s::allocateBlock - Swapping placeholder with last block %d for beam %d", mLogPrefix.c_str(),
             lastBlockId, beamIdx);
         auto lastBlock = getBlockById(lastBlockId);
@@ -2198,6 +2184,35 @@ void BlockManager::releaseLastBlock(GenerationRequest& sequence, SizeType32 wind
 
 void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
 {
+    if (isRecurrentState())
+    {
+        // In recurrent state, the last block always contains the current state and should not be released.
+        // Since the only caller of releaseLastBlock is speculative decoding rewinding, it only happens in decoding
+        // phase. We pop up the second last block instead, which is supposed to be a placeholder.
+        auto const requestId = sequence.getRequestId();
+        auto& allocatedBlocks = mAllocatedBlocksPerSeq.at(requestId);
+        TLLM_CHECK(allocatedBlocks.size() >= 2);
+        auto it = allocatedBlocks.rbegin();
+        auto& lastBlock = *it;
+        auto& secondLastBlock = *(++it);
+        TLLM_CHECK(secondLastBlock->isPlaceholder());
+        // Decrease ref count of the second last block (placeholder)
+        secondLastBlock->decRefCount();
+        if (!secondLastBlock->hasRefs())
+        {
+            mEvictionPolicy->releaseBlock(secondLastBlock, true);
+        }
+        // Remove the second last block from allocated blocks
+        allocatedBlocks.erase((++it).base());
+        // Remove stored block ids in sequence
+        auto beamWidth = sequence.getBeamWidth();
+        for (auto beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
+        {
+            sequence.getCacheBlockIds(mWindowSize)[beamIdx].erase(
+                sequence.getCacheBlockIds(mWindowSize)[beamIdx].end() - 2);
+        }
+        return;
+    }
     auto const requestId = sequence.getRequestId();
     auto& allocatedBlocks = mAllocatedBlocksPerSeq.at(requestId);
     auto it = allocatedBlocks.rbegin();

@@ -22,7 +22,7 @@
 
 from ..memory_buffer_utils import Buffers
 from ..metadata import KVCacheParams
-from ..pyexecutor.mamba_cache_manager import MambaCacheManager
+from ..pyexecutor.mamba_cache_manager import BaseMambaCacheManager
 from ..pyexecutor.resource_manager import KVCacheManager, KVCacheManagerV2
 from ..utils import get_model_extra_attrs
 
@@ -305,8 +305,7 @@ def _prepare_mamba_metadata(self):
             return
 
         if self.mamba_metadata is None:
-            if (self.kv_cache_manager is not None
-                    and isinstance(self.kv_cache_manager, MambaCacheManager)):
+            if isinstance(self.kv_cache_manager, BaseMambaCacheManager):
                 from ..modules.mamba.mamba2_metadata import Mamba2Metadata
                 self.mamba_metadata = Mamba2Metadata(self.max_num_requests,
                                                      self.mamba_chunk_size)

@@ -51,7 +51,7 @@
 from ...._utils import get_free_port, mpi_rank, mpi_world_size
 from ....mapping import Mapping
 from ...distributed import Distributed
-from ...pyexecutor.mamba_cache_manager import MambaHybridCacheManager
+from ...pyexecutor.mamba_cache_manager import BaseMambaCacheManager
 from ...pyexecutor.model_engine import ModelEngine, PyTorchModelEngine
 from ...pyexecutor.py_executor import PyExecutor
 from ...pyexecutor.resource_manager import (
@@ -292,7 +292,7 @@ def _generate_dummy_request(
     )
 
     # check if it's a hybrid kv-cache manager
-    is_hybrid_cache = isinstance(kv_cache_manager, MambaHybridCacheManager)
+    is_hybrid_cache = isinstance(kv_cache_manager, BaseMambaCacheManager)
 
     # check if we have a free page and free state available
     if not kv_cache_manager.get_num_free_blocks():

@@ -11,7 +11,7 @@
 from tensorrt_llm.mapping import Mapping
 
 from ...._utils import torch_dtype_to_binding
-from ...pyexecutor.mamba_cache_manager import MambaHybridCacheManager
+from ...pyexecutor.mamba_cache_manager import MambaHybridCacheManager, MixedMambaHybridCacheManager
 from ...pyexecutor.resource_manager import KVCacheManager
 from ..custom_ops.attention_interface import (
     CausalConvResourceHandler,
@@ -511,7 +511,7 @@ def _create_and_assign_state_views(
         num_managed_mamba_layers = mamba_params["mamba_num_layers"]
 
         # Create the hybrid cache manager
-        manager = MambaHybridCacheManager(
+        manager = MixedMambaHybridCacheManager(
             **mamba_params,
             **kv_cache_kwargs,
         )

@@ -4,27 +4,32 @@
 import tempfile
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, Generic, List, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, Dict, Generic, List, Optional, TypeVar
 
 import filelock
 import torch
 import transformers
 from transformers.utils import HF_MODULES_CACHE
 
-from tensorrt_llm import logger
 from tensorrt_llm._torch.pyexecutor.config_utils import (
-    get_qwen3_hybrid_num_attention_layers, is_nemotron_hybrid, is_qwen3_hybrid,
-    load_pretrained_config)
+    get_qwen3_hybrid_num_attention_layers, is_hybrid_linear, is_nemotron_hybrid,
+    is_qwen3_hybrid, load_pretrained_config)
 from tensorrt_llm._utils import get_sm_version, torch_dtype_to_binding
 from tensorrt_llm.bindings import LayerType as LayerTypeCpp
 from tensorrt_llm.functional import AllReduceStrategy
 from tensorrt_llm.llmapi.llm_args import (DeepSeekSparseAttentionConfig,
-                                          MoeLoadBalancerConfig)
+                                          KvCacheConfig, MoeLoadBalancerConfig)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization.mode import QuantAlgo
 
+if TYPE_CHECKING:
+    from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp
+    from tensorrt_llm.llmapi.llm_args import (DecodingBaseConfig, LoraConfig,
+                                              SparseAttentionConfig,
+                                              SpeculativeConfig)
+
 TConfig = TypeVar("TConfig", bound=transformers.PretrainedConfig)
 
 
@@ -636,9 +641,12 @@ def _recursive_update_config(config: transformers.PretrainedConfig,
         model_config._frozen = True
         return model_config
 
-    def get_bindings_model_config(self,
-                                  tokens_per_block: Optional[int] = None
-                                  ) -> "ModelConfigCpp":
+    def get_bindings_model_config(
+        self,
+        tokens_per_block: Optional[int] = None,
+        kv_cache_config: Optional[KvCacheConfig] = None,
+        spec_config: Optional['SpeculativeConfig'] = None,
+    ) -> "ModelConfigCpp":
         """
         This method is used to construct the bindings config for the model.
         Currently it adheres to gptJsonConfig.cpp::createModelConfig, which assumes
@@ -667,7 +675,8 @@ def ceil_div(a, b):
 
         hidden_size = ceil_div(self.pretrained_config.hidden_size, attn_tp_size)
         num_layers = self.pretrained_config.num_hidden_layers
-        num_attention_layers = self.get_num_attention_layers()
+        num_attention_layers = self.get_num_attention_layers(
+            kv_cache_config, spec_config)
         if (self.spec_config is not None
                 and self.spec_config.spec_dec_mode.is_mtp_one_model()):
             num_layers += self.spec_config.num_nextn_predict_layers
@@ -693,6 +702,7 @@ def ceil_div(a, b):
 
         num_key_value_heads = getattr(self.pretrained_config,
                                       "num_key_value_heads", num_heads)
+
         if isinstance(num_key_value_heads, (list, tuple)):
             # Per-layer KV heads (e.g., Nemotron-NAS, variable GQA models)
             num_kv_heads_per_layer = [
@@ -796,10 +806,32 @@ def get_layer_types(self) -> Optional[List[LayerTypeCpp]]:
         else:
             return None
 
-    def get_num_attention_layers(self):
-        if is_nemotron_hybrid(self.pretrained_config):
+    def get_num_attention_layers(
+            self,
+            kv_cache_config: Optional[KvCacheConfig] = None,
+            spec_config: Optional['SpeculativeConfig'] = None):
+        """Return the number of layers that need KV cache blocks.
+
+        For hybrid models using the MixedMambaHybridCacheManager path
+        (TRTLLM_USE_CPP_MAMBA=1 for disagg), only attention layers need KV
+        cache blocks, so we return the attention-only count.
+
+        For the default CppMambaHybridCacheManager path (including speculative
+        decoding), both attention and mamba layers are managed in the unified
+        KV cache pool, so we return num_hidden_layers (all layers).
+        """
+        use_disagg = os.environ.get('TRTLLM_USE_CPP_MAMBA', '0') == '1'
+        use_reuse = kv_cache_config is not None and kv_cache_config.enable_block_reuse
+
+        use_v1_mamba_manager = use_disagg
+        if is_hybrid_linear(
+                self.pretrained_config) and use_v1_mamba_manager and use_reuse:
+            logger.warning(
+                "Block reuse does not work with MTP or disagg for hybrid linear models"
+            )
+        if is_nemotron_hybrid(self.pretrained_config) and use_v1_mamba_manager:
             return self.pretrained_config.hybrid_override_pattern.count("*")
-        elif is_qwen3_hybrid(self.pretrained_config):
+        elif is_qwen3_hybrid(self.pretrained_config) and use_v1_mamba_manager:
             return get_qwen3_hybrid_num_attention_layers(self.pretrained_config)
         else:
             return self.pretrained_config.num_hidden_layers
@@ -763,9 +763,14 @@ def forward(
 
         state_indices_p, state_indices_d = torch.split(state_indices, batch_split_size)
         if num_prefills > 0:
-            ssm_states[state_indices_p] = torch.zeros(
+            # PyExecutor guarantees prefill requests are placed before decode requests
+            has_initial_states_p = has_initial_states[:num_prefills]
+            ssm_states[state_indices_p[~has_initial_states_p]] = torch.zeros(
                 (), dtype=ssm_states.dtype, device=ssm_states.device
             )
+            conv_states[state_indices_p[~has_initial_states_p]] = torch.zeros(
+                (), dtype=conv_states.dtype, device=conv_states.device
+            )
 
         is_target_verify = (
             num_decodes > 0

@@ -383,9 +383,8 @@ def forward(
             is_target_verify = attn_metadata.kv_cache_manager.is_speculative(
             ) and spec_metadata is not None
             if is_target_verify:
-                # Speculative decoding only supported with Python path
                 assert layer_cache is not None, \
-                    "Speculative decoding requires Python MambaCacheManager"
+                    "Speculative decoding requires mamba_layer_cache() support"
                 # TODO: support dynamic speculation, will add current_draft_len later [TRTLLM-10319]
                 draft_token_num = spec_metadata.max_draft_len + 1
                 intermediate_conv_states = layer_cache.intermediate_conv_window