jd-opensource
diff --git a/‎xllm/core/distributed_runtime/vlm_engine.cpp‎
Lines changed: 2 additions & 1 deletion b/‎xllm/core/distributed_runtime/vlm_engine.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎xllm/core/distributed_runtime/vlm_master.cpp‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/distributed_runtime/vlm_master.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 2 additions & 9 deletions b/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 20 additions & 1 deletion b/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 4 additions & 1 deletion b/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎xllm/core/framework/batch/mposition.cpp‎
Lines changed: 3 additions & 1 deletion b/‎xllm/core/framework/batch/mposition.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎xllm/core/framework/batch/onerec_batch_input_builder.cpp‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/framework/batch/onerec_batch_input_builder.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/framework/block/block_manager.h‎
Lines changed: 4 additions & 1 deletion b/‎xllm/core/framework/block/block_manager.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎xllm/core/framework/block/block_manager_impl.cpp‎
Lines changed: 13 additions & 5 deletions b/‎xllm/core/framework/block/block_manager_impl.cpp‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎xllm/core/framework/block/block_manager_impl.h‎
Lines changed: 3 additions & 1 deletion b/‎xllm/core/framework/block/block_manager_impl.h‎
Lines changed: 3 additions & 1 deletion
@@ -288,7 +288,8 @@ bool VLMEngine::allocate_kv_cache(const Engine::KVCacheCapacity& kv_cache_cap) {
       .block_size(block_size)
       .enable_prefix_cache(options_.enable_prefix_cache())
       .enable_disagg_pd(options_.enable_disagg_pd())
-      .enable_cache_upload(options_.enable_cache_upload());
+      .enable_cache_upload(options_.enable_cache_upload())
+      .enable_mm_prefix_cache(options_.enable_prefix_cache());
   kv_cache_manager_ = std::make_unique<BlockManagerPool>(options);
 
   // init kv cache for each worker in parallel
 
@@ -418,7 +418,7 @@ std::shared_ptr<Request> VLMMaster::generate_request(
                         "Image processor process failed.");
     return nullptr;
   }
-
+  input_processor_->hash_mm_items(mm_inputs, mm_data);
   auto prompt = chat_template_->apply(messages);
   if (!prompt.has_value()) {
     CALLBACK_WITH_ERROR(StatusCode::INVALID_ARGUMENT,
 
@@ -53,12 +53,6 @@ void Batch::add(Sequence* sequence, uint32_t allowed_max_token) {
   if (input_embedding.defined())
     input_embeddings_vec_.emplace_back(input_embedding);
 
-  const auto& mm_data = sequence->get_mm_data();
-  //  if (sequence->is_chunked_prefill_stage() &&  mm_data.valid())
-  // TODO:Compatible With Chunked Prefill
-  if ((sequence->stage() == SequenceStage::PREFILL) && mm_data.valid()) {
-    mm_data_vec_.emplace_back(mm_data);
-  }
   update_forward_type(sequence);
 }
 
@@ -315,9 +309,8 @@ void Batch::process_sample_output(const RawForwardOutput& raw_output,
     }
     CHECK_LT(output_idx, num_seqs);
 
-    // mm embed task
-    if (raw_output.mm_embeddings.size() > 0) {
-      int64_t n_images = seq->get_mm_data().size();
+    if (raw_output.mm_embeddings.size() > 0) {  // mm embed task
+      int64_t n_images = seq->mm_data().size();
       if (n_images > 0) {
         std::vector<torch::Tensor> seq_mm_embeddings;
         seq_mm_embeddings.reserve(n_images);
 
@@ -28,6 +28,7 @@ limitations under the License.
 #include "framework/model/model_input_params.h"
 #include "framework/request/sequence.h"
 #include "framework/sampling/sampling_params.h"
+#include "request/mm_data_visitor.h"
 #include "runtime/params_utils.h"
 #include "util/blocking_counter.h"
 #include "util/slice.h"
@@ -303,6 +304,8 @@ void BatchInputBuilder::process_single_sequence(
   state.seq_lens.push_back(state.seq_lens.back() + seq_len + offset);
   state.q_seq_lens.push_back(state.q_seq_lens.back() + q_seq_len);
 #endif
+  // Process multi-modal input
+  process_multi_modal_inputs(sequence, n_kv_cache_tokens, q_seq_len);
   // Process tokens and positions
   extract_tokens_and_positions(sequence, n_kv_cache_tokens, seq_len, state_ptr);
 
@@ -340,7 +343,11 @@ void BatchInputBuilder::extract_tokens_and_positions(Sequence* sequence,
   if (use_mrope_) {
     const auto& args = *args_;
     MPositionHelper helper(*sequence, args);
-    state.mrope_positions_vec.emplace_back(helper.get_positions());
+    const auto& whole_positions = helper.get_positions();
+    auto position = (sequence->stage() == SequenceStage::DECODE)
+                        ? whole_positions
+                        : whole_positions.slice(1, n_kv_cache_tokens, seq_len);
+    state.mrope_positions_vec.push_back(position);
   }
 
   // Process each token
@@ -734,4 +741,16 @@ void BatchInputBuilder::process_swap_block_infos(
                                          swap_block_transfer_infos_->end());
   }
 }
+
+void BatchInputBuilder::process_multi_modal_inputs(Sequence* sequence,
+                                                   uint32_t n_kv_cache_tokens,
+                                                   uint32_t q_seq_len) {
+  MMData& mm_data = sequence->mutable_mm_data();
+  if ((sequence->stage() != SequenceStage::DECODE) && mm_data.valid()) {
+    UpdateMMItemScheduleStateVisitor visitor(n_kv_cache_tokens, q_seq_len);
+    mm_data.foreach (visitor);
+    MMType ty{static_cast<MMType::Value>(mm_data.type())};
+    mm_data_vec_.emplace_back(MMData(ty, std::move(visitor.mm_data_items_)));
+  }
+}
 }  // namespace xllm
@@ -55,6 +55,9 @@ class BatchInputBuilder {
   void process_sequences_multithreaded();
   void padding_decode_batch_size(uint32_t num_decoding_tokens,
                                  uint32_t min_decoding_batch_size);
+  void process_multi_modal_inputs(Sequence* sequence,
+                                  uint32_t n_kv_cache_tokens,
+                                  uint32_t q_seq_len);
   ForwardInput state_to_forward_input();
   RawForwardInput state_to_raw_forward_input();
 
@@ -145,7 +148,7 @@ class BatchInputBuilder {
   const std::vector<Sequence*>& sequences_;
   const std::vector<uint32_t>& allowed_max_tokens_;
   const std::vector<torch::Tensor>& input_embeddings_vec_;
-  const std::vector<MMData>& mm_data_vec_;
+  std::vector<MMData> mm_data_vec_;
   const ModelArgs* args_;
 
   // Builder state
 
@@ -18,6 +18,7 @@ limitations under the License.
 #include <absl/strings/match.h>
 
 #include "framework/model/model_args.h"
+#include "framework/request/mm_batch_data.h"
 #include "framework/request/sequence.h"
 
 namespace xllm {
@@ -46,7 +47,8 @@ std::vector<std::tuple<std::string, int, int>> groupByTokenType(
 torch::Tensor MPositionHelper::get_positions() {
   // if (seq_.is_chunked_prefill_stage()) {
   if (seq_.kv_state().kv_cache_tokens_num() < seq_.num_prompt_tokens()) {
-    auto& mm_data = seq_.get_mm_data();
+    auto& data = seq_.mm_data();
+    MMBatchData mm_data({data});
 
     torch::Tensor image_grid_thw;
     if (auto res = mm_data.get<torch::Tensor>("image_grid_thw"))
 
@@ -172,7 +172,7 @@ ForwardInput OneRecBatchInputBuilder::build_rec_forward_input(
                                            src_ptr + group_encoder_seq_len);
         }
         // Collect sparse_embedding
-        auto mm_data = sequence->get_mm_data();
+        auto mm_data = sequence->mm_data();
         auto sparse_embedding_optional =
             mm_data.get<torch::Tensor>(Sequence::ENCODER_SPARSE_EMBEDDING_NAME);
         if (sparse_embedding_optional.has_value()) {
 
@@ -45,6 +45,7 @@ class BlockManager {
     PROPERTY(int32_t, block_size) = 0;
     PROPERTY(bool, enable_prefix_cache) = true;
     PROPERTY(bool, enable_disagg_pd) = false;
+    PROPERTY(bool, enable_mm_prefix_cache) = false;
     PROPERTY(bool, enable_cache_upload) = false;
   };
 
@@ -56,10 +57,12 @@ class BlockManager {
   virtual std::vector<Block> allocate(size_t num_blocks) = 0;
 
   virtual std::vector<Block> allocate_shared(
+      Sequence* sequence,
       const Slice<int32_t>& tokens_ids,
       const Slice<Block>& existed_shared_blocks = {}) = 0;
 
-  virtual void cache(const Slice<int32_t>& token_ids,
+  virtual void cache(Sequence* sequence,
+                     const Slice<int32_t>& token_ids,
                      std::vector<Block>& blocks,
                      size_t existed_shared_blocks_num = 0) = 0;
   virtual void cache(const std::vector<Block>& blocks) = 0;
 
@@ -26,8 +26,11 @@ BlockManagerImpl::BlockManagerImpl(const Options& options)
   CHECK_GT(options.num_blocks(), 0) << "No blocks to allocate";
   CHECK_GT(options.block_size(), 0) << "Block size must be positive";
   if (options_.enable_prefix_cache()) {
-    prefix_cache_ = create_prefix_cache(options.block_size(),
-                                        options.enable_cache_upload());
+    PrefixCache::Options prefix_cache_options;
+    prefix_cache_options.block_size(options.block_size())
+        .enable_cache_upload(options.enable_cache_upload())
+        .enable_mm_prefix_cache(options.enable_mm_prefix_cache());
+    prefix_cache_ = create_prefix_cache(prefix_cache_options);
     CHECK(prefix_cache_) << "Failed to create prefix cache!";
   }
 
@@ -122,14 +125,15 @@ bool BlockManagerImpl::has_enough_blocks(uint32_t num_blocks) {
 }
 
 std::vector<Block> BlockManagerImpl::allocate_shared(
+    Sequence* sequence,
     const Slice<int32_t>& tokens_ids,
     const Slice<Block>& existed_shared_blocks) {
   // only allocate shared blocks for prefill sequences
   if (options_.enable_prefix_cache()) {
     AUTO_COUNTER(prefix_cache_latency_seconds_match);
 
     std::vector<Block> shared_blocks =
-        prefix_cache_->match(tokens_ids, existed_shared_blocks);
+        prefix_cache_->match(sequence, tokens_ids, existed_shared_blocks);
 
     const size_t prefix_length =
         shared_blocks.empty() ? 0
@@ -148,13 +152,17 @@ std::vector<Block> BlockManagerImpl::allocate_shared(
   return {};
 }
 
-void BlockManagerImpl::cache(const Slice<int32_t>& token_ids,
+void BlockManagerImpl::cache(Sequence* sequence,
+                             const Slice<int32_t>& token_ids,
                              std::vector<Block>& blocks,
                              size_t existed_shared_blocks_num) {
   if (options_.enable_prefix_cache()) {
     AUTO_COUNTER(prefix_cache_latency_seconds_insert);
     // Add the kv cache to the prefix cache
-    prefix_cache_->insert(token_ids, blocks, existed_shared_blocks_num);
+    prefix_cache_->insert(sequence,
+                          token_ids,
+                          blocks,
+                          existed_shared_blocks_num);
   }
 }
 
 
@@ -37,11 +37,13 @@ class BlockManagerImpl : public BlockManager {
 
   // allocate shared blocks when enable prefix cache
   std::vector<Block> allocate_shared(
+      Sequence* sequence,
       const Slice<int32_t>& tokens_ids,
       const Slice<Block>& existed_shared_blocks = {}) override;
 
   // cache blocks when enable prefix cache
-  void cache(const Slice<int32_t>& token_ids,
+  void cache(Sequence* sequence,
+             const Slice<int32_t>& token_ids,
              std::vector<Block>& blocks,
              size_t existed_shared_blocks_num = 0) override;
   void cache(const std::vector<Block>& blocks) override;
Original file line number	Diff line number	Diff line change
`@@ -418,7 +418,7 @@ std::shared_ptr<Request> VLMMaster::generate_request(`
`418`	`418`	`"Image processor process failed.");`
`419`	`419`	`return nullptr;`
`420`	`420`	`}`
`421`		`-`
	`421`	`+ input_processor_->hash_mm_items(mm_inputs, mm_data);`
`422`	`422`	`auto prompt = chat_template_->apply(messages);`
`423`	`423`	`if (!prompt.has_value()) {`
`424`	`424`	`CALLBACK_WITH_ERROR(StatusCode::INVALID_ARGUMENT,`
Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,7 @@ ForwardInput OneRecBatchInputBuilder::build_rec_forward_input(`
`172`	`172`	`src_ptr + group_encoder_seq_len);`
`173`	`173`	`}`
`174`	`174`	`// Collect sparse_embedding`
`175`		`- auto mm_data = sequence->get_mm_data();`
	`175`	`+ auto mm_data = sequence->mm_data();`
`176`	`176`	`auto sparse_embedding_optional =`
`177`	`177`	`mm_data.get<torch::Tensor>(Sequence::ENCODER_SPARSE_EMBEDDING_NAME);`
`178`	`178`	`if (sparse_embedding_optional.has_value()) {`