jd-opensource
diff --git a/‎xllm/core/framework/model/model_args.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/framework/model/model_args.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 11 additions & 0 deletions b/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎xllm/core/runtime/speculative_worker_impl.cpp‎
Lines changed: 154 additions & 20 deletions b/‎xllm/core/runtime/speculative_worker_impl.cpp‎
Lines changed: 154 additions & 20 deletions
diff --git a/‎xllm/core/runtime/worker_impl.cpp‎
Lines changed: 17 additions & 0 deletions b/‎xllm/core/runtime/worker_impl.cpp‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎xllm/models/llm/llm_model_base.h‎
Lines changed: 3 additions & 2 deletions b/‎xllm/models/llm/llm_model_base.h‎
Lines changed: 3 additions & 2 deletions
@@ -124,6 +124,8 @@ struct ModelArgs {
   PROPERTY(int32_t, v_head_dim) = 0;
   PROPERTY(int32_t, q_lora_rank) = 0;
   PROPERTY(int32_t, kv_lora_rank) = 0;
+  // deepseek v3/v3.2 MTP
+  PROPERTY(int32_t, num_nextn_predict_layers) = 0;
 
   // deepseek v3.2 indexer
   PROPERTY(int32_t, index_head_dim) = 0;
 
@@ -161,6 +161,17 @@ struct ModelInputParams {
     LOG(INFO) << "ModelInputParams: dp_global_token_nums is "
               << dp_global_token_nums;
   }
+
+  int32_t get_q_seq_len(int32_t seq_idx) const {
+#if defined(USE_NPU)
+    CHECK(seq_idx < q_seq_lens_vec.size()) << "seq_idx out of range";
+    return q_seq_lens_vec[seq_idx];
+#else
+    CHECK(seq_idx < q_seq_lens_vec.size() - 1) << "seq_idx out of range";
+    return q_seq_lens_vec[seq_idx + 1] - q_seq_lens_vec[seq_idx];
+#endif
+  }
+
   // whether the kv-cache is empty for all sequences.
   bool empty_kv_cache = true;
   BatchForwardType batch_forward_type;
 
@@ -58,6 +58,98 @@ int32_t kv_cache_slot_id(int32_t position,
   return block_id * block_size + block_offset;
 }
 
+// Convert tensor to int64 for MLU platform (temp workaround)
+// MLU will support int32 for masked_scatter in the future
+torch::Tensor ensure_int64_for_certain_platform(torch::Tensor tensor) {
+#if defined(USE_MLU)
+  return tensor.to(torch::kInt64);
+#else
+  return tensor;
+#endif
+}
+
+// Push cumulative sum to vector (used for cumulative format)
+void push_cumsum(std::vector<int32_t>& vec, int32_t len) {
+  if (vec.empty()) {
+    vec.emplace_back(0);
+  }
+  vec.emplace_back(vec.back() + len);
+}
+
+// Calculate actual kv_len based on platform type
+// For NPU: direct format - returns kv_seq_lens_slice[seq_id] + offset
+// For MLU/CUDA: cumulative format - returns the actual length increment
+int32_t calculate_kv_len(const Slice<int32_t>& kv_seq_lens_slice,
+                         int32_t seq_id,
+                         int32_t offset) {
+#if defined(USE_NPU)
+  return kv_seq_lens_slice[seq_id] + offset;
+#elif defined(USE_MLU) || defined(USE_CUDA)
+  return kv_seq_lens_slice[seq_id + 1] - kv_seq_lens_slice[seq_id] + offset;
+#endif
+}
+
+// Append sequence length to vector based on platform type
+// For NPU: directly add the len value
+// For MLU/CUDA: add using cumulative format
+void append_seq_len(std::vector<int32_t>& vec, int32_t len) {
+#if defined(USE_NPU)
+  vec.emplace_back(len);
+#elif defined(USE_MLU) || defined(USE_CUDA)
+  push_cumsum(vec, len);
+#endif
+}
+
+// Update kv_seq_lens_vec and kv_max_seq_len
+void update_kv_seq_lens_and_max(std::vector<int32_t>& kv_seq_lens_vec,
+                                int32_t kv_len,
+                                int32_t& kv_max_seq_len) {
+  // Update max (same logic for both platforms)
+  if (kv_len > kv_max_seq_len) {
+    kv_max_seq_len = kv_len;
+  }
+  // Update kv_seq_lens_vec
+  append_seq_len(kv_seq_lens_vec, kv_len);
+}
+
+// Batch expansion strategy for validation
+void batch_expansion_process_seq_lens(
+    std::vector<int32_t>& kv_seq_lens_vec,
+    std::vector<int32_t>& q_seq_lens_vec,
+    std::vector<std::vector<int32_t>>& block_tables_vec,
+    int32_t& kv_max_seq_len,
+    const Slice<int32_t>& kv_seq_lens_slice,
+    const Slice<int32_t>& block_table_slice,
+    int32_t seq_id,
+    int32_t position_offset,
+    int32_t num_val_tokens) {
+  for (int32_t offset = position_offset;
+       offset < num_val_tokens + position_offset;
+       ++offset) {
+    // Calculate kv length and update kv_seq_lens_vec and kv_max_seq_len
+    int32_t kv_len = calculate_kv_len(kv_seq_lens_slice, seq_id, offset);
+    update_kv_seq_lens_and_max(kv_seq_lens_vec, kv_len, kv_max_seq_len);
+    // Append sequence length of 1 to q_seq_lens_vec
+    //  for batch expansion strategy for validation
+    append_seq_len(q_seq_lens_vec, 1);
+    // Append block table to block_tables_vec
+    block_tables_vec.emplace_back(block_table_slice);
+  }
+}
+
+// Update kv_seq_lens_vec based on platform type
+// For NPU: directly add kv_seq_lens_slice[seq_id] + offset
+// For others: build cumulative format
+// Also updates kv_max_seq_len to track the maximum sequence length
+void update_kv_seq_lens_vec(std::vector<int32_t>& kv_seq_lens_vec,
+                            const Slice<int32_t>& kv_seq_lens_slice,
+                            int32_t seq_id,
+                            int32_t offset,
+                            int32_t& kv_max_seq_len) {
+  int32_t kv_len = calculate_kv_len(kv_seq_lens_slice, seq_id, offset);
+  update_kv_seq_lens_and_max(kv_seq_lens_vec, kv_len, kv_max_seq_len);
+}
+
 }  // namespace
 
 SpeculativeWorkerImpl::SpeculativeWorkerImpl(const ParallelArgs& parallel_args,
@@ -68,6 +160,11 @@ SpeculativeWorkerImpl::SpeculativeWorkerImpl(const ParallelArgs& parallel_args,
   runtime_options.enable_schedule_overlap(false);
   impl_ =
       std::make_unique<LLMWorkerImpl>(parallel_args, device, runtime_options);
+  // here we specify num speculative tokens to 0 to pass the indication of
+  //  draft model to worker when enable_speculative_decode.
+  // NOTE: If you want to modify this part, make sure you also check the usage
+  // of
+  //  num_speculative_tokens in draft model.
   runtime_options.num_decoding_tokens(1).num_speculative_tokens(0);
   draft_impl_ =
       std::make_unique<LLMWorkerImpl>(parallel_args, device, runtime_options);
@@ -196,13 +293,15 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_prefill(
 
   // prepare input for draft model
   auto& embeddings = output.sample_output.embeddings;
-  auto next_tokens = safe_to(output.sample_output.next_tokens, torch::kInt);
+  auto next_tokens = ensure_int64_for_certain_platform(
+      safe_to(output.sample_output.next_tokens, torch::kInt));
 
   if (embeddings.defined()) {
     prefill_input.input_params.input_embedding = embeddings.clone();
   }
   if (next_tokens.defined()) {
     auto& token_ids = prefill_input.token_ids;
+    token_ids = ensure_int64_for_certain_platform(token_ids);
     auto mask = (token_ids == -1);
     token_ids.masked_scatter_(mask, next_tokens);
   }
@@ -259,7 +358,7 @@ void SpeculativeWorkerImpl::prepare_prefill_inputs(
   new_token_ids.reserve(input.token_ids.numel());
   for (size_t i = 0; i < input_params.num_sequences; ++i) {
     int32_t q_len = 0;
-    q_len = input_params.q_seq_lens_vec[i];
+    q_len = input_params.get_q_seq_len(i);
     Slice<int32_t> tokens_ids_slice_i =
         tokens_ids_slice.slice(start_idx + 1, start_idx + q_len);
     start_idx += q_len;
@@ -316,9 +415,10 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_decode(
 
   for (int i = 0; i < options_.num_speculative_tokens(); ++i) {
     ForwardOutput draft_output = draft_outputs[i];
-    auto next_tokens =
-        safe_to(draft_output.sample_output.next_tokens, torch::kInt);
+    auto next_tokens = ensure_int64_for_certain_platform(
+        safe_to(draft_output.sample_output.next_tokens, torch::kInt));
     auto& token_ids = validate_input.token_ids;
+    token_ids = ensure_int64_for_certain_platform(token_ids);
     auto mask = (token_ids == -1 * (i + 1));
     token_ids.masked_scatter_(mask, next_tokens);
   }
@@ -381,9 +481,13 @@ void SpeculativeWorkerImpl::prepare_draft_inputs(const ForwardInput& input,
   Slice<int32_t> kv_seq_lens_slice = input_params.kv_seq_lens_vec;
   torch::Tensor block_tables = safe_to(input_params.block_tables, torch::kCPU);
 
+  // Initialize kv_max_seq_len to 0
+  int32_t kv_max_seq_len = 0;
+
   for (int32_t seq_id = 0; seq_id < num_sequences; ++seq_id) {
     new_positions.emplace_back(positions_slice[seq_id] + offset);
-    kv_seq_lens_vec.emplace_back(kv_seq_lens_slice[seq_id] + offset);
+    update_kv_seq_lens_vec(
+        kv_seq_lens_vec, kv_seq_lens_slice, seq_id, offset, kv_max_seq_len);
     torch::Tensor block_table = block_tables[seq_id];
     Slice<int32_t> block_table_slice = {block_table.data_ptr<int32_t>(),
                                         block_table.numel()};
@@ -394,7 +498,7 @@ void SpeculativeWorkerImpl::prepare_draft_inputs(const ForwardInput& input,
 
   draft_input.positions = torch::tensor(new_positions, int_options);
   // update the input_params
-  input_params.kv_max_seq_len = input_params.kv_max_seq_len + offset;
+  input_params.kv_max_seq_len = kv_max_seq_len;
   input_params.kv_seq_lens_vec = kv_seq_lens_vec;
   input_params.kv_seq_lens = torch::tensor(kv_seq_lens_vec, int_options);
   input_params.new_cache_slots = torch::tensor(new_token_slot_ids, int_options);
@@ -438,6 +542,7 @@ void SpeculativeWorkerImpl::prepare_validate_inputs(
   std::vector<int32_t> new_token_slot_ids;
   std::vector<std::vector<int32_t>> block_tables_vec;
 
+  int32_t kv_max_seq_len = 0;
   for (int32_t seq_id = 0; seq_id < num_sequences; ++seq_id) {
     new_token_ids.emplace_back(tokens_ids_slice[seq_id]);
     new_positions.emplace_back(positions_slice[seq_id] + position_offset);
@@ -453,17 +558,27 @@ void SpeculativeWorkerImpl::prepare_validate_inputs(
 
     // process kv length and q length
     if (FLAGS_enable_atb_spec_kernel) {
+      // expand the num of decode tokens for each batch in the batch for
+      // validation
       kv_seq_lens_vec.emplace_back(kv_seq_lens_slice[seq_id] +
                                    num_speculative_tokens + position_offset);
       q_seq_lens_vec.emplace_back(num_val_tokens);
-    } else {
-      for (int32_t offset = position_offset;
-           offset < num_val_tokens + position_offset;
-           ++offset) {
-        q_seq_lens_vec.emplace_back(1);
-        kv_seq_lens_vec.emplace_back(kv_seq_lens_slice[seq_id] + offset);
-        block_tables_vec.emplace_back(block_table_slice);
+      // update max for NPU: direct format, compare with new value
+      if (kv_seq_lens_vec.back() > kv_max_seq_len) {
+        kv_max_seq_len = kv_seq_lens_vec.back();
       }
+    } else {
+      // expand the batch sizes for validation
+      //  and update max for MLU/CUDA: cumulative format, compare with new value
+      batch_expansion_process_seq_lens(kv_seq_lens_vec,
+                                       q_seq_lens_vec,
+                                       block_tables_vec,
+                                       kv_max_seq_len,
+                                       kv_seq_lens_slice,
+                                       block_table_slice,
+                                       seq_id,
+                                       position_offset,
+                                       num_val_tokens);
     }
 
     // process slot id
@@ -490,8 +605,7 @@ void SpeculativeWorkerImpl::prepare_validate_inputs(
   input_params.q_seq_lens_vec = std::move(q_seq_lens_vec);
   input_params.q_seq_lens =
       torch::tensor(input_params.q_seq_lens_vec, int_options);
-  input_params.kv_max_seq_len =
-      *std::max_element(kv_seq_lens_vec.begin(), kv_seq_lens_vec.end());
+  input_params.kv_max_seq_len = kv_max_seq_len;
   input_params.kv_seq_lens_vec = std::move(kv_seq_lens_vec);
   input_params.kv_seq_lens =
       torch::tensor(input_params.kv_seq_lens_vec, int_options);
@@ -573,6 +687,7 @@ SampleOutput SpeculativeWorkerImpl::validate(
   size_t num_draft_tokens = num_target_tokens - batch_size;
   COUNTER_ADD(speculative_num_draft_tokens_total, num_draft_tokens);
   COUNTER_ADD(speculative_num_accepted_tokens_total, num_draft_tokens - count);
+
   return sample_output;
 }
 
@@ -591,11 +706,14 @@ ForwardInput SpeculativeWorkerImpl::update_input_by_last_step_output(
   torch::Tensor positions = safe_to(inputs.positions, torch::kCPU);
   Slice<int32_t> positions_slice = {positions.data_ptr<int32_t>(),
                                     positions.numel()};
+  // Get the tokens generated in the last step (flattened for easier indexing)
   torch::Tensor last_token_ids = safe_to(
       last_step_output_.sample_output.next_tokens.flatten(), torch::kCPU);
   Slice<int64_t> last_tokens_ids_slice = {last_token_ids.data_ptr<int64_t>(),
                                           last_token_ids.numel()};
 
+  // Determine how many tokens were decoded in the last step
+  // If the output is 2D, it means multiple tokens were generated per sequence
   int32_t last_step_decode_num = 1;
   if (last_step_output_.sample_output.next_tokens.dim() == 2) {
     last_step_decode_num = last_step_output_.sample_output.next_tokens.size(1);
@@ -613,13 +731,23 @@ ForwardInput SpeculativeWorkerImpl::update_input_by_last_step_output(
   kv_seq_lens_vec.reserve(num_sequences);
   new_token_slot_ids.reserve(num_sequences);
 
-  // get right token id and position
+  // Initialize kv_max_seq_len to 0
+  int32_t kv_max_seq_len = 0;
+
+  // Process each sequence to get the correct token ID and position for the next
+  // step
   for (int32_t seq_id = 0; seq_id < num_sequences; ++seq_id) {
     int32_t postion_offset = 0;
     int32_t last_step_token_id = 0;
+
+    // If the token ID is non-negative, it's a direct token ID (not a
+    // placeholder)
     if (tokens_ids_slice[seq_id] >= 0) {
       last_step_token_id = tokens_ids_slice[seq_id];
     } else {
+      // Negative token IDs are placeholders that need to be resolved from
+      // last_step_output_ The absolute value minus 1 gives the index into the
+      // last step's output
       int32_t last_step_index = -1 * tokens_ids_slice[seq_id] - 1;
       last_step_index = last_step_index * last_step_decode_num;
       postion_offset = -1;
@@ -634,8 +762,14 @@ ForwardInput SpeculativeWorkerImpl::update_input_by_last_step_output(
 
     new_token_ids.emplace_back(last_step_token_id);
     new_positions.emplace_back(positions_slice[seq_id] + postion_offset);
-    kv_seq_lens_vec.emplace_back(kv_seq_lens_slice[seq_id] + postion_offset);
-
+    update_kv_seq_lens_vec(kv_seq_lens_vec,
+                           kv_seq_lens_slice,
+                           seq_id,
+                           postion_offset,
+                           kv_max_seq_len);
+
+    // Calculate the new cache slot ID based on the position offset
+    // This handles cases where we need to move to a different block
     torch::Tensor block_table = block_tables[seq_id];
     Slice<int32_t> block_table_slice = {block_table.data_ptr<int32_t>(),
                                         block_table.numel()};
@@ -644,12 +778,12 @@ ForwardInput SpeculativeWorkerImpl::update_input_by_last_step_output(
     new_token_slot_ids.emplace_back(slot_id);
   }
 
+  // Create new tensors with updated values
   torch::TensorOptions int_options = inputs.token_ids.options();
   new_inputs.token_ids = torch::tensor(new_token_ids, int_options);
   new_inputs.positions = torch::tensor(new_positions, int_options);
   // update the input_params
-  input_params.kv_max_seq_len =
-      *std::max_element(kv_seq_lens_vec.begin(), kv_seq_lens_vec.end());
+  input_params.kv_max_seq_len = kv_max_seq_len;
   input_params.kv_seq_lens_vec = std::move(kv_seq_lens_vec);
   input_params.kv_seq_lens =
       torch::tensor(input_params.kv_seq_lens_vec, int_options);
 
@@ -607,9 +607,26 @@ bool WorkerImpl::init_model(const std::string& model_weights_path,
     }
   }
 
+#if defined(USE_NPU)
   if (options_.enable_speculative_decode() && FLAGS_enable_atb_spec_kernel) {
     args.num_speculative_tokens(options_.num_speculative_tokens());
   }
+#else
+  if (options_.enable_speculative_decode()) {
+    args.num_speculative_tokens(options_.num_speculative_tokens());
+    // When running speculative decoding, the draft worker reuses the same
+    // checkpoint as the target DeepSeek V3/V32 model. The draft worker needs to
+    // instantiate the MTP variant, so override the model_type here without
+    // mutating the original config.
+    if (options_.num_speculative_tokens() == 0 &&
+        (args.model_type() == "deepseek_v3" ||
+         args.model_type() == "deepseek_v32")) {
+      LOG(INFO) << "Overriding draft model_type from " << args.model_type()
+                << " to deepseek_mtp for speculative decoding";
+      args.model_type("deepseek_mtp");
+    }
+  }
+#endif
 
   // create model context
   dtype_ = dtype;
 
@@ -393,8 +393,9 @@ class LlmForCausalLMImplBase : public torch::nn::Module {
 #endif
   }
 
-  void load_model(std::unique_ptr<ModelLoader> loader,
-                  std::string prefix = "model." /*llm model weight prefix*/) {
+  virtual void load_model(
+      std::unique_ptr<ModelLoader> loader,
+      std::string prefix = "model." /*llm model weight prefix*/) {
     for (const auto& state_dict : loader->get_state_dicts()) {
       model_->load_state_dict(state_dict->get_dict_with_prefix(prefix));
       if (tie_word_embeddings) {