jd-opensource
diff --git a/‎xllm/api_service/rec_completion_service_impl.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/api_service/rec_completion_service_impl.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/distributed_runtime/rec_master.cpp‎
Lines changed: 42 additions & 6 deletions b/‎xllm/core/distributed_runtime/rec_master.cpp‎
Lines changed: 42 additions & 6 deletions
diff --git a/‎xllm/core/distributed_runtime/rec_master.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/distributed_runtime/rec_master.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/framework/batch/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 50 additions & 4 deletions b/‎xllm/core/framework/batch/batch.cpp‎
Lines changed: 50 additions & 4 deletions
diff --git a/‎xllm/core/framework/batch/batch.h‎
Lines changed: 10 additions & 2 deletions b/‎xllm/core/framework/batch/batch.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎xllm/core/framework/batch/batch_factory.cpp‎
Lines changed: 58 additions & 0 deletions b/‎xllm/core/framework/batch/batch_factory.cpp‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎xllm/core/framework/batch/batch_factory.h‎
Lines changed: 7 additions & 0 deletions b/‎xllm/core/framework/batch/batch_factory.h‎
Lines changed: 7 additions & 0 deletions
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "api_service_impl.h"
 #include "completion.pb.h"
-#include "rec.pb.h"
 #include "core/distributed_runtime/rec_master.h"
+#include "rec.pb.h"
 #include "stream_call.h"
 
 namespace xllm {
 
@@ -15,12 +15,11 @@ limitations under the License.
 
 #include "rec_master.h"
 
+#include <absl/time/time.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <pybind11/pybind11.h>
 
-#include <absl/time/time.h>
-
 #include "common/macros.h"
 #include "common/metrics.h"
 #include "models/model_registry.h"
@@ -33,13 +32,32 @@ limitations under the License.
 
 namespace xllm {
 
+namespace {
+
+RecType InferRecTypeFromModelArgs(const ModelArgs& model_args) {
+  const auto& model_type = model_args.model_type();
+  if (model_type == "onerec") {
+    return RecType::kOneRec;
+  }
+  if (model_type == "qwen3rec") {
+    return RecType::kLlmRec;
+  }
+  return RecType::kNone;
+}
+
+}  // namespace
+
 RecMaster::RecMaster(const Options& options)
     : Master(options, EngineType::REC) {
   // Initialize with Rec engine type
   // The rest of the initialization follows the same pattern as LLMMaster
   CHECK(engine_->init());
 
   model_args_ = engine_->model_args();
+  rec_type_ = InferRecTypeFromModelArgs(model_args_);
+  if (rec_type_ == RecType::kNone) {
+    LOG(ERROR) << "Unsupported rec model_type: " << model_args_.model_type();
+  }
 
   bool enable_decode_response_to_service = false;
   if (options_.enable_service_routing()) {
@@ -72,7 +90,6 @@ RecMaster::RecMaster(const Options& options)
       .enable_decode_response_to_service(enable_decode_response_to_service);
   scheduler_ = create_fixed_steps_scheduler(engine_.get(), scheduler_options);
 
-  // OmniRec model does not have a tokenizer
   chat_template_ = nullptr;
   tokenizer_ = nullptr;
   threadpool_ =
@@ -163,6 +180,26 @@ std::shared_ptr<Request> RecMaster::generate_request(
   // contain the actual data Skip prompt empty check as mentioned in
   // requirements
 
+  if (rec_type_ == RecType::kNone) {
+    LOG(ERROR) << "Unsupported rec model_type: " << model_args_.model_type();
+    CALLBACK_WITH_ERROR(
+        StatusCode::INVALID_ARGUMENT,
+        std::string("Unsupported rec model_type: ") + model_args_.model_type());
+    return nullptr;
+  }
+
+  // qwen3rec requires tokenizer and RecBatchInputBuilder support. This PR keeps
+  // the extension point but rejects requests early to avoid LOG(FATAL) in the
+  // batch builder path.
+  if (rec_type_ == RecType::kLlmRec) {
+    LOG(ERROR) << "Rec model_type is not supported yet: "
+               << model_args_.model_type();
+    CALLBACK_WITH_ERROR(StatusCode::INVALID_ARGUMENT,
+                        std::string("Rec model_type is not supported yet: ") +
+                            model_args_.model_type());
+    return nullptr;
+  }
+
   Timer timer;
   std::vector<int> local_prompt_tokens;
 
@@ -258,9 +295,8 @@ std::shared_ptr<Request> RecMaster::generate_request(
                          callback,
                          nullptr,
                          sp.decode_address);
-  // TODO. add following when next pr (add is_rec_model and bos_token_id to
-  // RequestState). req_state.is_rec_model = true; req_state.bos_token_id =
-  // model_args_.bos_token_id();
+  req_state.rec_type = rec_type_;
+  req_state.bos_token_id = model_args_.bos_token_id();
   auto request = std::make_shared<Request>(sp.request_id,
                                            sp.x_request_id,
                                            sp.x_request_time,
 
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "framework/chat_template/jinja_chat_template.h"
 #include "framework/model/model_args.h"
+#include "framework/request/rec_type.h"
 #include "master.h"
 #include "rec_engine.h"
 #include "scheduler/continuous_scheduler.h"
@@ -55,6 +56,7 @@ class RecMaster : public Master {
   std::unique_ptr<FixedStepsScheduler> scheduler_;
   // model args
   ModelArgs model_args_;
+  RecType rec_type_ = RecType::kNone;
   std::unique_ptr<ThreadPool> threadpool_;
   std::unique_ptr<Tokenizer> tokenizer_;
   // chat template instance
 
@@ -10,12 +10,16 @@ cc_library(
     batch.h
     batch_factory.h
     batch_input_builder.h
+    rec_batch_input_builder.h
+    onerec_batch_input_builder.h
     mposition.h
   SRCS 
     dit_batch.cpp
     batch.cpp
     batch_factory.cpp
     batch_input_builder.cpp
+    rec_batch_input_builder.cpp
+    onerec_batch_input_builder.cpp
     mposition.cpp
     beam_search.h
   DEPS
 
@@ -29,6 +29,7 @@ limitations under the License.
 #include "framework/model/model_input_params.h"
 #include "framework/request/sequence.h"
 #include "framework/sampling/sampling_params.h"
+#include "rec_batch_input_builder.h"
 #include "runtime/params_utils.h"
 #include "util/slice.h"
 #include "util/tensor_helper.h"
@@ -96,6 +97,10 @@ void Batch::add(const std::vector<Sequence*>& sequences) {
 ForwardInput Batch::prepare_forward_input(uint32_t num_decoding_tokens,
                                           uint32_t min_decoding_batch_size,
                                           const ModelArgs& args) {
+  if (sequences_.empty() && !sequence_groups_.empty()) {
+    return prepare_rec_forward_input(
+        num_decoding_tokens, min_decoding_batch_size, args);
+  }
   BatchInputBuilder builder(sequences_,
                             allowed_max_tokens_,
                             input_embeddings_vec_,
@@ -108,6 +113,43 @@ ForwardInput Batch::prepare_forward_input(uint32_t num_decoding_tokens,
                                      min_decoding_batch_size);
 }
 
+ForwardInput Batch::prepare_rec_forward_input(uint32_t num_decoding_tokens,
+                                              uint32_t min_decoding_batch_size,
+                                              const ModelArgs& args,
+                                              ThreadPool* thread_pool) {
+  RecType rec_type = RecType::kNone;
+  if (!sequence_groups_.empty() && !sequence_groups_[0]->sequences().empty()) {
+    rec_type = sequence_groups_[0]->sequences()[0]->rec_type();
+  }
+
+  auto builder = RecBatchInputBuilder::Create(rec_type,
+                                              sequence_groups_,
+                                              allowed_max_tokens_,
+                                              input_embeddings_vec_,
+                                              mm_data_vec_,
+                                              swap_block_transfer_infos_,
+                                              batch_id_,
+                                              &args,
+                                              thread_pool);
+  return builder->build_rec_forward_input(num_decoding_tokens,
+                                          min_decoding_batch_size);
+}
+
+std::vector<Sequence*> Batch::get_sequences() const {
+  if (!sequences_.empty()) {
+    return sequences_;
+  }
+
+  std::vector<Sequence*> result;
+  for (const auto* seq_group : sequence_groups_) {
+    const auto& sequences = seq_group->sequences();
+    for (const auto& seq_ptr : sequences) {
+      result.push_back(seq_ptr.get());
+    }
+  }
+  return result;
+}
+
 void Batch::dp_balance_shuffle_seqs() {
   // this shuffle operation is mainly used for npu with 24 cores
   // and specific mla op implementation
@@ -217,7 +259,8 @@ void Batch::process_sample_output(const RawForwardOutput& raw_output,
   // this means all sequences are in prefill stage status.
   const int64_t num_seqs = raw_output.outputs.size();
   int64_t output_idx = 0;
-  for (auto* seq : sequences_) {
+  const auto sequences = get_sequences();
+  for (auto* seq : sequences) {
     if (seq->finished()) {
       output_idx++;
       continue;
@@ -264,7 +307,8 @@ void Batch::process_sample_output(const SampleOutput& sample_output,
   if (sample_output.embeddings.defined()) {
     const int64_t num_seqs = sample_output.embeddings.size(0);
     int64_t output_idx = 0;
-    for (auto* seq : sequences_) {
+    const auto sequences = get_sequences();
+    for (auto* seq : sequences) {
       CHECK_LT(output_idx, num_seqs);
       auto cur_seq_embed =
           safe_to(sample_output.embeddings[output_idx++], torch::kFloat32);
@@ -277,7 +321,8 @@ void Batch::process_sample_output(const SampleOutput& sample_output,
   // this means all sequences are in prefill stage status.
   const int64_t num_seqs = sample_output.next_tokens.size(0);
   int64_t output_idx = 0;
-  for (auto* seq : sequences_) {
+  const auto sequences = get_sequences();
+  for (auto* seq : sequences) {
     if (seq->finished()) {
       output_idx++;
       continue;
@@ -352,7 +397,8 @@ void Batch::process_embedding_output(const torch::Tensor& output_embedding) {
   Token token(0);
   if (output_embedding.defined()) {
     int32_t slice_img_index = 0;
-    for (auto* seq : sequences_) {  // TODO
+    const auto sequences = get_sequences();
+    for (auto* seq : sequences) {
       const auto& mm_data = seq->get_mm_data();
 
       auto pixel_values = mm_data.get_tensor_vec("pixel_values");
 
@@ -75,7 +75,7 @@ class Batch {
 
   // get the number of sequences in the batch
   size_t size() const { return sequences_.size(); }
-  bool empty() const { return sequences_.empty(); }
+  bool empty() const { return sequences_.empty() && sequence_groups_.empty(); }
 
   Sequence* operator[](size_t i) { return sequences_[i]; }
 
@@ -84,6 +84,11 @@ class Batch {
                                      uint32_t min_decoding_bach_size,
                                      const ModelArgs& args);
 
+  ForwardInput prepare_rec_forward_input(uint32_t num_decoding_tokens,
+                                         uint32_t min_decoding_batch_size,
+                                         const ModelArgs& args,
+                                         ThreadPool* thread_pool = nullptr);
+
   // Convert Batch to pb type, which will be pass to remote worker.
   RawForwardInput prepare_forward_input(const ModelArgs& args,
                                         ThreadPool* thread_pool);
@@ -110,7 +115,8 @@ class Batch {
   // process the accepted output embedding
   void process_embedding_output(const torch::Tensor& embedding);
 
-  // mark all sequence groups as finished (used by rec model multi-round decoding)
+  // mark all sequence groups as finished (used by rec model multi-round
+  // decoding)
   void finish();
 
   const std::vector<uint32_t>& get_allowed_max_tokens() const {
@@ -137,6 +143,8 @@ class Batch {
 
   void dp_balance_shuffle_seqs();
 
+  std::vector<Sequence*> get_sequences() const;
+
   std::vector<Sequence*> sequences_;
   std::vector<SequencesGroup*> sequence_groups_;
   std::vector<BlockTransferInfo>* swap_block_transfer_infos_ = nullptr;
 
@@ -92,4 +92,62 @@ std::vector<Batch> BatchFactory::create_batches(
   return batches;
 }
 
+std::vector<Batch> BatchFactory::create_rec_batches(
+    const std::vector<std::shared_ptr<Request>>& running_requests,
+    const std::vector<Sequence*>& running_sequences,
+    const std::vector<size_t>& running_sequences_budgets,
+    std::vector<std::vector<BlockTransferInfo>>* swap_block_transfer_infos) {
+  size_t num_prompt_tokens = 0;
+  size_t num_generated_tokens = 0;
+  std::vector<Batch> batches(dp_size_);
+  for (size_t i = 0; i < running_sequences.size(); ++i) {
+    auto* sequence = running_sequences[i];
+    const size_t token_budget = running_sequences_budgets[i];
+
+    const size_t remaining_prompt_tokens =
+        sequence->num_prompt_tokens() >
+                sequence->kv_state().kv_cache_tokens_num()
+            ? sequence->num_prompt_tokens() -
+                  sequence->kv_state().kv_cache_tokens_num()
+            : 0;
+    const size_t prompt_tokens =
+        std::min(remaining_prompt_tokens, token_budget);
+    const size_t generated_tokens = token_budget - prompt_tokens;
+    num_prompt_tokens += prompt_tokens;
+    num_generated_tokens += generated_tokens;
+
+    batches[sequence->dp_rank()].set_batch_id();
+  }
+
+  for (const auto& request : running_requests) {
+    auto seq_group = request->sequence_group();
+    int32_t dp_rank = seq_group->dp_rank();
+    batches[dp_rank].add(seq_group);
+  }
+
+  for (int i = 0; i < dp_size_; i++) {
+    if (!batches[i].empty()) {
+      if (swap_block_transfer_infos != nullptr &&
+          swap_block_transfer_infos->size() == dp_size_) {
+        batches[i].set_swap_block_transfer_infos(
+            &(swap_block_transfer_infos->at(i)));
+      }
+    }
+  }
+
+  COUNTER_ADD(num_processing_tokens_total_prompt, num_prompt_tokens);
+  COUNTER_ADD(num_processing_tokens_total_generated, num_generated_tokens);
+
+  if (running_sequences.size() > 0) {
+    HISTOGRAM_OBSERVE(
+        num_prompt_tokens_per_request,
+        static_cast<int64_t>(num_prompt_tokens / running_sequences.size()));
+    HISTOGRAM_OBSERVE(
+        num_generated_tokens_per_request,
+        static_cast<int64_t>(num_generated_tokens / running_sequences.size()));
+  }
+
+  return batches;
+}
+
 }  // namespace xllm
@@ -35,6 +35,13 @@ class BatchFactory {
       std::vector<std::vector<BlockTransferInfo>>* swap_block_transfer_infos =
           nullptr);
 
+  std::vector<Batch> create_rec_batches(
+      const std::vector<std::shared_ptr<Request>>& running_requests,
+      const std::vector<Sequence*>& running_sequences,
+      const std::vector<size_t>& running_sequences_budgets,
+      std::vector<std::vector<BlockTransferInfo>>* swap_block_transfer_infos =
+          nullptr);
+
  private:
   BatchFactory(int32_t dp_size) : dp_size_(dp_size) {}
   ~BatchFactory() = default;