jd-opensource
diff --git a/‎xllm/api_service/api_service.cpp‎
Lines changed: 1 addition & 4 deletions b/‎xllm/api_service/api_service.cpp‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎xllm/api_service/rec_completion_service_impl.cpp‎
Lines changed: 3 additions & 9 deletions b/‎xllm/api_service/rec_completion_service_impl.cpp‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎xllm/api_service/rec_completion_service_impl.h‎
Lines changed: 2 additions & 5 deletions b/‎xllm/api_service/rec_completion_service_impl.h‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 5 additions & 0 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/distributed_runtime/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/distributed_runtime/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎xllm/core/distributed_runtime/master.cpp‎
Lines changed: 34 additions & 0 deletions b/‎xllm/core/distributed_runtime/master.cpp‎
Lines changed: 34 additions & 0 deletions
@@ -27,8 +27,7 @@ limitations under the License.
 #include "core/common/metrics.h"
 #include "core/distributed_runtime/dit_master.h"
 #include "core/distributed_runtime/llm_master.h"
-// TODO. add following when next pr.
-// #include "core/runtime/rec_master.h"
+#include "core/distributed_runtime/rec_master.h"
 #include "core/distributed_runtime/vlm_master.h"
 #include "core/util/closure_guard.h"
 #include "embedding.pb.h"
@@ -73,8 +72,6 @@ APIService::APIService(Master* master,
         std::make_unique<ImageGenerationServiceImpl>(
             dynamic_cast<DiTMaster*>(master), model_names);
   } else if (FLAGS_backend == "rec") {
-    // TODO. delete this when next pr.
-    using RecMaster = LLMMaster;
     rec_completion_service_impl_ = std::make_unique<RecCompletionServiceImpl>(
         dynamic_cast<RecMaster*>(master), model_names);
   }
 
@@ -27,10 +27,9 @@ limitations under the License.
 #include "common/instance_name.h"
 #include "completion.pb.h"
 #include "core/distributed_runtime/llm_master.h"
+#include "core/distributed_runtime/rec_master.h"
 #include "core/framework/request/mm_data.h"
 #include "core/framework/request/request_output.h"
-// TODO. add following when next pr.
-// #include "core/runtime/rec_master.h"
 #include "core/util/utils.h"
 
 #define likely(x) __builtin_expect(!!(x), 1)
@@ -89,9 +88,7 @@ bool send_result_to_client_brpc_rec(std::shared_ptr<CompletionCall> call,
   // Add rec specific output tensors
   auto output_tensor = response.mutable_output_tensors()->Add();
   output_tensor->set_name("rec_result");
-  // TODO: add following when next pr.
-  // if (FLAGS_enable_constrained_decoding) {
-  if (true) {
+  if (FLAGS_enable_constrained_decoding) {
     output_tensor->set_datatype(proto::DataType::INT64);
     output_tensor->mutable_shape()->Add(req_output.outputs.size());
     output_tensor->mutable_shape()->Add(1);  // Single item per output
@@ -190,11 +187,8 @@ void RecCompletionServiceImpl::process_async_impl(
   master_->handle_request(
       std::move(rpc_request_ref.prompt()),
       std::move(prompt_tokens),
-      // TODO. add following when next pr.
-      // std::move(mm_data),
+      std::move(mm_data),
       std::move(request_params),
-      // TODO. delete this when next pr.
-      call.get(),
       [call,
        model,
        master = master_,
 
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "api_service_impl.h"
 #include "completion.pb.h"
+#include "core/distributed_runtime/rec_master.h"
 #include "rec.pb.h"
 #include "stream_call.h"
 
@@ -27,10 +28,6 @@ namespace xllm {
 using CompletionCall =
     StreamCall<proto::CompletionRequest, proto::CompletionResponse>;
 
-// TODO. add following when next pr.
-// class RecMaster;
-using RecMaster = LLMMaster;
-
 // a class to handle completion requests
 class RecCompletionServiceImpl final : public APIServiceImpl<CompletionCall> {
  public:
@@ -45,4 +42,4 @@ class RecCompletionServiceImpl final : public APIServiceImpl<CompletionCall> {
   RecMaster* master_ = nullptr;
 };
 
-}  // namespace xllm
+}  // namespace xllm
@@ -164,6 +164,11 @@ DEFINE_int32(
     256,
     "Max decode token per sequence which used for ZeroEvictionScheduler.");
 
+// for rec, it's better to set to 100;
+DEFINE_int32(request_queue_size,
+             100000,
+             "The request queue size of the scheduler");
+
 // --- parallel config ---
 
 DEFINE_int32(dp_size, 1, "Data parallel size for MLA attention.");
 
@@ -187,6 +187,8 @@ DECLARE_bool(enable_latency_aware_schedule);
 
 DECLARE_int32(profile_max_prompt_length);
 
+DECLARE_int32(request_queue_size);
+
 DECLARE_bool(enable_profile_kv_blocks);
 
 DECLARE_bool(disable_ttft_profiling);
 
@@ -21,6 +21,8 @@ cc_library(
     vlm_engine.h
     vlm_master.h
     speculative_engine.h
+    rec_engine.h
+    rec_master.h
     disagg_pd_service.h
     disagg_pd_service_impl.h
     pd_ooc_service.h
@@ -40,6 +42,8 @@ cc_library(
     vlm_engine.cpp
     vlm_master.cpp
     speculative_engine.cpp
+    rec_engine.cpp
+    rec_master.cpp
     disagg_pd_service.cpp
     disagg_pd_service_impl.cpp
     pd_ooc_service.cpp
 
@@ -34,6 +34,8 @@ limitations under the License.
 #include "llm_engine.h"
 #include "llm_master.h"
 #include "models/model_registry.h"
+#include "rec_engine.h"
+#include "rec_master.h"
 #include "speculative_engine.h"
 #include "util/device_name_utils.h"
 #include "util/scope_guard.h"
@@ -231,6 +233,35 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
       eng_options.device_ip(options_.device_ip().value());
     }
     engine_ = std::make_unique<LLMEngine>(eng_options);
+  } else if (type == EngineType::REC) {
+    options_.enable_schedule_overlap(false);
+    LOG(WARNING) << "Force to disable schedule overlap for REC model, not "
+                    "supported yet.";
+    runtime::Options eng_options;
+    eng_options.model_path(options_.model_path())
+        .devices(devices)
+        .backend(options_.backend())
+        .block_size(options_.block_size())
+        .max_cache_size(options_.max_cache_size())
+        .max_memory_utilization(options_.max_memory_utilization())
+        .enable_prefix_cache(options_.enable_prefix_cache())
+        .task_type(options_.task_type())
+        .enable_chunked_prefill(options_.enable_chunked_prefill())
+        .enable_offline_inference(options_.enable_offline_inference())
+        .spawn_worker_path(options_.spawn_worker_path())
+        .enable_shm(options_.enable_shm())
+        .is_local(options_.is_local())
+        .enable_schedule_overlap(options_.enable_schedule_overlap())
+        .master_node_addr(options_.master_node_addr())
+        .nnodes(options_.nnodes())
+        .node_rank(options_.node_rank())
+        .dp_size(options_.dp_size())
+        .ep_size(options_.ep_size())
+        .max_seqs_per_batch(options_.max_seqs_per_batch())
+        .max_tokens_per_chunk_for_prefill(
+            options_.max_tokens_per_chunk_for_prefill());
+
+    engine_ = std::make_unique<RecEngine>(eng_options);
   } else {
     LOG(WARNING) << "Not supported llm engine type: "
                  << static_cast<size_t>(type);
@@ -246,6 +277,9 @@ std::unique_ptr<Master> create_master(const std::string& backend,
   } else if (backend == "dit") {
     LOG(INFO) << "creating dit master";
     return std::make_unique<DiTMaster>(options);
+  } else if (backend == "rec") {
+    LOG(INFO) << "creating rec master";
+    return std::make_unique<RecMaster>(options);
   } else {
     LOG(FATAL) << "Failed to create master, backend is" << backend;
     return nullptr;