jd-opensource
diff --git a/‎xllm/core/runtime/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎xllm/core/runtime/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎xllm/core/runtime/llmrec_worker_impl.cpp‎
Lines changed: 131 additions & 0 deletions b/‎xllm/core/runtime/llmrec_worker_impl.cpp‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎xllm/core/runtime/llmrec_worker_impl.h‎
Lines changed: 44 additions & 0 deletions b/‎xllm/core/runtime/llmrec_worker_impl.h‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎xllm/core/runtime/onerec_worker_impl.cpp‎
Lines changed: 132 additions & 0 deletions b/‎xllm/core/runtime/onerec_worker_impl.cpp‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎xllm/core/runtime/onerec_worker_impl.h‎
Lines changed: 37 additions & 0 deletions b/‎xllm/core/runtime/onerec_worker_impl.h‎
Lines changed: 37 additions & 0 deletions
@@ -22,6 +22,9 @@ cc_library(
     dit_worker.h
     embed_worker_impl.h
     embed_vlm_worker_impl.h
+    rec_worker_impl.h
+    llmrec_worker_impl.h
+    onerec_worker_impl.h
     worker_client.h
     xservice_client.h
     speculative_worker_impl.h
@@ -38,6 +41,9 @@ cc_library(
     dit_worker.cpp
     embed_worker_impl.cpp
     embed_vlm_worker_impl.cpp
+    rec_worker_impl.cpp
+    llmrec_worker_impl.cpp
+    onerec_worker_impl.cpp
     worker_client.cpp
     xservice_client.cpp
     params_utils.cpp
 
@@ -0,0 +1,131 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llmrec_worker_impl.h"
+
+#include <glog/logging.h>
+#include <torch/torch.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "common/types.h"
+#include "core/layers/word_embedding.h"
+
+namespace xllm {
+
+LlmRecWorkerImpl::LlmRecWorkerImpl(const ParallelArgs& parallel_args,
+                                   const torch::Device& device,
+                                   const runtime::Options& options)
+    : RecWorkerImpl(parallel_args, device, options) {}
+
+void LlmRecWorkerImpl::prepare_work_before_execute(
+    const ForwardInput& inputs,
+    ForwardInput& processed_inputs) {
+  WorkerImpl::prepare_work_before_execute(inputs, processed_inputs);
+
+  if (!inputs.input_params.mm_data.valid()) {
+    return;
+  }
+
+  torch::Tensor input_embedding;
+  torch::Tensor input_tokens_tensor;
+  torch::Tensor input_indices_tensor;
+
+  const auto& mm_data = inputs.input_params.mm_data;
+  const auto& processed_mm_data = processed_inputs.input_params.mm_data;
+
+  if (auto res =
+          processed_mm_data.get<torch::Tensor>(LLM_REC_INPUT_TOKENS)) {
+    input_tokens_tensor = res.value();
+  }
+
+  // input indices 需要在 Host 侧生成位置索引
+  if (auto res = mm_data.get<torch::Tensor>(LLM_REC_INPUT_INDICES)) {
+    input_indices_tensor = res.value();
+  }
+
+  if (auto res =
+          processed_mm_data.get<torch::Tensor>(LLM_REC_INPUT_EMBEDDING)) {
+    input_embedding = res.value();
+  }
+
+  if (input_embedding.defined()) {
+    input_embedding = input_embedding.to(dtype());
+  }
+
+  if (input_indices_tensor.defined()) {
+    layer::WordEmbedding word_embedding = get_word_embedding();
+    torch::Tensor input_tokens_embedding =
+        word_embedding(input_tokens_tensor, 0);
+
+    if (input_embedding.defined()) {
+      std::vector<int> input_indices(
+          input_indices_tensor.data_ptr<int>(),
+          input_indices_tensor.data_ptr<int>() + input_indices_tensor.numel());
+
+      processed_inputs.input_params.input_embedding =
+          merge_embeddings_by_indices(
+              input_tokens_embedding, input_embedding, input_indices);
+    } else {
+      processed_inputs.input_params.input_embedding = input_tokens_embedding;
+    }
+  } else if (input_embedding.defined()) {
+    processed_inputs.input_params.input_embedding = input_embedding;
+  }
+}
+
+torch::Tensor LlmRecWorkerImpl::merge_embeddings_by_indices(
+    const torch::Tensor& input_tokens_embedding,
+    const torch::Tensor& input_embedding,
+    const std::vector<int>& input_indices) {
+  CHECK_EQ(input_embedding.dim(), 2);
+  CHECK_EQ(input_tokens_embedding.dim(), 2);
+  CHECK_EQ(input_tokens_embedding.size(1), input_embedding.size(1));
+  CHECK_EQ(input_tokens_embedding.dtype(), input_embedding.dtype());
+  CHECK_EQ(input_tokens_embedding.device(), input_embedding.device());
+
+  const int64_t total_rows =
+      input_tokens_embedding.size(0) + input_embedding.size(0);
+  const int64_t cols = input_embedding.size(1);
+
+  torch::Device device = input_embedding.device();
+  torch::Tensor merged = torch::empty(
+      {total_rows, cols}, torch::dtype(input_embedding.dtype()).device(device));
+
+  std::vector<int> input_embedding_indices;
+  for (int i = 0; i < static_cast<int>(total_rows); ++i) {
+    if (std::find(input_indices.begin(), input_indices.end(), i) ==
+        input_indices.end()) {
+      input_embedding_indices.push_back(i);
+    }
+  }
+
+  CHECK_EQ(input_embedding_indices.size(), input_embedding.size(0));
+
+  torch::Tensor input_embedding_indices_tensor =
+      torch::tensor(input_embedding_indices, torch::kInt64).to(device);
+  merged.index_put_({input_embedding_indices_tensor, torch::indexing::Ellipsis},
+                    input_embedding);
+
+  torch::Tensor input_indices_tensor =
+      torch::tensor(input_indices, torch::kInt64).to(device);
+  merged.index_put_({input_indices_tensor, torch::indexing::Ellipsis},
+                    input_tokens_embedding);
+
+  return merged;
+}
+
+}  // namespace xllm
@@ -0,0 +1,44 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <torch/torch.h>
+
+#include <vector>
+
+#include "runtime/rec_worker_impl.h"
+
+namespace xllm {
+
+class LlmRecWorkerImpl final : public RecWorkerImpl {
+ public:
+  LlmRecWorkerImpl(const ParallelArgs& parallel_args,
+                   const torch::Device& device,
+                   const runtime::Options& options);
+
+  ~LlmRecWorkerImpl() override = default;
+
+  void prepare_work_before_execute(const ForwardInput& inputs,
+                                   ForwardInput& processed_inputs) override;
+
+ private:
+  torch::Tensor merge_embeddings_by_indices(
+      const torch::Tensor& input_tokens_embedding,
+      const torch::Tensor& input_embedding,
+      const std::vector<int>& input_indices);
+};
+
+}  // namespace xllm
@@ -0,0 +1,132 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "onerec_worker_impl.h"
+
+#include <glog/logging.h>
+
+#include <optional>
+
+#include "common/device_monitor.h"
+#include "common/metrics.h"
+#include "framework/model/model_input_params.h"
+#include "util/timer.h"
+
+namespace xllm {
+
+OneRecWorkerImpl::OneRecWorkerImpl(const ParallelArgs& parallel_args,
+                                   const torch::Device& device,
+                                   const runtime::Options& options)
+    : RecWorkerImpl(parallel_args, device, options) {}
+
+std::optional<ForwardOutput> OneRecWorkerImpl::step(const ForwardInput& input) {
+  Timer timer;
+  device_.set_device();
+
+  const auto& sampling_params = input.sampling_params;
+  const auto& input_params = input.input_params;
+
+  if (!input_params.rec_params.has_value()) {
+    LOG(ERROR) << "OneRecWorkerImpl requires rec_params.";
+    return std::nullopt;
+  }
+
+  const auto& rec_params = input_params.rec_params.value();
+
+  torch::Tensor hidden_states;
+  if (rec_params.rec_stage == RecModelInputParams::RecStage::PREFILL) {
+    if (!rec_params.is_first_prefill) {
+      ModelInputParams decoder_params = input_params;
+      decoder_params.rec_params->is_encoder_forward = false;
+      hidden_states = model_executor_->forward(
+          input.token_ids, input.positions, kv_caches_, decoder_params);
+    } else {
+      const bool has_sparse_embedding =
+          rec_params.encoder_sparse_embedding.defined();
+      const bool has_encoder_tokens = rec_params.encoder_token_ids.defined() &&
+                                      rec_params.encoder_positions.defined();
+
+      if (!has_sparse_embedding && !has_encoder_tokens) {
+        LOG(ERROR) << "OneRecWorkerImpl first prefill requires encoder inputs.";
+        return std::nullopt;
+      }
+
+      ModelInputParams encoder_params = input_params;
+      encoder_params.rec_params->is_encoder_forward = true;
+
+      torch::Tensor encoder_tokens;
+      if (has_sparse_embedding) {
+        encoder_params.rec_params->is_hybrid_mode = true;
+        encoder_tokens = rec_params.encoder_sparse_embedding;
+      } else {
+        encoder_tokens = rec_params.encoder_token_ids;
+      }
+
+      model_executor_->forward(encoder_tokens,
+                               rec_params.encoder_positions,
+                               kv_caches_,
+                               encoder_params);
+
+      ModelInputParams decoder_params = input_params;
+      decoder_params.rec_params->is_encoder_forward = false;
+      hidden_states = model_executor_->forward(
+          input.token_ids, input.positions, kv_caches_, decoder_params);
+    }
+  } else {
+    ModelInputParams decoder_params = input_params;
+    decoder_params.rec_params->is_encoder_forward = false;
+    hidden_states = model_executor_->forward(
+        input.token_ids, input.positions, kv_caches_, decoder_params);
+  }
+
+  if (!hidden_states.defined()) {
+    return std::nullopt;
+  }
+
+  if (!enable_schedule_overlap() && !driver_ && !dp_driver_ &&
+      !options_.enable_speculative_decode()) {
+    device_.synchronize_default_stream();
+    COUNTER_ADD(execution_latency_seconds_model, timer.elapsed_seconds());
+    DeviceMonitor::get_instance().update_active_activation_memory(
+        device_.index());
+    return std::nullopt;
+  }
+
+  torch::Tensor logits;
+  if (sampling_params.selected_token_idxes.defined()) {
+    logits =
+        model_->logits(hidden_states, sampling_params.selected_token_idxes);
+  }
+
+  ForwardOutput output;
+
+  if (sampling_params.selected_token_idxes.defined()) {
+    auto sample_output = sampler_->forward(logits, sampling_params);
+    output.logits = logits;
+    output.sample_output = sample_output;
+    output.do_sample = sampling_params.do_sample;
+    output.logprobs = sampling_params.logprobs;
+    output.max_top_logprobs = sampling_params.max_top_logprobs;
+  }
+
+  device_.synchronize_default_stream();
+  COUNTER_ADD(execution_latency_seconds_model, timer.elapsed_seconds());
+  DeviceMonitor::get_instance().update_active_activation_memory(
+      device_.index());
+
+  return output;
+}
+
+}  // namespace xllm
@@ -0,0 +1,37 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <torch/torch.h>
+
+#include <optional>
+
+#include "runtime/rec_worker_impl.h"
+
+namespace xllm {
+
+class OneRecWorkerImpl final : public RecWorkerImpl {
+ public:
+  OneRecWorkerImpl(const ParallelArgs& parallel_args,
+                   const torch::Device& device,
+                   const runtime::Options& options);
+
+  ~OneRecWorkerImpl() override = default;
+
+  std::optional<ForwardOutput> step(const ForwardInput& input) override;
+};
+
+}  // namespace xllm