jd-opensource
diff --git a/‎xllm/core/kernels/mlu/gather_split.cpp‎
Lines changed: 29 additions & 0 deletions b/‎xllm/core/kernels/mlu/gather_split.cpp‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/mlu_ops_api.h‎
Lines changed: 46 additions & 0 deletions b/‎xllm/core/kernels/mlu/mlu_ops_api.h‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/moe_all2all.cpp‎
Lines changed: 152 additions & 0 deletions b/‎xllm/core/kernels/mlu/moe_all2all.cpp‎
Lines changed: 152 additions & 0 deletions
diff --git a/‎xllm/core/kernels/ops_api.cpp‎
Lines changed: 91 additions & 10 deletions b/‎xllm/core/kernels/ops_api.cpp‎
Lines changed: 91 additions & 10 deletions
@@ -0,0 +1,29 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+
+namespace xllm::kernel::mlu {
+
+void gather_split(const torch::Tensor& input,
+                  const torch::Tensor& gather_index,
+                  const torch::Tensor& valid_token_num,
+                  const torch::Tensor& output_head,
+                  const torch::Tensor& output_tail) {
+  tmo::torch_api::gather_split(
+      output_head, output_tail, input, gather_index, valid_token_num);
+}
+
+}  // namespace xllm::kernel::mlu
@@ -182,6 +182,46 @@ torch::Tensor moe_combine_result(
     const int64_t expert_size,
     const std::optional<torch::Tensor>& bias);
 
+torch::Tensor moe_all2all_gen_send_layout(const torch::Tensor& token_count,
+                                          int64_t nrank);
+
+std::vector<torch::Tensor> moe_all2all_gen_gather_index(
+    const torch::Tensor& token_num,
+    int64_t pad_num,
+    bool return_cusum_token_count);
+
+std::vector<torch::Tensor> moe_all2all_create(int64_t dispatch_token_byte,
+                                              int64_t combine_token_byte,
+                                              int64_t max_expert_num,
+                                              int64_t max_token_num,
+                                              int64_t rank,
+                                              int64_t nrank,
+                                              const torch::Device& device);
+
+void moe_all2all_init(int64_t handle,
+                      const torch::Tensor& all_exchange_info,
+                      const torch::Device& device);
+
+void moe_all2all_dispatch(int64_t handle,
+                          int64_t token_byte,
+                          int64_t token_num,
+                          const torch::Tensor& send_layout,
+                          const torch::Tensor& send_token_num,
+                          const torch::Tensor& recv_layout,
+                          const torch::Tensor& recv_token_num,
+                          const std::optional<torch::Tensor>& send_token,
+                          const std::optional<torch::Tensor>& recv_token);
+
+void moe_all2all_combine(int64_t handle,
+                         int64_t token_byte,
+                         int64_t token_num,
+                         const torch::Tensor& send_src_layout,
+                         const torch::Tensor& send_dst_layout,
+                         const std::optional<torch::Tensor>& send_token,
+                         const std::optional<torch::Tensor>& recv_token);
+
+void moe_all2all_destroy(int64_t handle, const torch::Device& device);
+
 std::tuple<torch::Tensor, torch::Tensor> scaled_quantize(
     const torch::Tensor& x,
     const torch::Tensor& smooth,
@@ -222,4 +262,10 @@ torch::Tensor apply_top_k_top_p(const torch::Tensor& logits,
 
 torch::Tensor random_sample(const torch::Tensor& probs);
 
+void gather_split(const torch::Tensor& input,
+                  const torch::Tensor& gather_index,
+                  const torch::Tensor& valid_token_num,
+                  const torch::Tensor& output_head,
+                  const torch::Tensor& output_tail);
+
 }  // namespace xllm::kernel::mlu
@@ -0,0 +1,152 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+
+namespace xllm::kernel::mlu {
+
+torch::Tensor moe_all2all_gen_send_layout(const torch::Tensor& token_count,
+                                          int64_t nrank) {
+  return tmo::torch_api::moe_all2all_gen_send_layout(token_count, nrank);
+}
+
+std::vector<torch::Tensor> moe_all2all_gen_gather_index(
+    const torch::Tensor& token_num,
+    int64_t pad_num,
+    bool return_cusum_token_count) {
+  // get dimension information
+  int32_t rank_num = token_num.size(0);
+  int32_t expert_num = token_num.size(1);
+
+  // prepare tensor options (keep same device as input, enforce int32)
+  auto options =
+      torch::TensorOptions().dtype(torch::kInt).device(token_num.device());
+
+  // output tensors
+  torch::Tensor gather_by_expert_index =
+      torch::empty({rank_num * pad_num}, options);
+  torch::Tensor gather_by_rank_index =
+      torch::empty({rank_num * pad_num}, options);
+  torch::Tensor token_count = torch::empty({expert_num}, options);
+  torch::Tensor token_sum = torch::empty({1}, options);
+
+  // handle optional tensor allocation
+  torch::Tensor cusum_token_count;
+  if (return_cusum_token_count) {
+    cusum_token_count = torch::empty({expert_num + 1}, options);
+  }
+
+  tmo::torch_api::moe_all2all_gen_gather_index(gather_by_expert_index,
+                                               gather_by_rank_index,
+                                               token_count,
+                                               cusum_token_count,
+                                               token_sum,
+                                               token_num,
+                                               pad_num);
+
+  // pack and return results using std::vector
+  std::vector<torch::Tensor> results;
+  results.reserve(return_cusum_token_count ? 5 : 4);
+
+  results.push_back(gather_by_expert_index);
+  results.push_back(gather_by_rank_index);
+  results.push_back(token_count);
+  results.push_back(token_sum);
+
+  if (return_cusum_token_count) {
+    results.push_back(cusum_token_count);
+  }
+
+  return results;
+}
+
+std::vector<torch::Tensor> moe_all2all_create(int64_t dispatch_token_byte,
+                                              int64_t combine_token_byte,
+                                              int64_t max_expert_num,
+                                              int64_t max_token_num,
+                                              int64_t rank,
+                                              int64_t nrank,
+                                              const torch::Device& device) {
+  // Create placeholder tensor on the specified device
+  auto options = torch::TensorOptions().device(device);
+  torch::Tensor place_holder = torch::empty({0}, options);
+
+  // Call the underlying operator
+  // Since the return type is explicitly std::vector<torch::Tensor>, we capture
+  // it directly.
+  std::vector<torch::Tensor> outputs =
+      tmo::torch_api::moe_all2all_create(dispatch_token_byte,
+                                         combine_token_byte,
+                                         max_expert_num,
+                                         max_token_num,
+                                         rank,
+                                         nrank,
+                                         place_holder);
+  // Return all 6 tensors
+  // Construct a new vector from the iterator range
+  return std::vector<torch::Tensor>(outputs.begin(), outputs.end());
+}
+
+void moe_all2all_init(int64_t handle,
+                      const torch::Tensor& all_exchange_info,
+                      const torch::Device& device) {
+  auto options = torch::TensorOptions().device(device);
+  torch::Tensor place_holder = torch::empty({0}, options);
+  tmo::torch_api::moe_all2all_init(handle, all_exchange_info, place_holder);
+}
+
+void moe_all2all_dispatch(int64_t handle,
+                          int64_t token_byte,
+                          int64_t token_num,
+                          const torch::Tensor& send_layout,
+                          const torch::Tensor& send_token_num,
+                          const torch::Tensor& recv_layout,
+                          const torch::Tensor& recv_token_num,
+                          const std::optional<torch::Tensor>& send_token,
+                          const std::optional<torch::Tensor>& recv_token) {
+  tmo::torch_api::moe_all2all_dispatch(handle,
+                                       token_byte,
+                                       token_num,
+                                       send_layout,
+                                       send_token_num,
+                                       recv_layout,
+                                       recv_token_num,
+                                       send_token,
+                                       recv_token);
+}
+
+void moe_all2all_combine(int64_t handle,
+                         int64_t token_byte,
+                         int64_t token_num,
+                         const torch::Tensor& send_src_layout,
+                         const torch::Tensor& send_dst_layout,
+                         const std::optional<torch::Tensor>& send_token,
+                         const std::optional<torch::Tensor>& recv_token) {
+  tmo::torch_api::moe_all2all_combine(handle,
+                                      token_byte,
+                                      token_num,
+                                      send_src_layout,
+                                      send_dst_layout,
+                                      send_token,
+                                      recv_token);
+}
+
+void moe_all2all_destroy(int64_t handle, const torch::Device& device) {
+  auto options = torch::TensorOptions().device(device);
+  torch::Tensor place_holder = torch::empty({0}, options);
+  tmo::torch_api::moe_all2all_destroy(handle, place_holder);
+}
+
+}  // namespace xllm::kernel::mlu
@@ -303,8 +303,6 @@ torch::Tensor group_gemm(GroupGemmParams& params) {
                          params.trans_a,
                          params.trans_b,
                          params.a_quant_bit);
-#elif defined(USE_CUDA)
-  LOG(FATAL) << "group_gemm for cuda not implemented";
 #else
   LOG(FATAL) << "group_gemm not implemented";
 #endif
@@ -323,8 +321,6 @@ std::tuple<torch::Tensor, torch::Tensor> moe_active_topk(
                               params.scoring_func,
                               params.route_scale,
                               params.e_score_correction_bias);
-#elif defined(USE_CUDA)
-  LOG(FATAL) << "moe_active_topk for cuda not implemented";
 #else
   LOG(FATAL) << "moe_active_topk not implemented";
 #endif
@@ -333,8 +329,6 @@ std::tuple<torch::Tensor, torch::Tensor> moe_active_topk(
 std::vector<torch::Tensor> moe_gen_idx(MoeGenIdxParams& params) {
 #if defined(USE_MLU)
   return mlu::moe_gen_idx(params.expert_id, params.expert_num);
-#elif defined(USE_CUDA)
-  LOG(FATAL) << "moe_gen_idx for cuda not implemented";
 #else
   LOG(FATAL) << "moe_gen_idx not implemented";
 #endif
@@ -347,8 +341,6 @@ torch::Tensor moe_expand_input(MoeExpandInputParams& params) {
                                params.cusum_token_count,
                                params.start_expert_id,
                                params.expert_size);
-#elif defined(USE_CUDA)
-  LOG(FATAL) << "moe_expand_input for cuda not implemented";
 #else
   LOG(FATAL) << "moe_expand_input not implemented";
 #endif
@@ -364,13 +356,90 @@ torch::Tensor moe_combine_result(MoeCombineResultParams& params) {
                                  params.start_expert_id,
                                  params.expert_size,
                                  params.bias);
-#elif defined(USE_CUDA)
-  LOG(FATAL) << "moe_combine_result for cuda not implemented";
 #else
   LOG(FATAL) << "moe_combine_result not implemented";
 #endif
 }
 
+torch::Tensor moe_all2all_gen_send_layout(
+    MoeAll2AllGenSendLayoutParams& params) {
+#if defined(USE_MLU)
+  return mlu::moe_all2all_gen_send_layout(params.token_count, params.nrank);
+#else
+  LOG(FATAL) << "moe_all2all_gen_send_layout not implemented";
+#endif
+}
+
+std::vector<torch::Tensor> moe_all2all_gen_gather_index(
+    MoeAll2AllGenGatherIndexParams& params) {
+#if defined(USE_MLU)
+  return mlu::moe_all2all_gen_gather_index(
+      params.token_num, params.pad_num, params.return_cusum_token_count);
+#else
+  LOG(FATAL) << "moe_all2all_gen_gather_index not implemented";
+#endif
+}
+
+std::vector<torch::Tensor> moe_all2all_create(MoeAll2AllCreateParams& params) {
+#if defined(USE_MLU)
+  return mlu::moe_all2all_create(params.dispatch_token_byte,
+                                 params.combine_token_byte,
+                                 params.max_expert_num,
+                                 params.max_token_num,
+                                 params.rank,
+                                 params.nrank,
+                                 params.device);
+#else
+  LOG(FATAL) << "moe_all2all_create not implemented";
+#endif
+}
+
+void moe_all2all_init(MoeAll2AllInitParams& params) {
+#if defined(USE_MLU)
+  mlu::moe_all2all_init(params.handle, params.all_exchange_info, params.device);
+#else
+  LOG(FATAL) << "moe_all2all_init not implemented";
+#endif
+}
+
+void moe_all2all_dispatch(MoeAll2AllDispatchParams& params) {
+#if defined(USE_MLU)
+  mlu::moe_all2all_dispatch(params.handle,
+                            params.token_byte,
+                            params.token_num,
+                            params.send_layout,
+                            params.send_token_num,
+                            params.recv_layout,
+                            params.recv_token_num,
+                            params.send_token,
+                            params.recv_token);
+#else
+  LOG(FATAL) << "moe_all2all_dispatch not implemented";
+#endif
+}
+
+void moe_all2all_combine(MoeAll2AllCombineParams& params) {
+#if defined(USE_MLU)
+  mlu::moe_all2all_combine(params.handle,
+                           params.token_byte,
+                           params.token_num,
+                           params.send_src_layout,
+                           params.send_dst_layout,
+                           params.send_token,
+                           params.recv_token);
+#else
+  LOG(FATAL) << "moe_all2all_combine not implemented";
+#endif
+}
+
+void moe_all2all_destroy(MoeAll2AllDestroyParams& params) {
+#if defined(USE_MLU)
+  mlu::moe_all2all_destroy(params.handle, params.device);
+#else
+  LOG(FATAL) << "moe_all2all_destroy not implemented";
+#endif
+}
+
 std::tuple<torch::Tensor, torch::Tensor> scaled_quantize(
     ScaledQuantizeParams& params) {
 #if defined(USE_MLU)
@@ -455,4 +524,16 @@ void masked_indexer_select_paged_kv(MaskedIndexerSelectPagedKVParams& params) {
 #endif
 }
 
+void gather_split(GatherSplitParams& params) {
+#if defined(USE_MLU)
+  mlu::gather_split(params.input,
+                    params.gather_index,
+                    params.valid_token_num,
+                    params.output_head,
+                    params.output_tail);
+#else
+  LOG(FATAL) << "gather_split not implemented";
+#endif
+}
+
 }  // namespace xllm::kernel