jd-opensource
diff --git a/‎xllm/core/kernels/npu/CMakeLists.txt‎
Lines changed: 20 additions & 8 deletions b/‎xllm/core/kernels/npu/CMakeLists.txt‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎xllm/core/kernels/npu/rms_norm.h‎ renamed to ‎xllm/core/kernels/npu/active.cpp‎
Lines changed: 12 additions & 14 deletions b/‎xllm/core/kernels/npu/rms_norm.h‎ renamed to ‎xllm/core/kernels/npu/active.cpp‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎xllm/core/kernels/npu/attention.cpp‎
Lines changed: 65 additions & 0 deletions b/‎xllm/core/kernels/npu/attention.cpp‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎xllm/core/kernels/npu/custom_functions_npu/atb_common.cpp‎
Lines changed: 175 additions & 0 deletions b/‎xllm/core/kernels/npu/custom_functions_npu/atb_common.cpp‎
Lines changed: 175 additions & 0 deletions
@@ -1,17 +1,29 @@
 include(cc_library)
 
-add_subdirectory(impl)
 add_subdirectory(xllm_ops)
 
+file(GLOB_RECURSE XLLM_CORE_KERNELS_NPU_HEADER
+  "${CMAKE_CURRENT_LIST_DIR}/custom_functions_npu/*.h"
+  "${CMAKE_CURRENT_LIST_DIR}/ops_npu/*.h"
+  "${CMAKE_CURRENT_LIST_DIR}/*.h"
+)
+
+file(GLOB_RECURSE XLLM_CORE_KERNELS_NPU_SRCS
+  "${CMAKE_CURRENT_LIST_DIR}/custom_functions_npu/*.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/ops_npu/*.cpp"
+  "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
+)
+
 cc_library(
   NAME
     npu_kernels
   HDRS
-    linear.h
-    split.h
-    rms_norm.h
-    rope.h
+    ${XLLM_CORE_KERNELS_NPU_HEADER}
+  SRCS
+    ${XLLM_CORE_KERNELS_NPU_SRCS}
   DEPS
-    :npu_kernels_impl
-    # spdlog::spdlog
-)
+    :model_context
+    glog::glog
+    torch
+    torch_npu
+)
@@ -13,20 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#pragma once
-#include "impl/npu_rms_norm_impl.h"
+#include <torch_npu/csrc/aten/CustomFunctions.h>
 
-namespace xllm {
-namespace kernel {
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
 
-class RmsNorm : public torch::nn::ModuleHolder<NpuRmsNormImpl> {
- public:
-  using torch::nn::ModuleHolder<NpuRmsNormImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = NpuRmsNormImpl;
+namespace xllm::kernel::npu {
 
-  RmsNorm(const ModelContext& context)
-      : ModuleHolder(std::make_shared<NpuRmsNormImpl>(context)) {}
-};
-
-}  // namespace kernel
-}  // namespace xllm
+torch::Tensor active(const torch::Tensor& input, const std::string& act_mode) {
+  if (act_mode != "silu" && act_mode != "swiglu") {
+    throw std::runtime_error(
+        "Only swiglu activation is supported in NPU active");
+  }
+  return at_npu::native::custom_ops::npu_swiglu(input);
+}
+}  // namespace xllm::kernel::npu
@@ -0,0 +1,65 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "npu_ops_api.h"
+#include "ops_npu/npu_ops.h"
+namespace xllm::kernel::npu {
+
+void reshape_paged_cache(torch::Tensor& key,
+                         std::optional<torch::Tensor>& value,
+                         torch::Tensor& k_cache,
+                         std::optional<torch::Tensor>& v_cache,
+                         const torch::Tensor& slot_mapping) {
+  atb::_npu_reshape_and_cache(
+      key, value.value(), k_cache, v_cache.value(), slot_mapping);
+}
+
+void batch_prefill(const torch::Tensor& query,
+                   const torch::Tensor& key,
+                   const torch::Tensor& value,
+                   const torch::Tensor& mask,
+                   const torch::Tensor& seq_len,
+                   float scale,
+                   torch::Tensor& output) {
+  auto num_heads = query.size(-2);
+  auto num_kv_heads = key.size(-2);
+  atb::_npu_flash_attention(
+      query, key, value, mask, seq_len, scale, num_heads, num_kv_heads, output);
+}
+
+void batch_decode(const torch::Tensor& query,
+                  const torch::Tensor& k_cache,
+                  const torch::Tensor& v_cache,
+                  float scale,
+                  const torch::Tensor& block_table,
+                  const torch::Tensor& seq_lens,
+                  torch::Tensor& output) {
+  auto head_size = query.size(-1);
+  auto num_heads = query.size(-2);
+  auto num_kv_heads = k_cache.size(-2);
+  auto q = query.view({-1, num_heads, head_size});
+  auto o = output.view({-1, num_heads, head_size});
+  atb::_npu_paged_attention(q,
+                            k_cache,
+                            v_cache,
+                            num_kv_heads,
+                            num_heads,
+                            scale,
+                            block_table,
+                            seq_lens,
+                            o);
+}
+
+}  // namespace xllm::kernel::npu
@@ -0,0 +1,175 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "atb_common.h"
+
+namespace atb {
+atb::Tensor at_tensor_to_atb_tensor(const at::Tensor at_tensor) {
+  static std::map<at::ScalarType, aclDataType> dtype_map = {
+      {at::ScalarType::Bool, ACL_BOOL},
+      {at::ScalarType::Byte, ACL_UINT8},
+      {at::ScalarType::Char, ACL_INT8},
+      {at::ScalarType::Half, ACL_FLOAT16},
+      {at::ScalarType::Float, ACL_FLOAT},
+      {at::ScalarType::Int, ACL_INT32},
+      {at::ScalarType::Long, ACL_INT64},
+      {at::ScalarType::BFloat16, ACL_BF16},
+      {at::ScalarType::Double, ACL_DOUBLE},
+      {at::ScalarType::Short, ACL_INT16},
+      {at::ScalarType::ComplexHalf, ACL_COMPLEX32},
+      {at::ScalarType::ComplexFloat, ACL_COMPLEX64},
+      {at::ScalarType::ComplexDouble, ACL_COMPLEX128},
+  };
+
+  TORCH_CHECK(at_tensor.is_contiguous(), "at_tensor is not contiguous");
+  atb::Tensor tensor;
+  tensor.desc.format = atb::utils::get_format_for_atb(at_tensor);
+  if (at_tensor.device().type() == at::kCPU) {
+    tensor.hostData = at_tensor.data_ptr();
+  } else {
+    tensor.deviceData = at_tensor.data_ptr();
+  }
+
+  tensor.desc.shape.dimNum = at_tensor.sizes().size();
+  for (uint64_t i = 0; i < at_tensor.sizes().size(); i++) {
+    tensor.desc.shape.dims[i] = at_tensor.sizes()[i];
+  }
+
+  auto dtype_iterator = dtype_map.find(at_tensor.scalar_type());
+  TORCH_CHECK(dtype_iterator != dtype_map.end(),
+              "not support dtype: ",
+              at_tensor.scalar_type());
+  tensor.desc.dtype = dtype_iterator->second;
+
+  tensor.dataSize = atb::Utils::GetTensorSize(tensor);
+
+  return tensor;
+}
+
+void run_atb_cmd_v1(atb::Operation* op,
+                    const ParamSetter& paramsetter,
+                    const std::string& name) {
+  aclrtStream stream = c10_npu::getCurrentNPUStream().stream(false);
+  auto context_ptr = atb::utils::get_context(stream);
+  atb::VariantPack variant_pack = paramsetter.variant_pack_;
+  uint64_t workspace_size = operation_setup(variant_pack, op, context_ptr);
+  at::Tensor workspace_tensor;
+  void* workspace_ptr = nullptr;
+  if (workspace_size != 0) {
+    at::TensorOptions options = at::TensorOptions(c10::DeviceType::PrivateUse1);
+    workspace_tensor = at::empty({workspace_size}, options.dtype(at::kByte));
+    workspace_ptr = const_cast<void*>(workspace_tensor.storage().data());
+  }
+  const c10::SmallVector<at::Tensor, N>& cpu_tensors =
+      paramsetter.tensor_maintainer_.cpu_tensors;
+  auto acl_call = [variant_pack,
+                   workspace_ptr,
+                   workspace_size,
+                   context_ptr,
+                   op,
+                   cpu_tensors]() -> int {
+    auto st = op->Execute(
+        variant_pack, (uint8_t*)workspace_ptr, workspace_size, context_ptr);
+    DestroyOperation(op);
+    return st;
+  };
+  at_npu::native::OpCommand::RunOpApiV2(name, acl_call);
+}
+
+void run_atb_cmd_v2(atb::Operation* op,
+                    const ParamSetter& paramsetter,
+                    const std::string& name) {
+  aclrtStream stream = c10_npu::getCurrentNPUStream().stream(false);
+  atb::VariantPack variant_pack = paramsetter.variant_pack_;
+  const c10::SmallVector<at::Tensor, N>& cpu_tensors =
+      paramsetter.tensor_maintainer_.cpu_tensors;
+  auto acl_call = [op, variant_pack, stream, cpu_tensors]() -> int {
+    auto context_ptr = atb::utils::get_context(stream);
+    uint64_t workspace_size = operation_setup(variant_pack, op, context_ptr);
+    at::Tensor workspace_tensor;
+    void* workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+      workspace_tensor =
+          at_npu::native::allocate_workspace(workspace_size, stream);
+      workspace_ptr = const_cast<void*>(workspace_tensor.storage().data());
+    }
+    auto st = op->Execute(
+        variant_pack, (uint8_t*)workspace_ptr, workspace_size, context_ptr);
+    return 0;
+  };
+  at_npu::native::OpCommand::RunOpApiV2(name, acl_call);
+}
+
+void run_atb_cmd(atb::Operation* op,
+                 const ParamSetter& paramsetter,
+                 const std::string& name) {
+  const auto is_capturing =
+      static_cast<int>(c10_npu::currentStreamCaptureStatusMayInitCtx());
+  if (is_capturing) {
+    run_atb_cmd_v1(op, paramsetter, name);
+  } else {
+    run_atb_cmd_v2(op, paramsetter, name);
+  }
+}
+
+ParamSetter& ParamSetter::Input(const at::Tensor& tensor,
+                                const bool& format_trans) {
+  if (!tensor.defined()) {
+    variant_pack_.inTensors.push_back(atb::Tensor());
+    return *this;
+  }
+  at::Tensor new_tensor = tensor.contiguous();
+  if (format_trans) {
+    new_tensor = atb::utils::format_trans(new_tensor);
+  }
+  atb::Tensor atb_tensor;
+  if (new_tensor.device().type() == at::kCPU) {
+    auto tensor_clone = new_tensor.clone();
+    atb_tensor = at_tensor_to_atb_tensor(tensor_clone);
+    tensor_maintainer_.cpu_tensors.emplace_back(std::move(tensor_clone));
+  } else {
+    atb_tensor = at_tensor_to_atb_tensor(new_tensor);
+    tensor_maintainer_.contiguous_tensors.emplace_back(std::move(new_tensor));
+  }
+  variant_pack_.inTensors.push_back(atb_tensor);
+  return *this;
+}
+
+ParamSetter& ParamSetter::Input(const c10::optional<at::Tensor>& tensor,
+                                const bool& format_trans) {
+  if (!tensor.has_value()) {
+    variant_pack_.inTensors.push_back(atb::Tensor());
+    return *this;
+  }
+  return Input(tensor.value(), format_trans);
+}
+
+ParamSetter& ParamSetter::Output(at::Tensor& output) {
+  auto atb_tensor = at_tensor_to_atb_tensor(output);
+  variant_pack_.outTensors.push_back(atb_tensor);
+  return *this;
+}
+
+uint64_t operation_setup(atb::VariantPack variant_pack,
+                         atb::Operation* operation,
+                         atb::Context* context_ptr) {
+  uint64_t workspace_size = 0;
+  atb::Status status =
+      operation->Setup(variant_pack, workspace_size, context_ptr);
+  TORCH_CHECK(status == 0, operation->GetName(), " setup failed!");
+  return workspace_size;
+}
+
+}  // namespace atb