refactor: standardize interface for active kernel execution.

yingxudeng · yingxudeng · commit baf3c90ab0c3 · 2025-12-05T22:46:13.000+08:00
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -56,6 +56,12 @@ void apply_rotary(RotaryParams& params) {
 
 void active(ActivationParams& params) {
 #if defined(USE_MLU)
+  // Note: Derivation from input is uncertain; using explicit parameter for
+  // robustness.
+  params.output = torch::empty(
+      {params.input.sizes()[0], params.intermediate_size / params.world_size},
+      params.input.options());
+
   mlu::active(params.input,
               params.output,
               params.bias,
@@ -65,20 +71,17 @@ void active(ActivationParams& params) {
               params.start_expert_id,
               params.expert_size);
 #elif defined(USE_CUDA)
+  params.output = torch::empty(
+      {params.input.sizes()[0], params.intermediate_size / params.world_size},
+      params.input.options());
   cuda::act_and_mul(params.output, params.input, params.act_mode);
+#elif defined(USE_NPU)
+  params.output = npu::active(params.input, params.act_mode);
 #else
   LOG(FATAL) << "active not implemented";
 #endif
 }
 
-torch::Tensor active_tensor(ActivationParams& params) {
-#if defined(USE_NPU)
-  return npu::active(params.input, params.act_mode);
-#else
-  LOG(FATAL) << "active_tensor not implemented";
-#endif
-}
-
 void reshape_paged_cache(ReshapePagedCacheParams& params) {
 #if defined(USE_MLU)
   mlu::reshape_paged_cache(params.key,
diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h
@@ -28,8 +28,6 @@ void apply_rotary(RotaryParams& params);
 
 void active(ActivationParams& params);
 
-torch::Tensor active_tensor(ActivationParams& params);
-
 void reshape_paged_cache(ReshapePagedCacheParams& params);
 
 void batch_prefill(AttentionParams& params);
diff --git a/xllm/core/kernels/param.h b/xllm/core/kernels/param.h
@@ -109,6 +109,11 @@ struct ActivationParams {
   // Expert size for MoE activation. Used when bias is provided.
   // Bias tensor shape must be [expert_size, in_channel].
   int64_t expert_size = 0;
+
+  // Note: Derivation from input is uncertain; using explicit parameter for
+  // robustness.
+  int64_t intermediate_size = 0;
+  int64_t world_size = 0;
 };
 
 // Reshape paged cache parameters
diff --git a/xllm/core/layers/common/dense_mlp.cpp b/xllm/core/layers/common/dense_mlp.cpp
@@ -89,17 +89,15 @@ torch::Tensor DenseMLPImpl::forward(const torch::Tensor& hidden_states) {
     // For w8a8 quantization, the active operation is fused with the down_proj
     return down_proj_->forward(gate_up);
   } else {
-    int64_t batch_size = gate_up.sizes()[0];
-    auto output = torch::empty(
-        {batch_size,
-         intermediate_size_ / parallel_args_.tp_group_->world_size()},
-        gate_up.options());
+    torch::Tensor output;
 
     xllm::kernel::ActivationParams activation_params;
     activation_params.input = gate_up;
     activation_params.output = output;
     activation_params.act_mode = hidden_act_;
     activation_params.is_gated = is_gated_;
+    activation_params.intermediate_size = intermediate_size_;
+    activation_params.world_size = parallel_args_.tp_group_->world_size();
     xllm::kernel::active(activation_params);
 
     return down_proj_->forward(output);