[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 4881d1ba219c · 2026-02-11T17:17:52.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/cpp/operator/test_cast_mxfp8_grouped.cu b/tests/cpp/operator/test_cast_mxfp8_grouped.cu
@@ -288,7 +288,7 @@ void performTest(const ProcessingMethod processing_method,
         rowwise_sfs_num += rowwise_sfs;
         colwise_sfs_num += colwise_sfs;
         sum_of_last_dims += K;
-        
+
         rowwise_scales_offset[t+1] = rowwise_sfs_num;
         colwise_scales_offset[t+1] = colwise_sfs_num;
         dbias_offsets[t+1] = sum_of_last_dims;
@@ -370,7 +370,7 @@ void performTest(const ProcessingMethod processing_method,
     cudaMemcpy(offsets_d, offsets_h.data(), offsets_size, cudaMemcpyHostToDevice);
 
     NVTEShape logical_shape_ = nvte_make_shape(logical_shape_vec.data(), logical_shape_vec.size());
-    
+
     std::vector<size_t> dbias_logical_shape_vec= {num_tensors, cols};
     NVTEShape dbias_logical_shape_ = nvte_make_shape(dbias_logical_shape_vec.data(),
                                                      dbias_logical_shape_vec.size());
diff --git a/transformer_engine/common/cast/core/common.cuh b/transformer_engine/common/cast/core/common.cuh
@@ -89,30 +89,25 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK)
 
 template <int nvec, typename OType>
 __global__ void __launch_bounds__(THREADS_PER_BLOCK)
-    group_reduce_dbias_kernel(const ShapeRepresentation shape_rep,
-                              const size_t num_tensors,
-                              const size_t first_logical_dim,
-                              const size_t last_logical_dim,
-                              const int64_t *const offsets_ptr,
-                              const int64_t *const first_dims_ptr,
-                              const int64_t *const last_dims_ptr,
-                              OType *const dbias_output,
-                              const float *dbias_partial,
-                              const size_t chunk_dim_Y) {
+    group_reduce_dbias_kernel(const ShapeRepresentation shape_rep, const size_t num_tensors,
+                              const size_t first_logical_dim, const size_t last_logical_dim,
+                              const int64_t *const offsets_ptr, const int64_t *const first_dims_ptr,
+                              const int64_t *const last_dims_ptr, OType *const dbias_output,
+                              const float *dbias_partial, const size_t chunk_dim_Y) {
   using ComputeVec = Vec<float, nvec>;
   using OutputVec = Vec<OType, nvec>;
 
   const size_t tensor_id = blockIdx.y;
   const size_t tensor_rows = (shape_rep == ShapeRepresentation::SAME_BOTH_DIMS)
-                             ? (first_logical_dim / num_tensors)
-                             : first_dims_ptr[tensor_id];
-  
+                                 ? (first_logical_dim / num_tensors)
+                                 : first_dims_ptr[tensor_id];
+
   const size_t rows = tensor_rows / chunk_dim_Y;
   const size_t cols = last_logical_dim;
 
   const size_t dbias_in_offset_Y = (shape_rep == ShapeRepresentation::SAME_BOTH_DIMS)
-                                   ? (tensor_id * (tensor_rows / chunk_dim_Y))
-                                   : (offsets_ptr[tensor_id] / cols / chunk_dim_Y);
+                                       ? (tensor_id * (tensor_rows / chunk_dim_Y))
+                                       : (offsets_ptr[tensor_id] / cols / chunk_dim_Y);
 
   const size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -160,16 +155,12 @@ void reduce_dbias(const float *workspace_ptr, Tensor *dbias, const size_t rows,
 }
 
 template <typename IType>
-void grouped_reduce_dbias(const ShapeRepresentation shape_rep,
-                          const size_t num_tensors,
-                          const size_t first_logical_dim,
-                          const size_t last_logical_dim,
+void grouped_reduce_dbias(const ShapeRepresentation shape_rep, const size_t num_tensors,
+                          const size_t first_logical_dim, const size_t last_logical_dim,
                           const int64_t *const data_tensor_offsets_ptr,
                           const int64_t *const data_tensor_first_dims_ptr,
-                          const int64_t *const data_tensor_last_dims_ptr,
-                          GroupedTensor *dbias,
-                          const float *workspace_ptr,
-                          const size_t chunk_dim_Y,
+                          const int64_t *const data_tensor_last_dims_ptr, GroupedTensor *dbias,
+                          const float *workspace_ptr, const size_t chunk_dim_Y,
                           cudaStream_t stream) {
   using namespace kernel;
   constexpr size_t reduce_dbias_store_bytes = 8;  // stg.64
@@ -181,11 +172,10 @@ void grouped_reduce_dbias(const ShapeRepresentation shape_rep,
   const size_t blocks_Y = num_tensors;
   const dim3 grid(blocks_X, blocks_Y);
 
-  group_reduce_dbias_kernel<reduce_dbias_nvec, IType>
-      <<<grid, THREADS_PER_BLOCK, 0, stream>>>(
-         shape_rep, num_tensors, first_logical_dim, last_logical_dim,
-         data_tensor_offsets_ptr, data_tensor_first_dims_ptr, data_tensor_last_dims_ptr,
-         reinterpret_cast<IType *>(dbias->data.dptr), workspace_ptr, chunk_dim_Y);
+  group_reduce_dbias_kernel<reduce_dbias_nvec, IType><<<grid, THREADS_PER_BLOCK, 0, stream>>>(
+      shape_rep, num_tensors, first_logical_dim, last_logical_dim, data_tensor_offsets_ptr,
+      data_tensor_first_dims_ptr, data_tensor_last_dims_ptr,
+      reinterpret_cast<IType *>(dbias->data.dptr), workspace_ptr, chunk_dim_Y);
 
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
diff --git a/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
@@ -144,7 +144,8 @@ __device__ __forceinline__ void modify_base_tensor_map(const CUtensorMap base_te
       NVTE_DEVICE_ERROR("Tensor data pointer must be 16B aligned");
     }
     if (global_dim_X % CHUNK_DIM_X != 0) {
-      NVTE_DEVICE_ERROR("The grouped tensor must be divisible by 128x128 tiles without a tail tile.");
+      NVTE_DEVICE_ERROR(
+          "The grouped tensor must be divisible by 128x128 tiles without a tail tile.");
     }
 
     asm volatile(
@@ -941,9 +942,8 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
 
           if constexpr (IS_DBIAS) {
             common::grouped_reduce_dbias<IType>(
-              shape_rep, num_tensors, first_logical_dim, last_logical_dim,
-              offsets_ptr, first_dims_ptr, last_dims_ptr,
-              dbias, workspace_ptr, CHUNK_DIM_Y, stream);
+                shape_rep, num_tensors, first_logical_dim, last_logical_dim, offsets_ptr,
+                first_dims_ptr, last_dims_ptr, dbias, workspace_ptr, CHUNK_DIM_Y, stream);
           }
 
           NVTE_CHECK_CUDA(cudaGetLastError()););  // NOLINT(*)
diff --git a/transformer_engine/common/include/transformer_engine/cast.h b/transformer_engine/common/include/transformer_engine/cast.h
@@ -207,7 +207,8 @@ void nvte_quantize_dbias_dgelu(const NVTETensor input, const NVTETensor act_inpu
  */
 void nvte_group_quantize_dbias_dgelu(const NVTEGroupedTensor input,
                                      const NVTEGroupedTensor act_input, NVTEGroupedTensor output,
-                                     NVTEGroupedTensor dbias, NVTETensor workspace, cudaStream_t stream);
+                                     NVTEGroupedTensor dbias, NVTETensor workspace,
+                                     cudaStream_t stream);
 
 /*! \brief Computes backward of SiLU operation on the input, then casts to FP8/MXFP8.
  *         Additionally, reduces the result of the SiLU backward along columns.
@@ -253,7 +254,8 @@ void nvte_quantize_dbias_dsilu(const NVTETensor input, const NVTETensor act_inpu
  */
 void nvte_group_quantize_dbias_dsilu(const NVTEGroupedTensor input,
                                      const NVTEGroupedTensor act_input, NVTEGroupedTensor output,
-                                     NVTEGroupedTensor dbias, NVTETensor workspace, cudaStream_t stream);
+                                     NVTEGroupedTensor dbias, NVTETensor workspace,
+                                     cudaStream_t stream);
 
 /*! \brief Computes backward of ReLU operation on the input, then casts to FP8/MXFP8.
  *         Additionally, reduces the result of the ReLU backward along columns.
@@ -299,7 +301,8 @@ void nvte_quantize_dbias_drelu(const NVTETensor input, const NVTETensor act_inpu
  */
 void nvte_group_quantize_dbias_drelu(const NVTEGroupedTensor input,
                                      const NVTEGroupedTensor act_input, NVTEGroupedTensor output,
-                                     NVTEGroupedTensor dbias, NVTETensor workspace, cudaStream_t stream);
+                                     NVTEGroupedTensor dbias, NVTETensor workspace,
+                                     cudaStream_t stream);
 
 /*! \brief Computes backward of Quick GeLU operation on the input, then casts to FP8/MXFP8.
  *         Additionally, reduces the result of the Quick GeLU backward along columns.
@@ -345,7 +348,8 @@ void nvte_quantize_dbias_dqgelu(const NVTETensor input, const NVTETensor act_inp
  */
 void nvte_group_quantize_dbias_dqgelu(const NVTEGroupedTensor input,
                                       const NVTEGroupedTensor act_input, NVTEGroupedTensor output,
-                                      NVTEGroupedTensor dbias, NVTETensor workspace, cudaStream_t stream);
+                                      NVTEGroupedTensor dbias, NVTETensor workspace,
+                                      cudaStream_t stream);
 
 /*! \brief Computes backward of Squared ReLU operation on the input, then casts to FP8/MXFP8.
  *         Additionally, reduces the result of the Squared ReLU backward along columns.
@@ -391,7 +395,8 @@ void nvte_quantize_dbias_dsrelu(const NVTETensor input, const NVTETensor act_inp
  */
 void nvte_group_quantize_dbias_dsrelu(const NVTEGroupedTensor input,
                                       const NVTEGroupedTensor act_input, NVTEGroupedTensor output,
-                                      NVTEGroupedTensor dbias, NVTETensor workspace, cudaStream_t stream);
+                                      NVTEGroupedTensor dbias, NVTETensor workspace,
+                                      cudaStream_t stream);
 
 /*! \brief Casts input tensor from reduced to higher precision.
  *         If the scaling mode of the input tensor is set to NVTE_MXFP8_1D_SCALING,

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,8 @@ __device__ __forceinline__ void modify_base_tensor_map(const CUtensorMap base_te`
`144`	`144`	`NVTE_DEVICE_ERROR("Tensor data pointer must be 16B aligned");`
`145`	`145`	`}`
`146`	`146`	`if (global_dim_X % CHUNK_DIM_X != 0) {`
`147`		`- NVTE_DEVICE_ERROR("The grouped tensor must be divisible by 128x128 tiles without a tail tile.");`
	`147`	`+ NVTE_DEVICE_ERROR(`
	`148`	`+ "The grouped tensor must be divisible by 128x128 tiles without a tail tile.");`
`148`	`149`	`}`
`149`	`150`
`150`	`151`	`asm volatile(`
`@@ -941,9 +942,8 @@ void group_quantize(const GroupedTensor input, const GroupedTensor activations`
`941`	`942`
`942`	`943`	`if constexpr (IS_DBIAS) {`
`943`	`944`	`common::grouped_reduce_dbias<IType>(`
`944`		`- shape_rep, num_tensors, first_logical_dim, last_logical_dim,`
`945`		`- offsets_ptr, first_dims_ptr, last_dims_ptr,`
`946`		`- dbias, workspace_ptr, CHUNK_DIM_Y, stream);`
	`945`	`+ shape_rep, num_tensors, first_logical_dim, last_logical_dim, offsets_ptr,`
	`946`	`+ first_dims_ptr, last_dims_ptr, dbias, workspace_ptr, CHUNK_DIM_Y, stream);`
`947`	`947`	`}`
`948`	`948`
`949`	`949`	`NVTE_CHECK_CUDA(cudaGetLastError());); // NOLINT(*)`