Fixes per the review

Oleg-Goncharov · Oleg-Goncharov · commit 325181bd23e6 · 2026-03-06T10:37:56.000Z
Signed-off-by: Oleg Goncharov &lt;ogoncharov@nvidia.com&gt;
diff --git a/tests/cpp/operator/test_cast_mxfp8_grouped.cu b/tests/cpp/operator/test_cast_mxfp8_grouped.cu
@@ -371,7 +371,7 @@ void performTest(const ProcessingMethod processing_method,
 
     NVTEShape logical_shape_ = nvte_make_shape(logical_shape_vec.data(), logical_shape_vec.size());
 
-    std::vector<size_t> dbias_logical_shape_vec= {num_tensors, cols};
+    std::vector<size_t> dbias_logical_shape_vec = {num_tensors, cols};
     NVTEShape dbias_logical_shape_ = nvte_make_shape(dbias_logical_shape_vec.data(),
                                                      dbias_logical_shape_vec.size());
 
diff --git a/transformer_engine/common/cast/core/common.cuh b/transformer_engine/common/cast/core/common.cuh
@@ -100,14 +100,14 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK)
   const size_t tensor_id = blockIdx.y;
   const size_t tensor_rows = (shape_rep == ShapeRepresentation::SAME_BOTH_DIMS)
                                  ? (first_logical_dim / num_tensors)
-                                 : first_dims_ptr[tensor_id];
+                                 : static_cast<size_t>(first_dims_ptr[tensor_id]);
 
   const size_t rows = tensor_rows / chunk_dim_Y;
   const size_t cols = last_logical_dim;
 
   const size_t dbias_in_offset_Y = (shape_rep == ShapeRepresentation::SAME_BOTH_DIMS)
                                        ? (tensor_id * (tensor_rows / chunk_dim_Y))
-                                       : (offsets_ptr[tensor_id] / cols / chunk_dim_Y);
+                                       : (static_cast<size_t>(offsets_ptr[tensor_id]) / cols / chunk_dim_Y);
 
   const size_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 
diff --git a/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
@@ -142,6 +142,10 @@ __device__ __forceinline__ size_t get_tensor_cols_num(
     case ShapeRepresentation::VARYING_LAST_DIM:
     case ShapeRepresentation::VARYING_BOTH_DIMS:
       cols_num = static_cast<size_t>(last_dims_ptr[tensor_id]);
+      if (cols_num % 128 != 0) {
+        NVTE_DEVICE_ERROR("For non-single tensors, the last dimension of each tensor in a group "
+                          "must be divisible by 128.");
+      }
       break;
   }
   return cols_num;
@@ -215,7 +219,8 @@ decode_block(const JobDescriptor &job, const bool is_single_tensor,
              const int64_t *const __restrict__ offsets_ptr) {
   BlockDescriptor block{};
   block.tensor_base = is_single_tensor ? 0 : static_cast<size_t>(offsets_ptr[job.tensor_id]);
-  const size_t blocks_X_num_in_current_tensor = DIVUP(job.cols, static_cast<size_t>(128));
+  const size_t CHUNK_DIM_X_ = CHUNK_DIM_X;
+  const size_t blocks_X_num_in_current_tensor = DIVUP(job.cols, CHUNK_DIM_X_);
   block.block_id_in_current_tensor =
       is_single_tensor ? job.block_id : (job.block_id - block.tensor_base / ELTS_PER_CHUNK);
   block.block_id_Y = block.block_id_in_current_tensor / blocks_X_num_in_current_tensor;