Fixes per the review

Oleg-Goncharov · Oleg-Goncharov · commit 957ef986e3f2 · 2026-02-27T14:03:46.000Z
Signed-off-by: Oleg Goncharov &lt;ogoncharov@nvidia.com&gt;
diff --git a/tests/cpp/operator/test_cast_mxfp8_grouped.cu b/tests/cpp/operator/test_cast_mxfp8_grouped.cu
@@ -346,16 +346,16 @@ void performTest(const ProcessingMethod processing_method,
     const size_t last_dims_size = num_tensors * sizeof(size_t);
     const size_t offsets_size = (num_tensors + 1) * sizeof(size_t);
 
-    InputType* grad_data_d;
-    InputType* in_data_d;
-    InputType* dbias_out_data_d;
-    OutputType* out_data_rowwise_d;
-    OutputType* out_data_colwise_d;
-    fp8e8m0* out_scales_rowwise_d;
-    fp8e8m0* out_scales_colwise_d;
-    size_t* first_dims_d;
-    size_t* last_dims_d;
-    size_t* offsets_d;
+    InputType* grad_data_d = nullptr;
+    InputType* in_data_d = nullptr;
+    InputType* dbias_out_data_d = nullptr;
+    OutputType* out_data_rowwise_d = nullptr;
+    OutputType* out_data_colwise_d = nullptr;
+    fp8e8m0* out_scales_rowwise_d = nullptr;
+    fp8e8m0* out_scales_colwise_d = nullptr;
+    size_t* first_dims_d = nullptr;
+    size_t* last_dims_d = nullptr;
+    size_t* offsets_d = nullptr;
 
     cudaMalloc((void**)&grad_data_d, in_data_size);
     cudaMalloc((void**)&in_data_d, in_data_size);
diff --git a/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
@@ -808,8 +808,6 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
     NVTE_CHECK(num_tensors <= MAX_SUPPORTED_TENSOR_DESCRIPTORS,
                "Number of tensors in a group is larger than "
                "the MAX number of supported descriptors (64).");
-    // Only full tiles supported
-    NVTE_CHECK(elts_total % ELTS_PER_CHUNK == 0, "Only full-tile grouped tensors supported.");
     blocks = DIVUP(elts_total, CHUNK_DIM_Y * CHUNK_DIM_X);
   }
   const size_t block_size = THREADS_PER_CHUNK;

Original file line number	Diff line number	Diff line change
`@@ -808,8 +808,6 @@ void group_quantize(const GroupedTensor input, const GroupedTensor activations`
`808`	`808`	`NVTE_CHECK(num_tensors <= MAX_SUPPORTED_TENSOR_DESCRIPTORS,`
`809`	`809`	`"Number of tensors in a group is larger than "`
`810`	`810`	`"the MAX number of supported descriptors (64).");`
`811`		`- // Only full tiles supported`
`812`		`- NVTE_CHECK(elts_total % ELTS_PER_CHUNK == 0, "Only full-tile grouped tensors supported.");`
`813`	`811`	`blocks = DIVUP(elts_total, CHUNK_DIM_Y * CHUNK_DIM_X);`
`814`	`812`	`}`
`815`	`813`	`const size_t block_size = THREADS_PER_CHUNK;`