[PyTorch][MOE] Tentative Fix For Replacing from_blob with empty for experts receiving zero tokens (NVIDIA#2134)

zhongbozhu · ksivaman · web-flow · commit c449c6cfbd30 · 2025-08-28T15:13:16.000-07:00
use torch empty for empty shape instead of from_blob

Signed-off-by: zhongboz &lt;zhongboz@nvidia.com&gt;
Co-authored-by: Kirthi Shankar Sivamani &lt;ksivamani@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -205,11 +205,8 @@ std::tuple<std::vector<py::object>, std::vector<TensorWrapper>> bulk_allocate_fp
   auto make_torch_view = [](std::shared_ptr<at::Tensor> &buffer, const std::vector<size_t> &shape,
                             size_t offset, at::ScalarType dtype) -> at::Tensor {
     std::vector<int64_t> shape_int64(shape.begin(), shape.end());
-    // in the case where full buffer is empty because local rank receives no tokens for all the experts
-    // then the data_ptr is nullptr, we need to return an empty tensor instead of calling from_blob
-    // but in the case where some experts receive tokens, some not, we want to leverage from_blob
-    // as much as possible to avoid CPU overhead
-    if (buffer->data_ptr<uint8_t>() == nullptr) {
+    bool is_empty_shape = product(shape) == 0;
+    if (buffer->data_ptr<uint8_t>() == nullptr || is_empty_shape) {
       return at::empty(shape_int64, at::device(at::kCUDA).dtype(dtype));
     }
     return at::from_blob(
@@ -359,11 +356,8 @@ std::tuple<std::vector<py::object>, std::vector<TensorWrapper>> bulk_allocate_mx
   auto make_torch_view = [](std::shared_ptr<at::Tensor> &buffer, const std::vector<size_t> &shape,
                             size_t offset, at::ScalarType dtype) -> at::Tensor {
     std::vector<int64_t> shape_int64(shape.begin(), shape.end());
-    // in the case where full buffer is empty because local rank receives no tokens for all the experts
-    // then the data_ptr is nullptr, we need to return an empty tensor instead of calling from_blob
-    // but in the case where some experts receive tokens, some not, we want to leverage from_blob
-    // as much as possible to avoid CPU overhead
-    if (buffer->data_ptr<uint8_t>() == nullptr) {
+    bool is_empty_shape = product(shape) == 0;
+    if (buffer->data_ptr<uint8_t>() == nullptr || is_empty_shape) {
       return at::empty(shape_int64, at::device(at::kCUDA).dtype(dtype));
     }
     return at::from_blob(