From 2aa595c89bb0801b9bfb2e8e18a7b0c218274941 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Wed, 17 Dec 2025 16:50:02 -0800 Subject: [PATCH] Unconditionally return 1 for ParallelType::Stream for unshardedSizes --- csrc/multidevice/execution_utils.cpp | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/csrc/multidevice/execution_utils.cpp b/csrc/multidevice/execution_utils.cpp index a7a7da703e2..83ddb761221 100644 --- a/csrc/multidevice/execution_utils.cpp +++ b/csrc/multidevice/execution_utils.cpp @@ -70,29 +70,7 @@ std::vector unshardedSizes( auto multiplier = [&]() -> int64_t { if (parallel_type == ParallelType::Stream) { - // TODO(#5525): hack for MultiDeviceExecutor. MultiDeviceExecutor looks - // for ParallelType::Stream only in logical domains and assumes a - // stream-parallelized dimension is always fully allocated. So we set - // the multiplier to 1 when `sharded_id` is a logical IterDomain. This - // will have to change when FusionExecutorCache requires a logical - // dimension to be stream-parallelized, both loop and allocation. Refer - // to - // https://github.com/NVIDIA/Fuser/blob/f8e84e52296cdecd318dd2ce904139616d7bd434/tests/cpp/test_overlap.cpp#L155 - // for an example. An alternative to consider is to create a new - // ParallelType for stream parallelization and use it in - // FusionExecutorCache. - if (std::find( - tv->getLogicalDomain().begin(), - tv->getLogicalDomain().end(), - sharded_id) != tv->getLogicalDomain().end()) { - return 1; - } - - NVF_ERROR( - sharded_id->extent()->isConstInt(), - "DIDs/Stream extent is expected to be constant: ", - sharded_id); - return sharded_id->extent()->evaluate().as(); + return 1; } if (isParallelTypeDeviceDim(parallel_type)) {