NVIDIA · zasdfgbnm · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/csrc/evaluator_common.cpp b/csrc/evaluator_common.cpp
@@ -186,7 +186,7 @@ void PrecomputedValues::bindParallelExtents(
     auto raw_val = launch_constraint.getRawVal(it.first);
     if (raw_val > 0) {
       for (auto extent : it.second) {
-        bindValue(extent->evaluatorIndex(), raw_val);
+        bindValue(extent->evaluatorIndex(), raw_val, extent);
       }
     }
   }
@@ -198,13 +198,17 @@ void PrecomputedValues::bindConcreteParallelTypeValue(
   auto index_list_it = thread_dim_value_indices_.find(pt);
   if (index_list_it != thread_dim_value_indices_.end()) {
     for (auto index : *(index_list_it->second)) {
-      bindValue(index, value);
+      const Val* ir_node = (index >= 0 && index < (int)symbols_.size())
+          ? symbols_[index]
+          : nullptr;
+      bindValue(index, value, ir_node);
     }
   }
 }
 
 void PrecomputedValues::bindInputs(const KernelArgumentHolder& args) {
   FUSER_PERF_SCOPE("PrecomputedValues::bindInputs");
+  debug() << "[DEBUG] PrecomputedValues::bindInputs called" << std::endl;
   if (hasValidValues()) {
     invalidate();
   }
@@ -219,6 +223,9 @@ void PrecomputedValues::bindValues(
       std::ssize(inputs),
       "kernel inputs size does not match args");
 
+  debug() << "[DEBUG] PrecomputedValues::bindValues called with " 
+          << inputs.size() << " inputs" << std::endl;
+
   for (const auto i : arange((int64_t)inputs.size())) {
     const auto input = inputs[i];
     NVF_ERROR(input != nullptr);
@@ -228,7 +235,7 @@ void PrecomputedValues::bindValues(
         bindTensorMetaData(tv, tensor);
       }
     } else {
-      bindValue(input->evaluatorIndex(), args[i]);
+      bindValue(input->evaluatorIndex(), args[i], input);
     }
   }
 }
@@ -360,15 +367,34 @@ void PrecomputedValues::initializeNamedScalars() {
 void PrecomputedValues::validate() {
   FUSER_PERF_SCOPE("PrecomputedValuess::Validate");
   using namespace PolymorphicValue_functions;
-  for (const auto& it : binding_log_) {
-    NVF_ERROR(
-        isSame(values_[it.first], it.second),
-        "Precomputed values failed to validate.",
-        "\nSomething unexpected changed between the compilation and "
-        "execution.\n",
-        values_[it.first],
-        " != ",
-        it.second);
+  for (const auto& [index, expected_value, ir_node] : binding_log_) {
+    if (!isSame(values_[index], expected_value)) {
+      std::stringstream error_msg;
+      error_msg << "Precomputed values failed to validate.\n"
+                << "Something unexpected changed between the compilation and "
+                   "execution.\n";
+      if (ir_node != nullptr) {
+        error_msg << "IR node: " << ir_node->toString() << "\n";
+      }
+      error_msg << "Computed value: " << toString(values_[index]) << "\n"
+                << "Expected value: " << toString(expected_value);
+
+      // Debug: Show binding history for this index
+      debug() << "[DEBUG] ===== VALIDATION FAILED =====" << std::endl;
+      debug() << "[DEBUG] Binding history for index " << index << ":" << std::endl;
+      for (const auto& [idx, val, node] : binding_log_) {
+        if (idx == index) {
+          debug() << "[DEBUG]   Bound to: " << toString(val);
+          if (node != nullptr) {
+            debug() << " (node: " << node->toString() << ")";
+          }
+          debug() << std::endl;
+        }
+      }
+      debug() << "[DEBUG] ================================" << std::endl;
+
+      NVF_ERROR(false, error_msg.str());
+    }
   }
   has_valid_values_ = true;
 }
@@ -383,6 +409,21 @@ void PrecomputedValues::bindTensorMetaData(
       "Something went wrong configuring launch. Inputs do not match.");
 
   std::vector<int64_t> logical_sizes = unshardedSizes(tv, tensor.sizes());
+
+  debug() << "[DEBUG] bindTensorMetaData for TV: " << tv->toString() << std::endl;
+  debug() << "[DEBUG]   Actual tensor.sizes(): [";
+  for (size_t i = 0; i < tensor.sizes().size(); ++i) {
+    if (i > 0) debug() << ", ";
+    debug() << tensor.sizes()[i];
+  }
+  debug() << "]" << std::endl;
+  debug() << "[DEBUG]   Unsharded logical_sizes: [";
+  for (size_t i = 0; i < logical_sizes.size(); ++i) {
+    if (i > 0) debug() << ", ";
+    debug() << logical_sizes[i];
+  }
+  debug() << "]" << std::endl;
+
   adjustEvaluatorSizes(tv, logical_sizes);
 
   for (const auto dim : arange(static_cast<int64_t>(logical_domain.size()))) {
@@ -391,12 +432,17 @@ void PrecomputedValues::bindTensorMetaData(
     if (id->isBroadcast()) {
       // DIDs are ignored for broadcast. See MultideviceShardingTest.Broadcast
       // and .ExpandedBroadcast.
-      bindValue(id->extent()->evaluatorIndex(), 1L);
+      bindValue(id->extent()->evaluatorIndex(), 1L, id->extent());
       if (id->hasExpandedExtent()) {
-        bindValue(id->expandedExtent()->evaluatorIndex(), dim_size);
+        bindValue(
+            id->expandedExtent()->evaluatorIndex(),
+            dim_size,
+            id->expandedExtent());
       }
     } else {
-      bindValue(id->extent()->evaluatorIndex(), dim_size);
+      debug() << "[DEBUG]   Binding " << id->extent()->toString() 
+              << " = " << dim_size << std::endl;
+      bindValue(id->extent()->evaluatorIndex(), dim_size, id->extent());
     }
   }
 
@@ -424,7 +470,7 @@ void PrecomputedValues::bindTensorMetaData(
       tv->toString(),
       " with input tensor ",
       tensor);
-  bindValue(metadata_val->evaluatorIndex(), metadata);
+  bindValue(metadata_val->evaluatorIndex(), metadata, metadata_val);
 }
 
 NaiveValueMachine::NaiveValueMachine(PrecomputedValues& precomputed_values)

diff --git a/csrc/evaluator_common.h b/csrc/evaluator_common.h
@@ -211,17 +211,32 @@ class PrecomputedValues {
 
   //! Bind concrete value to the given index
   //!  if the index is valid.
-  void bindValue_(int index, const PolymorphicValue& value) {
+  void bindValue_(
+      int index,
+      const PolymorphicValue& value,
+      const Val* ir_node = nullptr) {
     if (index < 0 || is_constant_[index]) {
       return;
     }
+
+    // Debug: show if we're rebinding a value
+    if (defined_[index]) {
+      debug() << "[DEBUG] REBINDING index " << index;
+      if (ir_node != nullptr) {
+        debug() << " (node: " << ir_node->toString() << ")";
+      }
+      debug() << " from " << PolymorphicValue_functions::toString(values_[index]) 
+              << " to " << PolymorphicValue_functions::toString(value) << std::endl;
+    }
+
     defined_[index] = true;
     values_[index] = value;
-    binding_log_.emplace_back(index, value);
+    binding_log_.emplace_back(index, value, ir_node);
+    validate();
   }
   template <typename T>
-  void bindValue(int index, const T& value) {
-    bindValue_(index, PolymorphicValue(value));
+  void bindValue(int index, const T& value, const Val* ir_node = nullptr) {
+    bindValue_(index, PolymorphicValue(value), ir_node);
   }
 
   //! Invalidate all computed values in the workspace.
@@ -292,7 +307,7 @@ class PrecomputedValues {
   //! An internal log to keep track of all the bindings
   //!  used in each evaluation cycle. To be used for
   //!  consistency check.
-  std::vector<std::pair<int, PolymorphicValue>> binding_log_;
+  std::vector<std::tuple<int, PolymorphicValue, const Val*>> binding_log_;
 
   //! Integer runtime for realizing the values computations.
   std::unique_ptr<NaiveValueMachine> value_machine_;

diff --git a/csrc/multidevice/execution_utils.cpp b/csrc/multidevice/execution_utils.cpp
@@ -87,6 +87,12 @@ std::vector<int64_t> unshardedSizes(
                 sharded_id) != tv->getLogicalDomain().end()) {
           return 1;
         }
+        if (std::find(
+          tv->getMaybeAllocationDomain().begin(),
+          tv->getMaybeAllocationDomain().end(),
+          sharded_id) != tv->getMaybeAllocationDomain().end()) {
+          return 1;
+        }
 
         NVF_ERROR(
             sharded_id->extent()->isConstInt(),

diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp
@@ -50,6 +50,10 @@ KernelArgumentHolder FusionExecutorCache::runFusionWithInputs(
     std::optional<int8_t> selected_device) {
   FUSER_PERF_SCOPE("FusionExecutorCache::runFusionWithInputs");
 
+  // Print fusion IR every run
+  debug() << "Fusion IR in FusionExecutorCache::runFusionWithInputs:" << std::endl;
+  fusion_->print();
+
   if (isProfilerEnabled()) {
     FusionProfiler::start(!isProfilerEnabledWithCupti());
   }
@@ -63,6 +67,7 @@ KernelArgumentHolder FusionExecutorCache::runFusionWithInputs(
   }
 
   if (!kernel_runtime->isCompiled()) {
+    debug() << "[DEBUG] ===== COMPILING KERNEL =====" << std::endl;
     kernel_runtime->compileFusionParallel(args);
   }
 
@@ -80,6 +85,7 @@ KernelArgumentHolder FusionExecutorCache::runFusionWithInputs(
         " failed.");
   }
 
+  debug() << "[DEBUG] ===== EXECUTING KERNEL =====" << std::endl;
   auto outputs = kernel_runtime->runWithInputs(args);
 
   // Kernel time measurement is off by default

diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -578,11 +578,14 @@ std::optional<std::unique_ptr<HeuristicParamsList>> FusionKernelRuntime::
     {
       FUSER_PERF_SCOPE(
           "FusionKernelRuntime::getMaybeHeuristicsFor::PrecomputedValues");
+      debug() << "[DEBUG] compileFusionParallel: Creating PrecomputedValues and binding inputs" << std::endl;
       evaluator_precomputed_values =
           std::make_unique<PrecomputedValues>(fusion_to_run);
+      debug() << "[DEBUG] compileFusionParallel: Calling bindInputs (group_runtime_inputs)" << std::endl;
       evaluator_precomputed_values->bindInputs(group_runtime_inputs);
       // TODO Remove binding the original fusion inputs when creating
       // heuristics for fusion segment.
+      debug() << "[DEBUG] compileFusionParallel: Calling bindValues (complete fusion inputs)" << std::endl;
       evaluator_precomputed_values->bindValues(
           group_to_run->getCompleteFusionInputs(), args);
       evaluator_precomputed_values->evaluate();

diff --git a/tests/cpp/test_stream.cpp b/tests/cpp/test_stream.cpp
@@ -193,4 +193,98 @@ TEST_F(StreamTest, ReplicatedAllocation) {
   }
 }
 
+TEST_F(StreamTest, Matmul) {
+  constexpr int64_t c = 3;
+
+  auto fusion = std::make_unique<Fusion>();
+  {
+    FusionGuard fg(fusion.get());
+    TensorView* in = makeSymbolicTensor(2);
+    TensorView* w = makeSymbolicTensor(2);
+    TensorView* out = matmul(in, w);
+    fusion->addInput(in);
+    fusion->addInput(w);
+    fusion->addOutput(out);
+
+    out->outer_split(1, c);
+    out->axis(1)->parallelize(ParallelType::Stream);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+  at::Tensor in_tensor = at::randn({5, 7}, options);
+  at::Tensor w_tensor = at::randn({7, c * 2}, options);
+
+  // With NVFUSER_DUMP=host_ir, you'll see the host IR container like the
+  // following:
+  // clang-format off
+  // %HostIrContainer { (T0_g_float[iS0{i0}, iS1{i2}], T1_g_float[istreamIdx7{3}, iS11{i2}, iS8{( ceilDiv(i4, 3) )}]) -> (T2_g_float[istreamIdx9{3}, iS4{i0}, iS10{( ceilDiv(i4, 3) )}, rS6{i2}]) :
+  //   FOR i18 from 0 to 3:
+  //     T2_g_float[istreamIdx9{3}, iS4{i0}, iS10{( ceilDiv(i4, 3) )}, rS6{i2}]
+  //        = matmul(T0_g_float[iS0{i0}, iS1{i2}],
+  //                 T1_g_float[istreamIdx7{3}, iS11{i2}, iS8{( ceilDiv(i4, 3) )}])
+  // } // %HostIrContainer
+  // clang-format on
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensor = executor_cache.runFusionWithInputs({in_tensor, w_tensor})[0]
+                        .as<at::Tensor>();
+
+  testValidate(
+      executor_cache.fusion(),
+      {out_tensor},
+      {in_tensor, w_tensor},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(StreamTest, TwoMatmuls) {
+  constexpr int64_t c = 3;
+
+  auto fusion = std::make_unique<Fusion>();
+  {
+    FusionGuard fg(fusion.get());
+    TensorView* in = makeSymbolicTensor(2);
+    TensorView* w1 = makeSymbolicTensor(2);
+    TensorView* w2 = makeSymbolicTensor(2);
+    TensorView* out = matmul(in, w1);
+    out = matmul(out, w2);
+    fusion->addInput(in);
+    fusion->addInput(w1);
+    fusion->addInput(w2);
+    fusion->addOutput(out);
+
+    in->outer_split(0, c);
+    in->axis(0)->parallelize(ParallelType::Stream);
+  }
+
+  {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+    at::Tensor in = at::randn({c * 2, 3}, options);
+    at::Tensor w1 = at::randn({3, 5}, options);
+    at::Tensor w2 = at::randn({5, 3}, options);
+
+    // With NVFUSER_DUMP=host_ir, you'll see the host IR container like the
+    // following:
+    // clang-format off
+    // %HostIrContainer { (T0_g_float[istreamIdx12{3}, iS13{( ceilDiv(i0, 3) )}, iS1{i2}], T1_g_float[iS14{i2}, iS3{i4}], T2_g_float[iS15{i4}, iS5{i6}]) -> (T4_g_float[istreamIdx18{3}, iS19{( ceilDiv(i0, 3) )}, iS10{i6}, rS11{i4}]) :
+    //   T4_g_float[istreamIdx18{3}, iS19{( ceilDiv(i0, 3) )}, iS10{i6}, rS11{i4}] = ALLOCATE(buffer=T4_g_float[istreamIdx18{3}, iS19{( ceilDiv(i0, 3) )}, iS10{i6}, rS11{i4}], mem_type=global, size=( i0 * i6 ), zero_init=false, resets_to_zero=false)
+    //   FOR i99 from 0 to 3:
+    //     T5_l_float[istreamIdx22{3}, iS23{( ceilDiv(i0, 3) )}, iS21{i2}] = ShardByStream(T0_g_float[istreamIdx12{3}, iS13{( ceilDiv(i0, 3) )}, iS1{i2}], stream_index = i99)
+    //     T3_g_float[istreamIdx16{3}, iS17{( ceilDiv(i0, 3) )}, iS7{i4}, rS8{i2}]
+    //        = matmul(T5_l_float[istreamIdx22{3}, iS23{( ceilDiv(i0, 3) )}, iS21{i2}],
+    //                 T1_g_float[iS14{i2}, iS3{i4}])
+    //     T6_l_float[istreamIdx26{3}, iS27{( ceilDiv(i0, 3) )}, iS25{i6}] = ShardByStream(T4_g_float[istreamIdx18{3}, iS19{( ceilDiv(i0, 3) )}, iS10{i6}, rS11{i4}], stream_index = i99)
+    //     T6_l_float[istreamIdx26{3}, iS27{( ceilDiv(i0, 3) )}, iS25{i6}]
+    //        = matmul(T3_g_float[istreamIdx16{3}, iS17{( ceilDiv(i0, 3) )}, iS7{i4}, rS8{i2}],
+    //                 T2_g_float[iS15{i4}, iS5{i6}])
+    // } // %HostIrContainer
+    // clang-format on
+    FusionExecutorCache executor_cache(std::move(fusion));
+    auto out =
+        executor_cache.runFusionWithInputs({in, w1, w2})[0].as<at::Tensor>();
+
+    testValidate(
+        executor_cache.fusion(), {out}, {in, w1, w2}, __LINE__, __FILE__);
+  }
+}
+
 } // namespace nvfuser