From d7d691defdd0f3bb725607889c36afc17644d47d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Mar 2026 20:28:15 +0000 Subject: [PATCH 1/2] Initial plan From e1458f4b94ccf0c2d9d3bcf7016d6580dbbafb9b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Mar 2026 20:32:24 +0000 Subject: [PATCH 2/2] Fix hint parameter: use (1, BLOCK_N) instead of (BLOCK_M, BLOCK_N) in all store calls Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- iris/ccl/all_gather.py | 4 ++-- iris/ccl/all_reduce.py | 8 ++++---- iris/ccl/all_to_all.py | 4 ++-- iris/x/all_reduce.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/iris/ccl/all_gather.py b/iris/ccl/all_gather.py index 70118508..190c9607 100644 --- a/iris/ccl/all_gather.py +++ b/iris/ccl/all_gather.py @@ -137,7 +137,7 @@ def persistent_all_gather( target_rank, heap_bases, mask=combined_mask, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) @@ -275,7 +275,7 @@ def persistent_all_gather_partitioned( target_rank, heap_bases, mask=combined_mask, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) diff --git a/iris/ccl/all_reduce.py b/iris/ccl/all_reduce.py index 124d663e..8503907a 100644 --- a/iris/ccl/all_reduce.py +++ b/iris/ccl/all_reduce.py @@ -334,7 +334,7 @@ def persistent_all_reduce_spinlock( dest_rank, heap_bases, mask=mask, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) # Release lock for this tile at dest_rank @@ -540,7 +540,7 @@ def persistent_all_reduce_ring( next_rank, heap_bases, mask=mask, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) tl.debug_barrier() iris.atomic_xchg( @@ -670,7 +670,7 @@ def persistent_all_reduce_two_shot( remote_rank_idx = (start_rank_idx + i) % world_size remote_rank = rank_start + remote_rank_idx * rank_stride if remote_rank_idx != group_rank: - iris.store(out_ptr, reduced, iris_rank, remote_rank, heap_bases, hint=(BLOCK_SIZE_M, BLOCK_SIZE_N)) + iris.store(out_ptr, reduced, iris_rank, remote_rank, heap_bases, hint=(1, BLOCK_SIZE_N)) # Slow path: MASKED (only boundary tiles land here) # This path handles tiles at tensor boundaries where not all elements are valid. @@ -700,7 +700,7 @@ def persistent_all_reduce_two_shot( remote_rank, heap_bases, mask=mask, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) diff --git a/iris/ccl/all_to_all.py b/iris/ccl/all_to_all.py index 3cb090cf..9ff16a1b 100644 --- a/iris/ccl/all_to_all.py +++ b/iris/ccl/all_to_all.py @@ -144,7 +144,7 @@ def persistent_all_to_all( iris_rank, target_rank, heap_bases, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) # Slow path: MASKED (only boundary tiles land here) @@ -184,7 +184,7 @@ def persistent_all_to_all( target_rank, heap_bases, mask=mask, - hint=(BLOCK_SIZE_M, BLOCK_SIZE_N), + hint=(1, BLOCK_SIZE_N), ) diff --git a/iris/x/all_reduce.py b/iris/x/all_reduce.py index ad8afca9..901f5adb 100644 --- a/iris/x/all_reduce.py +++ b/iris/x/all_reduce.py @@ -313,7 +313,7 @@ def all_reduce_two_shot( dest_rank, # to_rank (destination rank) ctx.heap_bases, mask=mask, - hint=(tile.block_m, tile.block_n), + hint=(1, tile.block_n), )