diff --git a/iron/operators/axpy/design.py b/iron/operators/axpy/design.py index af58eb55..cde40a63 100644 --- a/iron/operators/axpy/design.py +++ b/iron/operators/axpy/design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -117,4 +116,4 @@ def core_body(of_in1, of_in2, of_out, axpy): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/binary_elementwise_design.py b/iron/operators/binary_elementwise_design.py index 5263fbee..9155ffb6 100644 --- a/iron/operators/binary_elementwise_design.py +++ b/iron/operators/binary_elementwise_design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -115,4 +114,4 @@ def core_body(of_in1, of_in2, of_out, eltwise_fn): rt.finish_task_group(tg) # Place program components and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/channeled_unary_design.py b/iron/operators/channeled_unary_design.py index 693d2baa..26df29aa 100644 --- a/iron/operators/channeled_unary_design.py +++ b/iron/operators/channeled_unary_design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -129,4 +128,4 @@ def core_fn(of_in, of_out, kernel_line): rt.finish_task_group(tg) # Place components and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/dequant/design.py b/iron/operators/dequant/design.py index 65a92ffa..e613e08f 100644 --- a/iron/operators/dequant/design.py +++ b/iron/operators/dequant/design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -150,4 +149,4 @@ def core_body(of_in1, of_out, dequant_kernel): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/gemm/design.py b/iron/operators/gemm/design.py index a8ed8ad3..3d42e270 100644 --- a/iron/operators/gemm/design.py +++ b/iron/operators/gemm/design.py @@ -18,7 +18,6 @@ WorkerRuntimeBarrier, str_to_dtype, ) -from aie.iron.placers import SequentialPlacer from aie.iron.device import NPU1Col1, NPU1Col2, NPU1, NPU2, Tile from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D, TensorAccessPattern from aie.iron.controlflow import range_ @@ -381,7 +380,7 @@ def my_matmul( obj_types=[A_l1_ty] * (stop_row - start_row), names=[f"A_L2L1_{row}" for row in range(start_row, stop_row)], dims_to_stream=dims_to_stream, - placement=Tile( + tile=Tile( 2 * i if n_aie_cols == 8 else i, 1 ), # alternate columns in full 4x8 NPU2 case ) @@ -404,7 +403,7 @@ def my_matmul( obj_type=B_l1_ty, name=f"B_L2L1_{col}", dims_to_stream=dims_to_stream, - placement=Tile(col, 1), + tile=Tile(col, 1), ) ) @@ -430,7 +429,7 @@ def my_matmul( obj_types=[C_l1_ty] * n_aie_rows, names=[f"C_L1L2_{col}_{row}" for row in range(n_aie_rows)], depths=[fifo_depth_out] * n_aie_rows, - placement=Tile(col, 1), + tile=Tile(col, 1), ) ) for j in range(n_aie_rows): @@ -498,7 +497,7 @@ def core_fn( workerBarriers[row][col], acc_buffer, ], - placement=Tile(tile_col, tile_row), + tile=Tile(tile_col, tile_row), stack_size=0xD00, ) ) @@ -629,7 +628,7 @@ def set_rtps(*args): tap=C_tile, wait=True, task_group=tg, - placement=Tile(col, 0), + tile=Tile(col, 0), ) for tile_row in range(current_tb_n_rows): @@ -684,7 +683,7 @@ def set_rtps(*args): tap=C_tile, wait=True, task_group=tg, - placement=Tile(col, 0), + tile=Tile(col, 0), ) # This line does not change MLIR output at all - it's just for recording data movement C_taps.append(C_tile) @@ -718,7 +717,7 @@ def set_rtps(*args): A, tap=A_tiles[tile_offset], task_group=tg, - placement=Tile( + tile=Tile( 2 * col if n_aie_cols == 8 else col, 0 ), # alternate columns in full 4x8 NPU2 case ) @@ -749,7 +748,7 @@ def set_rtps(*args): B, tap=B_tiles[col], task_group=tg, - placement=Tile(col, 0), + tile=Tile(col, 0), ) # These lines do not change MLIR output at all - they are just for recording data movement @@ -773,7 +772,7 @@ def set_rtps(*args): my_program = Program(dev_ty, rt) # Place components (assign them resources on the device) and generate an MLIR module - module = my_program.resolve_program(SequentialPlacer()) + module = my_program.resolve_program() return module diff --git a/iron/operators/gemv/design.py b/iron/operators/gemv/design.py index 5031ed33..654dc7f6 100644 --- a/iron/operators/gemv/design.py +++ b/iron/operators/gemv/design.py @@ -9,7 +9,6 @@ from aie.helpers.dialects.scf import _for as range_ from aie.helpers.taplib import TensorAccessPattern from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer """ Matrix-vector design @@ -190,4 +189,4 @@ def core_body(A_L3L1_fifo, B_L3L1_fifo, C_L1L3_fifo, matvec): rt.finish_task_group(tg_ac) rt.finish_task_group(tg_b) - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/leaky_relu/design.py b/iron/operators/leaky_relu/design.py index 278bd046..e4540f35 100644 --- a/iron/operators/leaky_relu/design.py +++ b/iron/operators/leaky_relu/design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -122,4 +121,4 @@ def core_fn(of_in, of_out, leaky_relu_line): rt.finish_task_group(tg) # Place components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/mem_copy/design.py b/iron/operators/mem_copy/design.py index adf4c198..4f7faeb4 100644 --- a/iron/operators/mem_copy/design.py +++ b/iron/operators/mem_copy/design.py @@ -15,7 +15,6 @@ Runtime, Worker, ) -from aie.iron.placers import SequentialPlacer from aie.iron.device import Tile, NPU1, NPU2 from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -222,7 +221,8 @@ def core_fn(of_in, of_out, mem_copy_line): of_in.release(1) of_out.release(1) - # Create a worker to perform the task + # Create a worker to perform the task. + # Place at most ``num_channels`` workers per column. my_workers = [ Worker( core_fn, @@ -231,6 +231,7 @@ def core_fn(of_in, of_out, mem_copy_line): of_outs[i].prod(), mem_copy_fcn, ], + tile=Tile(i // num_channels, 2 + (i % num_channels)), ) for i in range(num_cores) ] @@ -404,4 +405,4 @@ def core_fn(of_in, of_out, mem_copy_line): objfifo_idx += partial_config.num_cores_with_full_tiles # Place components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer(num_channels)) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/mha/design.py b/iron/operators/mha/design.py index 2e8610bd..d9ee167c 100644 --- a/iron/operators/mha/design.py +++ b/iron/operators/mha/design.py @@ -19,7 +19,6 @@ Buffer, WorkerRuntimeBarrier, ) -from aie.iron.placers import SequentialPlacer from aie.iron.device import NPU2, Tile from aie.iron.controlflow import range_ from aie.helpers.taplib import TensorTiler2D, TensorAccessSequence, TensorAccessPattern @@ -277,7 +276,7 @@ def fused_mha( names=[f"memQ{i}" for i in range(number_of_pipelines_join_distribute)], dims_to_stream=[q_dims] * number_of_pipelines_join_distribute, depths=[of_depth] * number_of_pipelines_join_distribute, - placement=Tile(col=6, row=1), + tile=Tile(col=6, row=1), ) # Split between N pipelines if number_of_pipelines > 6: inQ2 = ObjectFifo( @@ -290,7 +289,7 @@ def fused_mha( names=[f"memQ2{i}" for i in range(number_of_pipelines_join_distribute)], dims_to_stream=[q_dims] * number_of_pipelines_join_distribute, depths=[of_depth] * number_of_pipelines_join_distribute, - placement=Tile(col=7, row=1), + tile=Tile(col=7, row=1), ) # Split between N pipelines # VJUNG: The SequentialPlacer will place all of these on the same MemTile if Placement is specified. We would need a list of placement in case of one-many or many-one. @@ -308,7 +307,7 @@ def fused_mha( memK = inK.cons().forward( name="memK", dims_to_stream=k_dims, - placement=Tile(col=3, row=1), + tile=Tile(col=3, row=1), depth=of_depth, ) # Broadcast, give this handle to N pipelines @@ -324,7 +323,7 @@ def fused_mha( memV = inV.cons().forward( name="memV", dims_to_stream=v_dims, - placement=Tile(col=4, row=1), + tile=Tile(col=4, row=1), depth=of_depth, ) # Broadcast, give this handle to N pipelines @@ -342,7 +341,7 @@ def fused_mha( name=f"outA{i}", dims_to_stream=a_dims, depth=of_depth, - # placement=Tile(col=i, row=1)) + # tile=Tile(col=i, row=1)) ) ) # Local to 1 pipeline @@ -357,7 +356,7 @@ def fused_mha( name=f"outP{i}", dims_to_stream=q_dims, depth=of_depth, - # placement=Tile(col=i, row=1) + # tile=Tile(col=i, row=1) ) ) # Local to 1 pipeline @@ -381,7 +380,7 @@ def fused_mha( obj_types=[q_ty] * number_of_pipelines_join_distribute, names=[f"outO{i}" for i in range(number_of_pipelines_join_distribute)], depths=[of_depth] * number_of_pipelines_join_distribute, - placement=Tile(col=6, row=1), + tile=Tile(col=6, row=1), ) # Join onto the output OF if number_of_pipelines > 6: memO2 = ObjectFifo( @@ -394,7 +393,7 @@ def fused_mha( obj_types=[q_ty] * number_of_pipelines_join_distribute, names=[f"outO2{i}" for i in range(number_of_pipelines_join_distribute)], depths=[of_depth] * number_of_pipelines_join_distribute, - placement=Tile(col=7, row=1), + tile=Tile(col=7, row=1), ) def batched_matmul_qk( @@ -654,7 +653,7 @@ def batched_matmul_pv( idx_buffer_qk, ], stack_size=0xD00, - placement=Tile(col=i, row=2), + tile=Tile(col=i, row=2), while_true=False, ) ) @@ -683,7 +682,7 @@ def batched_matmul_pv( scale_buffer_softmax, ], stack_size=0xD00, - placement=Tile(col=i, row=3), + tile=Tile(col=i, row=3), while_true=False, ) ) @@ -708,7 +707,7 @@ def batched_matmul_pv( idx_buffer_pv, ], stack_size=0xD00, - placement=Tile(col=i, row=4), + tile=Tile(col=i, row=4), while_true=False, ) ) @@ -813,7 +812,7 @@ def set_mha_rtps(): tap=Q_tiles[ 2 * head_idx * num_q_block_per_pipeline + q_block_idx * 2 ], - placement=Tile(col=4, row=0), + tile=Tile(col=4, row=0), task_group=tg, ) rt.fill( @@ -824,7 +823,7 @@ def set_mha_rtps(): + q_block_idx * 2 + 1 ], - placement=Tile(col=4, row=0), + tile=Tile(col=4, row=0), task_group=tg, ) else: @@ -832,7 +831,7 @@ def set_mha_rtps(): inQ.prod(), Q, tap=Q_tiles[head_idx * num_q_block_per_pipeline + q_block_idx], - placement=Tile(col=4, row=0), + tile=Tile(col=4, row=0), task_group=tg, ) @@ -841,14 +840,14 @@ def set_mha_rtps(): inK.prod(), K, tap=K_tiles[kv_head_idx], - placement=Tile(col=5, row=0), + tile=Tile(col=5, row=0), task_group=tg, ) rt.fill( inV.prod(), V, tap=V_tiles[kv_head_idx], - placement=Tile(col=6, row=0), + tile=Tile(col=6, row=0), task_group=tg, ) @@ -860,7 +859,7 @@ def set_mha_rtps(): 2 * head_idx * num_q_block_per_pipeline + q_block_idx * 2 ], wait=True, - placement=Tile(col=7, row=0), + tile=Tile(col=7, row=0), task_group=tg, ) rt.drain( @@ -872,7 +871,7 @@ def set_mha_rtps(): + 1 ], wait=True, - placement=Tile(col=7, row=0), + tile=Tile(col=7, row=0), task_group=tg, ) else: @@ -881,7 +880,7 @@ def set_mha_rtps(): O, tap=O_tiles[head_idx * num_q_block_per_pipeline + q_block_idx], wait=True, - placement=Tile(col=7, row=0), + tile=Tile(col=7, row=0), task_group=tg, ) @@ -892,5 +891,5 @@ def set_mha_rtps(): my_program = Program(dev_ty, rt) # Place components (assign them resources on the device) and generate an MLIR module - module = my_program.resolve_program(SequentialPlacer()) + module = my_program.resolve_program() return module diff --git a/iron/operators/repeat/design.py b/iron/operators/repeat/design.py index 36efa653..4e6f2ac1 100644 --- a/iron/operators/repeat/design.py +++ b/iron/operators/repeat/design.py @@ -9,7 +9,6 @@ from aie.dialects.aiex import TensorAccessPattern from aie.iron import ObjectFifo, Program, Runtime -from aie.iron.placers import SequentialPlacer def repeat(dev, dtype, rows, cols, repeat, transfer_size=None): @@ -69,4 +68,4 @@ def repeat(dev, dtype, rows, cols, repeat, transfer_size=None): rt.drain(fifo_out.cons(), out, output_tap, task_group=tg, wait=True) rt.finish_task_group(tg) - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/rms_norm/design.py b/iron/operators/rms_norm/design.py index 2db7ec2c..af96cac9 100644 --- a/iron/operators/rms_norm/design.py +++ b/iron/operators/rms_norm/design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.iron.device import NPU1, NPU2 from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -123,4 +122,4 @@ def core_body(of_in1, of_out, rms_norm_kernel): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/rms_norm/design_weighted.py b/iron/operators/rms_norm/design_weighted.py index ae5647a0..c0d77fe3 100644 --- a/iron/operators/rms_norm/design_weighted.py +++ b/iron/operators/rms_norm/design_weighted.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.iron.device import NPU1, NPU2 from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -178,4 +177,4 @@ def core_body_mul(of_in1, of_in2, of_out2, eltwise_mul): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/rope/design.py b/iron/operators/rope/design.py index 47b8a602..79de08e2 100644 --- a/iron/operators/rope/design.py +++ b/iron/operators/rope/design.py @@ -18,7 +18,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.iron.device import NPU1, NPU2 from aie.helpers.taplib.tap import TensorAccessPattern from aie.helpers.dialects.scf import _for as range_ @@ -161,4 +160,4 @@ def core_body(of_in, of_lut, of_out, rope_kernel): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/softmax/design.py b/iron/operators/softmax/design.py index 5cb68c39..b9b64be3 100644 --- a/iron/operators/softmax/design.py +++ b/iron/operators/softmax/design.py @@ -13,7 +13,6 @@ Buffer, WorkerRuntimeBarrier, ) -from aie.iron.placers import SequentialPlacer from aie.iron.device import NPU1, NPU2 from aie.helpers.taplib.tap import TensorAccessPattern from aie.helpers.dialects.scf import _for as range_ @@ -176,4 +175,4 @@ def set_rtps(*args): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/strided_copy/design.py b/iron/operators/strided_copy/design.py index ea5b1d46..6f92f4ab 100644 --- a/iron/operators/strided_copy/design.py +++ b/iron/operators/strided_copy/design.py @@ -12,7 +12,6 @@ from aie.dialects.aiex import TensorAccessPattern from aie.iron import ObjectFifo, Program, Runtime -from aie.iron.placers import SequentialPlacer def strided_copy( @@ -133,4 +132,4 @@ def strided_copy( rt.drain(fifos_out[c].cons(), out, output_taps[c], task_group=tg, wait=True) rt.finish_task_group(tg) - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/iron/operators/transpose/design.py b/iron/operators/transpose/design.py index bec4382b..24c89414 100644 --- a/iron/operators/transpose/design.py +++ b/iron/operators/transpose/design.py @@ -5,7 +5,6 @@ import numpy as np from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker -from aie.iron.placers import SequentialPlacer from aie.helpers.taplib.tap import TensorAccessPattern from aie.iron.controlflow import range_ @@ -159,4 +158,4 @@ def core_body(of_in1, of_out, transpose_kernel): rt.finish_task_group(tg) # Place program components (assign them resources on the device) and generate an MLIR module - return Program(dev, rt).resolve_program(SequentialPlacer()) + return Program(dev, rt).resolve_program() diff --git a/requirements.txt b/requirements.txt index 93e621ab..40f4d69d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ --extra-index-url https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly --extra-index-url https://pypi.org/simple -mlir_aie==0.0.1.2026033104+e4f35d6 +mlir_aie==0.0.1.2026051105+7fd5c8e llvm-aie==21.0.0.2026050601+2c363ce black