Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions iron/operators/axpy/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np

from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.iron.controlflow import range_

Expand Down Expand Up @@ -117,4 +116,4 @@ def core_body(of_in1, of_in2, of_out, axpy):
rt.finish_task_group(tg)

# Place program components (assign them resources on the device) and generate an MLIR module
return Program(dev, rt).resolve_program(SequentialPlacer())
return Program(dev, rt).resolve_program()
3 changes: 1 addition & 2 deletions iron/operators/binary_elementwise_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np

from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.iron.controlflow import range_

Expand Down Expand Up @@ -115,4 +114,4 @@ def core_body(of_in1, of_in2, of_out, eltwise_fn):
rt.finish_task_group(tg)

# Place program components and generate an MLIR module
return Program(dev, rt).resolve_program(SequentialPlacer())
return Program(dev, rt).resolve_program()
3 changes: 1 addition & 2 deletions iron/operators/channeled_unary_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np

from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.iron.controlflow import range_

Expand Down Expand Up @@ -129,4 +128,4 @@ def core_fn(of_in, of_out, kernel_line):
rt.finish_task_group(tg)

# Place components and generate an MLIR module
return Program(dev, rt).resolve_program(SequentialPlacer())
return Program(dev, rt).resolve_program()
3 changes: 1 addition & 2 deletions iron/operators/dequant/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np

from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.iron.controlflow import range_

Expand Down Expand Up @@ -150,4 +149,4 @@ def core_body(of_in1, of_out, dequant_kernel):
rt.finish_task_group(tg)

# Place program components (assign them resources on the device) and generate an MLIR module
return Program(dev, rt).resolve_program(SequentialPlacer())
return Program(dev, rt).resolve_program()
19 changes: 9 additions & 10 deletions iron/operators/gemm/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
WorkerRuntimeBarrier,
str_to_dtype,
)
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1, NPU1Col2, NPU1, NPU2, Tile
from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D, TensorAccessPattern
from aie.iron.controlflow import range_
Expand Down Expand Up @@ -381,7 +380,7 @@ def my_matmul(
obj_types=[A_l1_ty] * (stop_row - start_row),
names=[f"A_L2L1_{row}" for row in range(start_row, stop_row)],
dims_to_stream=dims_to_stream,
placement=Tile(
tile=Tile(
2 * i if n_aie_cols == 8 else i, 1
), # alternate columns in full 4x8 NPU2 case
)
Expand All @@ -404,7 +403,7 @@ def my_matmul(
obj_type=B_l1_ty,
name=f"B_L2L1_{col}",
dims_to_stream=dims_to_stream,
placement=Tile(col, 1),
tile=Tile(col, 1),
)
)

Expand All @@ -430,7 +429,7 @@ def my_matmul(
obj_types=[C_l1_ty] * n_aie_rows,
names=[f"C_L1L2_{col}_{row}" for row in range(n_aie_rows)],
depths=[fifo_depth_out] * n_aie_rows,
placement=Tile(col, 1),
tile=Tile(col, 1),
)
)
for j in range(n_aie_rows):
Expand Down Expand Up @@ -498,7 +497,7 @@ def core_fn(
workerBarriers[row][col],
acc_buffer,
],
placement=Tile(tile_col, tile_row),
tile=Tile(tile_col, tile_row),
stack_size=0xD00,
)
)
Expand Down Expand Up @@ -629,7 +628,7 @@ def set_rtps(*args):
tap=C_tile,
wait=True,
task_group=tg,
placement=Tile(col, 0),
tile=Tile(col, 0),
)

for tile_row in range(current_tb_n_rows):
Expand Down Expand Up @@ -684,7 +683,7 @@ def set_rtps(*args):
tap=C_tile,
wait=True,
task_group=tg,
placement=Tile(col, 0),
tile=Tile(col, 0),
)
# This line does not change MLIR output at all - it's just for recording data movement
C_taps.append(C_tile)
Expand Down Expand Up @@ -718,7 +717,7 @@ def set_rtps(*args):
A,
tap=A_tiles[tile_offset],
task_group=tg,
placement=Tile(
tile=Tile(
2 * col if n_aie_cols == 8 else col, 0
), # alternate columns in full 4x8 NPU2 case
)
Expand Down Expand Up @@ -749,7 +748,7 @@ def set_rtps(*args):
B,
tap=B_tiles[col],
task_group=tg,
placement=Tile(col, 0),
tile=Tile(col, 0),
)

# These lines do not change MLIR output at all - they are just for recording data movement
Expand All @@ -773,7 +772,7 @@ def set_rtps(*args):
my_program = Program(dev_ty, rt)

# Place components (assign them resources on the device) and generate an MLIR module
module = my_program.resolve_program(SequentialPlacer())
module = my_program.resolve_program()
return module


Expand Down
3 changes: 1 addition & 2 deletions iron/operators/gemv/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from aie.helpers.dialects.scf import _for as range_
from aie.helpers.taplib import TensorAccessPattern
from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer

"""
Matrix-vector design
Expand Down Expand Up @@ -190,4 +189,4 @@ def core_body(A_L3L1_fifo, B_L3L1_fifo, C_L1L3_fifo, matvec):
rt.finish_task_group(tg_ac)
rt.finish_task_group(tg_b)

return Program(dev, rt).resolve_program(SequentialPlacer())
return Program(dev, rt).resolve_program()
3 changes: 1 addition & 2 deletions iron/operators/leaky_relu/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np

from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.iron.controlflow import range_

Expand Down Expand Up @@ -122,4 +121,4 @@ def core_fn(of_in, of_out, leaky_relu_line):
rt.finish_task_group(tg)

# Place components (assign them resources on the device) and generate an MLIR module
return Program(dev, rt).resolve_program(SequentialPlacer())
return Program(dev, rt).resolve_program()
7 changes: 4 additions & 3 deletions iron/operators/mem_copy/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
Runtime,
Worker,
)
from aie.iron.placers import SequentialPlacer
from aie.iron.device import Tile, NPU1, NPU2
from aie.helpers.taplib.tap import TensorAccessPattern
from aie.iron.controlflow import range_
Expand Down Expand Up @@ -222,7 +221,8 @@ def core_fn(of_in, of_out, mem_copy_line):
of_in.release(1)
of_out.release(1)

# Create a worker to perform the task
# Create a worker to perform the task.
# Place at most ``num_channels`` workers per column.
my_workers = [
Worker(
core_fn,
Expand All @@ -231,6 +231,7 @@ def core_fn(of_in, of_out, mem_copy_line):
of_outs[i].prod(),
mem_copy_fcn,
],
tile=Tile(i // num_channels, 2 + (i % num_channels)),
)
for i in range(num_cores)
]
Expand Down Expand Up @@ -404,4 +405,4 @@ def core_fn(of_in, of_out, mem_copy_line):
objfifo_idx += partial_config.num_cores_with_full_tiles

# Place components (assign them resources on the device) and generate an MLIR module
return Program(dev, rt).resolve_program(SequentialPlacer(num_channels))
return Program(dev, rt).resolve_program()
41 changes: 20 additions & 21 deletions iron/operators/mha/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Buffer,
WorkerRuntimeBarrier,
)
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU2, Tile
from aie.iron.controlflow import range_
from aie.helpers.taplib import TensorTiler2D, TensorAccessSequence, TensorAccessPattern
Expand Down Expand Up @@ -277,7 +276,7 @@ def fused_mha(
names=[f"memQ{i}" for i in range(number_of_pipelines_join_distribute)],
dims_to_stream=[q_dims] * number_of_pipelines_join_distribute,
depths=[of_depth] * number_of_pipelines_join_distribute,
placement=Tile(col=6, row=1),
tile=Tile(col=6, row=1),
) # Split between N pipelines
if number_of_pipelines > 6:
inQ2 = ObjectFifo(
Expand All @@ -290,7 +289,7 @@ def fused_mha(
names=[f"memQ2{i}" for i in range(number_of_pipelines_join_distribute)],
dims_to_stream=[q_dims] * number_of_pipelines_join_distribute,
depths=[of_depth] * number_of_pipelines_join_distribute,
placement=Tile(col=7, row=1),
tile=Tile(col=7, row=1),
) # Split between N pipelines

# VJUNG: The SequentialPlacer will place all of these on the same MemTile if Placement is specified. We would need a list of placement in case of one-many or many-one.
Expand All @@ -308,7 +307,7 @@ def fused_mha(
memK = inK.cons().forward(
name="memK",
dims_to_stream=k_dims,
placement=Tile(col=3, row=1),
tile=Tile(col=3, row=1),
depth=of_depth,
) # Broadcast, give this handle to N pipelines

Expand All @@ -324,7 +323,7 @@ def fused_mha(
memV = inV.cons().forward(
name="memV",
dims_to_stream=v_dims,
placement=Tile(col=4, row=1),
tile=Tile(col=4, row=1),
depth=of_depth,
) # Broadcast, give this handle to N pipelines

Expand All @@ -342,7 +341,7 @@ def fused_mha(
name=f"outA{i}",
dims_to_stream=a_dims,
depth=of_depth,
# placement=Tile(col=i, row=1))
# tile=Tile(col=i, row=1))
)
) # Local to 1 pipeline

Expand All @@ -357,7 +356,7 @@ def fused_mha(
name=f"outP{i}",
dims_to_stream=q_dims,
depth=of_depth,
# placement=Tile(col=i, row=1)
# tile=Tile(col=i, row=1)
)
) # Local to 1 pipeline

Expand All @@ -381,7 +380,7 @@ def fused_mha(
obj_types=[q_ty] * number_of_pipelines_join_distribute,
names=[f"outO{i}" for i in range(number_of_pipelines_join_distribute)],
depths=[of_depth] * number_of_pipelines_join_distribute,
placement=Tile(col=6, row=1),
tile=Tile(col=6, row=1),
) # Join onto the output OF
if number_of_pipelines > 6:
memO2 = ObjectFifo(
Expand All @@ -394,7 +393,7 @@ def fused_mha(
obj_types=[q_ty] * number_of_pipelines_join_distribute,
names=[f"outO2{i}" for i in range(number_of_pipelines_join_distribute)],
depths=[of_depth] * number_of_pipelines_join_distribute,
placement=Tile(col=7, row=1),
tile=Tile(col=7, row=1),
)

def batched_matmul_qk(
Expand Down Expand Up @@ -654,7 +653,7 @@ def batched_matmul_pv(
idx_buffer_qk,
],
stack_size=0xD00,
placement=Tile(col=i, row=2),
tile=Tile(col=i, row=2),
while_true=False,
)
)
Expand Down Expand Up @@ -683,7 +682,7 @@ def batched_matmul_pv(
scale_buffer_softmax,
],
stack_size=0xD00,
placement=Tile(col=i, row=3),
tile=Tile(col=i, row=3),
while_true=False,
)
)
Expand All @@ -708,7 +707,7 @@ def batched_matmul_pv(
idx_buffer_pv,
],
stack_size=0xD00,
placement=Tile(col=i, row=4),
tile=Tile(col=i, row=4),
while_true=False,
)
)
Expand Down Expand Up @@ -813,7 +812,7 @@ def set_mha_rtps():
tap=Q_tiles[
2 * head_idx * num_q_block_per_pipeline + q_block_idx * 2
],
placement=Tile(col=4, row=0),
tile=Tile(col=4, row=0),
task_group=tg,
)
rt.fill(
Expand All @@ -824,15 +823,15 @@ def set_mha_rtps():
+ q_block_idx * 2
+ 1
],
placement=Tile(col=4, row=0),
tile=Tile(col=4, row=0),
task_group=tg,
)
else:
rt.fill(
inQ.prod(),
Q,
tap=Q_tiles[head_idx * num_q_block_per_pipeline + q_block_idx],
placement=Tile(col=4, row=0),
tile=Tile(col=4, row=0),
task_group=tg,
)

Expand All @@ -841,14 +840,14 @@ def set_mha_rtps():
inK.prod(),
K,
tap=K_tiles[kv_head_idx],
placement=Tile(col=5, row=0),
tile=Tile(col=5, row=0),
task_group=tg,
)
rt.fill(
inV.prod(),
V,
tap=V_tiles[kv_head_idx],
placement=Tile(col=6, row=0),
tile=Tile(col=6, row=0),
task_group=tg,
)

Expand All @@ -860,7 +859,7 @@ def set_mha_rtps():
2 * head_idx * num_q_block_per_pipeline + q_block_idx * 2
],
wait=True,
placement=Tile(col=7, row=0),
tile=Tile(col=7, row=0),
task_group=tg,
)
rt.drain(
Expand All @@ -872,7 +871,7 @@ def set_mha_rtps():
+ 1
],
wait=True,
placement=Tile(col=7, row=0),
tile=Tile(col=7, row=0),
task_group=tg,
)
else:
Expand All @@ -881,7 +880,7 @@ def set_mha_rtps():
O,
tap=O_tiles[head_idx * num_q_block_per_pipeline + q_block_idx],
wait=True,
placement=Tile(col=7, row=0),
tile=Tile(col=7, row=0),
task_group=tg,
)

Expand All @@ -892,5 +891,5 @@ def set_mha_rtps():
my_program = Program(dev_ty, rt)

# Place components (assign them resources on the device) and generate an MLIR module
module = my_program.resolve_program(SequentialPlacer())
module = my_program.resolve_program()
return module
Loading
Loading