Skip to content

Commit e840786

Browse files
committed
[Fix] Index_expr ops codegen issue
1 parent a13f37b commit e840786

6 files changed

Lines changed: 245 additions & 25 deletions

File tree

PyTorchSimFrontend/mlir/mlir_caller_codegen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def generate_args_define(self):
101101
bits = 8
102102
else:
103103
bits = torch.iinfo(arg_type).bits
104-
buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) # Round up to 64 bytes
104+
buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) * 2 # Round up to 64 bytes + Add some padding for safety
105105
self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({buffer_size}ULL){self.ending}')
106106
name_set.add(arg_name)
107107
self.writeline(self.newline)

PyTorchSimFrontend/mlir/mlir_codegen_backend.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,6 @@ def load(self, name: str, index: sympy.Expr):
470470
tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
471471
tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
472472
tile_stride = local_tile_desc.get_tile_stride()
473-
474473
# Compute vector unit size
475474
vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
476475
compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
@@ -697,7 +696,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
697696
self.reset("recompile")
698697
raise mlir_common.RecompileSignal(f"Index access (tile size {prior_tile_size} is not divisible by {prior_ranges})")
699698

700-
tile_size = tile_desc.get_tile_size_per_lane()
699+
tile_size_per_lane = tile_desc.get_tile_size_per_lane()
701700
compute_vec_size = tile_desc.get_compute_vec_size()
702701
strides = tile_desc.get_tile_stride_per_lane()
703702

@@ -707,13 +706,13 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
707706

708707
# Create tile_dim index
709708
dim_list = []
710-
for idx in range(len(tile_size)):
709+
for idx in range(len(tile_size_per_lane)):
711710
# Prepare initial values
712711
offset = tile_desc.vmap.vlane_stride #* strides[idx]
713-
outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride
712+
outer_sz = tile_desc.get_tile_size()[idx] // tile_desc.vmap.vlane_stride
714713
with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
715714
div_coeff = self.get_const_cse(strides[idx], "index")
716-
mod_coeff = self.get_const_cse(tile_size[idx], "index")
715+
mod_coeff = self.get_const_cse(tile_size_per_lane[idx], "index")
717716
vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index")
718717
vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
719718
nr_vector_lane = self.get_const_cse(self.vector_lane, "index")

PyTorchSimFrontend/mlir/mlir_conv_common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def outer_func_render(self, kernel_name, input_args):
8585
options = dict(
8686
kernel=self.kernel,
8787
KERNEL_NAME=kernel_name,
88-
FUNC_NAME=self.function_name + f"_{len(input_args)}",
88+
FUNC_NAME="wrapper_" + kernel_name,
8989
INPUT=X,
9090
WEIGHT=W,
9191
BIAS=Bias,
@@ -96,7 +96,7 @@ def outer_func_render(self, kernel_name, input_args):
9696
input_reorder=self.input_reorder
9797
)
9898
code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options)
99-
return code, self.function_name + f"_{len(input_args)}"
99+
return code, "wrapper_" + kernel_name
100100

101101
def get_arg_attributes(self):
102102
arg_attributes = []

PyTorchSimFrontend/mlir/mlir_scheduling.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,8 @@ def can_fuse_horizontal(self, node1, node2):
154154
}
155155
# Buffers still required by the activation node (unmet) or read by it
156156
epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies }
157-
has_depedency = bool(template_writes) and epilogue_unmet.issubset(template_writes)
158-
if not has_depedency:
157+
has_dependency = bool(template_writes) and epilogue_unmet.issubset(template_writes) and not bool(reads1 & writes2)
158+
if not has_dependency:
159159
return False
160160

161161
# Revert act_node.group : simplify_and_reorder() modified _body, _size, group

PyTorchSimFrontend/mlir/mlir_template.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ def call_kernel(self, kernel_name):
403403
_, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
404404
# generate the code to call this
405405
wrapper.generate_kernel_call(
406-
kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args)
406+
kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args)
407407

408408
def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
409409
with self as kernel:

tests/Yolov5/test_yolov5.py

Lines changed: 235 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,39 +13,230 @@
1313
import os
1414
import shutil
1515

16-
16+
def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
17+
if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
18+
message = f"|{name} Test Passed|"
19+
print("-" * len(message))
20+
print(message)
21+
print("-" * len(message))
22+
else:
23+
message = f"|{name} Test Failed|"
24+
print("-" * len(message))
25+
print(message)
26+
print("-" * len(message))
27+
print("custom out: ", out.cpu())
28+
print("cpu out: ", cpu_out)
29+
exit(1)
1730

1831
def run_yolo(batch, config):
32+
import copy
33+
1934
device = torch.device("npu:0")
2035

2136
torch._dynamo.config.recompile_limit = 64
2237
torch._dynamo.config.cache_size_limit = 128
23-
38+
39+
# Load model and prepare input
2440
model = torch.hub.load("ultralytics/yolov5", "yolov5s").cpu().eval()
2541
url = "https://ultralytics.com/images/zidane.jpg"
26-
42+
2743
response = requests.get(url)
2844
img = Image.open(BytesIO(response.content)).convert("RGB")
29-
45+
3046
imgsz = 64
3147
transform = transforms.Compose([
3248
transforms.Resize((imgsz, imgsz)),
3349
transforms.ToTensor(),
3450
])
35-
51+
3652
x = transform(img).unsqueeze(0) # [1, 3, H, W]
37-
x = x.to(device)
38-
39-
40-
model.to(device)
41-
x = x.to(device)
42-
43-
# Compile and run the model with PyTorchSim
44-
compiled_model = torch.compile(dynamic=False)(model)
45-
y = compiled_model(x)
53+
54+
# CPU version
55+
model_cpu = copy.deepcopy(model).cpu().eval()
56+
x_cpu = copy.deepcopy(x).cpu()
57+
y_cpu = model_cpu(x_cpu)
58+
59+
# NPU version
60+
model_npu = model_cpu.to(device).eval()
61+
x_npu = copy.deepcopy(x).to(device)
62+
compiled_model_npu = torch.compile(dynamic=False)(model_npu)
63+
y_npu = compiled_model_npu(x_npu)
64+
65+
# Compare results
66+
# YOLOv5 output is typically a list or tensor, handle both cases
67+
if isinstance(y_cpu, (list, tuple)):
68+
for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
69+
test_result(f"YOLOv5 Output {i}", out_npu, out_cpu)
70+
else:
71+
test_result("YOLOv5 Output", y_npu, y_cpu)
72+
4673
print("Yolo Simulation Done")
4774

4875

76+
def test_c3_module(device, batch=1, c1=64, c2=128, n=1, h=64, w=64):
77+
import copy
78+
import sys
79+
80+
# Import C3 module from YOLOv5
81+
try:
82+
# Load model first to ensure hub cache is populated
83+
_ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
84+
85+
# Try to import from torch hub cache
86+
hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
87+
if os.path.exists(hub_path):
88+
sys.path.insert(0, hub_path)
89+
# Import C3 module
90+
from models.common import C3 # noqa: F401
91+
except Exception as e:
92+
print(f"Warning: Could not import C3 module: {e}")
93+
print("Skipping C3 module test")
94+
return
95+
96+
torch.manual_seed(0)
97+
98+
# Create input tensor
99+
x = torch.randn(batch, c1, h, w)
100+
101+
# CPU version
102+
model_cpu = C3(c1, c2, n=n, shortcut=True, g=1, e=0.5).cpu().eval()
103+
x_cpu = copy.deepcopy(x).cpu()
104+
y_cpu = model_cpu(x_cpu)
105+
106+
# NPU version
107+
model_npu = model_cpu.to(device).eval()
108+
x_npu = copy.deepcopy(x).to(device)
109+
compiled_model_npu = torch.compile(dynamic=False)(model_npu)
110+
y_npu = compiled_model_npu(x_npu)
111+
112+
# Compare results
113+
if isinstance(y_cpu, (list, tuple)):
114+
for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
115+
test_result(f"C3 Output {i}", out_npu, out_cpu)
116+
else:
117+
test_result("C3 Output", y_npu, y_cpu)
118+
print("C3 Module Test Done")
119+
120+
121+
def test_bottleneck_module(device, batch=1, c1=64, c2=64, shortcut=True, g=1, e=0.5, h=16, w=16):
122+
import copy
123+
import sys
124+
125+
# Import Bottleneck module from YOLOv5
126+
try:
127+
# Load model first to ensure hub cache is populated
128+
_ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
129+
130+
# Try to import from torch hub cache
131+
hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
132+
if os.path.exists(hub_path):
133+
sys.path.insert(0, hub_path)
134+
# Import Bottleneck module
135+
from models.common import Bottleneck # noqa: F401
136+
except Exception as e:
137+
print(f"Warning: Could not import Bottleneck module: {e}")
138+
print("Skipping Bottleneck module test")
139+
return
140+
141+
torch.manual_seed(0)
142+
143+
# Create input tensor
144+
x = torch.randn(batch, c1, h, w)
145+
146+
# CPU version
147+
model_cpu = Bottleneck(c1, c2, shortcut=shortcut, g=g, e=e).cpu().eval()
148+
x_cpu = copy.deepcopy(x).cpu()
149+
y_cpu = model_cpu(x_cpu)
150+
151+
# NPU version
152+
model_npu = model_cpu.to(device).eval()
153+
x_npu = copy.deepcopy(x).to(device)
154+
compiled_model_npu = torch.compile(dynamic=False)(model_npu)
155+
y_npu = compiled_model_npu(x_npu)
156+
157+
# Compare results
158+
test_result("Bottleneck Module", y_npu, y_cpu)
159+
print("Bottleneck Module Test Done")
160+
161+
162+
def test_conv_module(device, batch=1, c1=32, c2=64, k=3, s=1, p=None, g=1, d=1, act=True, h=16, w=16):
163+
import copy
164+
import sys
165+
166+
# Import Conv module from YOLOv5
167+
try:
168+
# Load model first to ensure hub cache is populated
169+
_ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
170+
171+
# Try to import from torch hub cache
172+
hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
173+
if os.path.exists(hub_path):
174+
sys.path.insert(0, hub_path)
175+
# Import Conv module
176+
from models.common import Conv # noqa: F401
177+
except Exception as e:
178+
print(f"Warning: Could not import Conv module: {e}")
179+
print("Skipping Conv module test")
180+
return
181+
182+
torch.manual_seed(0)
183+
184+
# Create input tensor
185+
x = torch.randn(batch, c1, h, w)
186+
187+
# CPU version
188+
model_cpu = Conv(c1, c2, k=k, s=s, p=p, g=g, d=d, act=act).cpu().eval()
189+
x_cpu = copy.deepcopy(x).cpu()
190+
y_cpu = model_cpu(x_cpu)
191+
192+
# NPU version
193+
model_npu = model_cpu.to(device).eval()
194+
x_npu = copy.deepcopy(x).to(device)
195+
compiled_model_npu = torch.compile(dynamic=False)(model_npu)
196+
y_npu = compiled_model_npu(x_npu)
197+
198+
# Compare results
199+
test_result("Conv Module", y_npu, y_cpu)
200+
print("Conv Module Test Done")
201+
202+
203+
def test_concat_4d(device):
204+
"""
205+
Test concatenating 3 tensors along dimension 4
206+
Shapes: (1, 3, 4, 4, 2), (1, 3, 4, 4, 2), (1, 3, 4, 4, 81)
207+
Result: (1, 3, 4, 4, 85)
208+
"""
209+
import copy
210+
211+
torch.manual_seed(0)
212+
213+
# Create 3 input tensors
214+
x1 = torch.ones(1, 3, 4, 4, 2)
215+
x2 = torch.ones(1, 3, 4, 4, 2) * 2
216+
x3 = torch.ones(1, 3, 4, 4, 81) * 3
217+
218+
# CPU version
219+
x1_cpu = copy.deepcopy(x1).cpu()
220+
x2_cpu = copy.deepcopy(x2).cpu()
221+
x3_cpu = copy.deepcopy(x3).cpu()
222+
y_cpu = torch.cat([x1_cpu, x2_cpu, x3_cpu], dim=4)
223+
224+
# NPU version
225+
x1_npu = copy.deepcopy(x1).to(device)
226+
x2_npu = copy.deepcopy(x2).to(device)
227+
x3_npu = copy.deepcopy(x3).to(device)
228+
229+
def concat_fn(x1, x2, x3):
230+
return torch.cat([x1, x2, x3], dim=4)
231+
232+
compiled_concat = torch.compile(dynamic=False)(concat_fn)
233+
y_npu = compiled_concat(x1_npu, x2_npu, x3_npu)
234+
235+
# Compare results
236+
test_result("Concat 4D", y_npu, y_cpu)
237+
print(f"Output shape: {y_npu.shape}")
238+
print("Concat 4D Test Done")
239+
49240
if __name__ == "__main__":
50241

51242
base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
@@ -59,4 +250,34 @@ def run_yolo(batch, config):
59250
args = args.parse_args()
60251
batch = args.batch
61252

253+
device = torch.device("npu:0")
254+
255+
# Test Concat 4D
256+
# print("=" * 80)
257+
# print("Testing Concat 4D")
258+
# print("=" * 80)
259+
# test_concat_4d(device)
260+
261+
# Test Conv module
262+
# print("\n" + "=" * 80)
263+
# print("Testing Conv Module")
264+
# print("=" * 80)
265+
# test_conv_module(device, batch=batch, c1=32, c2=32, k=1, s=1, p=None, g=1, d=1, act=False, h=16, w=16)
266+
267+
# Test Bottleneck module
268+
# print("\n" + "=" * 80)
269+
# print("Testing Bottleneck Module")
270+
# print("=" * 80)
271+
# test_bottleneck_module(device, batch=batch, c1=32, c2=32, shortcut=True, g=1, e=0.5, h=16, w=16)
272+
273+
# Test C3 module
274+
# print("\n" + "=" * 80)
275+
# print("Testing C3 Module")
276+
# print("=" * 80)
277+
# test_c3_module(device, batch=batch, c1=64, c2=64, n=1, h=16, w=16)
278+
279+
# Test full YOLOv5 model
280+
print("\n" + "=" * 80)
281+
print("Testing Full YOLOv5 Model")
282+
print("=" * 80)
62283
run_yolo(batch, config)

0 commit comments

Comments
 (0)