[Fix] Index_expr ops codegen issue

YWHyuk · YWHyuk · commit e840786efc58 · 2026-02-11T08:27:53.000Z
diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -101,7 +101,7 @@ def generate_args_define(self):
                     bits = 8
                 else:
                     bits = torch.iinfo(arg_type).bits
-                buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) # Round up to 64 bytes
+                buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) * 2 # Round up to 64 bytes + Add some padding for safety
                 self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({buffer_size}ULL){self.ending}')
                 name_set.add(arg_name)
         self.writeline(self.newline)
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -470,7 +470,6 @@ def load(self, name: str, index: sympy.Expr):
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
-
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
@@ -697,7 +696,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
             self.reset("recompile")
             raise mlir_common.RecompileSignal(f"Index access (tile size {prior_tile_size} is not divisible by {prior_ranges})")
 
-        tile_size = tile_desc.get_tile_size_per_lane()
+        tile_size_per_lane = tile_desc.get_tile_size_per_lane()
         compute_vec_size = tile_desc.get_compute_vec_size()
         strides = tile_desc.get_tile_stride_per_lane()
 
@@ -707,13 +706,13 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
 
         # Create tile_dim index
         dim_list = []
-        for idx in range(len(tile_size)):
+        for idx in range(len(tile_size_per_lane)):
             # Prepare initial values
             offset = tile_desc.vmap.vlane_stride #* strides[idx]
-            outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride
+            outer_sz = tile_desc.get_tile_size()[idx] // tile_desc.vmap.vlane_stride
             with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
                 div_coeff = self.get_const_cse(strides[idx], "index")
-                mod_coeff = self.get_const_cse(tile_size[idx], "index")
+                mod_coeff = self.get_const_cse(tile_size_per_lane[idx], "index")
                 vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index")
                 vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
                 nr_vector_lane = self.get_const_cse(self.vector_lane, "index")
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -85,7 +85,7 @@ def outer_func_render(self, kernel_name, input_args):
         options = dict(
             kernel=self.kernel,
             KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name + f"_{len(input_args)}",
+            FUNC_NAME="wrapper_" + kernel_name,
             INPUT=X,
             WEIGHT=W,
             BIAS=Bias,
@@ -96,7 +96,7 @@ def outer_func_render(self, kernel_name, input_args):
             input_reorder=self.input_reorder
         )
         code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name + f"_{len(input_args)}"
+        return code, "wrapper_" + kernel_name
 
     def get_arg_attributes(self):
         arg_attributes = []
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -154,8 +154,8 @@ def can_fuse_horizontal(self, node1, node2):
             }
             # Buffers still required by the activation node (unmet) or read by it
             epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies }
-            has_depedency = bool(template_writes) and epilogue_unmet.issubset(template_writes)
-            if not has_depedency:
+            has_dependency = bool(template_writes) and epilogue_unmet.issubset(template_writes) and not bool(reads1 & writes2)
+            if not has_dependency:
                 return False
 
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -403,7 +403,7 @@ def call_kernel(self, kernel_name):
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
-            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args)
+            kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args)
 
     def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
         with self as kernel:
diff --git a/tests/Yolov5/test_yolov5.py b/tests/Yolov5/test_yolov5.py
@@ -13,39 +13,230 @@
 import os
 import shutil
 
-
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
 
 def run_yolo(batch, config):
+    import copy
+
     device = torch.device("npu:0")
 
     torch._dynamo.config.recompile_limit = 64
     torch._dynamo.config.cache_size_limit = 128
-    
+
+    # Load model and prepare input
     model = torch.hub.load("ultralytics/yolov5", "yolov5s").cpu().eval()
     url = "https://ultralytics.com/images/zidane.jpg"
-    
+
     response = requests.get(url)
     img = Image.open(BytesIO(response.content)).convert("RGB")
-    
+
     imgsz = 64
     transform = transforms.Compose([
         transforms.Resize((imgsz, imgsz)),
         transforms.ToTensor(),
     ])
-    
+
     x = transform(img).unsqueeze(0)   # [1, 3, H, W]
-    x = x.to(device)
-    
-
-    model.to(device)
-    x = x.to(device)
-    
-    # Compile and run the model with PyTorchSim
-    compiled_model = torch.compile(dynamic=False)(model)
-    y = compiled_model(x)
+
+    # CPU version
+    model_cpu = copy.deepcopy(model).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    # YOLOv5 output is typically a list or tensor, handle both cases
+    if isinstance(y_cpu, (list, tuple)):
+        for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
+            test_result(f"YOLOv5 Output {i}", out_npu, out_cpu)
+    else:
+        test_result("YOLOv5 Output", y_npu, y_cpu)
+
     print("Yolo Simulation Done")
 
 
+def test_c3_module(device, batch=1, c1=64, c2=128, n=1, h=64, w=64):
+    import copy
+    import sys
+
+    # Import C3 module from YOLOv5
+    try:
+        # Load model first to ensure hub cache is populated
+        _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
+
+        # Try to import from torch hub cache
+        hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
+        if os.path.exists(hub_path):
+            sys.path.insert(0, hub_path)
+        # Import C3 module
+        from models.common import C3  # noqa: F401
+    except Exception as e:
+        print(f"Warning: Could not import C3 module: {e}")
+        print("Skipping C3 module test")
+        return
+
+    torch.manual_seed(0)
+
+    # Create input tensor
+    x = torch.randn(batch, c1, h, w)
+
+    # CPU version
+    model_cpu = C3(c1, c2, n=n, shortcut=True, g=1, e=0.5).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    if isinstance(y_cpu, (list, tuple)):
+        for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
+            test_result(f"C3 Output {i}", out_npu, out_cpu)
+    else:
+        test_result("C3 Output", y_npu, y_cpu)
+    print("C3 Module Test Done")
+
+
+def test_bottleneck_module(device, batch=1, c1=64, c2=64, shortcut=True, g=1, e=0.5, h=16, w=16):
+    import copy
+    import sys
+
+    # Import Bottleneck module from YOLOv5
+    try:
+        # Load model first to ensure hub cache is populated
+        _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
+
+        # Try to import from torch hub cache
+        hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
+        if os.path.exists(hub_path):
+            sys.path.insert(0, hub_path)
+        # Import Bottleneck module
+        from models.common import Bottleneck  # noqa: F401
+    except Exception as e:
+        print(f"Warning: Could not import Bottleneck module: {e}")
+        print("Skipping Bottleneck module test")
+        return
+
+    torch.manual_seed(0)
+
+    # Create input tensor
+    x = torch.randn(batch, c1, h, w)
+
+    # CPU version
+    model_cpu = Bottleneck(c1, c2, shortcut=shortcut, g=g, e=e).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    test_result("Bottleneck Module", y_npu, y_cpu)
+    print("Bottleneck Module Test Done")
+
+
+def test_conv_module(device, batch=1, c1=32, c2=64, k=3, s=1, p=None, g=1, d=1, act=True, h=16, w=16):
+    import copy
+    import sys
+
+    # Import Conv module from YOLOv5
+    try:
+        # Load model first to ensure hub cache is populated
+        _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
+
+        # Try to import from torch hub cache
+        hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
+        if os.path.exists(hub_path):
+            sys.path.insert(0, hub_path)
+        # Import Conv module
+        from models.common import Conv  # noqa: F401
+    except Exception as e:
+        print(f"Warning: Could not import Conv module: {e}")
+        print("Skipping Conv module test")
+        return
+
+    torch.manual_seed(0)
+
+    # Create input tensor
+    x = torch.randn(batch, c1, h, w)
+
+    # CPU version
+    model_cpu = Conv(c1, c2, k=k, s=s, p=p, g=g, d=d, act=act).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    test_result("Conv Module", y_npu, y_cpu)
+    print("Conv Module Test Done")
+
+
+def test_concat_4d(device):
+    """
+    Test concatenating 3 tensors along dimension 4
+    Shapes: (1, 3, 4, 4, 2), (1, 3, 4, 4, 2), (1, 3, 4, 4, 81)
+    Result: (1, 3, 4, 4, 85)
+    """
+    import copy
+
+    torch.manual_seed(0)
+
+    # Create 3 input tensors
+    x1 = torch.ones(1, 3, 4, 4, 2)
+    x2 = torch.ones(1, 3, 4, 4, 2) * 2
+    x3 = torch.ones(1, 3, 4, 4, 81) * 3
+
+    # CPU version
+    x1_cpu = copy.deepcopy(x1).cpu()
+    x2_cpu = copy.deepcopy(x2).cpu()
+    x3_cpu = copy.deepcopy(x3).cpu()
+    y_cpu = torch.cat([x1_cpu, x2_cpu, x3_cpu], dim=4)
+
+    # NPU version
+    x1_npu = copy.deepcopy(x1).to(device)
+    x2_npu = copy.deepcopy(x2).to(device)
+    x3_npu = copy.deepcopy(x3).to(device)
+
+    def concat_fn(x1, x2, x3):
+        return torch.cat([x1, x2, x3], dim=4)
+
+    compiled_concat = torch.compile(dynamic=False)(concat_fn)
+    y_npu = compiled_concat(x1_npu, x2_npu, x3_npu)
+
+    # Compare results
+    test_result("Concat 4D", y_npu, y_cpu)
+    print(f"Output shape: {y_npu.shape}")
+    print("Concat 4D Test Done")
+
 if __name__ == "__main__":
 
     base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
@@ -59,4 +250,34 @@ def run_yolo(batch, config):
     args = args.parse_args()
     batch = args.batch
 
+    device = torch.device("npu:0")
+
+    # Test Concat 4D
+    # print("=" * 80)
+    # print("Testing Concat 4D")
+    # print("=" * 80)
+    # test_concat_4d(device)
+
+    # Test Conv module
+    # print("\n" + "=" * 80)
+    # print("Testing Conv Module")
+    # print("=" * 80)
+    # test_conv_module(device, batch=batch, c1=32, c2=32, k=1, s=1, p=None, g=1, d=1, act=False, h=16, w=16)
+
+    # Test Bottleneck module
+    # print("\n" + "=" * 80)
+    # print("Testing Bottleneck Module")
+    # print("=" * 80)
+    # test_bottleneck_module(device, batch=batch, c1=32, c2=32, shortcut=True, g=1, e=0.5, h=16, w=16)
+
+    # Test C3 module
+    # print("\n" + "=" * 80)
+    # print("Testing C3 Module")
+    # print("=" * 80)
+    # test_c3_module(device, batch=batch, c1=64, c2=64, n=1, h=16, w=16)
+
+    # Test full YOLOv5 model
+    print("\n" + "=" * 80)
+    print("Testing Full YOLOv5 Model")
+    print("=" * 80)
     run_yolo(batch, config)