1313import os
1414import shutil
1515
16-
16+ def test_result (name , out , cpu_out , rtol = 1e-4 , atol = 1e-4 ):
17+ if torch .allclose (out .cpu (), cpu_out , rtol = rtol , atol = atol ):
18+ message = f"|{ name } Test Passed|"
19+ print ("-" * len (message ))
20+ print (message )
21+ print ("-" * len (message ))
22+ else :
23+ message = f"|{ name } Test Failed|"
24+ print ("-" * len (message ))
25+ print (message )
26+ print ("-" * len (message ))
27+ print ("custom out: " , out .cpu ())
28+ print ("cpu out: " , cpu_out )
29+ exit (1 )
1730
1831def run_yolo (batch , config ):
32+ import copy
33+
1934 device = torch .device ("npu:0" )
2035
2136 torch ._dynamo .config .recompile_limit = 64
2237 torch ._dynamo .config .cache_size_limit = 128
23-
38+
39+ # Load model and prepare input
2440 model = torch .hub .load ("ultralytics/yolov5" , "yolov5s" ).cpu ().eval ()
2541 url = "https://ultralytics.com/images/zidane.jpg"
26-
42+
2743 response = requests .get (url )
2844 img = Image .open (BytesIO (response .content )).convert ("RGB" )
29-
45+
3046 imgsz = 64
3147 transform = transforms .Compose ([
3248 transforms .Resize ((imgsz , imgsz )),
3349 transforms .ToTensor (),
3450 ])
35-
51+
3652 x = transform (img ).unsqueeze (0 ) # [1, 3, H, W]
37- x = x .to (device )
38-
39-
40- model .to (device )
41- x = x .to (device )
42-
43- # Compile and run the model with PyTorchSim
44- compiled_model = torch .compile (dynamic = False )(model )
45- y = compiled_model (x )
53+
54+ # CPU version
55+ model_cpu = copy .deepcopy (model ).cpu ().eval ()
56+ x_cpu = copy .deepcopy (x ).cpu ()
57+ y_cpu = model_cpu (x_cpu )
58+
59+ # NPU version
60+ model_npu = model_cpu .to (device ).eval ()
61+ x_npu = copy .deepcopy (x ).to (device )
62+ compiled_model_npu = torch .compile (dynamic = False )(model_npu )
63+ y_npu = compiled_model_npu (x_npu )
64+
65+ # Compare results
66+ # YOLOv5 output is typically a list or tensor, handle both cases
67+ if isinstance (y_cpu , (list , tuple )):
68+ for i , (out_npu , out_cpu ) in enumerate (zip (y_npu , y_cpu )):
69+ test_result (f"YOLOv5 Output { i } " , out_npu , out_cpu )
70+ else :
71+ test_result ("YOLOv5 Output" , y_npu , y_cpu )
72+
4673 print ("Yolo Simulation Done" )
4774
4875
76+ def test_c3_module (device , batch = 1 , c1 = 64 , c2 = 128 , n = 1 , h = 64 , w = 64 ):
77+ import copy
78+ import sys
79+
80+ # Import C3 module from YOLOv5
81+ try :
82+ # Load model first to ensure hub cache is populated
83+ _ = torch .hub .load ("ultralytics/yolov5" , "yolov5s" , pretrained = False )
84+
85+ # Try to import from torch hub cache
86+ hub_path = os .path .expanduser ("~/.cache/torch/hub/ultralytics_yolov5_master" )
87+ if os .path .exists (hub_path ):
88+ sys .path .insert (0 , hub_path )
89+ # Import C3 module
90+ from models .common import C3 # noqa: F401
91+ except Exception as e :
92+ print (f"Warning: Could not import C3 module: { e } " )
93+ print ("Skipping C3 module test" )
94+ return
95+
96+ torch .manual_seed (0 )
97+
98+ # Create input tensor
99+ x = torch .randn (batch , c1 , h , w )
100+
101+ # CPU version
102+ model_cpu = C3 (c1 , c2 , n = n , shortcut = True , g = 1 , e = 0.5 ).cpu ().eval ()
103+ x_cpu = copy .deepcopy (x ).cpu ()
104+ y_cpu = model_cpu (x_cpu )
105+
106+ # NPU version
107+ model_npu = model_cpu .to (device ).eval ()
108+ x_npu = copy .deepcopy (x ).to (device )
109+ compiled_model_npu = torch .compile (dynamic = False )(model_npu )
110+ y_npu = compiled_model_npu (x_npu )
111+
112+ # Compare results
113+ if isinstance (y_cpu , (list , tuple )):
114+ for i , (out_npu , out_cpu ) in enumerate (zip (y_npu , y_cpu )):
115+ test_result (f"C3 Output { i } " , out_npu , out_cpu )
116+ else :
117+ test_result ("C3 Output" , y_npu , y_cpu )
118+ print ("C3 Module Test Done" )
119+
120+
121+ def test_bottleneck_module (device , batch = 1 , c1 = 64 , c2 = 64 , shortcut = True , g = 1 , e = 0.5 , h = 16 , w = 16 ):
122+ import copy
123+ import sys
124+
125+ # Import Bottleneck module from YOLOv5
126+ try :
127+ # Load model first to ensure hub cache is populated
128+ _ = torch .hub .load ("ultralytics/yolov5" , "yolov5s" , pretrained = False )
129+
130+ # Try to import from torch hub cache
131+ hub_path = os .path .expanduser ("~/.cache/torch/hub/ultralytics_yolov5_master" )
132+ if os .path .exists (hub_path ):
133+ sys .path .insert (0 , hub_path )
134+ # Import Bottleneck module
135+ from models .common import Bottleneck # noqa: F401
136+ except Exception as e :
137+ print (f"Warning: Could not import Bottleneck module: { e } " )
138+ print ("Skipping Bottleneck module test" )
139+ return
140+
141+ torch .manual_seed (0 )
142+
143+ # Create input tensor
144+ x = torch .randn (batch , c1 , h , w )
145+
146+ # CPU version
147+ model_cpu = Bottleneck (c1 , c2 , shortcut = shortcut , g = g , e = e ).cpu ().eval ()
148+ x_cpu = copy .deepcopy (x ).cpu ()
149+ y_cpu = model_cpu (x_cpu )
150+
151+ # NPU version
152+ model_npu = model_cpu .to (device ).eval ()
153+ x_npu = copy .deepcopy (x ).to (device )
154+ compiled_model_npu = torch .compile (dynamic = False )(model_npu )
155+ y_npu = compiled_model_npu (x_npu )
156+
157+ # Compare results
158+ test_result ("Bottleneck Module" , y_npu , y_cpu )
159+ print ("Bottleneck Module Test Done" )
160+
161+
162+ def test_conv_module (device , batch = 1 , c1 = 32 , c2 = 64 , k = 3 , s = 1 , p = None , g = 1 , d = 1 , act = True , h = 16 , w = 16 ):
163+ import copy
164+ import sys
165+
166+ # Import Conv module from YOLOv5
167+ try :
168+ # Load model first to ensure hub cache is populated
169+ _ = torch .hub .load ("ultralytics/yolov5" , "yolov5s" , pretrained = False )
170+
171+ # Try to import from torch hub cache
172+ hub_path = os .path .expanduser ("~/.cache/torch/hub/ultralytics_yolov5_master" )
173+ if os .path .exists (hub_path ):
174+ sys .path .insert (0 , hub_path )
175+ # Import Conv module
176+ from models .common import Conv # noqa: F401
177+ except Exception as e :
178+ print (f"Warning: Could not import Conv module: { e } " )
179+ print ("Skipping Conv module test" )
180+ return
181+
182+ torch .manual_seed (0 )
183+
184+ # Create input tensor
185+ x = torch .randn (batch , c1 , h , w )
186+
187+ # CPU version
188+ model_cpu = Conv (c1 , c2 , k = k , s = s , p = p , g = g , d = d , act = act ).cpu ().eval ()
189+ x_cpu = copy .deepcopy (x ).cpu ()
190+ y_cpu = model_cpu (x_cpu )
191+
192+ # NPU version
193+ model_npu = model_cpu .to (device ).eval ()
194+ x_npu = copy .deepcopy (x ).to (device )
195+ compiled_model_npu = torch .compile (dynamic = False )(model_npu )
196+ y_npu = compiled_model_npu (x_npu )
197+
198+ # Compare results
199+ test_result ("Conv Module" , y_npu , y_cpu )
200+ print ("Conv Module Test Done" )
201+
202+
203+ def test_concat_4d (device ):
204+ """
205+ Test concatenating 3 tensors along dimension 4
206+ Shapes: (1, 3, 4, 4, 2), (1, 3, 4, 4, 2), (1, 3, 4, 4, 81)
207+ Result: (1, 3, 4, 4, 85)
208+ """
209+ import copy
210+
211+ torch .manual_seed (0 )
212+
213+ # Create 3 input tensors
214+ x1 = torch .ones (1 , 3 , 4 , 4 , 2 )
215+ x2 = torch .ones (1 , 3 , 4 , 4 , 2 ) * 2
216+ x3 = torch .ones (1 , 3 , 4 , 4 , 81 ) * 3
217+
218+ # CPU version
219+ x1_cpu = copy .deepcopy (x1 ).cpu ()
220+ x2_cpu = copy .deepcopy (x2 ).cpu ()
221+ x3_cpu = copy .deepcopy (x3 ).cpu ()
222+ y_cpu = torch .cat ([x1_cpu , x2_cpu , x3_cpu ], dim = 4 )
223+
224+ # NPU version
225+ x1_npu = copy .deepcopy (x1 ).to (device )
226+ x2_npu = copy .deepcopy (x2 ).to (device )
227+ x3_npu = copy .deepcopy (x3 ).to (device )
228+
229+ def concat_fn (x1 , x2 , x3 ):
230+ return torch .cat ([x1 , x2 , x3 ], dim = 4 )
231+
232+ compiled_concat = torch .compile (dynamic = False )(concat_fn )
233+ y_npu = compiled_concat (x1_npu , x2_npu , x3_npu )
234+
235+ # Compare results
236+ test_result ("Concat 4D" , y_npu , y_cpu )
237+ print (f"Output shape: { y_npu .shape } " )
238+ print ("Concat 4D Test Done" )
239+
49240if __name__ == "__main__" :
50241
51242 base_dir = os .environ .get ("TORCHSIM_DIR" , default = "/workspace/PyTorchSim" )
@@ -59,4 +250,34 @@ def run_yolo(batch, config):
59250 args = args .parse_args ()
60251 batch = args .batch
61252
253+ device = torch .device ("npu:0" )
254+
255+ # Test Concat 4D
256+ # print("=" * 80)
257+ # print("Testing Concat 4D")
258+ # print("=" * 80)
259+ # test_concat_4d(device)
260+
261+ # Test Conv module
262+ # print("\n" + "=" * 80)
263+ # print("Testing Conv Module")
264+ # print("=" * 80)
265+ # test_conv_module(device, batch=batch, c1=32, c2=32, k=1, s=1, p=None, g=1, d=1, act=False, h=16, w=16)
266+
267+ # Test Bottleneck module
268+ # print("\n" + "=" * 80)
269+ # print("Testing Bottleneck Module")
270+ # print("=" * 80)
271+ # test_bottleneck_module(device, batch=batch, c1=32, c2=32, shortcut=True, g=1, e=0.5, h=16, w=16)
272+
273+ # Test C3 module
274+ # print("\n" + "=" * 80)
275+ # print("Testing C3 Module")
276+ # print("=" * 80)
277+ # test_c3_module(device, batch=batch, c1=64, c2=64, n=1, h=16, w=16)
278+
279+ # Test full YOLOv5 model
280+ print ("\n " + "=" * 80 )
281+ print ("Testing Full YOLOv5 Model" )
282+ print ("=" * 80 )
62283 run_yolo (batch , config )
0 commit comments