Skip to content

Commit 2c16654

Browse files
committed
[Doc] Refresh README for the latest version
1 parent 9b483f7 commit 2c16654

3 files changed

Lines changed: 126 additions & 57 deletions

File tree

.github/workflows/tag_release.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77

88
jobs:
99
build:
10-
runs-on: ubuntu-latest
10+
runs-on: self-hosted
1111

1212
permissions:
1313
contents: read
@@ -28,9 +28,17 @@ jobs:
2828
username: ${{ github.actor }}
2929
password: ${{ secrets.GITHUB_TOKEN }}
3030

31+
- name: Set Tag Environment
32+
run: |
33+
echo "IMAGE_TAG=torchsim-ci:${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
34+
echo "GITHUB_SHA=$GITHUB_SHA" >> $GITHUB_ENV
35+
echo "GITHUB_SHA=$GITHUB_SHA"
36+
3137
- name: Build and Push Docker Image
3238
uses: docker/build-push-action@v6
3339
with:
3440
context: .
3541
file: ./Dockerfile
36-
tags: ghcr.io/psal-postech/torchsim-release:${{ github.ref_name }}
42+
secrets: |
43+
GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }}
44+
tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}

PyTorchSimFrontend/extension_config.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,23 +57,44 @@ def __getattr__(name):
5757

5858
# Compiler Optimization
5959
if name == "codegen_compiler_optimization":
60-
return config_json["codegen_compiler_optimization"]
60+
opt_level = config_json["codegen_compiler_optimization"]
61+
valid_opts = {
62+
"fusion",
63+
"reduction_epilogue",
64+
"reduction_reduction",
65+
"prologue",
66+
"single_batch_conv",
67+
"multi_tile_conv",
68+
"subtile"
69+
}
70+
if opt_level == "all" or opt_level is "none":
71+
pass
72+
elif isinstance(opt_level, list):
73+
# Check if provided list contains only valid options
74+
invalids = set(opt_level) - valid_opts
75+
assert not invalids, f"Invalid optimization options found: {invalids}"
76+
else:
77+
assert False, "Invalid format: Must be 'all', none, or a list of options."
78+
return opt_level
6179

6280
# Advanced fusion options
81+
is_opt_enabled = lambda key: (__getattr__("codegen_compiler_optimization") == "all") or \
82+
(isinstance(__getattr__("codegen_compiler_optimization"), list) and \
83+
key in __getattr__("codegen_compiler_optimization"))
6384
if name == "CONFIG_FUSION":
64-
return True if (__getattr__("codegen_compiler_optimization") == "all" or "fusion" in __getattr__("codegen_compiler_optimization")) else False
85+
return is_opt_enabled("fusion")
6586
if name == "CONFIG_FUSION_REDUCTION_EPILOGUE":
66-
return True if (__getattr__("codegen_compiler_optimization") == "all" or "reduction_epliogue" in __getattr__("codegen_compiler_optimization")) else False
87+
return is_opt_enabled("reduction_epilogue") # Fixed typo here as well
6788
if name == "CONFIG_FUSION_REDUCTION_REDUCTION":
68-
return True if (__getattr__("codegen_compiler_optimization") == "all" or "reduction_reduction" in __getattr__("codegen_compiler_optimization")) else False
89+
return is_opt_enabled("reduction_reduction")
6990
if name == "CONFIG_FUSION_PROLOGUE":
70-
return True if ((__getattr__("codegen_compiler_optimization") == "all") or ("prologue" in __getattr__("codegen_compiler_optimization"))) else False
91+
return is_opt_enabled("prologue")
7192
if name == "CONFIG_SINGLE_BATCH_CONV":
72-
return True if (__getattr__("codegen_compiler_optimization") == "all" or "single_batch_conv" in __getattr__("codegen_compiler_optimization")) else False
93+
return is_opt_enabled("single_batch_conv")
7394
if name == "CONFIG_MULTI_TILE_CONV":
74-
return True if (__getattr__("codegen_compiler_optimization") == "all" or "multi_tile_conv" in __getattr__("codegen_compiler_optimization")) else False
95+
return is_opt_enabled("multi_tile_conv")
7596
if name == "CONFIG_SUBTILE":
76-
return True if (__getattr__("codegen_compiler_optimization") == "all" or "subtile" in __getattr__("codegen_compiler_optimization")) else False
97+
return is_opt_enabled("subtile")
7798

7899
if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
79100
return os.environ.get("TOGSIM_DEBUG_LEVEL", "")

README.md

Lines changed: 87 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Any x86 hardware capable of running Docker with more than 20 GB of memory -->
7878
- View
7979
- Activation
8080
- Pooling
81+
- Etc (WIP)
8182

8283
## Getting Started
8384
### Quick start with pre-built Docker image
@@ -86,7 +87,7 @@ To download the latest Docker image and set up the environment, use the followin
8687

8788
```bash
8889
# Run the Docker container
89-
docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:v1.0.0 bash
90+
docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:v1.0.1 bash
9091
```
9192
### Manual Setting (Optional)
9293
This script provides building [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) simulator from source code for specific experts.
@@ -105,17 +106,18 @@ You can run your own PyTorch model on PyTorchSim by setting up a custom NPU devi
105106
This method also applies when you want to simulate models beyond the provided examples.
106107
```python
107108
import torch
108-
from Scheduler.scheduler import ExecutionEngine
109+
from Scheduler.scheduler import PyTorchSimRunner
109110
# Declare a custom NPU device
110-
device = ExecutionEngine.setup_device().custom_device()
111+
device = PyTorchSimRunner.setup_device().custom_device()
111112

112113
# Declare you own model (e.g. resnet18 from torchvision)
113114
from torchvision.models import resnet18
114-
model = resnet50().eval()
115+
model = resnet18().eval()
116+
x = torch.randn(1, 3, 224, 224, dtype=torch.float32)
115117

116118
# Move model and input tensors to the custom device
117119
model.to(device)
118-
x.to(device)
120+
x = x.to(device)
119121

120122
# Compile and run the model with PyTorchSim
121123
compiled_model = torch.compile(dynamic=False)(model)
@@ -127,41 +129,73 @@ PyTorchSim automatically generates a Tile-Operation Graph (TOG), and runs it thr
127129
### Result
128130
Running log in CLI
129131
```bash
130-
Wrapper Codegen Path = /tmp/torchinductor_root/yd/cyda7nhzv5mtakfhfcxtmmhtsv6kg7sza4k6wpkdgk7oxbpvqnlz.py
131-
[Gem5Simulator] cmd> /workspace/gem5/build/RISCV/gem5.opt -r --stdout-file=sto.log -d /tmp/torchinductor/tmp/fy6nnyudtno/m5out /root/workspace/PyTorchSim/gem5_script/script_systolic.py -c /tmp/torchinductor/tmp/fy6nnyudtno/cycle_bin --vlane 128
132-
[Gem5Simulator] Simulation is still running...
133-
[SpikeSimulator] cmd> spike --isa rv64gcv --varch=vlen:256,elen:64 --vectorlane-size=128 -m0x80000000:0x1900000000,0x2000000000:0x1000000 --scratchpad-base-paddr=137438953472 --scratchpad-base-vaddr=3489660928 --scratchpad-size=131072 --kernel-addr=0000000000010400:10846 --base-path=/tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001 /workspace/riscv-pk/build/pk /tmp/torchinductor/tmp/fy6nnyudtno/validation_binary /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg0_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/arg1_1/0.raw /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/buf0/0.raw
134-
[TOGSimulator] cmd> /root/workspace/PyTorchSim/TOGSim/build/bin/Simulator --config /root/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json --models_list /tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx --attributes_list /tmp/torchinductor/tmp/fy6nnyudtno/runtime_0001/attribute/0
135-
[TOGSimulator] Simulation is still running..
136-
[TOGSimulator] Simulation of "/tmp/torchinductor/tmp/fy6nnyudtno/tile_graph.onnx" is stored to "/tmp/torchinductor/tmp/fy6nnyudtno/togsim_result/0"
132+
Wrapper Codegen Path = /tmp/torchinductor_root/fo/cfofsp5nwmpqxctouan2v2t5y7qp5vwrgvw4swssx4ca4us3c5tx.py
133+
[Gem5] Gem5 is running.
134+
[Spike] Running Spike simulator
135+
[TOGSim] TOGSim is running..
136+
[TOGSim] Simulation log is stored to "/workspace/PyTorchSim/togsim_results/20251205_080553.log"
137137
----------------------------
138138
|Matmul Forward Test Passed|
139139
----------------------------
140140
```
141141

142142
Simulation consists of three steps
143143

144-
1. `Gem5Simulator` obatins compute latency for TOG.
145-
2. `SpikeSimulator` verifies the output code.
146-
3. `TOGSimulator` simulates a NPU architecture.
144+
1. `Gem5` obatins compute latency for TOG.
145+
2. `Spike` verifies the output code.
146+
3. `TOGSim` simulates a NPU architecture.
147147

148148
If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below.
149149
```bash
150150
export pytorchsim_functional_mode=False
151151
```
152152
Log contains memory & core stats.
153153
```bash
154-
[info] HBM2-CH_0: avg BW utilization 37% (255 reads, 128 writes)
155-
[info] Row hits: 359, Row misses: 26, Row conflicts: 0
156-
[info] ========= Core stat =========
157-
[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active_cycles 0, idle_cycles 1014
158-
[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active_cycles 128, idle_cycles 886
159-
[info] Core [0] : DMA active_cycles 3 DMA idle_cycles 1011 DRAM BW 182.000 GB/s (6144)
160-
[info] Core [0] : Vector Unit Utilization(%) 4.34, active_cycles 44, idle_cycle 0
161-
[info] Core [0] : NUMA local memory: 34 requests, remote memory: 0 requests
162-
[info] Core [0] : Total_cycles 1014
163-
[info] Total execution cycles: 1014
164-
[info] Wall-clock time for simulation: 0.039296 seconds
154+
[2025-12-05 08:05:52.538] [info] HBM2-CH_0: avg BW utilization 49% (768 reads, 256 writes)
155+
[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
156+
[2025-12-05 08:05:52.538] [info] HBM2-CH_1: avg BW utilization 49% (768 reads, 256 writes)
157+
[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
158+
[2025-12-05 08:05:52.538] [info] HBM2-CH_2: avg BW utilization 49% (768 reads, 256 writes)
159+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
160+
[2025-12-05 08:05:52.538] [info] HBM2-CH_3: avg BW utilization 49% (768 reads, 256 writes)
161+
[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
162+
[2025-12-05 08:05:52.538] [info] HBM2-CH_4: avg BW utilization 49% (768 reads, 256 writes)
163+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
164+
[2025-12-05 08:05:52.538] [info] HBM2-CH_5: avg BW utilization 49% (768 reads, 256 writes)
165+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
166+
[2025-12-05 08:05:52.538] [info] HBM2-CH_6: avg BW utilization 49% (768 reads, 256 writes)
167+
[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
168+
[2025-12-05 08:05:52.538] [info] HBM2-CH_7: avg BW utilization 49% (768 reads, 256 writes)
169+
[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
170+
[2025-12-05 08:05:52.538] [info] HBM2-CH_8: avg BW utilization 49% (768 reads, 256 writes)
171+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
172+
[2025-12-05 08:05:52.538] [info] HBM2-CH_9: avg BW utilization 49% (768 reads, 256 writes)
173+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
174+
[2025-12-05 08:05:52.538] [info] HBM2-CH_10: avg BW utilization 49% (768 reads, 256 writes)
175+
[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
176+
[2025-12-05 08:05:52.538] [info] HBM2-CH_11: avg BW utilization 49% (768 reads, 256 writes)
177+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
178+
[2025-12-05 08:05:52.538] [info] HBM2-CH_12: avg BW utilization 49% (768 reads, 256 writes)
179+
[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
180+
[2025-12-05 08:05:52.538] [info] HBM2-CH_13: avg BW utilization 49% (768 reads, 256 writes)
181+
[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
182+
[2025-12-05 08:05:52.538] [info] HBM2-CH_14: avg BW utilization 49% (768 reads, 256 writes)
183+
[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
184+
[2025-12-05 08:05:52.538] [info] HBM2-CH_15: avg BW utilization 49% (768 reads, 256 writes)
185+
[2025-12-05 08:05:52.538] [info] ===== Instructions count =====
186+
[2025-12-05 08:05:52.538] [info] Core [0] : MOVIN inst_count 3
187+
[2025-12-05 08:05:52.538] [info] Core [0] : MOVOUT inst_count 1
188+
[2025-12-05 08:05:52.538] [info] Core [0] : COMP inst_count 10 (GEMM: 8, Vector: 2)
189+
[2025-12-05 08:05:52.538] [info] Core [0] : BAR inst_count 8
190+
[2025-12-05 08:05:52.538] [info] ========= Core stat =========
191+
[2025-12-05 08:05:52.538] [info] Core [0] : Systolic array [0] utilization(%) 12.40, active_cycles 256, idle_cycles 1809
192+
[2025-12-05 08:05:52.538] [info] Core [0] : Systolic array [1] utilization(%) 12.40, active_cycles 256, idle_cycles 1809
193+
[2025-12-05 08:05:52.538] [info] Core [0] : DMA active_cycles, 1024 DMA idle_cycles 1041, DRAM BW 238.000 GB/s (16384 responses)
194+
[2025-12-05 08:05:52.538] [info] Core [0] : Vector unit utilization(%) 2.42, active cycle 50, idle_cycle 0
195+
[2025-12-05 08:05:52.538] [info] Core [0] : NUMA local memory: 16384 requests, remote memory: 0 requests
196+
[2025-12-05 08:05:52.538] [info] Core [0] : Total_cycles 2065
197+
[2025-12-05 08:05:52.538] [info] Total execution cycles: 2065
198+
[2025-12-05 08:05:52.538] [info] Wall-clock time for simulation: 0.147463 seconds
165199
```
166200
The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below.
167201
```bash
@@ -193,12 +227,12 @@ import os
193227
import sys
194228
import torch
195229
from torchvision.models import resnet18
196-
from test_transformer import EncoderBlock
197230
base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
198231
config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
199232

200233
sys.path.append(base_path)
201-
from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
234+
from tests.test_transformer import EncoderBlock
235+
from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
202236
scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
203237

204238
# Register compiled model
@@ -232,13 +266,13 @@ model1_lambda = 3.0
232266
max_time = 1000.0 # [s]
233267

234268
# Generate Possion distribution requests for model0
235-
for model0_request_time in poisson_request_generator(model0_lambda, total_time=max_time):
269+
for model0_request_time in poisson_request_generator(model0_lambda, max_msec_time=max_time):
236270
x = torch.randn(1, 3, 224, 224)
237271
new_request = Request("model0", [x], [], request_queue_idx=0)
238272
scheduler.add_request(new_request, request_time=model0_request_time)
239273

240274
# Generate Possion distribution requests for model1
241-
for model1_request_time in poisson_request_generator(model1_lambda, total_time=max_time):
275+
for model1_request_time in poisson_request_generator(model1_lambda, max_msec_time=max_time):
242276
x = torch.randn(128, 768)
243277
new_request = Request("model1", [x], [], request_queue_idx=1)
244278
scheduler.add_request(new_request, request_time=model1_request_time)
@@ -267,18 +301,19 @@ PyTorchSim provides three mapping strategies.
267301
### Heuristic-based mapping
268302
We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory.
269303
### Auto-tuning
270-
Heuristic method is not optimal for some cases. PyTorchSim provides auto-tuning to find best mapping for GEMM, CONV, and vector operations. It reduces searching space by sorting of scratchpad memory utilization and pick top-k candiates. Searching parameters are tile shape and vector lane stride.
304+
Heuristic method may not be optimal for all cases. PyTorchSim provides auto-tuning to find the best mapping for GEMM, CONV, and vector operations. It reduces the search space by sorting candidates based on scratchpad memory utilization and picking the top-k candidates. Search parameters include tile shape and vector lane stride.
305+
306+
To enable this, update your configuration file as follows:
271307
```bash
272-
export AUTOTUNE=True
273-
export AUTOTUNE_TEMPLATE=True
308+
"codegen_mapping_strategy" : "autotune"
274309
```
275310
### Manunal setting
276-
User can exploit third-party(e.g. Timeloop) mapping. Set the cheatsheet path and write down their own mapping.
277-
311+
Users can utilizing third-party mapping tools (e.g., Timeloop). You can explicitly set the mapping file path in the configuration file to apply your own mapping strategies.
278312
```bash
279-
export CONFIG_GEMM_CHEATSHEET_PATH=validation/gemm_tpuv3_cheatsheet.json
313+
"codegen_mapping_strategy" : "external",
314+
"codegen_external_mapping_file" : "path/to/mapping_file.json",
280315
```
281-
Key: "M_K_N" for GEMM
316+
Key: "M_N_K" for GEMM
282317
```
283318
{
284319
"512_2048_8192" : {
@@ -298,13 +333,7 @@ Key: "M_K_N" for GEMM
298333
}
299334
}
300335
```
301-
If you want to explore specific tile size, set the environment variable as below.
302-
```bash
303-
export TORCHSIM_MANUAL_TILE_SIZE=1
304-
export TORCHSIM_TILE_M=512
305-
export TORCHSIM_TILE_N=512
306-
export TORCHSIM_TILE_K=512
307-
```
336+
308337
## L2 Cache
309338
It supports L2 cache as persistent cache. User can provide software-managed allocation/eviction strategy for tensors with persistent cache.
310339

@@ -329,8 +358,6 @@ Last but not least, you must set `l2d_type` and `l2d_config` in the [TOGSim conf
329358

330359
You can configure these options using environment variables.
331360
```bash
332-
export vpu_num_lanes=128 # vector lane size
333-
export vpu_num_lanes_STRIDE=2 # vector lane stride for DMA
334361
export TORCHSIM_DIR=/workspace/PyTorchSim # home directory
335362

336363
# Plan which tensor allocated in TPUv4's CMEM
@@ -348,6 +375,10 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
348375
"core_freq_mhz" : 940, // Core's frequency (MHz)
349376
"num_systolic_array_per_core" : 2, // Number of systolic array per core
350377
378+
"vpu_num_lanes" : 128, // Number of VPU lanes
379+
"vpu_spad_size_kb_per_lane" : 128, // Scratchpad memory size per lane (KB)
380+
"vpu_vector_length_bits" : 256, // VPU vector register length (Bits)
381+
351382
"dram_type" : "ramulator2", // DRAM type (ex. ramulator2, simple)
352383
"dram_freq_mhz" : 940, // DRAM frequency (MHz)
353384
"dram_channels": 32, // Number of DRAM channels
@@ -371,7 +402,16 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
371402
"partition": { // allocate request queue index
372403
"core_0":0,
373404
"core_1":1
374-
}
405+
},
406+
407+
"codegen_mapping_strategy" : "heuristic", // Compiler mapping strategy (ex. "heuristic", "autotune", "external-then-heuristic", "external-then-autotune")
408+
"codegen_external_mapping_file" : "", // Path to external mapping file
409+
"codegen_autotune_max_retry": 10, // Maximum retries for autotuning
410+
"codegen_autotune_template_topk": 4, // Top-K templates to consider during autotuning
411+
// Compiler optimization level/options.
412+
// Value can be "all", "none", or a list of specific optimizations:
413+
// ["fusion", "reduction_epilogue", "reduction_reduction", "prologue", "single_batch_conv", "multi_tile_conv", "subtile"]
414+
"codegen_compiler_optimization" : "all"
375415
```
376416
You can set TOGSim config path as below.
377417
```bash

0 commit comments

Comments
 (0)