Skip to content

Commit 66b0103

Browse files
committed
[TOGSim] Fix DMA stat logging and unify stat formatting
1 parent a553288 commit 66b0103

15 files changed

Lines changed: 135 additions & 70 deletions

File tree

README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,14 @@ Log contains memory & core stats.
154154
[info] HBM2-CH_0: avg BW utilization 37% (255 reads, 128 writes)
155155
[info] Row hits: 359, Row misses: 26, Row conflicts: 0
156156
[info] ========= Core stat =========
157-
[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active cycle 0, idle cycle 1014
158-
[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active cycle 128, idle cycle 886
159-
[info] Core [0] : DMA active cycle 3 DMA idle cycle 1011 DRAM BW 182.000 GB/s (6144)
160-
[info] Core [0] : Vector Unit Utilization(%) 4.34, active cycle 44, idle_cycle 0
161-
[info] Core [0] : NUMA local access count : 0, NUMA remote access count : 0
162-
[info] Core [0] : Total cycle 1014
163-
[info] Total execution cycle: 1014
164-
[info] Simulation wall clock time: 0.039296 seconds
157+
[info] Core [0] : Systolic array [0] Utilization(%) 0.00, active_cycles 0, idle_cycles 1014
158+
[info] Core [0] : Systolic array [1] Utilization(%) 12.62, active_cycles 128, idle_cycles 886
159+
[info] Core [0] : DMA active_cycles 3 DMA idle_cycles 1011 DRAM BW 182.000 GB/s (6144)
160+
[info] Core [0] : Vector Unit Utilization(%) 4.34, active_cycles 44, idle_cycle 0
161+
[info] Core [0] : NUMA local memory: 34 requests, remote memory: 0 requests
162+
[info] Core [0] : Total_cycles 1014
163+
[info] Total execution cycles: 1014
164+
[info] Wall-clock time for simulation: 0.039296 seconds
165165
```
166166
The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below.
167167
```bash

Simulator/simulator.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -443,12 +443,12 @@ def get_result_from_file(result_path):
443443
if 'DRAM: AVG BW Util' in line:
444444
avg_dram_bw = float(re.search(r'AVG BW Util (\d+\.?\d*)%', line).group(1))
445445

446-
if 'Total execution cycle' in line:
447-
total_cycle = int(re.search(r'Total execution cycle: (\d+)', line).group(1))
446+
if 'Total execution cycles' in line:
447+
total_cycle = int(re.search(r'Total execution cycles: (\d+)', line).group(1))
448448

449449
# Parse total simulation time
450-
if 'Simulation wall clock time' in line:
451-
simulation_time = float(re.search(r'Simulation wall clock time: (\d+\.?\d*) seconds', line).group(1))
450+
if 'Wall-clock time for simulation' in line:
451+
simulation_time = float(re.search(r'Wall-clock time for simulation: (\d+\.?\d*) seconds', line).group(1))
452452
return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
453453

454454
if __name__ == "__main__":

TOGSim/include/Core.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ class Core {
2828
virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
2929
virtual void push_memory_response(mem_fetch* response);
3030
void check_tag() { _dma.check_table(); }
31-
void inc_numa_hit() { _stat_numa_hit++; }
32-
void inc_numa_miss() { _stat_numa_miss++; }
31+
void inc_numa_local_access() { _stat_numa_local_access++; }
32+
void inc_numa_remote_access() { _stat_numa_remote_access++; }
3333

3434
std::queue<std::shared_ptr<Instruction>>& get_compute_pipeline(int compute_type);
3535
enum {
@@ -69,8 +69,8 @@ class Core {
6969
uint64_t _stat_tot_mem_response = 0;
7070
uint64_t _stat_gemm_inst = 0;
7171
uint64_t _stat_skip_dma = 0;
72-
uint64_t _stat_numa_hit = 0;
73-
uint64_t _stat_numa_miss = 0;
72+
uint64_t _stat_numa_local_access = 0;
73+
uint64_t _stat_numa_remote_access = 0;
7474

7575
cycle_type _stat_vu_compute_cycle = 0;
7676
std::vector<cycle_type> _stat_sa_compute_cycle;

TOGSim/include/DMA.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <cstdint>
55
#include <memory>
6+
#include <queue>
67
#include <map>
78
#include <vector>
89
#include "Instruction.h"
@@ -114,7 +115,7 @@ class DMA {
114115
}
115116

116117
std::shared_ptr<Instruction>& get_current_inst() { return _current_inst; }
117-
std::shared_ptr<std::vector<mem_fetch*>> get_memory_access(cycle_type core_cycle);
118+
std::shared_ptr<std::vector<mem_fetch*>> get_memory_access(cycle_type core_cycle, int nr_req);
118119
uint32_t generate_mem_access_id();
119120
const uint32_t get_max_dim() { return _max_dim; }
120121

@@ -130,5 +131,7 @@ class DMA {
130131
bool _finished=true;
131132
std::map<int, std::map<std::vector<int>, uint32_t>> tag_table;
132133
std::map<int, std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
134+
std::queue<mem_fetch*> _pending_accesses;
135+
bool _generated_once = false;
133136
};
134137
#endif

TOGSim/include/SimulationConfig.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ struct SimulationConfig {
2727
/* DRAM config */
2828
DramType dram_type;
2929
uint32_t dram_num_partitions = 1;
30+
uint32_t dram_channels_per_partitions = 0;
3031
uint32_t dram_freq_mhz;
3132
uint32_t dram_channels;
3233
uint32_t dram_req_size;

TOGSim/src/Common.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,15 @@ SimulationConfig initialize_config(json config) {
7474
parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"];
7575
if(config.contains("dram_num_burst_length"))
7676
parsed_config.dram_nbl = config["dram_num_burst_length"];
77-
if (config.contains("dram_num_partitions"))
77+
if (config.contains("dram_num_partitions")) {
7878
parsed_config.dram_num_partitions = config["dram_num_partitions"];
79+
if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) {
80+
throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions");
81+
}
82+
}
83+
parsed_config.dram_channels_per_partitions =
84+
parsed_config.dram_channels / parsed_config.dram_num_partitions;
85+
7986

8087
/* L2D config */
8188
if (config.contains("l2d_type")) {

TOGSim/src/Core.cc

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ void Core::dma_cycle() {
175175
}
176176
}
177177
/* Generate memfetch */
178-
auto access_vec = _dma.get_memory_access(_core_cycle);
178+
auto access_vec = _dma.get_memory_access(_core_cycle, _config.icnt_injection_ports_per_core);
179179
for (auto access : *access_vec) {
180180
access->set_start_cycle(_core_cycle);
181181
_request_queue.push(access);
@@ -411,24 +411,43 @@ void Core::print_stats() {
411411
std::vector<float> sa_utilization;
412412
update_stats();
413413
spdlog::info("===== Instructions count =====");
414-
for (int i=0; i < static_cast<size_t>(Opcode::COUNT); i++) {
415-
if (i == static_cast<size_t>(Opcode::COMP))
416-
spdlog::info("Core [{}] : {} inst count {} (GEMM: {}, Vector: {}), skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_gemm_inst, _stat_inst_count.at(i) - _stat_gemm_inst, _stat_tot_skipped_inst.at(i));
417-
else
418-
spdlog::info("Core [{}] : {} inst count {}, skipped inst count {}", _id, opcode_to_string(static_cast<Opcode>(i)), _stat_inst_count.at(i), _stat_tot_skipped_inst.at(i));
414+
for (int i = 0; i < static_cast<size_t>(Opcode::COUNT); i++) {
415+
auto opcode = static_cast<Opcode>(i);
416+
auto inst = _stat_inst_count.at(i);
417+
auto skipped = _stat_tot_skipped_inst.at(i);
418+
auto name = opcode_to_string(opcode);
419+
420+
if (opcode == Opcode::COMP) {
421+
auto gemm = _stat_gemm_inst;
422+
auto vector = inst - gemm;
423+
if (skipped)
424+
spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {}), skipped inst_count {}",
425+
_id, name, inst, gemm, vector, skipped);
426+
else
427+
spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {})",
428+
_id, name, inst, gemm, vector);
429+
}
430+
else {
431+
if (skipped)
432+
spdlog::info("Core [{}] : {:8} inst_count {}, skipped inst_count {}",
433+
_id, name, inst, skipped);
434+
else
435+
spdlog::info("Core [{}] : {:8} inst_count {}",
436+
_id, name, inst);
437+
}
419438
}
420439
spdlog::info("========= Core stat =========");
421440
for (int i=0; i<_num_systolic_array_per_core; i++)
422441
sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
423442
for (int i=0; i<_num_systolic_array_per_core; i++)
424-
spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
443+
spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
425444
_stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i));
426445
float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle
427-
spdlog::info("Core [{}] : DMA active cycle {} DMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response);
428-
spdlog::info("Core [{}] : Vector Unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
446+
spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response);
447+
spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
429448
static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
430-
spdlog::info("Core [{}] : NUMA local access count : {}, NUMA remote access count : {}", _id, _stat_numa_hit, _stat_numa_miss);
431-
spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
449+
spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access);
450+
spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
432451
}
433452

434453
void Core::print_current_stats() {
@@ -442,12 +461,12 @@ void Core::print_current_stats() {
442461

443462
spdlog::info("========= Core stat =========");
444463
for (int i=0; i<_num_systolic_array_per_core; i++)
445-
spdlog::info("Core [{}] : Systolic array [{}] Utilization(%) {:.2f}, active cycle {}, idle cycle {}", _id, i, sa_utilization.at(i),
464+
spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
446465
_stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
447-
spdlog::info("Core [{}] : DMA active cycle {} DMA idle cycle {} DRAM BW {:.3f} GB/s ({})", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response);
448-
spdlog::info("Core [{}] : Vector Unit Utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
466+
spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response);
467+
spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id,
449468
static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
450-
spdlog::info("Core [{}] : Total cycle {}", _id, _core_cycle);
469+
spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
451470
update_stats();
452471
}
453472

TOGSim/src/DMA.cc

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,62 @@ void DMA::issue_tile(std::shared_ptr<Instruction> inst) {
1818
_finished = false;
1919
}
2020

21-
std::shared_ptr<std::vector<mem_fetch*>> DMA::get_memory_access(cycle_type core_cycle) {
22-
auto addr_set = _current_inst->get_dram_address(_dram_req_size);
21+
std::shared_ptr<std::vector<mem_fetch*>> DMA::get_memory_access(cycle_type core_cycle, int nr_req) {
22+
23+
if (!_generated_once) {
24+
std::shared_ptr<std::set<addr_type>> addr_set =
25+
_current_inst->get_dram_address(_dram_req_size);
26+
27+
Tile* owner = (Tile*)_current_inst->get_owner();
28+
std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
29+
unsigned long long base_daddr = _current_inst->get_base_dram_address();
30+
31+
bool is_cacheable =
32+
owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
33+
34+
spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}",
35+
core_cycle, _id, base_daddr, is_cacheable);
36+
spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
37+
core_cycle, _id, owner_subgraph->get_core_id(),
38+
_current_inst->get_numa_id(), _current_inst->get_addr_name(),
39+
_current_inst->is_dma_write());
40+
for (const auto& addr : *addr_set) {
41+
mem_access_type acc_type =
42+
_current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W
43+
: mem_access_type::GLOBAL_ACC_R;
44+
mf_type type =
45+
_current_inst->is_dma_write() ? mf_type::WRITE_REQUEST
46+
: mf_type::READ_REQUEST;
47+
48+
mem_fetch* access = new mem_fetch(
49+
addr, acc_type, type, _dram_req_size,
50+
_current_inst->get_numa_id(),
51+
static_cast<void*>(_current_inst.get()));
52+
53+
access->set_cacheable(is_cacheable);
54+
_current_inst->inc_waiting_request();
55+
_pending_accesses.push(access);
56+
}
57+
_generated_once = true;
58+
}
59+
60+
if (nr_req == -1)
61+
nr_req = _pending_accesses.size();
62+
63+
// Return pending accesses up to nr_req
2364
auto access_vec = std::make_shared<std::vector<mem_fetch *>>();
24-
Tile* owner = (Tile*)_current_inst->get_owner();
25-
std::shared_ptr<TileSubGraph> owner_subgraph = owner->get_owner();
26-
unsigned long long base_daddr = _current_inst->get_base_dram_address();
27-
// Todo. We use a ternsor level buffer allocation, so we don't need to check all memfetch
28-
bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
29-
spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}", core_cycle, _id, base_daddr, is_cacheable);
30-
spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
31-
core_cycle, _id, owner_subgraph->get_core_id(), _current_inst->get_numa_id(), _current_inst->get_addr_name(), _current_inst->is_dma_write());
32-
33-
for (auto addr: *addr_set) {
34-
mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W : mem_access_type::GLOBAL_ACC_R;
35-
mf_type type = _current_inst->is_dma_write() ? mf_type::WRITE_REQUEST : mf_type::READ_REQUEST;
36-
mem_fetch* access = new mem_fetch(addr, acc_type, type, _dram_req_size, _current_inst->get_numa_id(), static_cast<void*>(_current_inst.get()));
37-
access->set_cacheable(is_cacheable);
38-
_current_inst->inc_waiting_request();
39-
access_vec->push_back(access);
65+
for (int i = 0; i < nr_req; i++) {
66+
if (_pending_accesses.empty())
67+
break;
68+
access_vec->push_back(_pending_accesses.front());
69+
_pending_accesses.pop();
4070
}
41-
_finished = true;
71+
72+
if (_pending_accesses.empty()) {
73+
_finished = true;
74+
_generated_once = false;
75+
}
76+
4277
return access_vec;
4378
}
4479

TOGSim/src/Dram.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
1717
_n_bl = config.dram_nbl;
1818
_req_size = config.dram_req_size;
1919
_n_partitions = config.dram_num_partitions;
20-
_n_ch_per_partition = _n_ch / _n_partitions;
20+
_n_ch_per_partition = config.dram_channels_per_partitions;
2121
_config = config;
2222

2323
spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size);

TOGSim/src/Simulator.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,11 @@ void Simulator::icnt_cycle() {
117117
mem_fetch *front = _cores[core_id]->top_memory_request();
118118
front->set_core_id(core_id);
119119
if (!_icnt->is_full(port_id, front)) {
120-
//int node_id = _dram->get_channel_id(front) / 16;
121-
//if (core_id == node_id)
122-
// _cores[core_id]->inc_numa_hit();
123-
//else
124-
// _cores[core_id]->inc_numa_miss();
120+
int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions;
121+
if (core_id == node_id)
122+
_cores[core_id]->inc_numa_local_access();
123+
else
124+
_cores[core_id]->inc_numa_remote_access();
125125
_icnt->push(port_id , get_dest_node(front), front);
126126
_cores[core_id]->pop_memory_request();
127127
_nr_from_core++;
@@ -291,5 +291,5 @@ void Simulator::print_core_stat()
291291
for (int core_id = 0; core_id < _n_cores; core_id++) {
292292
_cores[core_id]->print_stats();
293293
}
294-
spdlog::info("Total execution cycle: {}", _core_cycles);
294+
spdlog::info("Total execution cycles: {}", _core_cycles);
295295
}

0 commit comments

Comments
 (0)