From 26eecb254f0aca61f98c550eacf2a1cedc809393 Mon Sep 17 00:00:00 2001 From: Yanbo Date: Thu, 18 Dec 2025 04:41:18 -0500 Subject: [PATCH 1/6] cuThermo heatmap and block divergence feature --- include/sanalyzer.h | 2 + include/tools/block_divergence_analysis.h | 73 ++++++++ include/tools/heatmap_analysis.h | 78 +++++++++ include/tools/tool_type.h | 4 +- src/sanalyzer.cpp | 14 ++ src/tools/block_divergence_analysis.cpp | 190 +++++++++++++++++++++ src/tools/heatmap_analysis.cpp | 198 ++++++++++++++++++++++ 7 files changed, 558 insertions(+), 1 deletion(-) create mode 100644 include/tools/block_divergence_analysis.h create mode 100644 include/tools/heatmap_analysis.h create mode 100644 src/tools/block_divergence_analysis.cpp create mode 100644 src/tools/heatmap_analysis.cpp diff --git a/include/sanalyzer.h b/include/sanalyzer.h index e42ae53..8525d1f 100644 --- a/include/sanalyzer.h +++ b/include/sanalyzer.h @@ -25,6 +25,8 @@ typedef enum { GPU_PATCH_TIME_HOTNESS_CPU = 8, GPU_PATCH_ROOFLINE_FLOPS_NVBIT = 9, GPU_PATCH_ROOFLINE_SIZE = 10, + GPU_PATCH_HEATMAP_ANALYSIS = 11, + GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS = 12, } AccelProfPatchName_t; diff --git a/include/tools/block_divergence_analysis.h b/include/tools/block_divergence_analysis.h new file mode 100644 index 0000000..4fb52dd --- /dev/null +++ b/include/tools/block_divergence_analysis.h @@ -0,0 +1,73 @@ +#ifndef YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H +#define YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H + + +#include "tools/tool.h" +#include "utils/event.h" +#include "gpu_patch.h" + +#include +#include +#include +#include +namespace yosemite { + +class BlockDivergenceAnalysis final : public Tool { +public: + BlockDivergenceAnalysis(); + + ~BlockDivergenceAnalysis(); + + void gpu_data_analysis(void* data, uint64_t size); + + void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void evt_callback(EventPtr_t evt); + + void flush(); + +private: + void kernel_start_callback(std::shared_ptr kernel); + + void kernel_end_callback(std::shared_ptr kernel); + + void mem_alloc_callback(std::shared_ptr mem); + + void mem_free_callback(std::shared_ptr mem); + + void ten_alloc_callback(std::shared_ptr ten); + + void ten_free_callback(std::shared_ptr ten); + + void kernel_trace_flush(std::shared_ptr kernel); + + +/* +********************************* variables ********************************* +*/ + Timer_t _timer; + + std::string output_directory; + uint32_t kernel_id = 0; + + std::map> kernel_events; + std::map> alloc_events; + std::map> active_memories; + + std::map> tensor_events; + std::map> active_tensors; + + struct BlockStat { + std::unordered_map pc_counts; + uint64_t read_count = 0; + uint64_t write_count = 0; + }; + + std::unordered_map _block_entries; + std::set _unique_pcs; +}; + +} // yosemite +#endif // YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H diff --git a/include/tools/heatmap_analysis.h b/include/tools/heatmap_analysis.h new file mode 100644 index 0000000..0206a56 --- /dev/null +++ b/include/tools/heatmap_analysis.h @@ -0,0 +1,78 @@ +#ifndef YOSEMITE_HEATMAP_ANALYSIS_H +#define YOSEMITE_HEATMAP_ANALYSIS_H + + +#include "tools/tool.h" +#include "utils/event.h" +#include "gpu_patch.h" + +#include +#include +#include +#include +#include + +#define SECTOR_TAG_SHIFT 5 + +namespace yosemite { + +class HeatmapAnalysis final : public Tool { +public: + HeatmapAnalysis(); + + ~HeatmapAnalysis(); + + void gpu_data_analysis(void* data, uint64_t size); + + void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void evt_callback(EventPtr_t evt); + + void flush(); + +private: + void unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length); + + void add_sector_pc_information(uint32_t sector_tag, uint64_t pc); + + void kernel_start_callback(std::shared_ptr kernel); + + void kernel_end_callback(std::shared_ptr kernel); + + void mem_alloc_callback(std::shared_ptr mem); + + void mem_free_callback(std::shared_ptr mem); + + void ten_alloc_callback(std::shared_ptr ten); + + void ten_free_callback(std::shared_ptr ten); + + void kernel_trace_flush(std::shared_ptr kernel); + + +/* +********************************* variables ********************************* +*/ + + Timer_t _timer; + + std::string output_directory; + uint32_t kernel_id = 0; + + std::map> kernel_events; + std::map> alloc_events; + std::map> active_memories; + + std::map> tensor_events; + std::map> active_tensors; + + std::vector _traces; + std::unordered_map> _heatmap_data; + std::unordered_map> _sector_pc_information; + +}; + +} // namespace yosemite +#endif // YOSEMITE_HEATMAP_ANALYSIS_H diff --git a/include/tools/tool_type.h b/include/tools/tool_type.h index 4115ebb..fdc70dc 100644 --- a/include/tools/tool_type.h +++ b/include/tools/tool_type.h @@ -16,7 +16,9 @@ typedef enum { ROOFLINE_FLOPS = 11, ROOFLINE_SIZE = 12, ROOFLINE_TIME = 13, - TOOL_NUMS = 14 + HEATMAP_ANALYSIS = 14, + BLOCK_DIVERGENCE_ANALYSIS = 15, + TOOL_NUMS = 16 } AnalysisTool_t; #endif // TOOL_TYPE_H \ No newline at end of file diff --git a/src/sanalyzer.cpp b/src/sanalyzer.cpp index 0f250c5..038c37c 100644 --- a/src/sanalyzer.cpp +++ b/src/sanalyzer.cpp @@ -16,6 +16,8 @@ #include "tools/time_hotness_cpu.h" #include "tools/event_trace.h" #include "tools/event_trace_mgpu.h" +#include "tools/heatmap_analysis.h" +#include "tools/block_divergence_analysis.h" #include #include @@ -104,6 +106,12 @@ YosemiteResult_t yosemite_tool_enable(AnalysisTool_t& tool) { } else if (std::string(tool_name) == "event_trace_mgpu") { tool = EVENT_TRACE_MGPU; _tools.emplace(EVENT_TRACE_MGPU, std::make_shared()); + } else if (std::string(tool_name) == "heatmap_analysis") { + tool = HEATMAP_ANALYSIS; + _tools.emplace(HEATMAP_ANALYSIS, std::make_shared()); + } else if (std::string(tool_name) == "block_divergence_analysis") { + tool = BLOCK_DIVERGENCE_ANALYSIS; + _tools.emplace(BLOCK_DIVERGENCE_ANALYSIS, std::make_shared()); } else { fprintf(stderr, "[SANALYZER ERROR] Tool not found.\n"); fflush(stderr); @@ -249,6 +257,12 @@ YosemiteResult_t yosemite_init(AccelProfOptions_t& options) { options.patch_name = GPU_NO_PATCH; } else if (tool == EVENT_TRACE_MGPU) { options.patch_name = GPU_NO_PATCH; + } else if (tool == HEATMAP_ANALYSIS) { + options.patch_name = GPU_PATCH_HEATMAP_ANALYSIS; + options.patch_file = "gpu_patch_heatmap_analysis.fatbin"; + } else if (tool == BLOCK_DIVERGENCE_ANALYSIS) { + options.patch_name = GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS; + options.patch_file = "gpu_patch_block_divergence_analysis.fatbin"; } // enable torch profiler? diff --git a/src/tools/block_divergence_analysis.cpp b/src/tools/block_divergence_analysis.cpp new file mode 100644 index 0000000..8dd16d6 --- /dev/null +++ b/src/tools/block_divergence_analysis.cpp @@ -0,0 +1,190 @@ +#include "tools/block_divergence_analysis.h" +#include "utils/helper.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ +#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE +#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2 +#endif + +using namespace yosemite; + + +BlockDivergenceAnalysis::BlockDivergenceAnalysis() : Tool(MEM_TRACE) { + const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED"); + if (torch_prof && std::string(torch_prof) == "1") { + fprintf(stdout, "Enabling torch profiler in BlockDivergenceAnalysis.\n"); + _torch_enabled = true; + } + + const char* env_app_name = std::getenv("YOSEMITE_APP_NAME"); + if (env_app_name != nullptr) { + output_directory = "block_distribution_" + std::string(env_app_name) + + "_" + get_current_date_n_time(); + } else { + output_directory = "block_distribution_" + get_current_date_n_time(); + } + check_folder_existance(output_directory); +} + + +BlockDivergenceAnalysis::~BlockDivergenceAnalysis() {} + + +void BlockDivergenceAnalysis::kernel_start_callback(std::shared_ptr kernel) { + + kernel->kernel_id = kernel_id++; + kernel_events.emplace(_timer.get(), kernel); + _block_entries.clear(); + _unique_pcs.clear(); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::kernel_trace_flush(std::shared_ptr kernel) { + std::string filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".csv"; + printf("Dumping traces to %s\n", filename.c_str()); + + std::ofstream out(filename); + std::vector pc_list(_unique_pcs.begin(), _unique_pcs.end()); + std::sort(pc_list.begin(), pc_list.end()); + + std::vector block_ids; + block_ids.reserve(_block_entries.size()); + for (const auto& entry : _block_entries) { + block_ids.push_back(entry.first); + } + std::sort(block_ids.begin(), block_ids.end()); + + out << "blockidx,blockidy,blockidz"; + for (const auto pc : pc_list) { + out << ",0x" << std::hex << std::setw(16) << std::setfill('0') << pc << std::dec; + } + out << ",read_count,write_count" << std::endl; + + for (const auto block_id : block_ids) { + const auto& stats = _block_entries.at(block_id); + out << block_id << ",0,0"; + for (const auto pc : pc_list) { + auto pc_it = stats.pc_counts.find(pc); + uint64_t count = (pc_it != stats.pc_counts.end()) ? pc_it->second : 0; + out << "," << count; + } + out << "," << stats.read_count << "," << stats.write_count << std::endl; + } +} + + +void BlockDivergenceAnalysis::kernel_end_callback(std::shared_ptr kernel) { + auto evt = std::prev(kernel_events.end())->second; + evt->end_time = _timer.get(); + + kernel_trace_flush(evt); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::mem_alloc_callback(std::shared_ptr mem) { + alloc_events.emplace(_timer.get(), mem); + active_memories.emplace(mem->addr, mem); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::mem_free_callback(std::shared_ptr mem) { + auto it = active_memories.find(mem->addr); + assert(it != active_memories.end()); + active_memories.erase(it); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::ten_alloc_callback(std::shared_ptr ten) { + tensor_events.emplace(_timer.get(), ten); + active_tensors.emplace(ten->addr, ten); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::ten_free_callback(std::shared_ptr ten) { + auto it = active_tensors.find(ten->addr); + assert(it != active_tensors.end()); + active_tensors.erase(it); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + for (uint64_t i = 0; i < size; i++) { + const MemoryAccess& trace = accesses_buffer[i]; + uint64_t executed_inst_count = static_cast(__builtin_popcount(trace.active_mask)); + uint64_t pc = trace.pc; + uint64_t cta_id = trace.ctaId; + + auto& entry = _block_entries[cta_id]; + entry.pc_counts[pc] += executed_inst_count; + if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) { + entry.read_count += executed_inst_count; + } + if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) { + entry.write_count += executed_inst_count; + } + + _unique_pcs.insert(pc); + } + +} + + +void BlockDivergenceAnalysis::evt_callback(EventPtr_t evt) { + switch (evt->evt_type) { + case EventType_KERNEL_LAUNCH: + kernel_start_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_KERNEL_END: + kernel_end_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_ALLOC: + mem_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_FREE: + mem_free_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_ALLOC: + ten_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_FREE: + ten_free_callback(std::dynamic_pointer_cast(evt)); + break; + default: + break; + } +} + + +void BlockDivergenceAnalysis::flush() { +} diff --git a/src/tools/heatmap_analysis.cpp b/src/tools/heatmap_analysis.cpp new file mode 100644 index 0000000..23510fe --- /dev/null +++ b/src/tools/heatmap_analysis.cpp @@ -0,0 +1,198 @@ +#include "tools/heatmap_analysis.h" +#include "utils/helper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace yosemite; + + +HeatmapAnalysis::HeatmapAnalysis() : Tool(HEATMAP_ANALYSIS) { + const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED"); + if (torch_prof && std::string(torch_prof) == "1") { + fprintf(stdout, "Enabling torch profiler in HeatmapAnalysis.\n"); + _torch_enabled = true; + } + + const char* env_app_name = std::getenv("YOSEMITE_APP_NAME"); + if (env_app_name != nullptr) { + output_directory = "heatmap_" + std::string(env_app_name) + + "_" + get_current_date_n_time(); + } else { + output_directory = "heatmap_" + get_current_date_n_time(); + } + check_folder_existance(output_directory); +} + + +HeatmapAnalysis::~HeatmapAnalysis() {} + + +void HeatmapAnalysis::kernel_start_callback(std::shared_ptr kernel) { + + kernel->kernel_id = kernel_id++; + kernel_events.emplace(_timer.get(), kernel); + _traces.clear(); + _heatmap_data.clear(); + _sector_pc_information.clear(); + + _timer.increment(true); +} + + +void HeatmapAnalysis::kernel_trace_flush(std::shared_ptr kernel) { + std::string filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".csv"; + printf("Dumping block 0 heatmap to %s\n", filename.c_str()); + + std::ofstream out(filename); + std::stringstream ss; + + std::vector>> sorted_heatmap_data(_heatmap_data.begin(), _heatmap_data.end()); + std::sort(sorted_heatmap_data.begin(), sorted_heatmap_data.end(), [](const std::pair>& a, const std::pair>& b) { + return a.first < b.first; + }); + ss << "Sector Tag,\t\tDistinct Warp Count,\tAccess Count,\t\t\tTouched PC" << std::endl; + for (auto& [tag, data] : sorted_heatmap_data) { + ss << "0x"<(data[i]).count() << ","; + } + ss << "\t\t"; + for (int i = 9; i < 18; i++) { + ss << data[i] << ","; + } + for (auto pc : _sector_pc_information[tag]) { + ss << "\t\t0x" << std::hex << pc << std::dec << ","; + } + ss << std::endl; + } + + out << ss.str(); + + out.close(); +} + + +void HeatmapAnalysis::kernel_end_callback(std::shared_ptr kernel) { + auto evt = std::prev(kernel_events.end())->second; + evt->end_time = _timer.get(); + + kernel_trace_flush(evt); + + _timer.increment(true); +} + + +void HeatmapAnalysis::mem_alloc_callback(std::shared_ptr mem) { + alloc_events.emplace(_timer.get(), mem); + active_memories.emplace(mem->addr, mem); + + _timer.increment(true); +} + + +void HeatmapAnalysis::mem_free_callback(std::shared_ptr mem) { + auto it = active_memories.find(mem->addr); + assert(it != active_memories.end()); + active_memories.erase(it); + + _timer.increment(true); +} + + +void HeatmapAnalysis::ten_alloc_callback(std::shared_ptr ten) { + tensor_events.emplace(_timer.get(), ten); + active_tensors.emplace(ten->addr, ten); + + _timer.increment(true); +} + + +void HeatmapAnalysis::ten_free_callback(std::shared_ptr ten) { + auto it = active_tensors.find(ten->addr); + assert(it != active_tensors.end()); + active_tensors.erase(it); + + _timer.increment(true); +} + +// function signature: +// addr: the address of the memory access +// warp_id: the warp id of the memory access +// sector_tag: the sector tag of the memory access +// offset: the offset of the memory access +// length: the length of the memory access +// count_access_flag: whether to count the access flag +// return: void +void HeatmapAnalysis::unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length) { + + // heatmap_data[tag][0-7]: distinct warp id mask for each word in this sector; + // heatmap_data[tag][8]: distinct warp id mask for entire sector; + // heatmap_data[tag][9-17]: access count for each word and the last is for entire sector; + // // if count_access_flag is true, then the access count for the entire sector is incremented by 1; + auto& sector_data = _heatmap_data[sector_tag]; + auto mask = (1u << warp_id); + for (int i = 0; i < length; i+=4) { + sector_data[offset+i/4] |= mask; + sector_data[8] |= mask; + sector_data[9+offset+i/4] += 1; + } + sector_data[17] += 1; +} + +void HeatmapAnalysis::add_sector_pc_information(uint32_t sector_tag, uint64_t pc) { + _sector_pc_information[sector_tag].insert(pc); +} + + +void HeatmapAnalysis::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + for (uint64_t i = 0; i < size; i++) { + auto trace = accesses_buffer[i]; + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (trace.active_mask & (1u << j)) { + auto sector_tag = trace.addresses[j] >> SECTOR_TAG_SHIFT; + auto offset = (trace.addresses[j] & 31) >> 2; + unit_access(trace.warpId, sector_tag, offset, trace.accessSize); + add_sector_pc_information(sector_tag, trace.pc); + } + } + } +} +void HeatmapAnalysis::evt_callback(EventPtr_t evt) { + switch (evt->evt_type) { + case EventType_KERNEL_LAUNCH: + kernel_start_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_KERNEL_END: + kernel_end_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_ALLOC: + mem_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_FREE: + mem_free_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_ALLOC: + ten_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_FREE: + ten_free_callback(std::dynamic_pointer_cast(evt)); + break; + default: + break; + } +} + + +void HeatmapAnalysis::flush() { +} From 885927936cd35666d78aaee15c316eb277e39e80 Mon Sep 17 00:00:00 2001 From: Yanbo Date: Sun, 21 Dec 2025 05:05:33 -0500 Subject: [PATCH 2/6] local memory request support --- include/tools/heatmap_analysis.h | 2 +- src/tools/heatmap_analysis.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/tools/heatmap_analysis.h b/include/tools/heatmap_analysis.h index 0206a56..2a50475 100644 --- a/include/tools/heatmap_analysis.h +++ b/include/tools/heatmap_analysis.h @@ -35,7 +35,7 @@ class HeatmapAnalysis final : public Tool { private: void unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length); - void add_sector_pc_information(uint32_t sector_tag, uint64_t pc); + void add_sector_pc_information(uint64_t sector_tag, uint64_t pc); void kernel_start_callback(std::shared_ptr kernel); diff --git a/src/tools/heatmap_analysis.cpp b/src/tools/heatmap_analysis.cpp index 23510fe..6be87ce 100644 --- a/src/tools/heatmap_analysis.cpp +++ b/src/tools/heatmap_analysis.cpp @@ -149,7 +149,7 @@ void HeatmapAnalysis::unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_ sector_data[17] += 1; } -void HeatmapAnalysis::add_sector_pc_information(uint32_t sector_tag, uint64_t pc) { +void HeatmapAnalysis::add_sector_pc_information(uint64_t sector_tag, uint64_t pc) { _sector_pc_information[sector_tag].insert(pc); } @@ -168,6 +168,7 @@ void HeatmapAnalysis::gpu_data_analysis(void* data, uint64_t size) { } } } + void HeatmapAnalysis::evt_callback(EventPtr_t evt) { switch (evt->evt_type) { case EventType_KERNEL_LAUNCH: From 5431757d81a24caec06cb90a6a5ae1d727dc3b9f Mon Sep 17 00:00:00 2001 From: Yanbo Date: Tue, 6 Jan 2026 12:56:07 -0500 Subject: [PATCH 3/6] cuVein initialize --- include/sanalyzer.h | 1 + include/tools/pc_dependency_analysis.h | 186 +++++++++++ include/tools/tool_type.h | 3 +- include/utils/event.h | 1 + src/sanalyzer.cpp | 9 + src/tools/pc_dependency_analysis.cpp | 408 +++++++++++++++++++++++++ 6 files changed, 607 insertions(+), 1 deletion(-) create mode 100644 include/tools/pc_dependency_analysis.h create mode 100644 src/tools/pc_dependency_analysis.cpp diff --git a/include/sanalyzer.h b/include/sanalyzer.h index 8525d1f..0d9e188 100644 --- a/include/sanalyzer.h +++ b/include/sanalyzer.h @@ -27,6 +27,7 @@ typedef enum { GPU_PATCH_ROOFLINE_SIZE = 10, GPU_PATCH_HEATMAP_ANALYSIS = 11, GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS = 12, + GPU_PATCH_PC_DEPENDENCY_ANALYSIS = 13, } AccelProfPatchName_t; diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h new file mode 100644 index 0000000..1d2206c --- /dev/null +++ b/include/tools/pc_dependency_analysis.h @@ -0,0 +1,186 @@ +#ifndef YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H +#define YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H + + +#include "tools/tool.h" +#include "utils/event.h" +#include "gpu_patch.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace yosemite { + +/* we choose to use PC offset instead of PC because the PC is too long for shadow memory and it is not necessary to track the original PC. +The offset will be calculated during trace collection. + +Every memory allocation will cause a shadow memory to be created. +Every memory deallocation will cause a shadow memory to be destroyed. +Shadow memory bitmask will be reset when a kernel finished. (to avoid mass shadow memory reset) + +The gpu data analysis will +1.iterate the trace buffer and query the shadow memory to get the corresponding shadow memory entry. +2. compare the last access information with the current access information with the rules below: + 0. if bitmask of this access is 0, it means the current access is a cold miss set it's acient pc to 0xFFFFFFFF. + 1. if last access and current access are from the same thread, then it is an intra thread access. + 2. if last access and current access are from the same warp, then it is an intra warp access. + 3. if last access and current access are from the same block, then it is an intra block access. + 4. if last access and current access are from the same grid, then it is an intra grid access. +3. update the pc_statistics with the current pc, ancient pc and the distance. +4. update the shadow memory entry with the current pc and the flat thread id. +*/ + + +class memory_region{ +public: + memory_region() : start(0), end(0) {}; + memory_region(uint64_t start, uint64_t end) : start(start), end(end) {}; + ~memory_region() {}; + + bool contains(uint64_t ptr) const { + return ptr >= start && ptr < end; + }; + + bool operator==(const memory_region& other) const { + return start == other.start && end == other.end; + }; + + bool operator<(const memory_region& other) const { + // strict-weak-ordering: compare both start and end + if (start != other.start) return start < other.start; + return end < other.end; + }; + + uint64_t get_start() const { + return start; + }; + uint64_t get_end() const { + return end; + }; + +private: + uint64_t start; + uint64_t end; +}; + +class shadow_memory_entry{ +public: + shadow_memory_entry() {}; + ~shadow_memory_entry() {}; + + uint32_t last_pc = 0xFFFFFFFFu; // using offset of pc instead of original pc to save space and keep alignment; + uint32_t last_flat_thread_id = 0xFFFFFFFFu; // 0-5 bits for lane id, 6-10 bits for warp id, 11-31 bits for block id to save space; +}; + +class shadow_memory{ +public: + shadow_memory(uint64_t size) + : _shadow_memory_entries(std::make_unique(size)), + _size(size), + _shadow_memory_bitmap(std::vector((size + 7) / 8, 0)) { + printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size); + printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry)); + printf("[PC_DEPENDENCY] Shadow memory size: %lu\n", size*sizeof(shadow_memory_entry)); + printf("[PC_DEPENDENCY] Shadow memory bitmap size: %lu\n", _shadow_memory_bitmap.size()); + }; + ~shadow_memory() = default; + void reset_bitmap() { + std::fill(_shadow_memory_bitmap.begin(), _shadow_memory_bitmap.end(), 0); + }; + shadow_memory_entry& get_entry(uint64_t offset) { + assert(offset < _size); + return _shadow_memory_entries[offset]; + } + bool is_valid(uint64_t ptr) { + return _shadow_memory_bitmap[ptr / 8] & (1u << (ptr % 8)); + } + void set_valid(uint64_t ptr) { + _shadow_memory_bitmap[ptr / 8] |= (1u << (ptr % 8)); + } + uint64_t _size; + std::unique_ptr _shadow_memory_entries; + std::vector _shadow_memory_bitmap; +}; + + +class PC_statisitics{ +public: + std::array dist = {0, 0, 0, 0}; + // 0: intra thread + // 1: intra warp + // 2: intra block + // 3: intra grid +}; + +class PcDependency final : public Tool { +public: + PcDependency(); + + ~PcDependency(); + + void gpu_data_analysis(void* data, uint64_t size); + + void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void allocation_callback(uint64_t ptr, uint64_t size); + + void deallocation_callback(uint64_t ptr); + + void evt_callback(EventPtr_t evt); + + void flush(); + +private: + void kernel_start_callback(std::shared_ptr kernel); + + void kernel_end_callback(std::shared_ptr kernel); + + void mem_alloc_callback(std::shared_ptr mem); + + void mem_free_callback(std::shared_ptr mem); + + void ten_alloc_callback(std::shared_ptr ten); + + void ten_free_callback(std::shared_ptr ten); + + void kernel_trace_flush(std::shared_ptr kernel); + + void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size); + + +/* +********************************* variables ********************************* +*/ + Timer_t _timer; + + std::string output_directory; + uint32_t kernel_id = 0; + + + std::map> kernel_events; + std::map> alloc_events; + std::map> active_memories; + + std::map> tensor_events; + std::map> active_tensors; + + + std::vector _memory_regions; + + std::map> _shadow_memories; // memory region, shadow memory + std::unordered_map> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics + std::unordered_map _pc_flags; // pc offset, flags +}; + +} // yosemite +#endif // YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H diff --git a/include/tools/tool_type.h b/include/tools/tool_type.h index fdc70dc..b4d39dd 100644 --- a/include/tools/tool_type.h +++ b/include/tools/tool_type.h @@ -18,7 +18,8 @@ typedef enum { ROOFLINE_TIME = 13, HEATMAP_ANALYSIS = 14, BLOCK_DIVERGENCE_ANALYSIS = 15, - TOOL_NUMS = 16 + PC_DEPENDENCY_ANALYSIS = 16, + TOOL_NUMS = 17 } AnalysisTool_t; #endif // TOOL_TYPE_H \ No newline at end of file diff --git a/include/utils/event.h b/include/utils/event.h index 903fc43..c29dd4e 100644 --- a/include/utils/event.h +++ b/include/utils/event.h @@ -69,6 +69,7 @@ typedef struct KernelLaunch : public Event { uint32_t touched_objects; uint32_t touched_objects_size; uint64_t key; // for UVM Advisor + uint64_t kernel_pc; KernelLaunch() { this->evt_type = EventType_KERNEL_LAUNCH; diff --git a/src/sanalyzer.cpp b/src/sanalyzer.cpp index 038c37c..faa6056 100644 --- a/src/sanalyzer.cpp +++ b/src/sanalyzer.cpp @@ -18,6 +18,7 @@ #include "tools/event_trace_mgpu.h" #include "tools/heatmap_analysis.h" #include "tools/block_divergence_analysis.h" +#include "tools/pc_dependency_analysis.h" #include #include @@ -112,6 +113,9 @@ YosemiteResult_t yosemite_tool_enable(AnalysisTool_t& tool) { } else if (std::string(tool_name) == "block_divergence_analysis") { tool = BLOCK_DIVERGENCE_ANALYSIS; _tools.emplace(BLOCK_DIVERGENCE_ANALYSIS, std::make_shared()); + } else if (std::string(tool_name) == "pc_dependency_analysis") { + tool = PC_DEPENDENCY_ANALYSIS; + _tools.emplace(PC_DEPENDENCY_ANALYSIS, std::make_shared()); } else { fprintf(stderr, "[SANALYZER ERROR] Tool not found.\n"); fflush(stderr); @@ -263,6 +267,11 @@ YosemiteResult_t yosemite_init(AccelProfOptions_t& options) { } else if (tool == BLOCK_DIVERGENCE_ANALYSIS) { options.patch_name = GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS; options.patch_file = "gpu_patch_block_divergence_analysis.fatbin"; + } else if (tool == PC_DEPENDENCY_ANALYSIS) { + options.patch_name = GPU_PATCH_PC_DEPENDENCY_ANALYSIS; + // nv-compute/Makefile generates fatbins based on gpu_src/*.cu filenames. + // The source file for this tool is nv-compute/gpu_src/gpu_patch_pc_dependency.cu + options.patch_file = "gpu_patch_pc_dependency.fatbin"; } // enable torch profiler? diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp new file mode 100644 index 0000000..e4f66a3 --- /dev/null +++ b/src/tools/pc_dependency_analysis.cpp @@ -0,0 +1,408 @@ +#include "tools/pc_dependency_analysis.h" +#include "utils/helper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace yosemite; + +namespace { +static std::string json_escape(const std::string& s) { + std::string out; + out.reserve(s.size() + 8); + for (char c : s) { + switch (c) { + case '\"': out += "\\\""; break; + case '\\': out += "\\\\"; break; + case '\b': out += "\\b"; break; + case '\f': out += "\\f"; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + // control chars + if (static_cast(c) < 0x20) { + std::ostringstream oss; + oss << "\\u" + << std::hex << std::setw(4) << std::setfill('0') + << (int)static_cast(c); + out += oss.str(); + } else { + out += c; + } + } + } + return out; +} + +static std::string hex_u32(uint32_t v) { + std::ostringstream oss; + oss << "0x" << std::hex << v; + return oss.str(); +} +} // namespace + + +PcDependency::PcDependency() : Tool(PC_DEPENDENCY_ANALYSIS) { + const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED"); + if (torch_prof && std::string(torch_prof) == "1") { + fprintf(stdout, "Enabling torch profiler in PcDependency.\n"); + _torch_enabled = true; + } + + const char* env_app_name = std::getenv("YOSEMITE_APP_NAME"); + if (env_app_name != nullptr) { + output_directory = "dependency_" + std::string(env_app_name) + + "_" + get_current_date_n_time(); + } else { + output_directory = "dependency_" + get_current_date_n_time(); + } + check_folder_existance(output_directory); +} + + +PcDependency::~PcDependency() {} + + +void PcDependency::kernel_start_callback(std::shared_ptr kernel) { + + kernel->kernel_id = kernel_id++; + kernel_events.emplace(_timer.get(), kernel); + _pc_statistics.clear(); + _pc_flags.clear(); + for (auto& shadow_memory_iter : _shadow_memories) { + shadow_memory_iter.second->reset_bitmap(); + } + printf("[PC_DEPENDENCY] Resetting shadow memory bitmap\n"); + _timer.increment(true); +} + + +void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { + std::string filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".csv"; + printf("Dumping pc dependency to %s\n", filename.c_str()); + + std::ofstream out(filename); + out << "current_pc_offset,ancient_pc_offset,flags,intra_thread,intra_warp,intra_block,intra_grid\n"; + + std::vector>> outer( + _pc_statistics.begin(), _pc_statistics.end()); + std::sort(outer.begin(), outer.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + for (auto& [cur_pc, inner_map] : outer) { + std::vector> inner(inner_map.begin(), inner_map.end()); + std::sort(inner.begin(), inner.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + uint32_t flags = 0; + auto fit = _pc_flags.find(cur_pc); + if (fit != _pc_flags.end()) flags = fit->second; + + for (auto& [anc_pc, st] : inner) { + out << "0x" << std::hex << cur_pc + << ",0x" << anc_pc + << ",0x" << flags + << std::dec + << "," << st.dist[0] + << "," << st.dist[1] + << "," << st.dist[2] + << "," << st.dist[3] + << "\n"; + } + } + + // JSON output for building PC dependency graph (joinable with CFG) + std::string json_filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".json"; + std::ofstream jout(json_filename); + jout << "{\n"; + jout << " \"tool\": \"pc_dependency_analysis\",\n"; + jout << " \"kernel\": {\n"; + jout << " \"kernel_id\": " << kernel->kernel_id << ",\n"; + jout << " \"kernel_name\": \"" << json_escape(kernel->kernel_name) << "\",\n"; + jout << " \"device_id\": " << kernel->device_id << ",\n"; + jout << " \"kernel_pc\": " << kernel->kernel_pc << ",\n"; + jout << " \"kernel_pc_hex\": \"" << hex_u32((uint32_t)kernel->kernel_pc) << "\"\n"; + jout << " },\n"; + jout << " \"shadow_memory_granularity_bytes\": 1,\n"; + jout << " \"sample_stride_bytes\": 4,\n"; + + // Collect nodes (all current PCs + all non-cold ancient PCs) + std::set nodes; + for (const auto& [cur_pc, inner_map] : _pc_statistics) { + nodes.insert(cur_pc); + for (const auto& [anc_pc, st] : inner_map) { + (void)st; + if (anc_pc != 0xFFFFFFFFu) nodes.insert(anc_pc); + } + } + + jout << " \"nodes\": [\n"; + { + bool first = true; + for (uint32_t pc : nodes) { + if (!first) jout << ",\n"; + first = false; + auto fit = _pc_flags.find(pc); + bool has_flags = (fit != _pc_flags.end()); + uint32_t flags = has_flags ? fit->second : 0; + jout << " {\"pc\": " << pc + << ", \"pc_hex\": \"" << hex_u32(pc) << "\""; + if (has_flags) { + jout << ", \"flags\": " << flags + << ", \"flags_hex\": \"" << hex_u32(flags) << "\""; + } else { + jout << ", \"flags\": null, \"flags_hex\": null"; + } + jout << "}"; + } + jout << "\n"; + } + jout << " ],\n"; + + // Edges: ancient_pc -> current_pc, with per-scope counts. + jout << " \"edges\": [\n"; + { + // Stable order: sort by current pc then ancient pc + std::vector>> outer2( + _pc_statistics.begin(), _pc_statistics.end()); + std::sort(outer2.begin(), outer2.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + bool first_edge = true; + for (auto& [cur_pc, inner_map] : outer2) { + std::vector> inner2(inner_map.begin(), inner_map.end()); + std::sort(inner2.begin(), inner2.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + // current flags if available + auto cfit = _pc_flags.find(cur_pc); + bool has_cflags = (cfit != _pc_flags.end()); + uint32_t cflags = has_cflags ? cfit->second : 0; + + for (auto& [anc_pc, st] : inner2) { + if (!first_edge) jout << ",\n"; + first_edge = false; + + bool cold_miss = (anc_pc == 0xFFFFFFFFu); + + jout << " {\"current_pc\": " << cur_pc + << ", \"current_pc_hex\": \"" << hex_u32(cur_pc) << "\"" + << ", \"ancient_pc\": "; + if (cold_miss) { + jout << "null"; + } else { + jout << anc_pc; + } + jout << ", \"ancient_pc_hex\": "; + if (cold_miss) { + jout << "null"; + } else { + jout << "\"" << hex_u32(anc_pc) << "\""; + } + jout << ", \"cold_miss\": " << (cold_miss ? "true" : "false"); + + if (has_cflags) { + jout << ", \"current_flags\": " << cflags + << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\""; + } else { + jout << ", \"current_flags\": null, \"current_flags_hex\": null"; + } + + jout << ", \"dist\": {" + << "\"intra_thread\": " << st.dist[0] + << ", \"intra_warp\": " << st.dist[1] + << ", \"intra_block\": " << st.dist[2] + << ", \"intra_grid\": " << st.dist[3] + << "}}"; + } + } + jout << "\n"; + } + jout << " ]\n"; + jout << "}\n"; + printf("Dumping pc dependency graph json to %s\n", json_filename.c_str()); +} + + +void PcDependency::kernel_end_callback(std::shared_ptr kernel) { + auto evt = std::prev(kernel_events.end())->second; + evt->end_time = _timer.get(); + + kernel_trace_flush(evt); + + _timer.increment(true); +} + + +void PcDependency::mem_alloc_callback(std::shared_ptr mem) { + // TODO: add shadow memory allocation here + alloc_events.emplace(_timer.get(), mem); + active_memories.emplace(mem->addr, mem); + memory_region memory_region_current = memory_region((uint64_t)mem->addr, (uint64_t)(mem->addr + mem->size)); + _memory_regions.push_back(memory_region_current); + _shadow_memories.emplace(memory_region_current, std::make_unique(mem->size)); + + printf("[PC_DEPENDENCY] Allocating shadow memory for memory region: %p - %p, size: %lu\n", (void*)memory_region_current.get_start(), (void*)memory_region_current.get_end(), mem->size); + _timer.increment(true); +} + +void PcDependency::mem_free_callback(std::shared_ptr mem) { + auto it = active_memories.find(mem->addr); + assert(it != active_memories.end()); + + uint64_t sz = it->second->size; // 从 alloc 事件拿 size + active_memories.erase(it); + + memory_region r((uint64_t)mem->addr, (uint64_t)mem->addr + sz); + + auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r); + if (vit != _memory_regions.end()) _memory_regions.erase(vit); + + _shadow_memories.erase(r); + printf("[PC_DEPENDENCY] Freeing shadow memory for memory region: %p - %p, size: %lu\n", (void*)r.get_start(), (void*)r.get_end(), sz); + _timer.increment(true); +} + + +void PcDependency::ten_alloc_callback(std::shared_ptr ten) { + tensor_events.emplace(_timer.get(), ten); + active_tensors.emplace(ten->addr, ten); + + _timer.increment(true); +} + + +void PcDependency::ten_free_callback(std::shared_ptr ten) { + auto it = active_tensors.find(ten->addr); + assert(it != active_tensors.end()); + active_tensors.erase(it); + + _timer.increment(true); +} + +void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size) { + // auto& shadow_memory = this->_shadow_memories[memory_region_target]; + auto shadow_memory_it = this->_shadow_memories.find(memory_region_target); + if (shadow_memory_it == this->_shadow_memories.end()) { + printf("shadow memory not found for memory region: %lu - %lu\n", memory_region_target.get_start(), memory_region_target.get_end()); + return; + } + auto& shadow_memory = *(shadow_memory_it->second); + + for (int i = 0; i < access_size; i += 4) { + auto addr = ptr + i; + // Byte-granularity shadow memory: addr is byte offset within allocation. + // Bound check to avoid OOB on allocations at end boundary or odd sizes. + if (addr >= shadow_memory._size) { + break; + } + if (shadow_memory.is_valid(addr) == false) { + // cold miss + _pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1; + shadow_memory.set_valid(addr); + auto& shadow_memory_entry = shadow_memory.get_entry(addr); + shadow_memory_entry.last_pc = pc_offset; + shadow_memory_entry.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id; + continue; + } + auto& last_access = shadow_memory.get_entry(addr); + uint64_t last_block_id = last_access.last_flat_thread_id >> 10; + uint64_t last_warp_id = (last_access.last_flat_thread_id >> 5) & 0x1F; + uint64_t last_lane_id = last_access.last_flat_thread_id & 0x1F; + + uint32_t last_pc = last_access.last_pc; + if (last_block_id != current_block_id) { + this->_pc_statistics[pc_offset][last_pc].dist[3] += 1; + }else if (last_warp_id != current_warp_id) { + this->_pc_statistics[pc_offset][last_pc].dist[2] += 1; + }else if (last_lane_id != current_lane_id) { + this->_pc_statistics[pc_offset][last_pc].dist[1] += 1; + }else { + this->_pc_statistics[pc_offset][last_pc].dist[0] += 1; + } + last_access.last_pc = pc_offset; + last_access.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id; + } +} + + +void PcDependency::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + for (uint64_t i = 0; i < size; i++) { + MemoryAccess trace = accesses_buffer[i]; + uint32_t pc_offset = trace.pc; + this->_pc_flags[pc_offset] = trace.flags; + if (trace.type != MemoryType::Global) { + //only analyze global memory accesses currently + continue; + } + uint32_t access_size = trace.accessSize; + memory_region memory_region_target; + uint64_t first_valid_address = 0; + + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (trace.active_mask & (1u << j)) { + first_valid_address = trace.addresses[j]; + break; + } + } + + + assert(first_valid_address != 0); + for (auto memory_region_iter : this->_memory_regions) { + if (memory_region_iter.contains(first_valid_address)) { + memory_region_target = memory_region_iter; + break; + } + } + uint64_t memory_region_start = memory_region_target.get_start(); + assert(memory_region_start != 0); + for ( int j = 0; j < GPU_WARP_SIZE; j++) { + if (trace.active_mask & (1u << j)) { + unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size); + } + } + } + +} + + +void PcDependency::evt_callback(EventPtr_t evt) { + switch (evt->evt_type) { + case EventType_KERNEL_LAUNCH: + kernel_start_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_KERNEL_END: + kernel_end_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_ALLOC: + mem_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_FREE: + mem_free_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_ALLOC: + ten_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_FREE: + ten_free_callback(std::dynamic_pointer_cast(evt)); + break; + default: + break; + } +} + + +void PcDependency::flush() { +} From 6020b6585df30f4671f1fbde99b29cb58d0cdb2b Mon Sep 17 00:00:00 2001 From: Yanbo Zhao Date: Wed, 11 Feb 2026 14:09:06 -0500 Subject: [PATCH 4/6] cuVein update --- include/tools/pc_dependency_analysis.h | 58 +++++++- src/tools/pc_dependency_analysis.cpp | 196 ++++++++++++++++++++----- 2 files changed, 214 insertions(+), 40 deletions(-) diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h index 1d2206c..2d42264 100644 --- a/include/tools/pc_dependency_analysis.h +++ b/include/tools/pc_dependency_analysis.h @@ -17,6 +17,39 @@ #include #include + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ +#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE +#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_RED +#define SANITIZER_MEMORY_DEVICE_FLAG_RED 0x3 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC +#define SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC 0x4 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH +#define SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH 0x8 +#endif + +#ifndef SANITIZER_MEMORY_GLOBAL +#define SANITIZER_MEMORY_GLOBAL 0x10 +#endif + +#ifndef SANITIZER_MEMORY_SHARED +#define SANITIZER_MEMORY_SHARED 0x20 +#endif + +#ifndef SANITIZER_MEMORY_LOCAL +#define SANITIZER_MEMORY_LOCAL 0x40 +#endif + namespace yosemite { /* we choose to use PC offset instead of PC because the PC is too long for shadow memory and it is not necessary to track the original PC. @@ -83,8 +116,10 @@ class shadow_memory_entry{ class shadow_memory{ public: shadow_memory(uint64_t size) - : _shadow_memory_entries(std::make_unique(size)), - _size(size), + :_size(size), + _size_celled((size + 3) / 4 * 4), + _stride(_size_celled / 4), + _shadow_memory_entries(std::make_unique(_size_celled)), _shadow_memory_bitmap(std::vector((size + 7) / 8, 0)) { printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size); printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry)); @@ -97,7 +132,9 @@ class shadow_memory{ }; shadow_memory_entry& get_entry(uint64_t offset) { assert(offset < _size); - return _shadow_memory_entries[offset]; + //update layout: use offset/4 + offset%4 * _size/4 to make every 4 bytes adjacent in one cache line + return _shadow_memory_entries[(offset/4) + (offset%4) * _stride]; + // return _shadow_memory_entries[offset]; } bool is_valid(uint64_t ptr) { return _shadow_memory_bitmap[ptr / 8] & (1u << (ptr % 8)); @@ -106,6 +143,8 @@ class shadow_memory{ _shadow_memory_bitmap[ptr / 8] |= (1u << (ptr % 8)); } uint64_t _size; + uint64_t _size_celled; + uint64_t _stride; std::unique_ptr _shadow_memory_entries; std::vector _shadow_memory_bitmap; }; @@ -155,7 +194,11 @@ class PcDependency final : public Tool { void kernel_trace_flush(std::shared_ptr kernel); - void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size); + void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size); + + void unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size); + + void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size); /* @@ -178,8 +221,13 @@ class PcDependency final : public Tool { std::vector _memory_regions; std::map> _shadow_memories; // memory region, shadow memory + std::unordered_map _shadow_memory_shared; // shared memory address (packed as block_id << 32 | address low 32 bits to reduce aliasing), shadow memory shared std::unordered_map> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics - std::unordered_map _pc_flags; // pc offset, flags + std::unordered_map> _pc_flags; // pc offset, flags, size of the access + // Index [0..31] stores distinct sector count 1..32. + // Index [32..64] stores active lane count 0..32. + std::unordered_map> _distinct_sector_count; // pc offset, distinct sector distribution + }; } // yosemite diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp index e4f66a3..80053cc 100644 --- a/src/tools/pc_dependency_analysis.cpp +++ b/src/tools/pc_dependency_analysis.cpp @@ -10,6 +10,7 @@ #include #include #include +#include using namespace yosemite; @@ -48,6 +49,19 @@ static std::string hex_u32(uint32_t v) { oss << "0x" << std::hex << v; return oss.str(); } +static std::string flags_to_string(uint32_t flags) { + std::ostringstream oss; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) oss << "READ"; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) oss << "WRITE"; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC) oss << "ATOMIC"; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH) oss << "PREFETCH"; + oss << " "; + if (flags & SANITIZER_MEMORY_GLOBAL) oss << "GLOBAL"; + if (flags & SANITIZER_MEMORY_SHARED) oss << "SHARED"; + if (flags & SANITIZER_MEMORY_LOCAL) oss << "LOCAL"; + + return oss.str(); +} } // namespace @@ -105,8 +119,9 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { [](auto& a, auto& b){ return a.first < b.first; }); uint32_t flags = 0; + uint32_t access_size = 0; auto fit = _pc_flags.find(cur_pc); - if (fit != _pc_flags.end()) flags = fit->second; + if (fit != _pc_flags.end()){ flags = fit->second.first; access_size = fit->second.second;} for (auto& [anc_pc, st] : inner) { out << "0x" << std::hex << cur_pc @@ -155,14 +170,37 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { first = false; auto fit = _pc_flags.find(pc); bool has_flags = (fit != _pc_flags.end()); - uint32_t flags = has_flags ? fit->second : 0; + uint32_t flags = has_flags ? fit->second.first : 0; + uint32_t access_size = has_flags ? fit->second.second : 0; + bool has_distinct_sector_count = (_distinct_sector_count.find(pc) != _distinct_sector_count.end()); jout << " {\"pc\": " << pc << ", \"pc_hex\": \"" << hex_u32(pc) << "\""; if (has_flags) { - jout << ", \"flags\": " << flags - << ", \"flags_hex\": \"" << hex_u32(flags) << "\""; + jout << ", \"flags\": \"" << flags_to_string(flags) << "\"" + << ", \"flags_hex\": \"" << hex_u32(flags) << "\"" + << ", \"access_size\": " << access_size; } else { - jout << ", \"flags\": null, \"flags_hex\": null"; + jout << ", \"flags\": null, \"flags_hex\": null, \"access_size\": null"; + } + if (has_distinct_sector_count) { + jout << ", \"distinct_sector_count\": {"; + for (int i = 1; i <= 32; i++) { + jout << "\"" << i << "\": " << _distinct_sector_count[pc][i - 1]; + if (i != 32) { + jout << ", "; + } + } + jout << "}"; + jout << ", \"active_lane_count\": {"; + for (int i = 0; i <= 32; i++) { + jout << "\"" << i << "\": " << _distinct_sector_count[pc][32 + i]; + if (i != 32) { + jout << ", "; + } + } + jout << "}"; + } else { + jout << ", \"distinct_sector_count\": null, \"active_lane_count\": null"; } jout << "}"; } @@ -188,7 +226,8 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { // current flags if available auto cfit = _pc_flags.find(cur_pc); bool has_cflags = (cfit != _pc_flags.end()); - uint32_t cflags = has_cflags ? cfit->second : 0; + uint32_t cflags = has_cflags ? cfit->second.first : 0; + uint32_t c_access_size = has_cflags ? cfit->second.second : 0; for (auto& [anc_pc, st] : inner2) { if (!first_edge) jout << ",\n"; @@ -214,7 +253,8 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { if (has_cflags) { jout << ", \"current_flags\": " << cflags - << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\""; + << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\"" + << ", \"current_access_size\": " << c_access_size; } else { jout << ", \"current_flags\": null, \"current_flags_hex\": null"; } @@ -238,7 +278,8 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { void PcDependency::kernel_end_callback(std::shared_ptr kernel) { auto evt = std::prev(kernel_events.end())->second; evt->end_time = _timer.get(); - + this->_shadow_memory_shared.clear(); + printf("[PC_DEPENDENCY] Clearing shadow memory shared\n"); kernel_trace_flush(evt); _timer.increment(true); @@ -278,6 +319,9 @@ void PcDependency::mem_free_callback(std::shared_ptr mem) { void PcDependency::ten_alloc_callback(std::shared_ptr ten) { tensor_events.emplace(_timer.get(), ten); active_tensors.emplace(ten->addr, ten); + _memory_regions.push_back(memory_region((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size))); + _shadow_memories.emplace(_memory_regions.back(), std::make_unique(ten->size)); + printf("[PC_DEPENDENCY] Allocating shadow memory for tensor region: %p - %p, size: %lu\n", (void*)ten->addr, (void*)(ten->addr + ten->size), ten->size); _timer.increment(true); } @@ -286,12 +330,25 @@ void PcDependency::ten_alloc_callback(std::shared_ptr ten) { void PcDependency::ten_free_callback(std::shared_ptr ten) { auto it = active_tensors.find(ten->addr); assert(it != active_tensors.end()); + + // TenFree.size may be negative (e.g., accounting-style events). Use size from TenAlloc. + const uint64_t sz = static_cast(it->second->size); active_tensors.erase(it); + memory_region r((uint64_t)ten->addr, (uint64_t)ten->addr + sz); + + auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r); + if (vit != _memory_regions.end()) { + _memory_regions.erase(vit); + } + + _shadow_memories.erase(r); + printf("[PC_DEPENDENCY] Freeing shadow memory for tensor region: %p - %p, size: %lu\n", + (void*)r.get_start(), (void*)r.get_end(), sz); _timer.increment(true); } -void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size) { +void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size) { // auto& shadow_memory = this->_shadow_memories[memory_region_target]; auto shadow_memory_it = this->_shadow_memories.find(memory_region_target); if (shadow_memory_it == this->_shadow_memories.end()) { @@ -336,42 +393,111 @@ void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t curren } } +void PcDependency::unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) { + // 共享内存地址在同一个 block 内唯一,使用 block_id 高位 + 地址低 32 位作为 key, + const uint64_t packed_base = ((current_block_id & 0xFFFFFFFFull) << 32) + | (ptr & 0xFFFFFFFFull); + + for (int i = 0; i < access_size; i += 4) { + const uint64_t addr = packed_base + i; // 4 字节粒度 + + auto it = this->_shadow_memory_shared.find(addr); + if (it == this->_shadow_memory_shared.end()) { + // cold miss + this->_pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1; + auto& entry = this->_shadow_memory_shared.emplace(addr, shadow_memory_entry()).first->second; + entry.last_pc = pc_offset; + entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id; // 只编码 warp/lane + continue; + } + + auto& entry = it->second; + const uint64_t last_warp_id = (entry.last_flat_thread_id >> 5) & 0x1F; + const uint64_t last_lane_id = entry.last_flat_thread_id & 0x1F; + const uint32_t last_pc = entry.last_pc; + + if (last_warp_id != current_warp_id) { + // 不同 warp 同 block + this->_pc_statistics[pc_offset][last_pc].dist[2] += 1; + } else if (last_lane_id != current_lane_id) { + // 同 warp 不同 lane + this->_pc_statistics[pc_offset][last_pc].dist[1] += 1; + } else { + // 同一线程 + this->_pc_statistics[pc_offset][last_pc].dist[0] += 1; + } + + entry.last_pc = pc_offset; + entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id; + } +} + +void PcDependency::unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) { + // TODO: implement local memory access +} + void PcDependency::gpu_data_analysis(void* data, uint64_t size) { MemoryAccess* accesses_buffer = (MemoryAccess*)data; for (uint64_t i = 0; i < size; i++) { MemoryAccess trace = accesses_buffer[i]; uint32_t pc_offset = trace.pc; - this->_pc_flags[pc_offset] = trace.flags; - if (trace.type != MemoryType::Global) { - //only analyze global memory accesses currently - continue; - } + uint32_t flags = trace.flags; uint32_t access_size = trace.accessSize; - memory_region memory_region_target; - uint64_t first_valid_address = 0; - - for (int j = 0; j < GPU_WARP_SIZE; j++) { - if (trace.active_mask & (1u << j)) { - first_valid_address = trace.addresses[j]; + uint32_t distinct_sector_count = trace.distinct_sector_count; + uint32_t active_mask = trace.active_mask; + switch (trace.type) { + case MemoryType::Local:{ + flags |= SANITIZER_MEMORY_LOCAL; + break; + } + case MemoryType::Shared:{ + flags |= SANITIZER_MEMORY_SHARED; + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (active_mask & (1u << j)) { + unit_access_shared(trace.addresses[j], pc_offset, trace.ctaId, trace.warpId, j, trace.accessSize); + } + } + break; + } + case MemoryType::Global:{ + flags |= SANITIZER_MEMORY_GLOBAL; + memory_region memory_region_target; + uint64_t first_valid_address = 0; + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (active_mask & (1u << j)) { + first_valid_address = trace.addresses[j]; + break; + } + } + assert(first_valid_address != 0); + for (auto memory_region_iter : this->_memory_regions) { + if (memory_region_iter.contains(first_valid_address)) { + memory_region_target = memory_region_iter; + break; + } + } + uint64_t memory_region_start = memory_region_target.get_start(); + assert(memory_region_start != 0); + for ( int j = 0; j < GPU_WARP_SIZE; j++) { + if (active_mask & (1u << j)) { + unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size); + } + } + break; + } + default: + printf("unknown memory type\n"); break; - } } - - - assert(first_valid_address != 0); - for (auto memory_region_iter : this->_memory_regions) { - if (memory_region_iter.contains(first_valid_address)) { - memory_region_target = memory_region_iter; - break; - } + this->_pc_flags[pc_offset] = std::make_pair(flags, access_size); + // Defensive bounds checks: GPU side should produce [1, 32]. + if (distinct_sector_count >= 1 && distinct_sector_count <= 32) { + this->_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1; } - uint64_t memory_region_start = memory_region_target.get_start(); - assert(memory_region_start != 0); - for ( int j = 0; j < GPU_WARP_SIZE; j++) { - if (trace.active_mask & (1u << j)) { - unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size); - } + const uint32_t active_lane_count = __builtin_popcount(active_mask); + if (active_lane_count <= 32) { + this->_distinct_sector_count[pc_offset][32 + active_lane_count] += 1; } } From e5c235146266de89755e37c4fbd492d4363d41a3 Mon Sep 17 00:00:00 2001 From: Yanbo Zhao Date: Fri, 13 Feb 2026 16:36:32 -0500 Subject: [PATCH 5/6] cuVein parallel optimization --- include/tools/pc_dependency_analysis.h | 91 ++++-- src/tools/pc_dependency_analysis.cpp | 398 +++++++++++++++++++------ 2 files changed, 368 insertions(+), 121 deletions(-) diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h index 2d42264..717307a 100644 --- a/include/tools/pc_dependency_analysis.h +++ b/include/tools/pc_dependency_analysis.h @@ -16,6 +16,11 @@ #include #include #include +#include +#include +#include +#include +#include #ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ @@ -104,13 +109,14 @@ class memory_region{ uint64_t end; }; -class shadow_memory_entry{ +class alignas(8) shadow_memory_entry{ public: shadow_memory_entry() {}; ~shadow_memory_entry() {}; - - uint32_t last_pc = 0xFFFFFFFFu; // using offset of pc instead of original pc to save space and keep alignment; - uint32_t last_flat_thread_id = 0xFFFFFFFFu; // 0-5 bits for lane id, 6-10 bits for warp id, 11-31 bits for block id to save space; + // Packed representation: low 32 bits = last_pc, high 32 bits = last_flat_thread_id. + // Keeping a single 64-bit field avoids type-punning UB in atomic exchange. + // packed == 0 means invalid/uninitialized (cold). + uint64_t packed = 0; }; class shadow_memory{ @@ -119,16 +125,26 @@ class shadow_memory{ :_size(size), _size_celled((size + 3) / 4 * 4), _stride(_size_celled / 4), - _shadow_memory_entries(std::make_unique(_size_celled)), - _shadow_memory_bitmap(std::vector((size + 7) / 8, 0)) { + _entries_bytes(std::max(1, _size_celled * sizeof(shadow_memory_entry))) { + _shadow_memory_entries = static_cast( + mmap(nullptr, _entries_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0) + ); + assert(_shadow_memory_entries != MAP_FAILED); + printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size); printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry)); printf("[PC_DEPENDENCY] Shadow memory size: %lu\n", size*sizeof(shadow_memory_entry)); - printf("[PC_DEPENDENCY] Shadow memory bitmap size: %lu\n", _shadow_memory_bitmap.size()); }; - ~shadow_memory() = default; - void reset_bitmap() { - std::fill(_shadow_memory_bitmap.begin(), _shadow_memory_bitmap.end(), 0); + ~shadow_memory() { + if (_shadow_memory_entries != nullptr && _shadow_memory_entries != MAP_FAILED) { + munmap(_shadow_memory_entries, _entries_bytes); + _shadow_memory_entries = nullptr; + } + } + void reset_entries() { + if (madvise(_shadow_memory_entries, _entries_bytes, MADV_DONTNEED) != 0) { + std::memset(_shadow_memory_entries, 0, _entries_bytes); + } }; shadow_memory_entry& get_entry(uint64_t offset) { assert(offset < _size); @@ -136,17 +152,11 @@ class shadow_memory{ return _shadow_memory_entries[(offset/4) + (offset%4) * _stride]; // return _shadow_memory_entries[offset]; } - bool is_valid(uint64_t ptr) { - return _shadow_memory_bitmap[ptr / 8] & (1u << (ptr % 8)); - } - void set_valid(uint64_t ptr) { - _shadow_memory_bitmap[ptr / 8] |= (1u << (ptr % 8)); - } uint64_t _size; uint64_t _size_celled; uint64_t _stride; - std::unique_ptr _shadow_memory_entries; - std::vector _shadow_memory_bitmap; + uint64_t _entries_bytes; + shadow_memory_entry* _shadow_memory_entries = nullptr; }; @@ -194,11 +204,30 @@ class PcDependency final : public Tool { void kernel_trace_flush(std::shared_ptr kernel); - void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size); - - void unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size); + void unit_access( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + memory_region& memory_region_target, + int access_size, + std::unordered_map>& local_pc_statistics + ); + + void unit_access_shared( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + int access_size, + std::unordered_map>& local_pc_statistics, + std::unordered_map& local_shadow_memory_shared + ); void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size); + void worker_loop(uint64_t worker_idx); /* @@ -221,13 +250,31 @@ class PcDependency final : public Tool { std::vector _memory_regions; std::map> _shadow_memories; // memory region, shadow memory - std::unordered_map _shadow_memory_shared; // shared memory address (packed as block_id << 32 | address low 32 bits to reduce aliasing), shadow memory shared std::unordered_map> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics std::unordered_map> _pc_flags; // pc offset, flags, size of the access // Index [0..31] stores distinct sector count 1..32. // Index [32..64] stores active lane count 0..32. std::unordered_map> _distinct_sector_count; // pc offset, distinct sector distribution + // Persistent worker pool and per-worker shared-memory shadow state. + uint64_t _worker_count = 1; + std::vector _workers; + std::vector> _worker_shadow_memory_shared; + + // Per-batch job data produced by gpu_data_analysis and consumed by workers. + const MemoryAccess* _job_accesses_buffer = nullptr; + std::vector> _job_worker_trace_indices; + std::vector>> _job_worker_pc_statistics; + std::vector>> _job_worker_pc_flags; + std::vector>> _job_worker_distinct_sector_count; + + std::mutex _worker_pool_mutex; + std::condition_variable _worker_pool_cv; + std::condition_variable _worker_pool_done_cv; + bool _worker_pool_shutdown = false; + uint64_t _worker_job_generation = 0; + uint64_t _worker_pending_jobs = 0; + }; } // yosemite diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp index 80053cc..6dadf09 100644 --- a/src/tools/pc_dependency_analysis.cpp +++ b/src/tools/pc_dependency_analysis.cpp @@ -11,6 +11,7 @@ #include #include #include +#include using namespace yosemite; @@ -62,6 +63,18 @@ static std::string flags_to_string(uint32_t flags) { return oss.str(); } + +static inline uint64_t pack_shadow_entry(uint32_t pc, uint32_t flat_thread_id) { + return (static_cast(flat_thread_id) << 32) | static_cast(pc); +} + +static inline uint32_t unpack_shadow_pc(uint64_t packed) { + return static_cast(packed & 0xFFFFFFFFu); +} + +static inline uint32_t unpack_shadow_flat_tid(uint64_t packed) { + return static_cast(packed >> 32); +} } // namespace @@ -80,10 +93,33 @@ PcDependency::PcDependency() : Tool(PC_DEPENDENCY_ANALYSIS) { output_directory = "dependency_" + get_current_date_n_time(); } check_folder_existance(output_directory); + + _worker_count = std::max(1u, std::thread::hardware_concurrency()); + _worker_shadow_memory_shared.resize(_worker_count); + _job_worker_trace_indices.resize(_worker_count); + _job_worker_pc_statistics.resize(_worker_count); + _job_worker_pc_flags.resize(_worker_count); + _job_worker_distinct_sector_count.resize(_worker_count); + _workers.reserve(_worker_count); + for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) { + _workers.emplace_back(&PcDependency::worker_loop, this, worker_idx); + } } -PcDependency::~PcDependency() {} +PcDependency::~PcDependency() { + { + std::lock_guard guard(_worker_pool_mutex); + _worker_pool_shutdown = true; + ++_worker_job_generation; + } + _worker_pool_cv.notify_all(); + for (auto& worker : _workers) { + if (worker.joinable()) { + worker.join(); + } + } +} void PcDependency::kernel_start_callback(std::shared_ptr kernel) { @@ -92,10 +128,14 @@ void PcDependency::kernel_start_callback(std::shared_ptr kernel) kernel_events.emplace(_timer.get(), kernel); _pc_statistics.clear(); _pc_flags.clear(); + _distinct_sector_count.clear(); + for (auto& shared_map : _worker_shadow_memory_shared) { + shared_map.clear(); + } for (auto& shadow_memory_iter : _shadow_memories) { - shadow_memory_iter.second->reset_bitmap(); + shadow_memory_iter.second->reset_entries(); } - printf("[PC_DEPENDENCY] Resetting shadow memory bitmap\n"); + printf("[PC_DEPENDENCY] Resetting shadow memory entries\n"); _timer.increment(true); } @@ -158,7 +198,7 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { nodes.insert(cur_pc); for (const auto& [anc_pc, st] : inner_map) { (void)st; - if (anc_pc != 0xFFFFFFFFu) nodes.insert(anc_pc); + if (anc_pc != 0u) nodes.insert(anc_pc); } } @@ -233,7 +273,7 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { if (!first_edge) jout << ",\n"; first_edge = false; - bool cold_miss = (anc_pc == 0xFFFFFFFFu); + bool cold_miss = (anc_pc == 0u); jout << " {\"current_pc\": " << cur_pc << ", \"current_pc_hex\": \"" << hex_u32(cur_pc) << "\"" @@ -278,7 +318,9 @@ void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { void PcDependency::kernel_end_callback(std::shared_ptr kernel) { auto evt = std::prev(kernel_events.end())->second; evt->end_time = _timer.get(); - this->_shadow_memory_shared.clear(); + for (auto& shared_map : _worker_shadow_memory_shared) { + shared_map.clear(); + } printf("[PC_DEPENDENCY] Clearing shadow memory shared\n"); kernel_trace_flush(evt); @@ -348,7 +390,16 @@ void PcDependency::ten_free_callback(std::shared_ptr ten) { _timer.increment(true); } -void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size) { +void PcDependency::unit_access( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + memory_region& memory_region_target, + int access_size, + std::unordered_map>& local_pc_statistics +) { // auto& shadow_memory = this->_shadow_memories[memory_region_target]; auto shadow_memory_it = this->_shadow_memories.find(memory_region_target); if (shadow_memory_it == this->_shadow_memories.end()) { @@ -356,79 +407,94 @@ void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t curren return; } auto& shadow_memory = *(shadow_memory_it->second); + const uint32_t current_flat_thread_id = + static_cast((current_block_id << 10) | (current_warp_id << 5) | current_lane_id); for (int i = 0; i < access_size; i += 4) { - auto addr = ptr + i; + const uint64_t addr = ptr + i; // Byte-granularity shadow memory: addr is byte offset within allocation. // Bound check to avoid OOB on allocations at end boundary or odd sizes. if (addr >= shadow_memory._size) { break; } - if (shadow_memory.is_valid(addr) == false) { - // cold miss - _pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1; - shadow_memory.set_valid(addr); - auto& shadow_memory_entry = shadow_memory.get_entry(addr); - shadow_memory_entry.last_pc = pc_offset; - shadow_memory_entry.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id; + + auto& entry = shadow_memory.get_entry(addr); + const uint64_t old_packed = __atomic_exchange_n( + &entry.packed, + pack_shadow_entry(pc_offset, current_flat_thread_id), + __ATOMIC_ACQ_REL + ); + const bool is_cold_miss = (old_packed == 0); + + if (is_cold_miss) { + local_pc_statistics[pc_offset][0].dist[0] += 1; continue; } - auto& last_access = shadow_memory.get_entry(addr); - uint64_t last_block_id = last_access.last_flat_thread_id >> 10; - uint64_t last_warp_id = (last_access.last_flat_thread_id >> 5) & 0x1F; - uint64_t last_lane_id = last_access.last_flat_thread_id & 0x1F; - uint32_t last_pc = last_access.last_pc; + const uint32_t last_pc = unpack_shadow_pc(old_packed); + const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed); + const uint64_t last_block_id = static_cast(last_flat_thread_id >> 10); + const uint64_t last_warp_id = static_cast((last_flat_thread_id >> 5) & 0x1F); + const uint64_t last_lane_id = static_cast(last_flat_thread_id & 0x1F); if (last_block_id != current_block_id) { - this->_pc_statistics[pc_offset][last_pc].dist[3] += 1; - }else if (last_warp_id != current_warp_id) { - this->_pc_statistics[pc_offset][last_pc].dist[2] += 1; - }else if (last_lane_id != current_lane_id) { - this->_pc_statistics[pc_offset][last_pc].dist[1] += 1; - }else { - this->_pc_statistics[pc_offset][last_pc].dist[0] += 1; + local_pc_statistics[pc_offset][last_pc].dist[3] += 1; + } else if (last_warp_id != current_warp_id) { + local_pc_statistics[pc_offset][last_pc].dist[2] += 1; + } else if (last_lane_id != current_lane_id) { + local_pc_statistics[pc_offset][last_pc].dist[1] += 1; + } else { + local_pc_statistics[pc_offset][last_pc].dist[0] += 1; } - last_access.last_pc = pc_offset; - last_access.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id; } } -void PcDependency::unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) { +void PcDependency::unit_access_shared( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + int access_size, + std::unordered_map>& local_pc_statistics, + std::unordered_map& local_shadow_memory_shared +) { // 共享内存地址在同一个 block 内唯一,使用 block_id 高位 + 地址低 32 位作为 key, const uint64_t packed_base = ((current_block_id & 0xFFFFFFFFull) << 32) | (ptr & 0xFFFFFFFFull); for (int i = 0; i < access_size; i += 4) { const uint64_t addr = packed_base + i; // 4 字节粒度 - - auto it = this->_shadow_memory_shared.find(addr); - if (it == this->_shadow_memory_shared.end()) { - // cold miss - this->_pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1; - auto& entry = this->_shadow_memory_shared.emplace(addr, shadow_memory_entry()).first->second; - entry.last_pc = pc_offset; - entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id; // 只编码 warp/lane + const uint32_t current_flat_thread_id = + static_cast((current_warp_id << 5) | current_lane_id); + + auto [it, inserted] = local_shadow_memory_shared.emplace(addr, shadow_memory_entry()); + const bool is_cold_miss = inserted; + const uint64_t old_packed = __atomic_exchange_n( + &(it->second.packed), + pack_shadow_entry(pc_offset, current_flat_thread_id), + __ATOMIC_ACQ_REL + ); + + if (is_cold_miss) { + local_pc_statistics[pc_offset][0].dist[0] += 1; continue; } - auto& entry = it->second; - const uint64_t last_warp_id = (entry.last_flat_thread_id >> 5) & 0x1F; - const uint64_t last_lane_id = entry.last_flat_thread_id & 0x1F; - const uint32_t last_pc = entry.last_pc; + const uint32_t last_pc = unpack_shadow_pc(old_packed); + const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed); + const uint64_t last_warp_id = static_cast((last_flat_thread_id >> 5) & 0x1F); + const uint64_t last_lane_id = static_cast(last_flat_thread_id & 0x1F); if (last_warp_id != current_warp_id) { // 不同 warp 同 block - this->_pc_statistics[pc_offset][last_pc].dist[2] += 1; + local_pc_statistics[pc_offset][last_pc].dist[2] += 1; } else if (last_lane_id != current_lane_id) { // 同 warp 不同 lane - this->_pc_statistics[pc_offset][last_pc].dist[1] += 1; + local_pc_statistics[pc_offset][last_pc].dist[1] += 1; } else { // 同一线程 - this->_pc_statistics[pc_offset][last_pc].dist[0] += 1; + local_pc_statistics[pc_offset][last_pc].dist[0] += 1; } - - entry.last_pc = pc_offset; - entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id; } } @@ -437,67 +503,201 @@ void PcDependency::unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t } -void PcDependency::gpu_data_analysis(void* data, uint64_t size) { - MemoryAccess* accesses_buffer = (MemoryAccess*)data; - for (uint64_t i = 0; i < size; i++) { - MemoryAccess trace = accesses_buffer[i]; - uint32_t pc_offset = trace.pc; - uint32_t flags = trace.flags; - uint32_t access_size = trace.accessSize; - uint32_t distinct_sector_count = trace.distinct_sector_count; - uint32_t active_mask = trace.active_mask; - switch (trace.type) { - case MemoryType::Local:{ - flags |= SANITIZER_MEMORY_LOCAL; - break; - } - case MemoryType::Shared:{ - flags |= SANITIZER_MEMORY_SHARED; - for (int j = 0; j < GPU_WARP_SIZE; j++) { - if (active_mask & (1u << j)) { - unit_access_shared(trace.addresses[j], pc_offset, trace.ctaId, trace.warpId, j, trace.accessSize); - } +void PcDependency::worker_loop(uint64_t worker_idx) { + uint64_t seen_generation = 0; + while (true) { + uint64_t current_generation = 0; + { + std::unique_lock lock(_worker_pool_mutex); + _worker_pool_cv.wait(lock, [&]{ + return _worker_pool_shutdown || _worker_job_generation > seen_generation; + }); + if (_worker_pool_shutdown) { + return; + } + current_generation = _worker_job_generation; + } + + auto& local_pc_statistics = _job_worker_pc_statistics[worker_idx]; + auto& local_pc_flags = _job_worker_pc_flags[worker_idx]; + auto& local_distinct_sector_count = _job_worker_distinct_sector_count[worker_idx]; + auto& local_shadow_memory_shared = _worker_shadow_memory_shared[worker_idx]; + const auto& trace_indices = _job_worker_trace_indices[worker_idx]; + + for (uint64_t i : trace_indices) { + MemoryAccess trace = _job_accesses_buffer[i]; + uint32_t pc_offset = trace.pc; + uint32_t flags = trace.flags; + uint32_t access_size = trace.accessSize; + uint32_t distinct_sector_count = trace.distinct_sector_count; + uint32_t active_mask = trace.active_mask; + switch (trace.type) { + case MemoryType::Local:{ + flags |= SANITIZER_MEMORY_LOCAL; + break; } - break; - } - case MemoryType::Global:{ - flags |= SANITIZER_MEMORY_GLOBAL; - memory_region memory_region_target; - uint64_t first_valid_address = 0; - for (int j = 0; j < GPU_WARP_SIZE; j++) { - if (active_mask & (1u << j)) { - first_valid_address = trace.addresses[j]; - break; + case MemoryType::Shared:{ + flags |= SANITIZER_MEMORY_SHARED; + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (active_mask & (1u << j)) { + unit_access_shared( + trace.addresses[j], + pc_offset, + trace.ctaId, + trace.warpId, + j, + trace.accessSize, + local_pc_statistics, + local_shadow_memory_shared + ); + } } + break; } - assert(first_valid_address != 0); - for (auto memory_region_iter : this->_memory_regions) { - if (memory_region_iter.contains(first_valid_address)) { - memory_region_target = memory_region_iter; - break; + case MemoryType::Global:{ + flags |= SANITIZER_MEMORY_GLOBAL; + memory_region memory_region_target; + uint64_t first_valid_address = 0; + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (active_mask & (1u << j)) { + first_valid_address = trace.addresses[j]; + break; + } } - } - uint64_t memory_region_start = memory_region_target.get_start(); - assert(memory_region_start != 0); - for ( int j = 0; j < GPU_WARP_SIZE; j++) { - if (active_mask & (1u << j)) { - unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size); + assert(first_valid_address != 0); + for (auto memory_region_iter : this->_memory_regions) { + if (memory_region_iter.contains(first_valid_address)) { + memory_region_target = memory_region_iter; + break; + } + } + uint64_t memory_region_start = memory_region_target.get_start(); + assert(memory_region_start != 0); + for ( int j = 0; j < GPU_WARP_SIZE; j++) { + if (active_mask & (1u << j)) { + unit_access( + trace.addresses[j] - memory_region_start, + pc_offset, + trace.ctaId, + trace.warpId, + j, + memory_region_target, + access_size, + local_pc_statistics + ); + } } + break; } + default: + printf("unknown memory type\n"); break; + } + auto& local_flag = local_pc_flags[pc_offset]; + local_flag.first |= flags; + if (local_flag.second == 0) { + local_flag.second = access_size; + } else if (local_flag.second != access_size) { + local_flag.second = std::max(local_flag.second, access_size); + } + if (distinct_sector_count >= 1 && distinct_sector_count <= 32) { + local_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1; + } + const uint32_t active_lane_count = __builtin_popcount(active_mask); + if (active_lane_count <= 32) { + local_distinct_sector_count[pc_offset][32 + active_lane_count] += 1; + } + } + + { + std::lock_guard guard(_worker_pool_mutex); + seen_generation = current_generation; + if (!trace_indices.empty()) { + assert(_worker_pending_jobs > 0); + _worker_pending_jobs -= 1; + if (_worker_pending_jobs == 0) { + _worker_pool_done_cv.notify_one(); } - default: - printf("unknown memory type\n"); - break; + } + } + } +} + + +void PcDependency::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + if (size == 0) { + return; + } + + for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) { + _job_worker_trace_indices[worker_idx].clear(); + _job_worker_pc_statistics[worker_idx].clear(); + _job_worker_pc_flags[worker_idx].clear(); + _job_worker_distinct_sector_count[worker_idx].clear(); + _job_worker_trace_indices[worker_idx].reserve((size / _worker_count) + 1); + } + + // Stable assignment by block id keeps intra-block trace order. + for (uint64_t i = 0; i < size; ++i) { + const uint64_t worker_idx = accesses_buffer[i].ctaId % _worker_count; + _job_worker_trace_indices[worker_idx].push_back(i); + } + + uint64_t pending_jobs = 0; + for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) { + if (!_job_worker_trace_indices[worker_idx].empty()) { + pending_jobs += 1; } - this->_pc_flags[pc_offset] = std::make_pair(flags, access_size); - // Defensive bounds checks: GPU side should produce [1, 32]. - if (distinct_sector_count >= 1 && distinct_sector_count <= 32) { - this->_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1; + } + if (pending_jobs == 0) { + return; + } + + { + std::lock_guard guard(_worker_pool_mutex); + _job_accesses_buffer = accesses_buffer; + _worker_pending_jobs = pending_jobs; + ++_worker_job_generation; + } + _worker_pool_cv.notify_all(); + { + std::unique_lock lock(_worker_pool_mutex); + _worker_pool_done_cv.wait(lock, [&]{ + return _worker_pending_jobs == 0; + }); + } + + for (auto& local_flags_map : _job_worker_pc_flags) { + for (auto& [pc, local_flag] : local_flags_map) { + auto& global_flag = this->_pc_flags[pc]; + global_flag.first |= local_flag.first; + if (global_flag.second == 0) { + global_flag.second = local_flag.second; + } else if (global_flag.second != local_flag.second) { + global_flag.second = std::max(global_flag.second, local_flag.second); + } + } + } + + for (auto& local_distinct_map : _job_worker_distinct_sector_count) { + for (auto& [pc, local_hist] : local_distinct_map) { + auto& global_hist = this->_distinct_sector_count[pc]; + for (size_t idx = 0; idx < global_hist.size(); ++idx) { + global_hist[idx] += local_hist[idx]; + } } - const uint32_t active_lane_count = __builtin_popcount(active_mask); - if (active_lane_count <= 32) { - this->_distinct_sector_count[pc_offset][32 + active_lane_count] += 1; + } + + for (auto& local_map : _job_worker_pc_statistics) { + for (auto& [cur_pc, local_inner] : local_map) { + auto& global_inner = this->_pc_statistics[cur_pc]; + for (auto& [anc_pc, local_stats] : local_inner) { + auto& global_stats = global_inner[anc_pc]; + for (int d = 0; d < 4; ++d) { + global_stats.dist[d] += local_stats.dist[d]; + } + } } } From 98c0353dab89adbee0f730f141fc3af4a4285bbf Mon Sep 17 00:00:00 2001 From: Yanbo Zhao Date: Mon, 16 Feb 2026 16:28:41 -0500 Subject: [PATCH 6/6] lock free parallelization optimization --- include/tools/pc_dependency_analysis.h | 8 +- src/tools/pc_dependency_analysis.cpp | 174 +++++++++++++++---------- 2 files changed, 110 insertions(+), 72 deletions(-) diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h index 717307a..1df9813 100644 --- a/include/tools/pc_dependency_analysis.h +++ b/include/tools/pc_dependency_analysis.h @@ -113,7 +113,8 @@ class alignas(8) shadow_memory_entry{ public: shadow_memory_entry() {}; ~shadow_memory_entry() {}; - // Packed representation: low 32 bits = last_pc, high 32 bits = last_flat_thread_id. + // Packed representation: low 32 bits = (generation:8 | pc24:24), + // high 32 bits = last_flat_thread_id. // Keeping a single 64-bit field avoids type-punning UB in atomic exchange. // packed == 0 means invalid/uninitialized (cold). uint64_t packed = 0; @@ -223,7 +224,7 @@ class PcDependency final : public Tool { uint32_t current_lane_id, int access_size, std::unordered_map>& local_pc_statistics, - std::unordered_map& local_shadow_memory_shared + std::unordered_map>& local_shadow_memory_shared ); void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size); @@ -237,6 +238,7 @@ class PcDependency final : public Tool { std::string output_directory; uint32_t kernel_id = 0; + uint8_t _kernel_generation = 0; std::map> kernel_events; @@ -259,7 +261,7 @@ class PcDependency final : public Tool { // Persistent worker pool and per-worker shared-memory shadow state. uint64_t _worker_count = 1; std::vector _workers; - std::vector> _worker_shadow_memory_shared; + std::vector>> _worker_shadow_memory_shared; // Per-batch job data produced by gpu_data_analysis and consumed by workers. const MemoryAccess* _job_accesses_buffer = nullptr; diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp index 6dadf09..aa3f525 100644 --- a/src/tools/pc_dependency_analysis.cpp +++ b/src/tools/pc_dependency_analysis.cpp @@ -64,17 +64,38 @@ static std::string flags_to_string(uint32_t flags) { return oss.str(); } -static inline uint64_t pack_shadow_entry(uint32_t pc, uint32_t flat_thread_id) { - return (static_cast(flat_thread_id) << 32) | static_cast(pc); +static inline uint64_t pack_shadow_entry(uint8_t generation, uint32_t pc24, uint32_t flat_thread_id) { + const uint32_t encoded_pc = (static_cast(generation) << 24) + | (pc24 & 0x00FFFFFFu); + return (static_cast(flat_thread_id) << 32) | static_cast(encoded_pc); } -static inline uint32_t unpack_shadow_pc(uint64_t packed) { +static inline uint32_t unpack_shadow_pc_encoded(uint64_t packed) { return static_cast(packed & 0xFFFFFFFFu); } static inline uint32_t unpack_shadow_flat_tid(uint64_t packed) { return static_cast(packed >> 32); } + +static inline const memory_region* find_memory_region_containing( + const std::vector& regions, + uint64_t addr +) { + auto it = std::upper_bound( + regions.begin(), + regions.end(), + addr, + [](uint64_t value, const memory_region& region) { + return value < region.get_start(); + } + ); + if (it == regions.begin()) { + return nullptr; + } + --it; + return it->contains(addr) ? &(*it) : nullptr; +} } // namespace @@ -132,10 +153,13 @@ void PcDependency::kernel_start_callback(std::shared_ptr kernel) for (auto& shared_map : _worker_shadow_memory_shared) { shared_map.clear(); } - for (auto& shadow_memory_iter : _shadow_memories) { - shadow_memory_iter.second->reset_entries(); + _kernel_generation = static_cast(_kernel_generation + 1u); + if (_kernel_generation == 0) { + for (auto& shadow_memory_iter : _shadow_memories) { + shadow_memory_iter.second->reset_entries(); + } + printf("[PC_DEPENDENCY] Shadow generation wrapped, resetting entries\n"); } - printf("[PC_DEPENDENCY] Resetting shadow memory entries\n"); _timer.increment(true); } @@ -333,7 +357,10 @@ void PcDependency::mem_alloc_callback(std::shared_ptr mem) { alloc_events.emplace(_timer.get(), mem); active_memories.emplace(mem->addr, mem); memory_region memory_region_current = memory_region((uint64_t)mem->addr, (uint64_t)(mem->addr + mem->size)); - _memory_regions.push_back(memory_region_current); + _memory_regions.insert( + std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current), + memory_region_current + ); _shadow_memories.emplace(memory_region_current, std::make_unique(mem->size)); printf("[PC_DEPENDENCY] Allocating shadow memory for memory region: %p - %p, size: %lu\n", (void*)memory_region_current.get_start(), (void*)memory_region_current.get_end(), mem->size); @@ -349,8 +376,8 @@ void PcDependency::mem_free_callback(std::shared_ptr mem) { memory_region r((uint64_t)mem->addr, (uint64_t)mem->addr + sz); - auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r); - if (vit != _memory_regions.end()) _memory_regions.erase(vit); + auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r); + if (vit != _memory_regions.end() && *vit == r) _memory_regions.erase(vit); _shadow_memories.erase(r); printf("[PC_DEPENDENCY] Freeing shadow memory for memory region: %p - %p, size: %lu\n", (void*)r.get_start(), (void*)r.get_end(), sz); @@ -361,8 +388,12 @@ void PcDependency::mem_free_callback(std::shared_ptr mem) { void PcDependency::ten_alloc_callback(std::shared_ptr ten) { tensor_events.emplace(_timer.get(), ten); active_tensors.emplace(ten->addr, ten); - _memory_regions.push_back(memory_region((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size))); - _shadow_memories.emplace(_memory_regions.back(), std::make_unique(ten->size)); + memory_region memory_region_current((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size)); + _memory_regions.insert( + std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current), + memory_region_current + ); + _shadow_memories.emplace(memory_region_current, std::make_unique(ten->size)); printf("[PC_DEPENDENCY] Allocating shadow memory for tensor region: %p - %p, size: %lu\n", (void*)ten->addr, (void*)(ten->addr + ten->size), ten->size); _timer.increment(true); @@ -379,8 +410,8 @@ void PcDependency::ten_free_callback(std::shared_ptr ten) { memory_region r((uint64_t)ten->addr, (uint64_t)ten->addr + sz); - auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r); - if (vit != _memory_regions.end()) { + auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r); + if (vit != _memory_regions.end() && *vit == r) { _memory_regions.erase(vit); } @@ -421,7 +452,7 @@ void PcDependency::unit_access( auto& entry = shadow_memory.get_entry(addr); const uint64_t old_packed = __atomic_exchange_n( &entry.packed, - pack_shadow_entry(pc_offset, current_flat_thread_id), + pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id), __ATOMIC_ACQ_REL ); const bool is_cold_miss = (old_packed == 0); @@ -431,7 +462,13 @@ void PcDependency::unit_access( continue; } - const uint32_t last_pc = unpack_shadow_pc(old_packed); + const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed); + const uint8_t last_generation = static_cast(last_pc_encoded >> 24); + if (last_generation != _kernel_generation) { + local_pc_statistics[pc_offset][0].dist[0] += 1; + continue; + } + const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu); const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed); const uint64_t last_block_id = static_cast(last_flat_thread_id >> 10); const uint64_t last_warp_id = static_cast((last_flat_thread_id >> 5) & 0x1F); @@ -456,31 +493,34 @@ void PcDependency::unit_access_shared( uint32_t current_lane_id, int access_size, std::unordered_map>& local_pc_statistics, - std::unordered_map& local_shadow_memory_shared + std::unordered_map>& local_shadow_memory_shared ) { - // 共享内存地址在同一个 block 内唯一,使用 block_id 高位 + 地址低 32 位作为 key, - const uint64_t packed_base = ((current_block_id & 0xFFFFFFFFull) << 32) - | (ptr & 0xFFFFFFFFull); + // Per-CTA layered shadow map: local_shadow_memory_shared[cta_id][addr_low32] + auto& cta_shadow = local_shadow_memory_shared[current_block_id]; + const uint32_t base_addr_low32 = static_cast(ptr & 0xFFFFFFFFull); for (int i = 0; i < access_size; i += 4) { - const uint64_t addr = packed_base + i; // 4 字节粒度 + const uint32_t addr = base_addr_low32 + static_cast(i); // 4 字节粒度 const uint32_t current_flat_thread_id = static_cast((current_warp_id << 5) | current_lane_id); - auto [it, inserted] = local_shadow_memory_shared.emplace(addr, shadow_memory_entry()); + auto [it, inserted] = cta_shadow.emplace(addr, shadow_memory_entry()); const bool is_cold_miss = inserted; - const uint64_t old_packed = __atomic_exchange_n( - &(it->second.packed), - pack_shadow_entry(pc_offset, current_flat_thread_id), - __ATOMIC_ACQ_REL - ); + const uint64_t old_packed = it->second.packed; + it->second.packed = pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id); if (is_cold_miss) { local_pc_statistics[pc_offset][0].dist[0] += 1; continue; } - const uint32_t last_pc = unpack_shadow_pc(old_packed); + const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed); + const uint8_t last_generation = static_cast(last_pc_encoded >> 24); + if (last_generation != _kernel_generation) { + local_pc_statistics[pc_offset][0].dist[0] += 1; + continue; + } + const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu); const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed); const uint64_t last_warp_id = static_cast((last_flat_thread_id >> 5) & 0x1F); const uint64_t last_lane_id = static_cast(last_flat_thread_id & 0x1F); @@ -525,8 +565,8 @@ void PcDependency::worker_loop(uint64_t worker_idx) { const auto& trace_indices = _job_worker_trace_indices[worker_idx]; for (uint64_t i : trace_indices) { - MemoryAccess trace = _job_accesses_buffer[i]; - uint32_t pc_offset = trace.pc; + const MemoryAccess& trace = _job_accesses_buffer[i]; + uint32_t pc_offset = (trace.pc & 0x00FFFFFFu); uint32_t flags = trace.flags; uint32_t access_size = trace.accessSize; uint32_t distinct_sector_count = trace.distinct_sector_count; @@ -538,54 +578,50 @@ void PcDependency::worker_loop(uint64_t worker_idx) { } case MemoryType::Shared:{ flags |= SANITIZER_MEMORY_SHARED; - for (int j = 0; j < GPU_WARP_SIZE; j++) { - if (active_mask & (1u << j)) { - unit_access_shared( - trace.addresses[j], - pc_offset, - trace.ctaId, - trace.warpId, - j, - trace.accessSize, - local_pc_statistics, - local_shadow_memory_shared - ); - } + uint32_t remaining_mask = active_mask; + while (remaining_mask != 0) { + const uint32_t j = static_cast(__builtin_ctz(remaining_mask)); + remaining_mask &= (remaining_mask - 1); + unit_access_shared( + trace.addresses[j], + pc_offset, + trace.ctaId, + trace.warpId, + j, + trace.accessSize, + local_pc_statistics, + local_shadow_memory_shared + ); } break; } case MemoryType::Global:{ flags |= SANITIZER_MEMORY_GLOBAL; - memory_region memory_region_target; - uint64_t first_valid_address = 0; - for (int j = 0; j < GPU_WARP_SIZE; j++) { - if (active_mask & (1u << j)) { - first_valid_address = trace.addresses[j]; - break; - } - } - assert(first_valid_address != 0); - for (auto memory_region_iter : this->_memory_regions) { - if (memory_region_iter.contains(first_valid_address)) { - memory_region_target = memory_region_iter; - break; - } + if (active_mask == 0) { + break; } + const uint32_t first_lane = static_cast(__builtin_ctz(active_mask)); + const uint64_t first_valid_address = trace.addresses[first_lane]; + const memory_region* memory_region_target_ptr = + find_memory_region_containing(this->_memory_regions, first_valid_address); + assert(memory_region_target_ptr != nullptr); + memory_region memory_region_target = *memory_region_target_ptr; uint64_t memory_region_start = memory_region_target.get_start(); assert(memory_region_start != 0); - for ( int j = 0; j < GPU_WARP_SIZE; j++) { - if (active_mask & (1u << j)) { - unit_access( - trace.addresses[j] - memory_region_start, - pc_offset, - trace.ctaId, - trace.warpId, - j, - memory_region_target, - access_size, - local_pc_statistics - ); - } + uint32_t remaining_mask = active_mask; + while (remaining_mask != 0) { + const uint32_t j = static_cast(__builtin_ctz(remaining_mask)); + remaining_mask &= (remaining_mask - 1); + unit_access( + trace.addresses[j] - memory_region_start, + pc_offset, + trace.ctaId, + trace.warpId, + j, + memory_region_target, + access_size, + local_pc_statistics + ); } break; }