diff --git a/include/sanalyzer.h b/include/sanalyzer.h index e42ae53..0d9e188 100644 --- a/include/sanalyzer.h +++ b/include/sanalyzer.h @@ -25,6 +25,9 @@ typedef enum { GPU_PATCH_TIME_HOTNESS_CPU = 8, GPU_PATCH_ROOFLINE_FLOPS_NVBIT = 9, GPU_PATCH_ROOFLINE_SIZE = 10, + GPU_PATCH_HEATMAP_ANALYSIS = 11, + GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS = 12, + GPU_PATCH_PC_DEPENDENCY_ANALYSIS = 13, } AccelProfPatchName_t; diff --git a/include/tools/block_divergence_analysis.h b/include/tools/block_divergence_analysis.h new file mode 100644 index 0000000..4fb52dd --- /dev/null +++ b/include/tools/block_divergence_analysis.h @@ -0,0 +1,73 @@ +#ifndef YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H +#define YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H + + +#include "tools/tool.h" +#include "utils/event.h" +#include "gpu_patch.h" + +#include +#include +#include +#include +namespace yosemite { + +class BlockDivergenceAnalysis final : public Tool { +public: + BlockDivergenceAnalysis(); + + ~BlockDivergenceAnalysis(); + + void gpu_data_analysis(void* data, uint64_t size); + + void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void evt_callback(EventPtr_t evt); + + void flush(); + +private: + void kernel_start_callback(std::shared_ptr kernel); + + void kernel_end_callback(std::shared_ptr kernel); + + void mem_alloc_callback(std::shared_ptr mem); + + void mem_free_callback(std::shared_ptr mem); + + void ten_alloc_callback(std::shared_ptr ten); + + void ten_free_callback(std::shared_ptr ten); + + void kernel_trace_flush(std::shared_ptr kernel); + + +/* +********************************* variables ********************************* +*/ + Timer_t _timer; + + std::string output_directory; + uint32_t kernel_id = 0; + + std::map> kernel_events; + std::map> alloc_events; + std::map> active_memories; + + std::map> tensor_events; + std::map> active_tensors; + + struct BlockStat { + std::unordered_map pc_counts; + uint64_t read_count = 0; + uint64_t write_count = 0; + }; + + std::unordered_map _block_entries; + std::set _unique_pcs; +}; + +} // yosemite +#endif // YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H diff --git a/include/tools/heatmap_analysis.h b/include/tools/heatmap_analysis.h new file mode 100644 index 0000000..2a50475 --- /dev/null +++ b/include/tools/heatmap_analysis.h @@ -0,0 +1,78 @@ +#ifndef YOSEMITE_HEATMAP_ANALYSIS_H +#define YOSEMITE_HEATMAP_ANALYSIS_H + + +#include "tools/tool.h" +#include "utils/event.h" +#include "gpu_patch.h" + +#include +#include +#include +#include +#include + +#define SECTOR_TAG_SHIFT 5 + +namespace yosemite { + +class HeatmapAnalysis final : public Tool { +public: + HeatmapAnalysis(); + + ~HeatmapAnalysis(); + + void gpu_data_analysis(void* data, uint64_t size); + + void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void evt_callback(EventPtr_t evt); + + void flush(); + +private: + void unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length); + + void add_sector_pc_information(uint64_t sector_tag, uint64_t pc); + + void kernel_start_callback(std::shared_ptr kernel); + + void kernel_end_callback(std::shared_ptr kernel); + + void mem_alloc_callback(std::shared_ptr mem); + + void mem_free_callback(std::shared_ptr mem); + + void ten_alloc_callback(std::shared_ptr ten); + + void ten_free_callback(std::shared_ptr ten); + + void kernel_trace_flush(std::shared_ptr kernel); + + +/* +********************************* variables ********************************* +*/ + + Timer_t _timer; + + std::string output_directory; + uint32_t kernel_id = 0; + + std::map> kernel_events; + std::map> alloc_events; + std::map> active_memories; + + std::map> tensor_events; + std::map> active_tensors; + + std::vector _traces; + std::unordered_map> _heatmap_data; + std::unordered_map> _sector_pc_information; + +}; + +} // namespace yosemite +#endif // YOSEMITE_HEATMAP_ANALYSIS_H diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h new file mode 100644 index 0000000..1df9813 --- /dev/null +++ b/include/tools/pc_dependency_analysis.h @@ -0,0 +1,283 @@ +#ifndef YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H +#define YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H + + +#include "tools/tool.h" +#include "utils/event.h" +#include "gpu_patch.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ +#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE +#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_RED +#define SANITIZER_MEMORY_DEVICE_FLAG_RED 0x3 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC +#define SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC 0x4 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH +#define SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH 0x8 +#endif + +#ifndef SANITIZER_MEMORY_GLOBAL +#define SANITIZER_MEMORY_GLOBAL 0x10 +#endif + +#ifndef SANITIZER_MEMORY_SHARED +#define SANITIZER_MEMORY_SHARED 0x20 +#endif + +#ifndef SANITIZER_MEMORY_LOCAL +#define SANITIZER_MEMORY_LOCAL 0x40 +#endif + +namespace yosemite { + +/* we choose to use PC offset instead of PC because the PC is too long for shadow memory and it is not necessary to track the original PC. +The offset will be calculated during trace collection. + +Every memory allocation will cause a shadow memory to be created. +Every memory deallocation will cause a shadow memory to be destroyed. +Shadow memory bitmask will be reset when a kernel finished. (to avoid mass shadow memory reset) + +The gpu data analysis will +1.iterate the trace buffer and query the shadow memory to get the corresponding shadow memory entry. +2. compare the last access information with the current access information with the rules below: + 0. if bitmask of this access is 0, it means the current access is a cold miss set it's acient pc to 0xFFFFFFFF. + 1. if last access and current access are from the same thread, then it is an intra thread access. + 2. if last access and current access are from the same warp, then it is an intra warp access. + 3. if last access and current access are from the same block, then it is an intra block access. + 4. if last access and current access are from the same grid, then it is an intra grid access. +3. update the pc_statistics with the current pc, ancient pc and the distance. +4. update the shadow memory entry with the current pc and the flat thread id. +*/ + + +class memory_region{ +public: + memory_region() : start(0), end(0) {}; + memory_region(uint64_t start, uint64_t end) : start(start), end(end) {}; + ~memory_region() {}; + + bool contains(uint64_t ptr) const { + return ptr >= start && ptr < end; + }; + + bool operator==(const memory_region& other) const { + return start == other.start && end == other.end; + }; + + bool operator<(const memory_region& other) const { + // strict-weak-ordering: compare both start and end + if (start != other.start) return start < other.start; + return end < other.end; + }; + + uint64_t get_start() const { + return start; + }; + uint64_t get_end() const { + return end; + }; + +private: + uint64_t start; + uint64_t end; +}; + +class alignas(8) shadow_memory_entry{ +public: + shadow_memory_entry() {}; + ~shadow_memory_entry() {}; + // Packed representation: low 32 bits = (generation:8 | pc24:24), + // high 32 bits = last_flat_thread_id. + // Keeping a single 64-bit field avoids type-punning UB in atomic exchange. + // packed == 0 means invalid/uninitialized (cold). + uint64_t packed = 0; +}; + +class shadow_memory{ +public: + shadow_memory(uint64_t size) + :_size(size), + _size_celled((size + 3) / 4 * 4), + _stride(_size_celled / 4), + _entries_bytes(std::max(1, _size_celled * sizeof(shadow_memory_entry))) { + _shadow_memory_entries = static_cast( + mmap(nullptr, _entries_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0) + ); + assert(_shadow_memory_entries != MAP_FAILED); + + printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size); + printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry)); + printf("[PC_DEPENDENCY] Shadow memory size: %lu\n", size*sizeof(shadow_memory_entry)); + }; + ~shadow_memory() { + if (_shadow_memory_entries != nullptr && _shadow_memory_entries != MAP_FAILED) { + munmap(_shadow_memory_entries, _entries_bytes); + _shadow_memory_entries = nullptr; + } + } + void reset_entries() { + if (madvise(_shadow_memory_entries, _entries_bytes, MADV_DONTNEED) != 0) { + std::memset(_shadow_memory_entries, 0, _entries_bytes); + } + }; + shadow_memory_entry& get_entry(uint64_t offset) { + assert(offset < _size); + //update layout: use offset/4 + offset%4 * _size/4 to make every 4 bytes adjacent in one cache line + return _shadow_memory_entries[(offset/4) + (offset%4) * _stride]; + // return _shadow_memory_entries[offset]; + } + uint64_t _size; + uint64_t _size_celled; + uint64_t _stride; + uint64_t _entries_bytes; + shadow_memory_entry* _shadow_memory_entries = nullptr; +}; + + +class PC_statisitics{ +public: + std::array dist = {0, 0, 0, 0}; + // 0: intra thread + // 1: intra warp + // 2: intra block + // 3: intra grid +}; + +class PcDependency final : public Tool { +public: + PcDependency(); + + ~PcDependency(); + + void gpu_data_analysis(void* data, uint64_t size); + + void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {}; + + void allocation_callback(uint64_t ptr, uint64_t size); + + void deallocation_callback(uint64_t ptr); + + void evt_callback(EventPtr_t evt); + + void flush(); + +private: + void kernel_start_callback(std::shared_ptr kernel); + + void kernel_end_callback(std::shared_ptr kernel); + + void mem_alloc_callback(std::shared_ptr mem); + + void mem_free_callback(std::shared_ptr mem); + + void ten_alloc_callback(std::shared_ptr ten); + + void ten_free_callback(std::shared_ptr ten); + + void kernel_trace_flush(std::shared_ptr kernel); + + void unit_access( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + memory_region& memory_region_target, + int access_size, + std::unordered_map>& local_pc_statistics + ); + + void unit_access_shared( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + int access_size, + std::unordered_map>& local_pc_statistics, + std::unordered_map>& local_shadow_memory_shared + ); + + void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size); + void worker_loop(uint64_t worker_idx); + + +/* +********************************* variables ********************************* +*/ + Timer_t _timer; + + std::string output_directory; + uint32_t kernel_id = 0; + uint8_t _kernel_generation = 0; + + + std::map> kernel_events; + std::map> alloc_events; + std::map> active_memories; + + std::map> tensor_events; + std::map> active_tensors; + + + std::vector _memory_regions; + + std::map> _shadow_memories; // memory region, shadow memory + std::unordered_map> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics + std::unordered_map> _pc_flags; // pc offset, flags, size of the access + // Index [0..31] stores distinct sector count 1..32. + // Index [32..64] stores active lane count 0..32. + std::unordered_map> _distinct_sector_count; // pc offset, distinct sector distribution + + // Persistent worker pool and per-worker shared-memory shadow state. + uint64_t _worker_count = 1; + std::vector _workers; + std::vector>> _worker_shadow_memory_shared; + + // Per-batch job data produced by gpu_data_analysis and consumed by workers. + const MemoryAccess* _job_accesses_buffer = nullptr; + std::vector> _job_worker_trace_indices; + std::vector>> _job_worker_pc_statistics; + std::vector>> _job_worker_pc_flags; + std::vector>> _job_worker_distinct_sector_count; + + std::mutex _worker_pool_mutex; + std::condition_variable _worker_pool_cv; + std::condition_variable _worker_pool_done_cv; + bool _worker_pool_shutdown = false; + uint64_t _worker_job_generation = 0; + uint64_t _worker_pending_jobs = 0; + +}; + +} // yosemite +#endif // YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H diff --git a/include/tools/tool_type.h b/include/tools/tool_type.h index 4115ebb..b4d39dd 100644 --- a/include/tools/tool_type.h +++ b/include/tools/tool_type.h @@ -16,7 +16,10 @@ typedef enum { ROOFLINE_FLOPS = 11, ROOFLINE_SIZE = 12, ROOFLINE_TIME = 13, - TOOL_NUMS = 14 + HEATMAP_ANALYSIS = 14, + BLOCK_DIVERGENCE_ANALYSIS = 15, + PC_DEPENDENCY_ANALYSIS = 16, + TOOL_NUMS = 17 } AnalysisTool_t; #endif // TOOL_TYPE_H \ No newline at end of file diff --git a/include/utils/event.h b/include/utils/event.h index 903fc43..c29dd4e 100644 --- a/include/utils/event.h +++ b/include/utils/event.h @@ -69,6 +69,7 @@ typedef struct KernelLaunch : public Event { uint32_t touched_objects; uint32_t touched_objects_size; uint64_t key; // for UVM Advisor + uint64_t kernel_pc; KernelLaunch() { this->evt_type = EventType_KERNEL_LAUNCH; diff --git a/src/sanalyzer.cpp b/src/sanalyzer.cpp index 0f250c5..faa6056 100644 --- a/src/sanalyzer.cpp +++ b/src/sanalyzer.cpp @@ -16,6 +16,9 @@ #include "tools/time_hotness_cpu.h" #include "tools/event_trace.h" #include "tools/event_trace_mgpu.h" +#include "tools/heatmap_analysis.h" +#include "tools/block_divergence_analysis.h" +#include "tools/pc_dependency_analysis.h" #include #include @@ -104,6 +107,15 @@ YosemiteResult_t yosemite_tool_enable(AnalysisTool_t& tool) { } else if (std::string(tool_name) == "event_trace_mgpu") { tool = EVENT_TRACE_MGPU; _tools.emplace(EVENT_TRACE_MGPU, std::make_shared()); + } else if (std::string(tool_name) == "heatmap_analysis") { + tool = HEATMAP_ANALYSIS; + _tools.emplace(HEATMAP_ANALYSIS, std::make_shared()); + } else if (std::string(tool_name) == "block_divergence_analysis") { + tool = BLOCK_DIVERGENCE_ANALYSIS; + _tools.emplace(BLOCK_DIVERGENCE_ANALYSIS, std::make_shared()); + } else if (std::string(tool_name) == "pc_dependency_analysis") { + tool = PC_DEPENDENCY_ANALYSIS; + _tools.emplace(PC_DEPENDENCY_ANALYSIS, std::make_shared()); } else { fprintf(stderr, "[SANALYZER ERROR] Tool not found.\n"); fflush(stderr); @@ -249,6 +261,17 @@ YosemiteResult_t yosemite_init(AccelProfOptions_t& options) { options.patch_name = GPU_NO_PATCH; } else if (tool == EVENT_TRACE_MGPU) { options.patch_name = GPU_NO_PATCH; + } else if (tool == HEATMAP_ANALYSIS) { + options.patch_name = GPU_PATCH_HEATMAP_ANALYSIS; + options.patch_file = "gpu_patch_heatmap_analysis.fatbin"; + } else if (tool == BLOCK_DIVERGENCE_ANALYSIS) { + options.patch_name = GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS; + options.patch_file = "gpu_patch_block_divergence_analysis.fatbin"; + } else if (tool == PC_DEPENDENCY_ANALYSIS) { + options.patch_name = GPU_PATCH_PC_DEPENDENCY_ANALYSIS; + // nv-compute/Makefile generates fatbins based on gpu_src/*.cu filenames. + // The source file for this tool is nv-compute/gpu_src/gpu_patch_pc_dependency.cu + options.patch_file = "gpu_patch_pc_dependency.fatbin"; } // enable torch profiler? diff --git a/src/tools/block_divergence_analysis.cpp b/src/tools/block_divergence_analysis.cpp new file mode 100644 index 0000000..8dd16d6 --- /dev/null +++ b/src/tools/block_divergence_analysis.cpp @@ -0,0 +1,190 @@ +#include "tools/block_divergence_analysis.h" +#include "utils/helper.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ +#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1 +#endif + +#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE +#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2 +#endif + +using namespace yosemite; + + +BlockDivergenceAnalysis::BlockDivergenceAnalysis() : Tool(MEM_TRACE) { + const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED"); + if (torch_prof && std::string(torch_prof) == "1") { + fprintf(stdout, "Enabling torch profiler in BlockDivergenceAnalysis.\n"); + _torch_enabled = true; + } + + const char* env_app_name = std::getenv("YOSEMITE_APP_NAME"); + if (env_app_name != nullptr) { + output_directory = "block_distribution_" + std::string(env_app_name) + + "_" + get_current_date_n_time(); + } else { + output_directory = "block_distribution_" + get_current_date_n_time(); + } + check_folder_existance(output_directory); +} + + +BlockDivergenceAnalysis::~BlockDivergenceAnalysis() {} + + +void BlockDivergenceAnalysis::kernel_start_callback(std::shared_ptr kernel) { + + kernel->kernel_id = kernel_id++; + kernel_events.emplace(_timer.get(), kernel); + _block_entries.clear(); + _unique_pcs.clear(); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::kernel_trace_flush(std::shared_ptr kernel) { + std::string filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".csv"; + printf("Dumping traces to %s\n", filename.c_str()); + + std::ofstream out(filename); + std::vector pc_list(_unique_pcs.begin(), _unique_pcs.end()); + std::sort(pc_list.begin(), pc_list.end()); + + std::vector block_ids; + block_ids.reserve(_block_entries.size()); + for (const auto& entry : _block_entries) { + block_ids.push_back(entry.first); + } + std::sort(block_ids.begin(), block_ids.end()); + + out << "blockidx,blockidy,blockidz"; + for (const auto pc : pc_list) { + out << ",0x" << std::hex << std::setw(16) << std::setfill('0') << pc << std::dec; + } + out << ",read_count,write_count" << std::endl; + + for (const auto block_id : block_ids) { + const auto& stats = _block_entries.at(block_id); + out << block_id << ",0,0"; + for (const auto pc : pc_list) { + auto pc_it = stats.pc_counts.find(pc); + uint64_t count = (pc_it != stats.pc_counts.end()) ? pc_it->second : 0; + out << "," << count; + } + out << "," << stats.read_count << "," << stats.write_count << std::endl; + } +} + + +void BlockDivergenceAnalysis::kernel_end_callback(std::shared_ptr kernel) { + auto evt = std::prev(kernel_events.end())->second; + evt->end_time = _timer.get(); + + kernel_trace_flush(evt); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::mem_alloc_callback(std::shared_ptr mem) { + alloc_events.emplace(_timer.get(), mem); + active_memories.emplace(mem->addr, mem); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::mem_free_callback(std::shared_ptr mem) { + auto it = active_memories.find(mem->addr); + assert(it != active_memories.end()); + active_memories.erase(it); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::ten_alloc_callback(std::shared_ptr ten) { + tensor_events.emplace(_timer.get(), ten); + active_tensors.emplace(ten->addr, ten); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::ten_free_callback(std::shared_ptr ten) { + auto it = active_tensors.find(ten->addr); + assert(it != active_tensors.end()); + active_tensors.erase(it); + + _timer.increment(true); +} + + +void BlockDivergenceAnalysis::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + for (uint64_t i = 0; i < size; i++) { + const MemoryAccess& trace = accesses_buffer[i]; + uint64_t executed_inst_count = static_cast(__builtin_popcount(trace.active_mask)); + uint64_t pc = trace.pc; + uint64_t cta_id = trace.ctaId; + + auto& entry = _block_entries[cta_id]; + entry.pc_counts[pc] += executed_inst_count; + if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) { + entry.read_count += executed_inst_count; + } + if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) { + entry.write_count += executed_inst_count; + } + + _unique_pcs.insert(pc); + } + +} + + +void BlockDivergenceAnalysis::evt_callback(EventPtr_t evt) { + switch (evt->evt_type) { + case EventType_KERNEL_LAUNCH: + kernel_start_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_KERNEL_END: + kernel_end_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_ALLOC: + mem_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_FREE: + mem_free_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_ALLOC: + ten_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_FREE: + ten_free_callback(std::dynamic_pointer_cast(evt)); + break; + default: + break; + } +} + + +void BlockDivergenceAnalysis::flush() { +} diff --git a/src/tools/heatmap_analysis.cpp b/src/tools/heatmap_analysis.cpp new file mode 100644 index 0000000..6be87ce --- /dev/null +++ b/src/tools/heatmap_analysis.cpp @@ -0,0 +1,199 @@ +#include "tools/heatmap_analysis.h" +#include "utils/helper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace yosemite; + + +HeatmapAnalysis::HeatmapAnalysis() : Tool(HEATMAP_ANALYSIS) { + const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED"); + if (torch_prof && std::string(torch_prof) == "1") { + fprintf(stdout, "Enabling torch profiler in HeatmapAnalysis.\n"); + _torch_enabled = true; + } + + const char* env_app_name = std::getenv("YOSEMITE_APP_NAME"); + if (env_app_name != nullptr) { + output_directory = "heatmap_" + std::string(env_app_name) + + "_" + get_current_date_n_time(); + } else { + output_directory = "heatmap_" + get_current_date_n_time(); + } + check_folder_existance(output_directory); +} + + +HeatmapAnalysis::~HeatmapAnalysis() {} + + +void HeatmapAnalysis::kernel_start_callback(std::shared_ptr kernel) { + + kernel->kernel_id = kernel_id++; + kernel_events.emplace(_timer.get(), kernel); + _traces.clear(); + _heatmap_data.clear(); + _sector_pc_information.clear(); + + _timer.increment(true); +} + + +void HeatmapAnalysis::kernel_trace_flush(std::shared_ptr kernel) { + std::string filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".csv"; + printf("Dumping block 0 heatmap to %s\n", filename.c_str()); + + std::ofstream out(filename); + std::stringstream ss; + + std::vector>> sorted_heatmap_data(_heatmap_data.begin(), _heatmap_data.end()); + std::sort(sorted_heatmap_data.begin(), sorted_heatmap_data.end(), [](const std::pair>& a, const std::pair>& b) { + return a.first < b.first; + }); + ss << "Sector Tag,\t\tDistinct Warp Count,\tAccess Count,\t\t\tTouched PC" << std::endl; + for (auto& [tag, data] : sorted_heatmap_data) { + ss << "0x"<(data[i]).count() << ","; + } + ss << "\t\t"; + for (int i = 9; i < 18; i++) { + ss << data[i] << ","; + } + for (auto pc : _sector_pc_information[tag]) { + ss << "\t\t0x" << std::hex << pc << std::dec << ","; + } + ss << std::endl; + } + + out << ss.str(); + + out.close(); +} + + +void HeatmapAnalysis::kernel_end_callback(std::shared_ptr kernel) { + auto evt = std::prev(kernel_events.end())->second; + evt->end_time = _timer.get(); + + kernel_trace_flush(evt); + + _timer.increment(true); +} + + +void HeatmapAnalysis::mem_alloc_callback(std::shared_ptr mem) { + alloc_events.emplace(_timer.get(), mem); + active_memories.emplace(mem->addr, mem); + + _timer.increment(true); +} + + +void HeatmapAnalysis::mem_free_callback(std::shared_ptr mem) { + auto it = active_memories.find(mem->addr); + assert(it != active_memories.end()); + active_memories.erase(it); + + _timer.increment(true); +} + + +void HeatmapAnalysis::ten_alloc_callback(std::shared_ptr ten) { + tensor_events.emplace(_timer.get(), ten); + active_tensors.emplace(ten->addr, ten); + + _timer.increment(true); +} + + +void HeatmapAnalysis::ten_free_callback(std::shared_ptr ten) { + auto it = active_tensors.find(ten->addr); + assert(it != active_tensors.end()); + active_tensors.erase(it); + + _timer.increment(true); +} + +// function signature: +// addr: the address of the memory access +// warp_id: the warp id of the memory access +// sector_tag: the sector tag of the memory access +// offset: the offset of the memory access +// length: the length of the memory access +// count_access_flag: whether to count the access flag +// return: void +void HeatmapAnalysis::unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length) { + + // heatmap_data[tag][0-7]: distinct warp id mask for each word in this sector; + // heatmap_data[tag][8]: distinct warp id mask for entire sector; + // heatmap_data[tag][9-17]: access count for each word and the last is for entire sector; + // // if count_access_flag is true, then the access count for the entire sector is incremented by 1; + auto& sector_data = _heatmap_data[sector_tag]; + auto mask = (1u << warp_id); + for (int i = 0; i < length; i+=4) { + sector_data[offset+i/4] |= mask; + sector_data[8] |= mask; + sector_data[9+offset+i/4] += 1; + } + sector_data[17] += 1; +} + +void HeatmapAnalysis::add_sector_pc_information(uint64_t sector_tag, uint64_t pc) { + _sector_pc_information[sector_tag].insert(pc); +} + + +void HeatmapAnalysis::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + for (uint64_t i = 0; i < size; i++) { + auto trace = accesses_buffer[i]; + for (int j = 0; j < GPU_WARP_SIZE; j++) { + if (trace.active_mask & (1u << j)) { + auto sector_tag = trace.addresses[j] >> SECTOR_TAG_SHIFT; + auto offset = (trace.addresses[j] & 31) >> 2; + unit_access(trace.warpId, sector_tag, offset, trace.accessSize); + add_sector_pc_information(sector_tag, trace.pc); + } + } + } +} + +void HeatmapAnalysis::evt_callback(EventPtr_t evt) { + switch (evt->evt_type) { + case EventType_KERNEL_LAUNCH: + kernel_start_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_KERNEL_END: + kernel_end_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_ALLOC: + mem_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_FREE: + mem_free_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_ALLOC: + ten_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_FREE: + ten_free_callback(std::dynamic_pointer_cast(evt)); + break; + default: + break; + } +} + + +void HeatmapAnalysis::flush() { +} diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp new file mode 100644 index 0000000..aa3f525 --- /dev/null +++ b/src/tools/pc_dependency_analysis.cpp @@ -0,0 +1,770 @@ +#include "tools/pc_dependency_analysis.h" +#include "utils/helper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace yosemite; + +namespace { +static std::string json_escape(const std::string& s) { + std::string out; + out.reserve(s.size() + 8); + for (char c : s) { + switch (c) { + case '\"': out += "\\\""; break; + case '\\': out += "\\\\"; break; + case '\b': out += "\\b"; break; + case '\f': out += "\\f"; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + // control chars + if (static_cast(c) < 0x20) { + std::ostringstream oss; + oss << "\\u" + << std::hex << std::setw(4) << std::setfill('0') + << (int)static_cast(c); + out += oss.str(); + } else { + out += c; + } + } + } + return out; +} + +static std::string hex_u32(uint32_t v) { + std::ostringstream oss; + oss << "0x" << std::hex << v; + return oss.str(); +} +static std::string flags_to_string(uint32_t flags) { + std::ostringstream oss; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) oss << "READ"; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) oss << "WRITE"; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC) oss << "ATOMIC"; + if (flags & SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH) oss << "PREFETCH"; + oss << " "; + if (flags & SANITIZER_MEMORY_GLOBAL) oss << "GLOBAL"; + if (flags & SANITIZER_MEMORY_SHARED) oss << "SHARED"; + if (flags & SANITIZER_MEMORY_LOCAL) oss << "LOCAL"; + + return oss.str(); +} + +static inline uint64_t pack_shadow_entry(uint8_t generation, uint32_t pc24, uint32_t flat_thread_id) { + const uint32_t encoded_pc = (static_cast(generation) << 24) + | (pc24 & 0x00FFFFFFu); + return (static_cast(flat_thread_id) << 32) | static_cast(encoded_pc); +} + +static inline uint32_t unpack_shadow_pc_encoded(uint64_t packed) { + return static_cast(packed & 0xFFFFFFFFu); +} + +static inline uint32_t unpack_shadow_flat_tid(uint64_t packed) { + return static_cast(packed >> 32); +} + +static inline const memory_region* find_memory_region_containing( + const std::vector& regions, + uint64_t addr +) { + auto it = std::upper_bound( + regions.begin(), + regions.end(), + addr, + [](uint64_t value, const memory_region& region) { + return value < region.get_start(); + } + ); + if (it == regions.begin()) { + return nullptr; + } + --it; + return it->contains(addr) ? &(*it) : nullptr; +} +} // namespace + + +PcDependency::PcDependency() : Tool(PC_DEPENDENCY_ANALYSIS) { + const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED"); + if (torch_prof && std::string(torch_prof) == "1") { + fprintf(stdout, "Enabling torch profiler in PcDependency.\n"); + _torch_enabled = true; + } + + const char* env_app_name = std::getenv("YOSEMITE_APP_NAME"); + if (env_app_name != nullptr) { + output_directory = "dependency_" + std::string(env_app_name) + + "_" + get_current_date_n_time(); + } else { + output_directory = "dependency_" + get_current_date_n_time(); + } + check_folder_existance(output_directory); + + _worker_count = std::max(1u, std::thread::hardware_concurrency()); + _worker_shadow_memory_shared.resize(_worker_count); + _job_worker_trace_indices.resize(_worker_count); + _job_worker_pc_statistics.resize(_worker_count); + _job_worker_pc_flags.resize(_worker_count); + _job_worker_distinct_sector_count.resize(_worker_count); + _workers.reserve(_worker_count); + for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) { + _workers.emplace_back(&PcDependency::worker_loop, this, worker_idx); + } +} + + +PcDependency::~PcDependency() { + { + std::lock_guard guard(_worker_pool_mutex); + _worker_pool_shutdown = true; + ++_worker_job_generation; + } + _worker_pool_cv.notify_all(); + for (auto& worker : _workers) { + if (worker.joinable()) { + worker.join(); + } + } +} + + +void PcDependency::kernel_start_callback(std::shared_ptr kernel) { + + kernel->kernel_id = kernel_id++; + kernel_events.emplace(_timer.get(), kernel); + _pc_statistics.clear(); + _pc_flags.clear(); + _distinct_sector_count.clear(); + for (auto& shared_map : _worker_shadow_memory_shared) { + shared_map.clear(); + } + _kernel_generation = static_cast(_kernel_generation + 1u); + if (_kernel_generation == 0) { + for (auto& shadow_memory_iter : _shadow_memories) { + shadow_memory_iter.second->reset_entries(); + } + printf("[PC_DEPENDENCY] Shadow generation wrapped, resetting entries\n"); + } + _timer.increment(true); +} + + +void PcDependency::kernel_trace_flush(std::shared_ptr kernel) { + std::string filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".csv"; + printf("Dumping pc dependency to %s\n", filename.c_str()); + + std::ofstream out(filename); + out << "current_pc_offset,ancient_pc_offset,flags,intra_thread,intra_warp,intra_block,intra_grid\n"; + + std::vector>> outer( + _pc_statistics.begin(), _pc_statistics.end()); + std::sort(outer.begin(), outer.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + for (auto& [cur_pc, inner_map] : outer) { + std::vector> inner(inner_map.begin(), inner_map.end()); + std::sort(inner.begin(), inner.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + uint32_t flags = 0; + uint32_t access_size = 0; + auto fit = _pc_flags.find(cur_pc); + if (fit != _pc_flags.end()){ flags = fit->second.first; access_size = fit->second.second;} + + for (auto& [anc_pc, st] : inner) { + out << "0x" << std::hex << cur_pc + << ",0x" << anc_pc + << ",0x" << flags + << std::dec + << "," << st.dist[0] + << "," << st.dist[1] + << "," << st.dist[2] + << "," << st.dist[3] + << "\n"; + } + } + + // JSON output for building PC dependency graph (joinable with CFG) + std::string json_filename = output_directory + "/kernel_" + + std::to_string(kernel->kernel_id) + ".json"; + std::ofstream jout(json_filename); + jout << "{\n"; + jout << " \"tool\": \"pc_dependency_analysis\",\n"; + jout << " \"kernel\": {\n"; + jout << " \"kernel_id\": " << kernel->kernel_id << ",\n"; + jout << " \"kernel_name\": \"" << json_escape(kernel->kernel_name) << "\",\n"; + jout << " \"device_id\": " << kernel->device_id << ",\n"; + jout << " \"kernel_pc\": " << kernel->kernel_pc << ",\n"; + jout << " \"kernel_pc_hex\": \"" << hex_u32((uint32_t)kernel->kernel_pc) << "\"\n"; + jout << " },\n"; + jout << " \"shadow_memory_granularity_bytes\": 1,\n"; + jout << " \"sample_stride_bytes\": 4,\n"; + + // Collect nodes (all current PCs + all non-cold ancient PCs) + std::set nodes; + for (const auto& [cur_pc, inner_map] : _pc_statistics) { + nodes.insert(cur_pc); + for (const auto& [anc_pc, st] : inner_map) { + (void)st; + if (anc_pc != 0u) nodes.insert(anc_pc); + } + } + + jout << " \"nodes\": [\n"; + { + bool first = true; + for (uint32_t pc : nodes) { + if (!first) jout << ",\n"; + first = false; + auto fit = _pc_flags.find(pc); + bool has_flags = (fit != _pc_flags.end()); + uint32_t flags = has_flags ? fit->second.first : 0; + uint32_t access_size = has_flags ? fit->second.second : 0; + bool has_distinct_sector_count = (_distinct_sector_count.find(pc) != _distinct_sector_count.end()); + jout << " {\"pc\": " << pc + << ", \"pc_hex\": \"" << hex_u32(pc) << "\""; + if (has_flags) { + jout << ", \"flags\": \"" << flags_to_string(flags) << "\"" + << ", \"flags_hex\": \"" << hex_u32(flags) << "\"" + << ", \"access_size\": " << access_size; + } else { + jout << ", \"flags\": null, \"flags_hex\": null, \"access_size\": null"; + } + if (has_distinct_sector_count) { + jout << ", \"distinct_sector_count\": {"; + for (int i = 1; i <= 32; i++) { + jout << "\"" << i << "\": " << _distinct_sector_count[pc][i - 1]; + if (i != 32) { + jout << ", "; + } + } + jout << "}"; + jout << ", \"active_lane_count\": {"; + for (int i = 0; i <= 32; i++) { + jout << "\"" << i << "\": " << _distinct_sector_count[pc][32 + i]; + if (i != 32) { + jout << ", "; + } + } + jout << "}"; + } else { + jout << ", \"distinct_sector_count\": null, \"active_lane_count\": null"; + } + jout << "}"; + } + jout << "\n"; + } + jout << " ],\n"; + + // Edges: ancient_pc -> current_pc, with per-scope counts. + jout << " \"edges\": [\n"; + { + // Stable order: sort by current pc then ancient pc + std::vector>> outer2( + _pc_statistics.begin(), _pc_statistics.end()); + std::sort(outer2.begin(), outer2.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + bool first_edge = true; + for (auto& [cur_pc, inner_map] : outer2) { + std::vector> inner2(inner_map.begin(), inner_map.end()); + std::sort(inner2.begin(), inner2.end(), + [](auto& a, auto& b){ return a.first < b.first; }); + + // current flags if available + auto cfit = _pc_flags.find(cur_pc); + bool has_cflags = (cfit != _pc_flags.end()); + uint32_t cflags = has_cflags ? cfit->second.first : 0; + uint32_t c_access_size = has_cflags ? cfit->second.second : 0; + + for (auto& [anc_pc, st] : inner2) { + if (!first_edge) jout << ",\n"; + first_edge = false; + + bool cold_miss = (anc_pc == 0u); + + jout << " {\"current_pc\": " << cur_pc + << ", \"current_pc_hex\": \"" << hex_u32(cur_pc) << "\"" + << ", \"ancient_pc\": "; + if (cold_miss) { + jout << "null"; + } else { + jout << anc_pc; + } + jout << ", \"ancient_pc_hex\": "; + if (cold_miss) { + jout << "null"; + } else { + jout << "\"" << hex_u32(anc_pc) << "\""; + } + jout << ", \"cold_miss\": " << (cold_miss ? "true" : "false"); + + if (has_cflags) { + jout << ", \"current_flags\": " << cflags + << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\"" + << ", \"current_access_size\": " << c_access_size; + } else { + jout << ", \"current_flags\": null, \"current_flags_hex\": null"; + } + + jout << ", \"dist\": {" + << "\"intra_thread\": " << st.dist[0] + << ", \"intra_warp\": " << st.dist[1] + << ", \"intra_block\": " << st.dist[2] + << ", \"intra_grid\": " << st.dist[3] + << "}}"; + } + } + jout << "\n"; + } + jout << " ]\n"; + jout << "}\n"; + printf("Dumping pc dependency graph json to %s\n", json_filename.c_str()); +} + + +void PcDependency::kernel_end_callback(std::shared_ptr kernel) { + auto evt = std::prev(kernel_events.end())->second; + evt->end_time = _timer.get(); + for (auto& shared_map : _worker_shadow_memory_shared) { + shared_map.clear(); + } + printf("[PC_DEPENDENCY] Clearing shadow memory shared\n"); + kernel_trace_flush(evt); + + _timer.increment(true); +} + + +void PcDependency::mem_alloc_callback(std::shared_ptr mem) { + // TODO: add shadow memory allocation here + alloc_events.emplace(_timer.get(), mem); + active_memories.emplace(mem->addr, mem); + memory_region memory_region_current = memory_region((uint64_t)mem->addr, (uint64_t)(mem->addr + mem->size)); + _memory_regions.insert( + std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current), + memory_region_current + ); + _shadow_memories.emplace(memory_region_current, std::make_unique(mem->size)); + + printf("[PC_DEPENDENCY] Allocating shadow memory for memory region: %p - %p, size: %lu\n", (void*)memory_region_current.get_start(), (void*)memory_region_current.get_end(), mem->size); + _timer.increment(true); +} + +void PcDependency::mem_free_callback(std::shared_ptr mem) { + auto it = active_memories.find(mem->addr); + assert(it != active_memories.end()); + + uint64_t sz = it->second->size; // 从 alloc 事件拿 size + active_memories.erase(it); + + memory_region r((uint64_t)mem->addr, (uint64_t)mem->addr + sz); + + auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r); + if (vit != _memory_regions.end() && *vit == r) _memory_regions.erase(vit); + + _shadow_memories.erase(r); + printf("[PC_DEPENDENCY] Freeing shadow memory for memory region: %p - %p, size: %lu\n", (void*)r.get_start(), (void*)r.get_end(), sz); + _timer.increment(true); +} + + +void PcDependency::ten_alloc_callback(std::shared_ptr ten) { + tensor_events.emplace(_timer.get(), ten); + active_tensors.emplace(ten->addr, ten); + memory_region memory_region_current((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size)); + _memory_regions.insert( + std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current), + memory_region_current + ); + _shadow_memories.emplace(memory_region_current, std::make_unique(ten->size)); + printf("[PC_DEPENDENCY] Allocating shadow memory for tensor region: %p - %p, size: %lu\n", (void*)ten->addr, (void*)(ten->addr + ten->size), ten->size); + + _timer.increment(true); +} + + +void PcDependency::ten_free_callback(std::shared_ptr ten) { + auto it = active_tensors.find(ten->addr); + assert(it != active_tensors.end()); + + // TenFree.size may be negative (e.g., accounting-style events). Use size from TenAlloc. + const uint64_t sz = static_cast(it->second->size); + active_tensors.erase(it); + + memory_region r((uint64_t)ten->addr, (uint64_t)ten->addr + sz); + + auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r); + if (vit != _memory_regions.end() && *vit == r) { + _memory_regions.erase(vit); + } + + _shadow_memories.erase(r); + printf("[PC_DEPENDENCY] Freeing shadow memory for tensor region: %p - %p, size: %lu\n", + (void*)r.get_start(), (void*)r.get_end(), sz); + _timer.increment(true); +} + +void PcDependency::unit_access( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + memory_region& memory_region_target, + int access_size, + std::unordered_map>& local_pc_statistics +) { + // auto& shadow_memory = this->_shadow_memories[memory_region_target]; + auto shadow_memory_it = this->_shadow_memories.find(memory_region_target); + if (shadow_memory_it == this->_shadow_memories.end()) { + printf("shadow memory not found for memory region: %lu - %lu\n", memory_region_target.get_start(), memory_region_target.get_end()); + return; + } + auto& shadow_memory = *(shadow_memory_it->second); + const uint32_t current_flat_thread_id = + static_cast((current_block_id << 10) | (current_warp_id << 5) | current_lane_id); + + for (int i = 0; i < access_size; i += 4) { + const uint64_t addr = ptr + i; + // Byte-granularity shadow memory: addr is byte offset within allocation. + // Bound check to avoid OOB on allocations at end boundary or odd sizes. + if (addr >= shadow_memory._size) { + break; + } + + auto& entry = shadow_memory.get_entry(addr); + const uint64_t old_packed = __atomic_exchange_n( + &entry.packed, + pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id), + __ATOMIC_ACQ_REL + ); + const bool is_cold_miss = (old_packed == 0); + + if (is_cold_miss) { + local_pc_statistics[pc_offset][0].dist[0] += 1; + continue; + } + + const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed); + const uint8_t last_generation = static_cast(last_pc_encoded >> 24); + if (last_generation != _kernel_generation) { + local_pc_statistics[pc_offset][0].dist[0] += 1; + continue; + } + const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu); + const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed); + const uint64_t last_block_id = static_cast(last_flat_thread_id >> 10); + const uint64_t last_warp_id = static_cast((last_flat_thread_id >> 5) & 0x1F); + const uint64_t last_lane_id = static_cast(last_flat_thread_id & 0x1F); + if (last_block_id != current_block_id) { + local_pc_statistics[pc_offset][last_pc].dist[3] += 1; + } else if (last_warp_id != current_warp_id) { + local_pc_statistics[pc_offset][last_pc].dist[2] += 1; + } else if (last_lane_id != current_lane_id) { + local_pc_statistics[pc_offset][last_pc].dist[1] += 1; + } else { + local_pc_statistics[pc_offset][last_pc].dist[0] += 1; + } + } +} + +void PcDependency::unit_access_shared( + uint64_t ptr, + uint32_t pc_offset, + uint64_t current_block_id, + uint32_t current_warp_id, + uint32_t current_lane_id, + int access_size, + std::unordered_map>& local_pc_statistics, + std::unordered_map>& local_shadow_memory_shared +) { + // Per-CTA layered shadow map: local_shadow_memory_shared[cta_id][addr_low32] + auto& cta_shadow = local_shadow_memory_shared[current_block_id]; + const uint32_t base_addr_low32 = static_cast(ptr & 0xFFFFFFFFull); + + for (int i = 0; i < access_size; i += 4) { + const uint32_t addr = base_addr_low32 + static_cast(i); // 4 字节粒度 + const uint32_t current_flat_thread_id = + static_cast((current_warp_id << 5) | current_lane_id); + + auto [it, inserted] = cta_shadow.emplace(addr, shadow_memory_entry()); + const bool is_cold_miss = inserted; + const uint64_t old_packed = it->second.packed; + it->second.packed = pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id); + + if (is_cold_miss) { + local_pc_statistics[pc_offset][0].dist[0] += 1; + continue; + } + + const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed); + const uint8_t last_generation = static_cast(last_pc_encoded >> 24); + if (last_generation != _kernel_generation) { + local_pc_statistics[pc_offset][0].dist[0] += 1; + continue; + } + const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu); + const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed); + const uint64_t last_warp_id = static_cast((last_flat_thread_id >> 5) & 0x1F); + const uint64_t last_lane_id = static_cast(last_flat_thread_id & 0x1F); + + if (last_warp_id != current_warp_id) { + // 不同 warp 同 block + local_pc_statistics[pc_offset][last_pc].dist[2] += 1; + } else if (last_lane_id != current_lane_id) { + // 同 warp 不同 lane + local_pc_statistics[pc_offset][last_pc].dist[1] += 1; + } else { + // 同一线程 + local_pc_statistics[pc_offset][last_pc].dist[0] += 1; + } + } +} + +void PcDependency::unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) { + // TODO: implement local memory access +} + + +void PcDependency::worker_loop(uint64_t worker_idx) { + uint64_t seen_generation = 0; + while (true) { + uint64_t current_generation = 0; + { + std::unique_lock lock(_worker_pool_mutex); + _worker_pool_cv.wait(lock, [&]{ + return _worker_pool_shutdown || _worker_job_generation > seen_generation; + }); + if (_worker_pool_shutdown) { + return; + } + current_generation = _worker_job_generation; + } + + auto& local_pc_statistics = _job_worker_pc_statistics[worker_idx]; + auto& local_pc_flags = _job_worker_pc_flags[worker_idx]; + auto& local_distinct_sector_count = _job_worker_distinct_sector_count[worker_idx]; + auto& local_shadow_memory_shared = _worker_shadow_memory_shared[worker_idx]; + const auto& trace_indices = _job_worker_trace_indices[worker_idx]; + + for (uint64_t i : trace_indices) { + const MemoryAccess& trace = _job_accesses_buffer[i]; + uint32_t pc_offset = (trace.pc & 0x00FFFFFFu); + uint32_t flags = trace.flags; + uint32_t access_size = trace.accessSize; + uint32_t distinct_sector_count = trace.distinct_sector_count; + uint32_t active_mask = trace.active_mask; + switch (trace.type) { + case MemoryType::Local:{ + flags |= SANITIZER_MEMORY_LOCAL; + break; + } + case MemoryType::Shared:{ + flags |= SANITIZER_MEMORY_SHARED; + uint32_t remaining_mask = active_mask; + while (remaining_mask != 0) { + const uint32_t j = static_cast(__builtin_ctz(remaining_mask)); + remaining_mask &= (remaining_mask - 1); + unit_access_shared( + trace.addresses[j], + pc_offset, + trace.ctaId, + trace.warpId, + j, + trace.accessSize, + local_pc_statistics, + local_shadow_memory_shared + ); + } + break; + } + case MemoryType::Global:{ + flags |= SANITIZER_MEMORY_GLOBAL; + if (active_mask == 0) { + break; + } + const uint32_t first_lane = static_cast(__builtin_ctz(active_mask)); + const uint64_t first_valid_address = trace.addresses[first_lane]; + const memory_region* memory_region_target_ptr = + find_memory_region_containing(this->_memory_regions, first_valid_address); + assert(memory_region_target_ptr != nullptr); + memory_region memory_region_target = *memory_region_target_ptr; + uint64_t memory_region_start = memory_region_target.get_start(); + assert(memory_region_start != 0); + uint32_t remaining_mask = active_mask; + while (remaining_mask != 0) { + const uint32_t j = static_cast(__builtin_ctz(remaining_mask)); + remaining_mask &= (remaining_mask - 1); + unit_access( + trace.addresses[j] - memory_region_start, + pc_offset, + trace.ctaId, + trace.warpId, + j, + memory_region_target, + access_size, + local_pc_statistics + ); + } + break; + } + default: + printf("unknown memory type\n"); + break; + } + auto& local_flag = local_pc_flags[pc_offset]; + local_flag.first |= flags; + if (local_flag.second == 0) { + local_flag.second = access_size; + } else if (local_flag.second != access_size) { + local_flag.second = std::max(local_flag.second, access_size); + } + if (distinct_sector_count >= 1 && distinct_sector_count <= 32) { + local_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1; + } + const uint32_t active_lane_count = __builtin_popcount(active_mask); + if (active_lane_count <= 32) { + local_distinct_sector_count[pc_offset][32 + active_lane_count] += 1; + } + } + + { + std::lock_guard guard(_worker_pool_mutex); + seen_generation = current_generation; + if (!trace_indices.empty()) { + assert(_worker_pending_jobs > 0); + _worker_pending_jobs -= 1; + if (_worker_pending_jobs == 0) { + _worker_pool_done_cv.notify_one(); + } + } + } + } +} + + +void PcDependency::gpu_data_analysis(void* data, uint64_t size) { + MemoryAccess* accesses_buffer = (MemoryAccess*)data; + if (size == 0) { + return; + } + + for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) { + _job_worker_trace_indices[worker_idx].clear(); + _job_worker_pc_statistics[worker_idx].clear(); + _job_worker_pc_flags[worker_idx].clear(); + _job_worker_distinct_sector_count[worker_idx].clear(); + _job_worker_trace_indices[worker_idx].reserve((size / _worker_count) + 1); + } + + // Stable assignment by block id keeps intra-block trace order. + for (uint64_t i = 0; i < size; ++i) { + const uint64_t worker_idx = accesses_buffer[i].ctaId % _worker_count; + _job_worker_trace_indices[worker_idx].push_back(i); + } + + uint64_t pending_jobs = 0; + for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) { + if (!_job_worker_trace_indices[worker_idx].empty()) { + pending_jobs += 1; + } + } + if (pending_jobs == 0) { + return; + } + + { + std::lock_guard guard(_worker_pool_mutex); + _job_accesses_buffer = accesses_buffer; + _worker_pending_jobs = pending_jobs; + ++_worker_job_generation; + } + _worker_pool_cv.notify_all(); + { + std::unique_lock lock(_worker_pool_mutex); + _worker_pool_done_cv.wait(lock, [&]{ + return _worker_pending_jobs == 0; + }); + } + + for (auto& local_flags_map : _job_worker_pc_flags) { + for (auto& [pc, local_flag] : local_flags_map) { + auto& global_flag = this->_pc_flags[pc]; + global_flag.first |= local_flag.first; + if (global_flag.second == 0) { + global_flag.second = local_flag.second; + } else if (global_flag.second != local_flag.second) { + global_flag.second = std::max(global_flag.second, local_flag.second); + } + } + } + + for (auto& local_distinct_map : _job_worker_distinct_sector_count) { + for (auto& [pc, local_hist] : local_distinct_map) { + auto& global_hist = this->_distinct_sector_count[pc]; + for (size_t idx = 0; idx < global_hist.size(); ++idx) { + global_hist[idx] += local_hist[idx]; + } + } + } + + for (auto& local_map : _job_worker_pc_statistics) { + for (auto& [cur_pc, local_inner] : local_map) { + auto& global_inner = this->_pc_statistics[cur_pc]; + for (auto& [anc_pc, local_stats] : local_inner) { + auto& global_stats = global_inner[anc_pc]; + for (int d = 0; d < 4; ++d) { + global_stats.dist[d] += local_stats.dist[d]; + } + } + } + } + +} + + +void PcDependency::evt_callback(EventPtr_t evt) { + switch (evt->evt_type) { + case EventType_KERNEL_LAUNCH: + kernel_start_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_KERNEL_END: + kernel_end_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_ALLOC: + mem_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_MEM_FREE: + mem_free_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_ALLOC: + ten_alloc_callback(std::dynamic_pointer_cast(evt)); + break; + case EventType_TEN_FREE: + ten_free_callback(std::dynamic_pointer_cast(evt)); + break; + default: + break; + } +} + + +void PcDependency::flush() { +}