From 26eecb254f0aca61f98c550eacf2a1cedc809393 Mon Sep 17 00:00:00 2001
From: Yanbo <mu.ra.flag@outlook.com>
Date: Thu, 18 Dec 2025 04:41:18 -0500
Subject: [PATCH 1/6] cuThermo heatmap and block divergence feature

---
 include/sanalyzer.h                       |   2 +
 include/tools/block_divergence_analysis.h |  73 ++++++++
 include/tools/heatmap_analysis.h          |  78 +++++++++
 include/tools/tool_type.h                 |   4 +-
 src/sanalyzer.cpp                         |  14 ++
 src/tools/block_divergence_analysis.cpp   | 190 +++++++++++++++++++++
 src/tools/heatmap_analysis.cpp            | 198 ++++++++++++++++++++++
 7 files changed, 558 insertions(+), 1 deletion(-)
 create mode 100644 include/tools/block_divergence_analysis.h
 create mode 100644 include/tools/heatmap_analysis.h
 create mode 100644 src/tools/block_divergence_analysis.cpp
 create mode 100644 src/tools/heatmap_analysis.cpp
diff --git a/include/sanalyzer.h b/include/sanalyzer.h
index e42ae53..8525d1f 100644
--- a/include/sanalyzer.h
+++ b/include/sanalyzer.h
@@ -25,6 +25,8 @@ typedef enum {
     GPU_PATCH_TIME_HOTNESS_CPU = 8,
     GPU_PATCH_ROOFLINE_FLOPS_NVBIT = 9,
     GPU_PATCH_ROOFLINE_SIZE = 10,
+    GPU_PATCH_HEATMAP_ANALYSIS = 11,
+    GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS = 12,
 } AccelProfPatchName_t;
 
 
diff --git a/include/tools/block_divergence_analysis.h b/include/tools/block_divergence_analysis.h
new file mode 100644
index 0000000..4fb52dd
--- /dev/null
+++ b/include/tools/block_divergence_analysis.h
@@ -0,0 +1,73 @@
+#ifndef YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H
+#define YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H
+
+
+#include "tools/tool.h"
+#include "utils/event.h"
+#include "gpu_patch.h"
+
+#include <map>
+#include <vector>
+#include <set>
+#include <unordered_map>
+namespace yosemite {
+
+class BlockDivergenceAnalysis final : public Tool {
+public:
+    BlockDivergenceAnalysis();
+
+    ~BlockDivergenceAnalysis();
+
+    void gpu_data_analysis(void* data, uint64_t size);
+
+    void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void evt_callback(EventPtr_t evt);
+
+    void flush();
+
+private:
+    void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel);
+
+    void mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem);
+
+    void mem_free_callback(std::shared_ptr<MemFree_t> mem);
+
+    void ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten);
+
+    void ten_free_callback(std::shared_ptr<TenFree_t> ten);
+
+    void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
+
+
+/*
+********************************* variables *********************************
+*/
+    Timer_t _timer;
+
+    std::string output_directory;
+    uint32_t kernel_id = 0;
+
+    std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
+    std::map<uint64_t, std::shared_ptr<MemAlloc_t>> alloc_events;
+    std::map<DevPtr, std::shared_ptr<MemAlloc_t>> active_memories;
+
+    std::map<uint64_t, std::shared_ptr<TenAlloc_t>> tensor_events;
+    std::map<DevPtr, std::shared_ptr<TenAlloc_t>> active_tensors;
+
+    struct BlockStat {
+        std::unordered_map<uint64_t, uint64_t> pc_counts;
+        uint64_t read_count = 0;
+        uint64_t write_count = 0;
+    };
+
+    std::unordered_map<uint64_t, BlockStat> _block_entries;
+    std::set<uint64_t> _unique_pcs;
+};
+
+}   // yosemite
+#endif // YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H
diff --git a/include/tools/heatmap_analysis.h b/include/tools/heatmap_analysis.h
new file mode 100644
index 0000000..0206a56
--- /dev/null
+++ b/include/tools/heatmap_analysis.h
@@ -0,0 +1,78 @@
+#ifndef YOSEMITE_HEATMAP_ANALYSIS_H
+#define YOSEMITE_HEATMAP_ANALYSIS_H
+
+
+#include "tools/tool.h"
+#include "utils/event.h"
+#include "gpu_patch.h"
+
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <array>
+
+#define SECTOR_TAG_SHIFT 5
+
+namespace yosemite {
+
+class HeatmapAnalysis final : public Tool {
+public:
+    HeatmapAnalysis();
+
+    ~HeatmapAnalysis();
+
+    void gpu_data_analysis(void* data, uint64_t size);
+
+    void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void evt_callback(EventPtr_t evt);
+
+    void flush();
+    
+private:
+    void unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length);
+    
+    void add_sector_pc_information(uint32_t sector_tag, uint64_t pc);
+
+    void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel);
+
+    void mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem);
+
+    void mem_free_callback(std::shared_ptr<MemFree_t> mem);
+
+    void ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten);
+
+    void ten_free_callback(std::shared_ptr<TenFree_t> ten);
+
+    void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
+
+
+/*
+********************************* variables *********************************
+*/
+
+    Timer_t _timer;
+
+    std::string output_directory;
+    uint32_t kernel_id = 0;
+
+    std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
+    std::map<uint64_t, std::shared_ptr<MemAlloc_t>> alloc_events;
+    std::map<DevPtr, std::shared_ptr<MemAlloc_t>> active_memories;
+
+    std::map<uint64_t, std::shared_ptr<TenAlloc>> tensor_events;
+    std::map<DevPtr, std::shared_ptr<TenAlloc>> active_tensors;
+
+    std::vector<MemoryAccess> _traces;
+    std::unordered_map<uint64_t, std::array<uint32_t, 18>> _heatmap_data;
+    std::unordered_map<uint64_t, std::set<uint64_t>> _sector_pc_information;
+
+};
+
+}   // namespace yosemite
+#endif // YOSEMITE_HEATMAP_ANALYSIS_H
diff --git a/include/tools/tool_type.h b/include/tools/tool_type.h
index 4115ebb..fdc70dc 100644
--- a/include/tools/tool_type.h
+++ b/include/tools/tool_type.h
@@ -16,7 +16,9 @@ typedef enum {
     ROOFLINE_FLOPS = 11,
     ROOFLINE_SIZE = 12,
     ROOFLINE_TIME = 13,
-    TOOL_NUMS = 14
+    HEATMAP_ANALYSIS = 14,
+    BLOCK_DIVERGENCE_ANALYSIS = 15,
+    TOOL_NUMS = 16
 } AnalysisTool_t;
 
 #endif // TOOL_TYPE_H
\ No newline at end of file
diff --git a/src/sanalyzer.cpp b/src/sanalyzer.cpp
index 0f250c5..038c37c 100644
--- a/src/sanalyzer.cpp
+++ b/src/sanalyzer.cpp
@@ -16,6 +16,8 @@
 #include "tools/time_hotness_cpu.h"
 #include "tools/event_trace.h"
 #include "tools/event_trace_mgpu.h"
+#include "tools/heatmap_analysis.h"
+#include "tools/block_divergence_analysis.h"
 
 #include <memory>
 #include <map>
@@ -104,6 +106,12 @@ YosemiteResult_t yosemite_tool_enable(AnalysisTool_t& tool) {
     } else if (std::string(tool_name) == "event_trace_mgpu") {
         tool = EVENT_TRACE_MGPU;
         _tools.emplace(EVENT_TRACE_MGPU, std::make_shared<EventTraceMGPU>());
+    } else if (std::string(tool_name) == "heatmap_analysis") {
+        tool = HEATMAP_ANALYSIS;
+        _tools.emplace(HEATMAP_ANALYSIS, std::make_shared<HeatmapAnalysis>());
+    } else if (std::string(tool_name) == "block_divergence_analysis") {
+        tool = BLOCK_DIVERGENCE_ANALYSIS;
+        _tools.emplace(BLOCK_DIVERGENCE_ANALYSIS, std::make_shared<BlockDivergenceAnalysis>());
     } else {
         fprintf(stderr, "[SANALYZER ERROR] Tool not found.\n");
         fflush(stderr);
@@ -249,6 +257,12 @@ YosemiteResult_t yosemite_init(AccelProfOptions_t& options) {
         options.patch_name = GPU_NO_PATCH;
     } else if (tool == EVENT_TRACE_MGPU) {
         options.patch_name = GPU_NO_PATCH;
+    } else if (tool == HEATMAP_ANALYSIS) {
+        options.patch_name = GPU_PATCH_HEATMAP_ANALYSIS;
+        options.patch_file = "gpu_patch_heatmap_analysis.fatbin";
+    } else if (tool == BLOCK_DIVERGENCE_ANALYSIS) {
+        options.patch_name = GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS;
+        options.patch_file = "gpu_patch_block_divergence_analysis.fatbin";
     }
 
     // enable torch profiler?
diff --git a/src/tools/block_divergence_analysis.cpp b/src/tools/block_divergence_analysis.cpp
new file mode 100644
index 0000000..8dd16d6
--- /dev/null
+++ b/src/tools/block_divergence_analysis.cpp
@@ -0,0 +1,190 @@
+#include "tools/block_divergence_analysis.h"
+#include "utils/helper.h"
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <cassert>
+#include <algorithm>
+#include <iomanip>
+#include <vector>
+
+#ifdef __has_include
+#if __has_include(<sanitizer_patching.h>)
+#include <sanitizer_patching.h>
+#endif
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ
+#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE
+#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2
+#endif
+
+using namespace yosemite;
+
+
+BlockDivergenceAnalysis::BlockDivergenceAnalysis() : Tool(MEM_TRACE) {
+    const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED");
+    if (torch_prof && std::string(torch_prof) == "1") {
+        fprintf(stdout, "Enabling torch profiler in BlockDivergenceAnalysis.\n");
+        _torch_enabled = true;
+    }
+
+    const char* env_app_name = std::getenv("YOSEMITE_APP_NAME");
+    if (env_app_name != nullptr) {
+        output_directory = "block_distribution_" + std::string(env_app_name)
+                            + "_" + get_current_date_n_time();
+    } else {
+        output_directory = "block_distribution_" + get_current_date_n_time();
+    }
+    check_folder_existance(output_directory);
+}
+
+
+BlockDivergenceAnalysis::~BlockDivergenceAnalysis() {}
+
+
+void BlockDivergenceAnalysis::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
+
+    kernel->kernel_id = kernel_id++;
+    kernel_events.emplace(_timer.get(), kernel);
+    _block_entries.clear();
+    _unique_pcs.clear();
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
+    std::string filename = output_directory + "/kernel_"
+                            + std::to_string(kernel->kernel_id) + ".csv";
+    printf("Dumping traces to %s\n", filename.c_str());
+
+    std::ofstream out(filename);
+    std::vector<uint64_t> pc_list(_unique_pcs.begin(), _unique_pcs.end());
+    std::sort(pc_list.begin(), pc_list.end());
+
+    std::vector<uint64_t> block_ids;
+    block_ids.reserve(_block_entries.size());
+    for (const auto& entry : _block_entries) {
+        block_ids.push_back(entry.first);
+    }
+    std::sort(block_ids.begin(), block_ids.end());
+
+    out << "blockidx,blockidy,blockidz";
+    for (const auto pc : pc_list) {
+        out << ",0x" << std::hex << std::setw(16) << std::setfill('0') << pc << std::dec;
+    }
+    out << ",read_count,write_count" << std::endl;
+
+    for (const auto block_id : block_ids) {
+        const auto& stats = _block_entries.at(block_id);
+        out << block_id << ",0,0";
+        for (const auto pc : pc_list) {
+            auto pc_it = stats.pc_counts.find(pc);
+            uint64_t count = (pc_it != stats.pc_counts.end()) ? pc_it->second : 0;
+            out << "," << count;
+        }
+        out << "," << stats.read_count << "," << stats.write_count << std::endl;
+    }
+}
+
+
+void BlockDivergenceAnalysis::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
+    auto evt = std::prev(kernel_events.end())->second;
+    evt->end_time = _timer.get();
+
+    kernel_trace_flush(evt);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
+    alloc_events.emplace(_timer.get(), mem);
+    active_memories.emplace(mem->addr, mem);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
+    auto it = active_memories.find(mem->addr);
+    assert(it != active_memories.end());
+    active_memories.erase(it);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
+    tensor_events.emplace(_timer.get(), ten);
+    active_tensors.emplace(ten->addr, ten);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
+    auto it = active_tensors.find(ten->addr);
+    assert(it != active_tensors.end());
+    active_tensors.erase(it);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    for (uint64_t i = 0; i < size; i++) {
+        const MemoryAccess& trace = accesses_buffer[i];
+        uint64_t executed_inst_count = static_cast<uint64_t>(__builtin_popcount(trace.active_mask));
+        uint64_t pc = trace.pc;
+        uint64_t cta_id = trace.ctaId;
+
+        auto& entry = _block_entries[cta_id];
+        entry.pc_counts[pc] += executed_inst_count;
+        if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) {
+            entry.read_count += executed_inst_count;
+        }
+        if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) {
+            entry.write_count += executed_inst_count;
+        }
+
+        _unique_pcs.insert(pc);
+    }
+
+}
+
+
+void BlockDivergenceAnalysis::evt_callback(EventPtr_t evt) {
+    switch (evt->evt_type) {
+        case EventType_KERNEL_LAUNCH:
+            kernel_start_callback(std::dynamic_pointer_cast<KernelLaunch_t>(evt));
+            break;
+        case EventType_KERNEL_END:
+            kernel_end_callback(std::dynamic_pointer_cast<KernelEnd_t>(evt));
+            break;
+        case EventType_MEM_ALLOC:
+            mem_alloc_callback(std::dynamic_pointer_cast<MemAlloc_t>(evt));
+            break;
+        case EventType_MEM_FREE:
+            mem_free_callback(std::dynamic_pointer_cast<MemFree_t>(evt));
+            break;
+        case EventType_TEN_ALLOC:
+            ten_alloc_callback(std::dynamic_pointer_cast<TenAlloc_t>(evt));
+            break;
+        case EventType_TEN_FREE:
+            ten_free_callback(std::dynamic_pointer_cast<TenFree_t>(evt));
+            break;
+        default:
+            break;
+    }
+}
+
+
+void BlockDivergenceAnalysis::flush() {
+}
diff --git a/src/tools/heatmap_analysis.cpp b/src/tools/heatmap_analysis.cpp
new file mode 100644
index 0000000..23510fe
--- /dev/null
+++ b/src/tools/heatmap_analysis.cpp
@@ -0,0 +1,198 @@
+#include "tools/heatmap_analysis.h"
+#include "utils/helper.h"
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <cassert>
+#include <iostream>
+#include <bitset>
+#include <sstream>
+#include <algorithm>
+#include <vector>
+
+
+using namespace yosemite;
+
+
+HeatmapAnalysis::HeatmapAnalysis() : Tool(HEATMAP_ANALYSIS) {
+    const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED");
+    if (torch_prof && std::string(torch_prof) == "1") {
+        fprintf(stdout, "Enabling torch profiler in HeatmapAnalysis.\n");
+        _torch_enabled = true;
+    }
+
+    const char* env_app_name = std::getenv("YOSEMITE_APP_NAME");
+    if (env_app_name != nullptr) {
+        output_directory = "heatmap_" + std::string(env_app_name)
+                            + "_" + get_current_date_n_time();
+    } else {
+        output_directory = "heatmap_" + get_current_date_n_time();
+    }
+    check_folder_existance(output_directory);
+}
+
+
+HeatmapAnalysis::~HeatmapAnalysis() {}
+
+
+void HeatmapAnalysis::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
+
+    kernel->kernel_id = kernel_id++;
+    kernel_events.emplace(_timer.get(), kernel);
+    _traces.clear();
+    _heatmap_data.clear();
+    _sector_pc_information.clear();
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
+    std::string filename = output_directory + "/kernel_"
+                            + std::to_string(kernel->kernel_id) + ".csv";
+    printf("Dumping block 0 heatmap to %s\n", filename.c_str());
+
+    std::ofstream out(filename);
+    std::stringstream ss;
+
+    std::vector<std::pair<uint64_t, std::array<uint32_t, 18>>> sorted_heatmap_data(_heatmap_data.begin(), _heatmap_data.end());
+    std::sort(sorted_heatmap_data.begin(), sorted_heatmap_data.end(), [](const std::pair<uint64_t, std::array<uint32_t, 18>>& a, const std::pair<uint64_t, std::array<uint32_t, 18>>& b) {
+        return a.first < b.first;
+    });
+    ss << "Sector Tag,\t\tDistinct Warp Count,\tAccess Count,\t\t\tTouched PC" << std::endl;
+    for (auto& [tag, data] : sorted_heatmap_data) {
+        ss << "0x"<<std::hex << tag << std::dec << ",\t\t";
+        for (int i = 0; i < 9; i++) {
+            ss << std::bitset<32>(data[i]).count() << ",";
+        }
+        ss << "\t\t";
+        for (int i = 9; i < 18; i++) {
+            ss << data[i] << ",";
+        }
+        for (auto pc : _sector_pc_information[tag]) {
+            ss << "\t\t0x" << std::hex << pc << std::dec << ",";
+        }
+        ss << std::endl;
+    }
+
+    out << ss.str();
+
+    out.close();
+}
+
+
+void HeatmapAnalysis::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
+    auto evt = std::prev(kernel_events.end())->second;
+    evt->end_time = _timer.get();
+
+    kernel_trace_flush(evt);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
+    alloc_events.emplace(_timer.get(), mem);
+    active_memories.emplace(mem->addr, mem);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
+    auto it = active_memories.find(mem->addr);
+    assert(it != active_memories.end());
+    active_memories.erase(it);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
+    tensor_events.emplace(_timer.get(), ten);
+    active_tensors.emplace(ten->addr, ten);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
+    auto it = active_tensors.find(ten->addr);
+    assert(it != active_tensors.end());
+    active_tensors.erase(it);
+
+    _timer.increment(true);
+}
+
+// function signature:
+// addr: the address of the memory access
+// warp_id: the warp id of the memory access
+// sector_tag: the sector tag of the memory access
+// offset: the offset of the memory access
+// length: the length of the memory access
+// count_access_flag: whether to count the access flag
+// return: void
+void HeatmapAnalysis::unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length) {
+    
+    // heatmap_data[tag][0-7]: distinct warp id mask for each word in this sector;
+    // heatmap_data[tag][8]: distinct warp id mask for entire sector;
+    // heatmap_data[tag][9-17]: access count for each word and the last is for entire sector;
+    // // if count_access_flag is true, then the access count for the entire sector is incremented by 1;
+    auto& sector_data = _heatmap_data[sector_tag];
+    auto mask = (1u << warp_id);
+    for (int i = 0; i < length; i+=4) {
+        sector_data[offset+i/4] |= mask;
+        sector_data[8] |= mask;
+        sector_data[9+offset+i/4] += 1;
+    }
+    sector_data[17] += 1;
+}
+
+void HeatmapAnalysis::add_sector_pc_information(uint32_t sector_tag, uint64_t pc) {
+    _sector_pc_information[sector_tag].insert(pc);
+}
+
+
+void HeatmapAnalysis::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    for (uint64_t i = 0; i < size; i++) {
+        auto trace = accesses_buffer[i];
+        for (int j = 0; j < GPU_WARP_SIZE; j++) {
+            if (trace.active_mask & (1u << j)) {
+                auto sector_tag = trace.addresses[j] >> SECTOR_TAG_SHIFT;
+                auto offset = (trace.addresses[j] & 31) >> 2;
+                unit_access(trace.warpId, sector_tag, offset, trace.accessSize);
+                add_sector_pc_information(sector_tag, trace.pc);
+            }
+        }
+    } 
+}
+void HeatmapAnalysis::evt_callback(EventPtr_t evt) {
+    switch (evt->evt_type) {
+        case EventType_KERNEL_LAUNCH:
+            kernel_start_callback(std::dynamic_pointer_cast<KernelLaunch_t>(evt));
+            break;
+        case EventType_KERNEL_END:
+            kernel_end_callback(std::dynamic_pointer_cast<KernelEnd_t>(evt));
+            break;
+        case EventType_MEM_ALLOC:
+            mem_alloc_callback(std::dynamic_pointer_cast<MemAlloc_t>(evt));
+            break;
+        case EventType_MEM_FREE:
+            mem_free_callback(std::dynamic_pointer_cast<MemFree_t>(evt));
+            break;
+        case EventType_TEN_ALLOC:
+            ten_alloc_callback(std::dynamic_pointer_cast<TenAlloc_t>(evt));
+            break;
+        case EventType_TEN_FREE:
+            ten_free_callback(std::dynamic_pointer_cast<TenFree_t>(evt));
+            break;
+        default:
+            break;
+    }
+}
+
+
+void HeatmapAnalysis::flush() {
+}

From 885927936cd35666d78aaee15c316eb277e39e80 Mon Sep 17 00:00:00 2001
From: Yanbo <mu.ra.flag@outlook.com>
Date: Sun, 21 Dec 2025 05:05:33 -0500
Subject: [PATCH 2/6] local memory request support

---
 include/tools/heatmap_analysis.h | 2 +-
 src/tools/heatmap_analysis.cpp   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/tools/heatmap_analysis.h b/include/tools/heatmap_analysis.h
index 0206a56..2a50475 100644
--- a/include/tools/heatmap_analysis.h
+++ b/include/tools/heatmap_analysis.h
@@ -35,7 +35,7 @@ class HeatmapAnalysis final : public Tool {
 private:
     void unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length);
     
-    void add_sector_pc_information(uint32_t sector_tag, uint64_t pc);
+    void add_sector_pc_information(uint64_t sector_tag, uint64_t pc);
 
     void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
 
diff --git a/src/tools/heatmap_analysis.cpp b/src/tools/heatmap_analysis.cpp
index 23510fe..6be87ce 100644
--- a/src/tools/heatmap_analysis.cpp
+++ b/src/tools/heatmap_analysis.cpp
@@ -149,7 +149,7 @@ void HeatmapAnalysis::unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_
     sector_data[17] += 1;
 }
 
-void HeatmapAnalysis::add_sector_pc_information(uint32_t sector_tag, uint64_t pc) {
+void HeatmapAnalysis::add_sector_pc_information(uint64_t sector_tag, uint64_t pc) {
     _sector_pc_information[sector_tag].insert(pc);
 }
 
@@ -168,6 +168,7 @@ void HeatmapAnalysis::gpu_data_analysis(void* data, uint64_t size) {
         }
     } 
 }
+
 void HeatmapAnalysis::evt_callback(EventPtr_t evt) {
     switch (evt->evt_type) {
         case EventType_KERNEL_LAUNCH:

From 5431757d81a24caec06cb90a6a5ae1d727dc3b9f Mon Sep 17 00:00:00 2001
From: Yanbo <mu.ra.flag@outlook.com>
Date: Tue, 6 Jan 2026 12:56:07 -0500
Subject: [PATCH 3/6] cuVein initialize

---
 include/sanalyzer.h                    |   1 +
 include/tools/pc_dependency_analysis.h | 186 +++++++++++
 include/tools/tool_type.h              |   3 +-
 include/utils/event.h                  |   1 +
 src/sanalyzer.cpp                      |   9 +
 src/tools/pc_dependency_analysis.cpp   | 408 +++++++++++++++++++++++++
 6 files changed, 607 insertions(+), 1 deletion(-)
 create mode 100644 include/tools/pc_dependency_analysis.h
 create mode 100644 src/tools/pc_dependency_analysis.cpp

diff --git a/include/sanalyzer.h b/include/sanalyzer.h
index 8525d1f..0d9e188 100644
--- a/include/sanalyzer.h
+++ b/include/sanalyzer.h
@@ -27,6 +27,7 @@ typedef enum {
     GPU_PATCH_ROOFLINE_SIZE = 10,
     GPU_PATCH_HEATMAP_ANALYSIS = 11,
     GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS = 12,
+    GPU_PATCH_PC_DEPENDENCY_ANALYSIS = 13,
 } AccelProfPatchName_t;
 
 
diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h
new file mode 100644
index 0000000..1d2206c
--- /dev/null
+++ b/include/tools/pc_dependency_analysis.h
@@ -0,0 +1,186 @@
+#ifndef YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H
+#define YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H
+
+
+#include "tools/tool.h"
+#include "utils/event.h"
+#include "gpu_patch.h"
+
+#include <map>
+#include <vector>
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <array>
+#include <cstdint>
+#include <string>
+#include <memory>
+#include <cassert>
+
+namespace yosemite {
+
+/* we choose to use PC offset instead of PC because the PC is too long for shadow memory and it is not necessary to track the original PC.
+The offset will be calculated during trace collection.
+
+Every memory allocation will cause a shadow memory to be created.
+Every memory deallocation will cause a shadow memory to be destroyed.
+Shadow memory bitmask will be reset when a kernel finished. (to avoid mass shadow memory reset)
+
+The gpu data analysis will 
+1.iterate the trace buffer and query the shadow memory to get the corresponding shadow memory entry.
+2. compare the last access information with the current access information with the rules below:
+    0. if bitmask of this access is 0, it means the current access is a cold miss set it's acient pc to 0xFFFFFFFF.
+    1. if last access and current access are from the same thread, then it is an intra thread access.
+    2. if last access and current access are from the same warp, then it is an intra warp access.
+    3. if last access and current access are from the same block, then it is an intra block access.
+    4. if last access and current access are from the same grid, then it is an intra grid access.
+3. update the pc_statistics with the current pc, ancient pc and the distance.
+4. update the shadow memory entry with the current pc and the flat thread id.
+*/
+
+
+class memory_region{
+public:
+    memory_region() : start(0), end(0) {};
+    memory_region(uint64_t start, uint64_t end) : start(start), end(end) {};
+    ~memory_region() {};
+
+    bool contains(uint64_t ptr) const {
+        return ptr >= start && ptr < end;
+    };
+
+    bool operator==(const memory_region& other) const {
+        return start == other.start && end == other.end;
+    };
+
+    bool operator<(const memory_region& other) const {
+        // strict-weak-ordering: compare both start and end
+        if (start != other.start) return start < other.start;
+        return end < other.end;
+    };
+
+    uint64_t get_start() const {
+        return start;
+    };
+    uint64_t get_end() const {
+        return end;
+    };
+
+private:
+    uint64_t start;
+    uint64_t end;
+};
+
+class shadow_memory_entry{
+public:
+    shadow_memory_entry() {};
+    ~shadow_memory_entry() {};
+
+    uint32_t last_pc = 0xFFFFFFFFu; // using offset of pc instead of original pc to save space and keep alignment;
+    uint32_t last_flat_thread_id = 0xFFFFFFFFu; // 0-5 bits for lane id, 6-10 bits for warp id, 11-31 bits for block id to save space;
+};
+
+class shadow_memory{
+public:
+    shadow_memory(uint64_t size) 
+    : _shadow_memory_entries(std::make_unique<shadow_memory_entry[]>(size)), 
+      _size(size), 
+      _shadow_memory_bitmap(std::vector<uint8_t>((size + 7) / 8, 0)) {
+        printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size);
+        printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry));
+        printf("[PC_DEPENDENCY] Shadow memory size: %lu\n", size*sizeof(shadow_memory_entry));
+        printf("[PC_DEPENDENCY] Shadow memory bitmap size: %lu\n", _shadow_memory_bitmap.size());
+      };
+    ~shadow_memory() = default;
+    void reset_bitmap() {
+        std::fill(_shadow_memory_bitmap.begin(), _shadow_memory_bitmap.end(), 0);
+    };
+    shadow_memory_entry& get_entry(uint64_t offset) {
+        assert(offset < _size);
+        return _shadow_memory_entries[offset];
+    }
+    bool is_valid(uint64_t ptr) {
+        return _shadow_memory_bitmap[ptr / 8] & (1u << (ptr % 8)); 
+    }
+    void set_valid(uint64_t ptr) {
+        _shadow_memory_bitmap[ptr / 8] |= (1u << (ptr % 8));
+    }
+    uint64_t _size;
+    std::unique_ptr<shadow_memory_entry[]> _shadow_memory_entries;
+    std::vector<uint8_t> _shadow_memory_bitmap;
+};
+
+
+class PC_statisitics{
+public:
+    std::array<uint64_t, 4> dist = {0, 0, 0, 0}; 
+    // 0: intra thread
+    // 1: intra warp
+    // 2: intra block
+    // 3: intra grid
+};
+
+class PcDependency final : public Tool {
+public:
+    PcDependency();
+
+    ~PcDependency();
+
+    void gpu_data_analysis(void* data, uint64_t size);
+
+    void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void allocation_callback(uint64_t ptr, uint64_t size);
+
+    void deallocation_callback(uint64_t ptr);
+
+    void evt_callback(EventPtr_t evt);
+
+    void flush();
+
+private:
+    void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel);
+
+    void mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem);
+
+    void mem_free_callback(std::shared_ptr<MemFree_t> mem);
+
+    void ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten);
+
+    void ten_free_callback(std::shared_ptr<TenFree_t> ten);
+
+    void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size);
+
+
+/*
+********************************* variables *********************************
+*/
+    Timer_t _timer;
+
+    std::string output_directory;
+    uint32_t kernel_id = 0;
+
+
+    std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
+    std::map<uint64_t, std::shared_ptr<MemAlloc_t>> alloc_events;
+    std::map<DevPtr, std::shared_ptr<MemAlloc_t>> active_memories;
+
+    std::map<uint64_t, std::shared_ptr<TenAlloc>> tensor_events;
+    std::map<DevPtr, std::shared_ptr<TenAlloc>> active_tensors;
+
+
+    std::vector<memory_region> _memory_regions;
+
+    std::map<memory_region, std::unique_ptr<shadow_memory>> _shadow_memories; // memory region, shadow memory
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics
+    std::unordered_map<uint32_t, uint32_t> _pc_flags; // pc offset, flags
+};
+
+}   // yosemite
+#endif // YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H
diff --git a/include/tools/tool_type.h b/include/tools/tool_type.h
index fdc70dc..b4d39dd 100644
--- a/include/tools/tool_type.h
+++ b/include/tools/tool_type.h
@@ -18,7 +18,8 @@ typedef enum {
     ROOFLINE_TIME = 13,
     HEATMAP_ANALYSIS = 14,
     BLOCK_DIVERGENCE_ANALYSIS = 15,
-    TOOL_NUMS = 16
+    PC_DEPENDENCY_ANALYSIS = 16,
+    TOOL_NUMS = 17
 } AnalysisTool_t;
 
 #endif // TOOL_TYPE_H
\ No newline at end of file
diff --git a/include/utils/event.h b/include/utils/event.h
index 903fc43..c29dd4e 100644
--- a/include/utils/event.h
+++ b/include/utils/event.h
@@ -69,6 +69,7 @@ typedef struct KernelLaunch : public Event {
     uint32_t touched_objects;
     uint32_t touched_objects_size;
     uint64_t key;   // for UVM Advisor
+    uint64_t kernel_pc;
 
     KernelLaunch() {
         this->evt_type = EventType_KERNEL_LAUNCH;
diff --git a/src/sanalyzer.cpp b/src/sanalyzer.cpp
index 038c37c..faa6056 100644
--- a/src/sanalyzer.cpp
+++ b/src/sanalyzer.cpp
@@ -18,6 +18,7 @@
 #include "tools/event_trace_mgpu.h"
 #include "tools/heatmap_analysis.h"
 #include "tools/block_divergence_analysis.h"
+#include "tools/pc_dependency_analysis.h"
 
 #include <memory>
 #include <map>
@@ -112,6 +113,9 @@ YosemiteResult_t yosemite_tool_enable(AnalysisTool_t& tool) {
     } else if (std::string(tool_name) == "block_divergence_analysis") {
         tool = BLOCK_DIVERGENCE_ANALYSIS;
         _tools.emplace(BLOCK_DIVERGENCE_ANALYSIS, std::make_shared<BlockDivergenceAnalysis>());
+    } else if (std::string(tool_name) == "pc_dependency_analysis") {
+        tool = PC_DEPENDENCY_ANALYSIS;
+        _tools.emplace(PC_DEPENDENCY_ANALYSIS, std::make_shared<PcDependency>());
     } else {
         fprintf(stderr, "[SANALYZER ERROR] Tool not found.\n");
         fflush(stderr);
@@ -263,6 +267,11 @@ YosemiteResult_t yosemite_init(AccelProfOptions_t& options) {
     } else if (tool == BLOCK_DIVERGENCE_ANALYSIS) {
         options.patch_name = GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS;
         options.patch_file = "gpu_patch_block_divergence_analysis.fatbin";
+    } else if (tool == PC_DEPENDENCY_ANALYSIS) {
+        options.patch_name = GPU_PATCH_PC_DEPENDENCY_ANALYSIS;
+        // nv-compute/Makefile generates fatbins based on gpu_src/*.cu filenames.
+        // The source file for this tool is nv-compute/gpu_src/gpu_patch_pc_dependency.cu
+        options.patch_file = "gpu_patch_pc_dependency.fatbin";
     }
 
     // enable torch profiler?
diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp
new file mode 100644
index 0000000..e4f66a3
--- /dev/null
+++ b/src/tools/pc_dependency_analysis.cpp
@@ -0,0 +1,408 @@
+#include "tools/pc_dependency_analysis.h"
+#include "utils/helper.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <fstream>
+#include <memory>
+#include <cassert>
+#include <iostream>
+#include <sstream>
+#include <set>
+#include <iomanip>
+
+
+using namespace yosemite;
+
+namespace {
+static std::string json_escape(const std::string& s) {
+    std::string out;
+    out.reserve(s.size() + 8);
+    for (char c : s) {
+        switch (c) {
+            case '\"': out += "\\\""; break;
+            case '\\': out += "\\\\"; break;
+            case '\b': out += "\\b"; break;
+            case '\f': out += "\\f"; break;
+            case '\n': out += "\\n"; break;
+            case '\r': out += "\\r"; break;
+            case '\t': out += "\\t"; break;
+            default:
+                // control chars
+                if (static_cast<unsigned char>(c) < 0x20) {
+                    std::ostringstream oss;
+                    oss << "\\u"
+                        << std::hex << std::setw(4) << std::setfill('0')
+                        << (int)static_cast<unsigned char>(c);
+                    out += oss.str();
+                } else {
+                    out += c;
+                }
+        }
+    }
+    return out;
+}
+
+static std::string hex_u32(uint32_t v) {
+    std::ostringstream oss;
+    oss << "0x" << std::hex << v;
+    return oss.str();
+}
+} // namespace
+
+
+PcDependency::PcDependency() : Tool(PC_DEPENDENCY_ANALYSIS) {
+    const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED");
+    if (torch_prof && std::string(torch_prof) == "1") {
+        fprintf(stdout, "Enabling torch profiler in PcDependency.\n");
+        _torch_enabled = true;
+    }
+
+    const char* env_app_name = std::getenv("YOSEMITE_APP_NAME");
+    if (env_app_name != nullptr) {
+        output_directory = "dependency_" + std::string(env_app_name)
+                            + "_" + get_current_date_n_time();
+    } else {
+        output_directory = "dependency_" + get_current_date_n_time();
+    }
+    check_folder_existance(output_directory);
+}
+
+
+PcDependency::~PcDependency() {}
+
+
+void PcDependency::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
+
+    kernel->kernel_id = kernel_id++;
+    kernel_events.emplace(_timer.get(), kernel);
+    _pc_statistics.clear();
+    _pc_flags.clear();
+    for (auto& shadow_memory_iter : _shadow_memories) {
+        shadow_memory_iter.second->reset_bitmap();
+    }
+    printf("[PC_DEPENDENCY] Resetting shadow memory bitmap\n");
+    _timer.increment(true);
+}
+
+
+void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
+    std::string filename = output_directory + "/kernel_"
+                            + std::to_string(kernel->kernel_id) + ".csv";
+    printf("Dumping pc dependency to %s\n", filename.c_str());
+
+    std::ofstream out(filename);
+    out << "current_pc_offset,ancient_pc_offset,flags,intra_thread,intra_warp,intra_block,intra_grid\n";
+
+    std::vector<std::pair<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>> outer(
+        _pc_statistics.begin(), _pc_statistics.end());
+    std::sort(outer.begin(), outer.end(),
+              [](auto& a, auto& b){ return a.first < b.first; });
+
+    for (auto& [cur_pc, inner_map] : outer) {
+        std::vector<std::pair<uint32_t, PC_statisitics>> inner(inner_map.begin(), inner_map.end());
+        std::sort(inner.begin(), inner.end(),
+                  [](auto& a, auto& b){ return a.first < b.first; });
+
+        uint32_t flags = 0;
+        auto fit = _pc_flags.find(cur_pc);
+        if (fit != _pc_flags.end()) flags = fit->second;
+
+        for (auto& [anc_pc, st] : inner) {
+            out << "0x" << std::hex << cur_pc
+                << ",0x" << anc_pc
+                << ",0x" << flags
+                << std::dec
+                << "," << st.dist[0]
+                << "," << st.dist[1]
+                << "," << st.dist[2]
+                << "," << st.dist[3]
+                << "\n";
+        }
+    }
+
+    // JSON output for building PC dependency graph (joinable with CFG)
+    std::string json_filename = output_directory + "/kernel_"
+                                + std::to_string(kernel->kernel_id) + ".json";
+    std::ofstream jout(json_filename);
+    jout << "{\n";
+    jout << "  \"tool\": \"pc_dependency_analysis\",\n";
+    jout << "  \"kernel\": {\n";
+    jout << "    \"kernel_id\": " << kernel->kernel_id << ",\n";
+    jout << "    \"kernel_name\": \"" << json_escape(kernel->kernel_name) << "\",\n";
+    jout << "    \"device_id\": " << kernel->device_id << ",\n";
+    jout << "    \"kernel_pc\": " << kernel->kernel_pc << ",\n";
+    jout << "    \"kernel_pc_hex\": \"" << hex_u32((uint32_t)kernel->kernel_pc) << "\"\n";
+    jout << "  },\n";
+    jout << "  \"shadow_memory_granularity_bytes\": 1,\n";
+    jout << "  \"sample_stride_bytes\": 4,\n";
+
+    // Collect nodes (all current PCs + all non-cold ancient PCs)
+    std::set<uint32_t> nodes;
+    for (const auto& [cur_pc, inner_map] : _pc_statistics) {
+        nodes.insert(cur_pc);
+        for (const auto& [anc_pc, st] : inner_map) {
+            (void)st;
+            if (anc_pc != 0xFFFFFFFFu) nodes.insert(anc_pc);
+        }
+    }
+
+    jout << "  \"nodes\": [\n";
+    {
+        bool first = true;
+        for (uint32_t pc : nodes) {
+            if (!first) jout << ",\n";
+            first = false;
+            auto fit = _pc_flags.find(pc);
+            bool has_flags = (fit != _pc_flags.end());
+            uint32_t flags = has_flags ? fit->second : 0;
+            jout << "    {\"pc\": " << pc
+                 << ", \"pc_hex\": \"" << hex_u32(pc) << "\"";
+            if (has_flags) {
+                jout << ", \"flags\": " << flags
+                     << ", \"flags_hex\": \"" << hex_u32(flags) << "\"";
+            } else {
+                jout << ", \"flags\": null, \"flags_hex\": null";
+            }
+            jout << "}";
+        }
+        jout << "\n";
+    }
+    jout << "  ],\n";
+
+    // Edges: ancient_pc -> current_pc, with per-scope counts.
+    jout << "  \"edges\": [\n";
+    {
+        // Stable order: sort by current pc then ancient pc
+        std::vector<std::pair<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>> outer2(
+            _pc_statistics.begin(), _pc_statistics.end());
+        std::sort(outer2.begin(), outer2.end(),
+                  [](auto& a, auto& b){ return a.first < b.first; });
+
+        bool first_edge = true;
+        for (auto& [cur_pc, inner_map] : outer2) {
+            std::vector<std::pair<uint32_t, PC_statisitics>> inner2(inner_map.begin(), inner_map.end());
+            std::sort(inner2.begin(), inner2.end(),
+                      [](auto& a, auto& b){ return a.first < b.first; });
+
+            // current flags if available
+            auto cfit = _pc_flags.find(cur_pc);
+            bool has_cflags = (cfit != _pc_flags.end());
+            uint32_t cflags = has_cflags ? cfit->second : 0;
+
+            for (auto& [anc_pc, st] : inner2) {
+                if (!first_edge) jout << ",\n";
+                first_edge = false;
+
+                bool cold_miss = (anc_pc == 0xFFFFFFFFu);
+
+                jout << "    {\"current_pc\": " << cur_pc
+                     << ", \"current_pc_hex\": \"" << hex_u32(cur_pc) << "\""
+                     << ", \"ancient_pc\": ";
+                if (cold_miss) {
+                    jout << "null";
+                } else {
+                    jout << anc_pc;
+                }
+                jout << ", \"ancient_pc_hex\": ";
+                if (cold_miss) {
+                    jout << "null";
+                } else {
+                    jout << "\"" << hex_u32(anc_pc) << "\"";
+                }
+                jout << ", \"cold_miss\": " << (cold_miss ? "true" : "false");
+
+                if (has_cflags) {
+                    jout << ", \"current_flags\": " << cflags
+                         << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\"";
+                } else {
+                    jout << ", \"current_flags\": null, \"current_flags_hex\": null";
+                }
+
+                jout << ", \"dist\": {"
+                     << "\"intra_thread\": " << st.dist[0]
+                     << ", \"intra_warp\": " << st.dist[1]
+                     << ", \"intra_block\": " << st.dist[2]
+                     << ", \"intra_grid\": " << st.dist[3]
+                     << "}}";
+            }
+        }
+        jout << "\n";
+    }
+    jout << "  ]\n";
+    jout << "}\n";
+    printf("Dumping pc dependency graph json to %s\n", json_filename.c_str());
+}
+
+
+void PcDependency::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
+    auto evt = std::prev(kernel_events.end())->second;
+    evt->end_time = _timer.get();
+
+    kernel_trace_flush(evt);
+
+    _timer.increment(true);
+}
+
+
+void PcDependency::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
+    // TODO： add shadow memory allocation here
+    alloc_events.emplace(_timer.get(), mem);
+    active_memories.emplace(mem->addr, mem);
+    memory_region memory_region_current = memory_region((uint64_t)mem->addr, (uint64_t)(mem->addr + mem->size));
+    _memory_regions.push_back(memory_region_current);
+    _shadow_memories.emplace(memory_region_current, std::make_unique<shadow_memory>(mem->size));
+
+    printf("[PC_DEPENDENCY] Allocating shadow memory for memory region: %p - %p, size: %lu\n", (void*)memory_region_current.get_start(), (void*)memory_region_current.get_end(), mem->size);
+    _timer.increment(true);
+}
+
+void PcDependency::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
+    auto it = active_memories.find(mem->addr);
+    assert(it != active_memories.end());
+
+    uint64_t sz = it->second->size;   // 从 alloc 事件拿 size
+    active_memories.erase(it);
+
+    memory_region r((uint64_t)mem->addr, (uint64_t)mem->addr + sz);
+
+    auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r);
+    if (vit != _memory_regions.end()) _memory_regions.erase(vit);
+
+    _shadow_memories.erase(r);
+    printf("[PC_DEPENDENCY] Freeing shadow memory for memory region: %p - %p, size: %lu\n", (void*)r.get_start(), (void*)r.get_end(), sz);
+    _timer.increment(true);
+}
+
+
+void PcDependency::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
+    tensor_events.emplace(_timer.get(), ten);
+    active_tensors.emplace(ten->addr, ten);
+
+    _timer.increment(true);
+}
+
+
+void PcDependency::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
+    auto it = active_tensors.find(ten->addr);
+    assert(it != active_tensors.end());
+    active_tensors.erase(it);
+
+    _timer.increment(true);
+}
+
+void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size) {
+    // auto& shadow_memory = this->_shadow_memories[memory_region_target];
+    auto shadow_memory_it = this->_shadow_memories.find(memory_region_target);
+    if (shadow_memory_it == this->_shadow_memories.end()) {
+        printf("shadow memory not found for memory region: %lu - %lu\n", memory_region_target.get_start(), memory_region_target.get_end());
+        return;
+    }
+    auto& shadow_memory = *(shadow_memory_it->second);
+
+    for (int i = 0; i < access_size; i += 4) {
+        auto addr = ptr + i;
+        // Byte-granularity shadow memory: addr is byte offset within allocation.
+        // Bound check to avoid OOB on allocations at end boundary or odd sizes.
+        if (addr >= shadow_memory._size) {
+            break;
+        }
+        if (shadow_memory.is_valid(addr) == false) {
+            // cold miss
+            _pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1;
+            shadow_memory.set_valid(addr);
+            auto& shadow_memory_entry = shadow_memory.get_entry(addr);
+            shadow_memory_entry.last_pc = pc_offset;
+            shadow_memory_entry.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id;
+            continue;
+        }
+        auto& last_access = shadow_memory.get_entry(addr);
+        uint64_t last_block_id = last_access.last_flat_thread_id >> 10;
+        uint64_t last_warp_id = (last_access.last_flat_thread_id >> 5) & 0x1F;
+        uint64_t last_lane_id = last_access.last_flat_thread_id & 0x1F;
+
+        uint32_t last_pc = last_access.last_pc;
+        if (last_block_id != current_block_id) {
+            this->_pc_statistics[pc_offset][last_pc].dist[3] += 1;
+        }else if (last_warp_id != current_warp_id) {
+            this->_pc_statistics[pc_offset][last_pc].dist[2] += 1;
+        }else if (last_lane_id != current_lane_id) {
+            this->_pc_statistics[pc_offset][last_pc].dist[1] += 1;
+        }else {
+            this->_pc_statistics[pc_offset][last_pc].dist[0] += 1;
+        }
+        last_access.last_pc = pc_offset;
+        last_access.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id;
+    }
+}
+
+
+void PcDependency::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    for (uint64_t i = 0; i < size; i++) {
+        MemoryAccess trace = accesses_buffer[i];
+        uint32_t pc_offset = trace.pc;
+        this->_pc_flags[pc_offset] = trace.flags;
+        if (trace.type != MemoryType::Global) {
+            //only analyze global memory accesses currently
+            continue;
+        }
+        uint32_t access_size = trace.accessSize;
+        memory_region memory_region_target;
+        uint64_t first_valid_address = 0;
+
+        for (int j = 0; j < GPU_WARP_SIZE; j++) {
+            if (trace.active_mask & (1u << j)) {
+                first_valid_address = trace.addresses[j];
+                break;
+            }
+        }
+        
+        
+        assert(first_valid_address != 0);
+        for (auto memory_region_iter : this->_memory_regions) {
+            if (memory_region_iter.contains(first_valid_address)) {
+                memory_region_target = memory_region_iter;
+                break;
+            }
+        }
+        uint64_t memory_region_start = memory_region_target.get_start();
+        assert(memory_region_start != 0);
+        for ( int j = 0; j < GPU_WARP_SIZE; j++) {
+            if (trace.active_mask & (1u << j)) {
+                unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size);
+            }
+        }
+    }
+
+}
+
+
+void PcDependency::evt_callback(EventPtr_t evt) {
+    switch (evt->evt_type) {
+        case EventType_KERNEL_LAUNCH:
+            kernel_start_callback(std::dynamic_pointer_cast<KernelLaunch_t>(evt));
+            break;
+        case EventType_KERNEL_END:
+            kernel_end_callback(std::dynamic_pointer_cast<KernelEnd_t>(evt));
+            break;
+        case EventType_MEM_ALLOC:
+            mem_alloc_callback(std::dynamic_pointer_cast<MemAlloc_t>(evt));
+            break;
+        case EventType_MEM_FREE:
+            mem_free_callback(std::dynamic_pointer_cast<MemFree_t>(evt));
+            break;
+        case EventType_TEN_ALLOC:
+            ten_alloc_callback(std::dynamic_pointer_cast<TenAlloc_t>(evt));
+            break;
+        case EventType_TEN_FREE:
+            ten_free_callback(std::dynamic_pointer_cast<TenFree_t>(evt));
+            break;
+        default:
+            break;
+    }
+}
+
+
+void PcDependency::flush() {
+}

From 6020b6585df30f4671f1fbde99b29cb58d0cdb2b Mon Sep 17 00:00:00 2001
From: Yanbo Zhao <yzhao62@eb2-3224-lin10.csc.ncsu.edu>
Date: Wed, 11 Feb 2026 14:09:06 -0500
Subject: [PATCH 4/6] cuVein update

---
 include/tools/pc_dependency_analysis.h |  58 +++++++-
 src/tools/pc_dependency_analysis.cpp   | 196 ++++++++++++++++++++-----
 2 files changed, 214 insertions(+), 40 deletions(-)

diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h
index 1d2206c..2d42264 100644
--- a/include/tools/pc_dependency_analysis.h
+++ b/include/tools/pc_dependency_analysis.h
@@ -17,6 +17,39 @@
 #include <memory>
 #include <cassert>
 
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ
+#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE
+#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_RED
+#define SANITIZER_MEMORY_DEVICE_FLAG_RED 0x3
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC
+#define SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC 0x4
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH
+#define SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH 0x8
+#endif
+
+#ifndef SANITIZER_MEMORY_GLOBAL
+#define SANITIZER_MEMORY_GLOBAL 0x10
+#endif
+
+#ifndef SANITIZER_MEMORY_SHARED
+#define SANITIZER_MEMORY_SHARED 0x20
+#endif
+
+#ifndef SANITIZER_MEMORY_LOCAL
+#define SANITIZER_MEMORY_LOCAL 0x40
+#endif
+
 namespace yosemite {
 
 /* we choose to use PC offset instead of PC because the PC is too long for shadow memory and it is not necessary to track the original PC.
@@ -83,8 +116,10 @@ class shadow_memory_entry{
 class shadow_memory{
 public:
     shadow_memory(uint64_t size) 
-    : _shadow_memory_entries(std::make_unique<shadow_memory_entry[]>(size)), 
-      _size(size), 
+    :_size(size),
+    _size_celled((size + 3) / 4 * 4),
+    _stride(_size_celled / 4),
+    _shadow_memory_entries(std::make_unique<shadow_memory_entry[]>(_size_celled)), 
       _shadow_memory_bitmap(std::vector<uint8_t>((size + 7) / 8, 0)) {
         printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size);
         printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry));
@@ -97,7 +132,9 @@ class shadow_memory{
     };
     shadow_memory_entry& get_entry(uint64_t offset) {
         assert(offset < _size);
-        return _shadow_memory_entries[offset];
+        //update layout: use offset/4 + offset%4 * _size/4 to make every 4 bytes adjacent in one cache line
+        return _shadow_memory_entries[(offset/4) + (offset%4) * _stride];
+        // return _shadow_memory_entries[offset];
     }
     bool is_valid(uint64_t ptr) {
         return _shadow_memory_bitmap[ptr / 8] & (1u << (ptr % 8)); 
@@ -106,6 +143,8 @@ class shadow_memory{
         _shadow_memory_bitmap[ptr / 8] |= (1u << (ptr % 8));
     }
     uint64_t _size;
+    uint64_t _size_celled;
+    uint64_t _stride;
     std::unique_ptr<shadow_memory_entry[]> _shadow_memory_entries;
     std::vector<uint8_t> _shadow_memory_bitmap;
 };
@@ -155,7 +194,11 @@ class PcDependency final : public Tool {
 
     void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
 
-    void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size);
+    void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size);
+
+    void unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size);
+
+    void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size);
 
 
 /*
@@ -178,8 +221,13 @@ class PcDependency final : public Tool {
     std::vector<memory_region> _memory_regions;
 
     std::map<memory_region, std::unique_ptr<shadow_memory>> _shadow_memories; // memory region, shadow memory
+    std::unordered_map<uint64_t, shadow_memory_entry> _shadow_memory_shared; // shared memory address (packed as block_id << 32 | address low 32 bits to reduce aliasing), shadow memory shared
     std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics
-    std::unordered_map<uint32_t, uint32_t> _pc_flags; // pc offset, flags
+    std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> _pc_flags; // pc offset, flags, size of the access
+    // Index [0..31] stores distinct sector count 1..32.
+    // Index [32..64] stores active lane count 0..32.
+    std::unordered_map<uint32_t, std::array<uint64_t, 65>> _distinct_sector_count; // pc offset, distinct sector distribution
+
 };
 
 }   // yosemite
diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp
index e4f66a3..80053cc 100644
--- a/src/tools/pc_dependency_analysis.cpp
+++ b/src/tools/pc_dependency_analysis.cpp
@@ -10,6 +10,7 @@
 #include <sstream>
 #include <set>
 #include <iomanip>
+#include <thread>
 
 
 using namespace yosemite;
@@ -48,6 +49,19 @@ static std::string hex_u32(uint32_t v) {
     oss << "0x" << std::hex << v;
     return oss.str();
 }
+static std::string flags_to_string(uint32_t flags) {
+    std::ostringstream oss;
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) oss << "READ";
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) oss << "WRITE";
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC) oss << "ATOMIC";
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH) oss << "PREFETCH";
+    oss << " ";
+    if (flags & SANITIZER_MEMORY_GLOBAL) oss << "GLOBAL";
+    if (flags & SANITIZER_MEMORY_SHARED) oss << "SHARED";
+    if (flags & SANITIZER_MEMORY_LOCAL) oss << "LOCAL";
+
+    return oss.str();
+}
 } // namespace
 
 
@@ -105,8 +119,9 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
                   [](auto& a, auto& b){ return a.first < b.first; });
 
         uint32_t flags = 0;
+        uint32_t access_size = 0;
         auto fit = _pc_flags.find(cur_pc);
-        if (fit != _pc_flags.end()) flags = fit->second;
+        if (fit != _pc_flags.end()){ flags = fit->second.first; access_size = fit->second.second;}
 
         for (auto& [anc_pc, st] : inner) {
             out << "0x" << std::hex << cur_pc
@@ -155,14 +170,37 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
             first = false;
             auto fit = _pc_flags.find(pc);
             bool has_flags = (fit != _pc_flags.end());
-            uint32_t flags = has_flags ? fit->second : 0;
+            uint32_t flags = has_flags ? fit->second.first : 0;
+            uint32_t access_size = has_flags ? fit->second.second : 0;
+            bool has_distinct_sector_count = (_distinct_sector_count.find(pc) != _distinct_sector_count.end());
             jout << "    {\"pc\": " << pc
                  << ", \"pc_hex\": \"" << hex_u32(pc) << "\"";
             if (has_flags) {
-                jout << ", \"flags\": " << flags
-                     << ", \"flags_hex\": \"" << hex_u32(flags) << "\"";
+                jout << ", \"flags\": \"" << flags_to_string(flags) << "\""
+                     << ", \"flags_hex\": \"" << hex_u32(flags) << "\""
+                     << ", \"access_size\": " << access_size;
             } else {
-                jout << ", \"flags\": null, \"flags_hex\": null";
+                jout << ", \"flags\": null, \"flags_hex\": null, \"access_size\": null";
+            }
+            if (has_distinct_sector_count) {
+                jout << ", \"distinct_sector_count\": {";
+                for (int i = 1; i <= 32; i++) {
+                    jout << "\"" << i << "\": " << _distinct_sector_count[pc][i - 1];
+                    if (i != 32) {
+                        jout << ", ";
+                    }
+                }
+                jout << "}";
+                jout << ", \"active_lane_count\": {";
+                for (int i = 0; i <= 32; i++) {
+                    jout << "\"" << i << "\": " << _distinct_sector_count[pc][32 + i];
+                    if (i != 32) {
+                        jout << ", ";
+                    }
+                }
+                jout << "}";
+            } else {
+                jout << ", \"distinct_sector_count\": null, \"active_lane_count\": null";
             }
             jout << "}";
         }
@@ -188,7 +226,8 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
             // current flags if available
             auto cfit = _pc_flags.find(cur_pc);
             bool has_cflags = (cfit != _pc_flags.end());
-            uint32_t cflags = has_cflags ? cfit->second : 0;
+            uint32_t cflags = has_cflags ? cfit->second.first : 0;
+            uint32_t c_access_size = has_cflags ? cfit->second.second : 0;
 
             for (auto& [anc_pc, st] : inner2) {
                 if (!first_edge) jout << ",\n";
@@ -214,7 +253,8 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
 
                 if (has_cflags) {
                     jout << ", \"current_flags\": " << cflags
-                         << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\"";
+                         << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\""
+                         << ", \"current_access_size\": " << c_access_size;
                 } else {
                     jout << ", \"current_flags\": null, \"current_flags_hex\": null";
                 }
@@ -238,7 +278,8 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
 void PcDependency::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
     auto evt = std::prev(kernel_events.end())->second;
     evt->end_time = _timer.get();
-
+    this->_shadow_memory_shared.clear();
+    printf("[PC_DEPENDENCY] Clearing shadow memory shared\n");
     kernel_trace_flush(evt);
 
     _timer.increment(true);
@@ -278,6 +319,9 @@ void PcDependency::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
 void PcDependency::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
     tensor_events.emplace(_timer.get(), ten);
     active_tensors.emplace(ten->addr, ten);
+    _memory_regions.push_back(memory_region((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size)));
+    _shadow_memories.emplace(_memory_regions.back(), std::make_unique<shadow_memory>(ten->size));
+    printf("[PC_DEPENDENCY] Allocating shadow memory for tensor region: %p - %p, size: %lu\n", (void*)ten->addr, (void*)(ten->addr + ten->size), ten->size);
 
     _timer.increment(true);
 }
@@ -286,12 +330,25 @@ void PcDependency::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
 void PcDependency::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
     auto it = active_tensors.find(ten->addr);
     assert(it != active_tensors.end());
+
+    // TenFree.size may be negative (e.g., accounting-style events). Use size from TenAlloc.
+    const uint64_t sz = static_cast<uint64_t>(it->second->size);
     active_tensors.erase(it);
 
+    memory_region r((uint64_t)ten->addr, (uint64_t)ten->addr + sz);
+
+    auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r);
+    if (vit != _memory_regions.end()) {
+        _memory_regions.erase(vit);
+    }
+
+    _shadow_memories.erase(r);
+    printf("[PC_DEPENDENCY] Freeing shadow memory for tensor region: %p - %p, size: %lu\n",
+           (void*)r.get_start(), (void*)r.get_end(), sz);
     _timer.increment(true);
 }
 
-void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint64_t current_warp_id, uint64_t current_lane_id, memory_region& memory_region_target, int access_size) {
+void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size) {
     // auto& shadow_memory = this->_shadow_memories[memory_region_target];
     auto shadow_memory_it = this->_shadow_memories.find(memory_region_target);
     if (shadow_memory_it == this->_shadow_memories.end()) {
@@ -336,42 +393,111 @@ void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t curren
     }
 }
 
+void PcDependency::unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) {
+    // 共享内存地址在同一个 block 内唯一，使用 block_id 高位 + 地址低 32 位作为 key，
+    const uint64_t packed_base = ((current_block_id & 0xFFFFFFFFull) << 32)
+                                 | (ptr & 0xFFFFFFFFull);
+
+    for (int i = 0; i < access_size; i += 4) {
+        const uint64_t addr = packed_base + i;  // 4 字节粒度
+
+        auto it = this->_shadow_memory_shared.find(addr);
+        if (it == this->_shadow_memory_shared.end()) {
+            // cold miss
+            this->_pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1;
+            auto& entry = this->_shadow_memory_shared.emplace(addr, shadow_memory_entry()).first->second;
+            entry.last_pc = pc_offset;
+            entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id; // 只编码 warp/lane
+            continue;
+        }
+
+        auto& entry = it->second;
+        const uint64_t last_warp_id = (entry.last_flat_thread_id >> 5) & 0x1F;
+        const uint64_t last_lane_id = entry.last_flat_thread_id & 0x1F;
+        const uint32_t last_pc = entry.last_pc;
+
+        if (last_warp_id != current_warp_id) {
+            // 不同 warp 同 block
+            this->_pc_statistics[pc_offset][last_pc].dist[2] += 1;
+        } else if (last_lane_id != current_lane_id) {
+            // 同 warp 不同 lane
+            this->_pc_statistics[pc_offset][last_pc].dist[1] += 1;
+        } else {
+            // 同一线程
+            this->_pc_statistics[pc_offset][last_pc].dist[0] += 1;
+        }
+
+        entry.last_pc = pc_offset;
+        entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id;
+    }
+}
+
+void PcDependency::unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) {
+    // TODO: implement local memory access
+}
+
 
 void PcDependency::gpu_data_analysis(void* data, uint64_t size) {
     MemoryAccess* accesses_buffer = (MemoryAccess*)data;
     for (uint64_t i = 0; i < size; i++) {
         MemoryAccess trace = accesses_buffer[i];
         uint32_t pc_offset = trace.pc;
-        this->_pc_flags[pc_offset] = trace.flags;
-        if (trace.type != MemoryType::Global) {
-            //only analyze global memory accesses currently
-            continue;
-        }
+        uint32_t flags = trace.flags;
         uint32_t access_size = trace.accessSize;
-        memory_region memory_region_target;
-        uint64_t first_valid_address = 0;
-
-        for (int j = 0; j < GPU_WARP_SIZE; j++) {
-            if (trace.active_mask & (1u << j)) {
-                first_valid_address = trace.addresses[j];
+        uint32_t distinct_sector_count = trace.distinct_sector_count;
+        uint32_t active_mask = trace.active_mask;
+        switch (trace.type) {
+            case MemoryType::Local:{
+                    flags |= SANITIZER_MEMORY_LOCAL;
+                    break;
+                }
+            case MemoryType::Shared:{
+                    flags |= SANITIZER_MEMORY_SHARED;
+                    for (int j = 0; j < GPU_WARP_SIZE; j++) {
+                        if (active_mask & (1u << j)) {
+                            unit_access_shared(trace.addresses[j], pc_offset, trace.ctaId, trace.warpId, j, trace.accessSize);
+                        }
+                    }
+                    break;
+                }
+            case MemoryType::Global:{
+                    flags |= SANITIZER_MEMORY_GLOBAL;
+                    memory_region memory_region_target;
+                    uint64_t first_valid_address = 0;
+                    for (int j = 0; j < GPU_WARP_SIZE; j++) {
+                        if (active_mask & (1u << j)) {
+                            first_valid_address = trace.addresses[j];
+                            break;
+                        }
+                    }
+                    assert(first_valid_address != 0);
+                    for (auto memory_region_iter : this->_memory_regions) {
+                        if (memory_region_iter.contains(first_valid_address)) {
+                            memory_region_target = memory_region_iter;
+                            break;
+                        }
+                    }
+                    uint64_t memory_region_start = memory_region_target.get_start();
+                    assert(memory_region_start != 0);
+                    for ( int j = 0; j < GPU_WARP_SIZE; j++) {
+                        if (active_mask & (1u << j)) {
+                            unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size);
+                        }
+                    }
+                    break;
+                }
+            default:
+                printf("unknown memory type\n");
                 break;
-            }
         }
-        
-        
-        assert(first_valid_address != 0);
-        for (auto memory_region_iter : this->_memory_regions) {
-            if (memory_region_iter.contains(first_valid_address)) {
-                memory_region_target = memory_region_iter;
-                break;
-            }
+        this->_pc_flags[pc_offset] = std::make_pair(flags, access_size);
+        // Defensive bounds checks: GPU side should produce [1, 32].
+        if (distinct_sector_count >= 1 && distinct_sector_count <= 32) {
+            this->_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1;
         }
-        uint64_t memory_region_start = memory_region_target.get_start();
-        assert(memory_region_start != 0);
-        for ( int j = 0; j < GPU_WARP_SIZE; j++) {
-            if (trace.active_mask & (1u << j)) {
-                unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size);
-            }
+        const uint32_t active_lane_count = __builtin_popcount(active_mask);
+        if (active_lane_count <= 32) {
+            this->_distinct_sector_count[pc_offset][32 + active_lane_count] += 1;
         }
     }
 

From e5c235146266de89755e37c4fbd492d4363d41a3 Mon Sep 17 00:00:00 2001
From: Yanbo Zhao <yzhao62@eb2-3224-lin10.csc.ncsu.edu>
Date: Fri, 13 Feb 2026 16:36:32 -0500
Subject: [PATCH 5/6] cuVein parallel optimization

---
 include/tools/pc_dependency_analysis.h |  91 ++++--
 src/tools/pc_dependency_analysis.cpp   | 398 +++++++++++++++++++------
 2 files changed, 368 insertions(+), 121 deletions(-)

diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h
index 2d42264..717307a 100644
--- a/include/tools/pc_dependency_analysis.h
+++ b/include/tools/pc_dependency_analysis.h
@@ -16,6 +16,11 @@
 #include <string>
 #include <memory>
 #include <cassert>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+#include <cstring>
+#include <sys/mman.h>
 
 
 #ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ
@@ -104,13 +109,14 @@ class memory_region{
     uint64_t end;
 };
 
-class shadow_memory_entry{
+class alignas(8) shadow_memory_entry{
 public:
     shadow_memory_entry() {};
     ~shadow_memory_entry() {};
-
-    uint32_t last_pc = 0xFFFFFFFFu; // using offset of pc instead of original pc to save space and keep alignment;
-    uint32_t last_flat_thread_id = 0xFFFFFFFFu; // 0-5 bits for lane id, 6-10 bits for warp id, 11-31 bits for block id to save space;
+    // Packed representation: low 32 bits = last_pc, high 32 bits = last_flat_thread_id.
+    // Keeping a single 64-bit field avoids type-punning UB in atomic exchange.
+    // packed == 0 means invalid/uninitialized (cold).
+    uint64_t packed = 0;
 };
 
 class shadow_memory{
@@ -119,16 +125,26 @@ class shadow_memory{
     :_size(size),
     _size_celled((size + 3) / 4 * 4),
     _stride(_size_celled / 4),
-    _shadow_memory_entries(std::make_unique<shadow_memory_entry[]>(_size_celled)), 
-      _shadow_memory_bitmap(std::vector<uint8_t>((size + 7) / 8, 0)) {
+    _entries_bytes(std::max<uint64_t>(1, _size_celled * sizeof(shadow_memory_entry))) {
+        _shadow_memory_entries = static_cast<shadow_memory_entry*>(
+            mmap(nullptr, _entries_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
+        );
+        assert(_shadow_memory_entries != MAP_FAILED);
+
         printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size);
         printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry));
         printf("[PC_DEPENDENCY] Shadow memory size: %lu\n", size*sizeof(shadow_memory_entry));
-        printf("[PC_DEPENDENCY] Shadow memory bitmap size: %lu\n", _shadow_memory_bitmap.size());
       };
-    ~shadow_memory() = default;
-    void reset_bitmap() {
-        std::fill(_shadow_memory_bitmap.begin(), _shadow_memory_bitmap.end(), 0);
+    ~shadow_memory() {
+        if (_shadow_memory_entries != nullptr && _shadow_memory_entries != MAP_FAILED) {
+            munmap(_shadow_memory_entries, _entries_bytes);
+            _shadow_memory_entries = nullptr;
+        }
+    }
+    void reset_entries() {
+        if (madvise(_shadow_memory_entries, _entries_bytes, MADV_DONTNEED) != 0) {
+            std::memset(_shadow_memory_entries, 0, _entries_bytes);
+        }
     };
     shadow_memory_entry& get_entry(uint64_t offset) {
         assert(offset < _size);
@@ -136,17 +152,11 @@ class shadow_memory{
         return _shadow_memory_entries[(offset/4) + (offset%4) * _stride];
         // return _shadow_memory_entries[offset];
     }
-    bool is_valid(uint64_t ptr) {
-        return _shadow_memory_bitmap[ptr / 8] & (1u << (ptr % 8)); 
-    }
-    void set_valid(uint64_t ptr) {
-        _shadow_memory_bitmap[ptr / 8] |= (1u << (ptr % 8));
-    }
     uint64_t _size;
     uint64_t _size_celled;
     uint64_t _stride;
-    std::unique_ptr<shadow_memory_entry[]> _shadow_memory_entries;
-    std::vector<uint8_t> _shadow_memory_bitmap;
+    uint64_t _entries_bytes;
+    shadow_memory_entry* _shadow_memory_entries = nullptr;
 };
 
 
@@ -194,11 +204,30 @@ class PcDependency final : public Tool {
 
     void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
 
-    void unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size);
-
-    void unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size);
+    void unit_access(
+        uint64_t ptr,
+        uint32_t pc_offset,
+        uint64_t current_block_id,
+        uint32_t current_warp_id,
+        uint32_t current_lane_id,
+        memory_region& memory_region_target,
+        int access_size,
+        std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics
+    );
+
+    void unit_access_shared(
+        uint64_t ptr,
+        uint32_t pc_offset,
+        uint64_t current_block_id,
+        uint32_t current_warp_id,
+        uint32_t current_lane_id,
+        int access_size,
+        std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics,
+        std::unordered_map<uint64_t, shadow_memory_entry>& local_shadow_memory_shared
+    );
 
     void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size);
+    void worker_loop(uint64_t worker_idx);
 
 
 /*
@@ -221,13 +250,31 @@ class PcDependency final : public Tool {
     std::vector<memory_region> _memory_regions;
 
     std::map<memory_region, std::unique_ptr<shadow_memory>> _shadow_memories; // memory region, shadow memory
-    std::unordered_map<uint64_t, shadow_memory_entry> _shadow_memory_shared; // shared memory address (packed as block_id << 32 | address low 32 bits to reduce aliasing), shadow memory shared
     std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics
     std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> _pc_flags; // pc offset, flags, size of the access
     // Index [0..31] stores distinct sector count 1..32.
     // Index [32..64] stores active lane count 0..32.
     std::unordered_map<uint32_t, std::array<uint64_t, 65>> _distinct_sector_count; // pc offset, distinct sector distribution
 
+    // Persistent worker pool and per-worker shared-memory shadow state.
+    uint64_t _worker_count = 1;
+    std::vector<std::thread> _workers;
+    std::vector<std::unordered_map<uint64_t, shadow_memory_entry>> _worker_shadow_memory_shared;
+
+    // Per-batch job data produced by gpu_data_analysis and consumed by workers.
+    const MemoryAccess* _job_accesses_buffer = nullptr;
+    std::vector<std::vector<uint64_t>> _job_worker_trace_indices;
+    std::vector<std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>> _job_worker_pc_statistics;
+    std::vector<std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>>> _job_worker_pc_flags;
+    std::vector<std::unordered_map<uint32_t, std::array<uint64_t, 65>>> _job_worker_distinct_sector_count;
+
+    std::mutex _worker_pool_mutex;
+    std::condition_variable _worker_pool_cv;
+    std::condition_variable _worker_pool_done_cv;
+    bool _worker_pool_shutdown = false;
+    uint64_t _worker_job_generation = 0;
+    uint64_t _worker_pending_jobs = 0;
+
 };
 
 }   // yosemite
diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp
index 80053cc..6dadf09 100644
--- a/src/tools/pc_dependency_analysis.cpp
+++ b/src/tools/pc_dependency_analysis.cpp
@@ -11,6 +11,7 @@
 #include <set>
 #include <iomanip>
 #include <thread>
+#include <atomic>
 
 
 using namespace yosemite;
@@ -62,6 +63,18 @@ static std::string flags_to_string(uint32_t flags) {
 
     return oss.str();
 }
+
+static inline uint64_t pack_shadow_entry(uint32_t pc, uint32_t flat_thread_id) {
+    return (static_cast<uint64_t>(flat_thread_id) << 32) | static_cast<uint64_t>(pc);
+}
+
+static inline uint32_t unpack_shadow_pc(uint64_t packed) {
+    return static_cast<uint32_t>(packed & 0xFFFFFFFFu);
+}
+
+static inline uint32_t unpack_shadow_flat_tid(uint64_t packed) {
+    return static_cast<uint32_t>(packed >> 32);
+}
 } // namespace
 
 
@@ -80,10 +93,33 @@ PcDependency::PcDependency() : Tool(PC_DEPENDENCY_ANALYSIS) {
         output_directory = "dependency_" + get_current_date_n_time();
     }
     check_folder_existance(output_directory);
+
+    _worker_count = std::max(1u, std::thread::hardware_concurrency());
+    _worker_shadow_memory_shared.resize(_worker_count);
+    _job_worker_trace_indices.resize(_worker_count);
+    _job_worker_pc_statistics.resize(_worker_count);
+    _job_worker_pc_flags.resize(_worker_count);
+    _job_worker_distinct_sector_count.resize(_worker_count);
+    _workers.reserve(_worker_count);
+    for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) {
+        _workers.emplace_back(&PcDependency::worker_loop, this, worker_idx);
+    }
 }
 
 
-PcDependency::~PcDependency() {}
+PcDependency::~PcDependency() {
+    {
+        std::lock_guard<std::mutex> guard(_worker_pool_mutex);
+        _worker_pool_shutdown = true;
+        ++_worker_job_generation;
+    }
+    _worker_pool_cv.notify_all();
+    for (auto& worker : _workers) {
+        if (worker.joinable()) {
+            worker.join();
+        }
+    }
+}
 
 
 void PcDependency::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
@@ -92,10 +128,14 @@ void PcDependency::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel)
     kernel_events.emplace(_timer.get(), kernel);
     _pc_statistics.clear();
     _pc_flags.clear();
+    _distinct_sector_count.clear();
+    for (auto& shared_map : _worker_shadow_memory_shared) {
+        shared_map.clear();
+    }
     for (auto& shadow_memory_iter : _shadow_memories) {
-        shadow_memory_iter.second->reset_bitmap();
+        shadow_memory_iter.second->reset_entries();
     }
-    printf("[PC_DEPENDENCY] Resetting shadow memory bitmap\n");
+    printf("[PC_DEPENDENCY] Resetting shadow memory entries\n");
     _timer.increment(true);
 }
 
@@ -158,7 +198,7 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
         nodes.insert(cur_pc);
         for (const auto& [anc_pc, st] : inner_map) {
             (void)st;
-            if (anc_pc != 0xFFFFFFFFu) nodes.insert(anc_pc);
+            if (anc_pc != 0u) nodes.insert(anc_pc);
         }
     }
 
@@ -233,7 +273,7 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
                 if (!first_edge) jout << ",\n";
                 first_edge = false;
 
-                bool cold_miss = (anc_pc == 0xFFFFFFFFu);
+                bool cold_miss = (anc_pc == 0u);
 
                 jout << "    {\"current_pc\": " << cur_pc
                      << ", \"current_pc_hex\": \"" << hex_u32(cur_pc) << "\""
@@ -278,7 +318,9 @@ void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
 void PcDependency::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
     auto evt = std::prev(kernel_events.end())->second;
     evt->end_time = _timer.get();
-    this->_shadow_memory_shared.clear();
+    for (auto& shared_map : _worker_shadow_memory_shared) {
+        shared_map.clear();
+    }
     printf("[PC_DEPENDENCY] Clearing shadow memory shared\n");
     kernel_trace_flush(evt);
 
@@ -348,7 +390,16 @@ void PcDependency::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
     _timer.increment(true);
 }
 
-void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, memory_region& memory_region_target, int access_size) {
+void PcDependency::unit_access(
+    uint64_t ptr,
+    uint32_t pc_offset,
+    uint64_t current_block_id,
+    uint32_t current_warp_id,
+    uint32_t current_lane_id,
+    memory_region& memory_region_target,
+    int access_size,
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics
+) {
     // auto& shadow_memory = this->_shadow_memories[memory_region_target];
     auto shadow_memory_it = this->_shadow_memories.find(memory_region_target);
     if (shadow_memory_it == this->_shadow_memories.end()) {
@@ -356,79 +407,94 @@ void PcDependency::unit_access(uint64_t ptr, uint32_t pc_offset, uint64_t curren
         return;
     }
     auto& shadow_memory = *(shadow_memory_it->second);
+    const uint32_t current_flat_thread_id =
+        static_cast<uint32_t>((current_block_id << 10) | (current_warp_id << 5) | current_lane_id);
 
     for (int i = 0; i < access_size; i += 4) {
-        auto addr = ptr + i;
+        const uint64_t addr = ptr + i;
         // Byte-granularity shadow memory: addr is byte offset within allocation.
         // Bound check to avoid OOB on allocations at end boundary or odd sizes.
         if (addr >= shadow_memory._size) {
             break;
         }
-        if (shadow_memory.is_valid(addr) == false) {
-            // cold miss
-            _pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1;
-            shadow_memory.set_valid(addr);
-            auto& shadow_memory_entry = shadow_memory.get_entry(addr);
-            shadow_memory_entry.last_pc = pc_offset;
-            shadow_memory_entry.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id;
+
+        auto& entry = shadow_memory.get_entry(addr);
+        const uint64_t old_packed = __atomic_exchange_n(
+            &entry.packed,
+            pack_shadow_entry(pc_offset, current_flat_thread_id),
+            __ATOMIC_ACQ_REL
+        );
+        const bool is_cold_miss = (old_packed == 0);
+
+        if (is_cold_miss) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
             continue;
         }
-        auto& last_access = shadow_memory.get_entry(addr);
-        uint64_t last_block_id = last_access.last_flat_thread_id >> 10;
-        uint64_t last_warp_id = (last_access.last_flat_thread_id >> 5) & 0x1F;
-        uint64_t last_lane_id = last_access.last_flat_thread_id & 0x1F;
 
-        uint32_t last_pc = last_access.last_pc;
+        const uint32_t last_pc = unpack_shadow_pc(old_packed);
+        const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed);
+        const uint64_t last_block_id = static_cast<uint64_t>(last_flat_thread_id >> 10);
+        const uint64_t last_warp_id = static_cast<uint64_t>((last_flat_thread_id >> 5) & 0x1F);
+        const uint64_t last_lane_id = static_cast<uint64_t>(last_flat_thread_id & 0x1F);
         if (last_block_id != current_block_id) {
-            this->_pc_statistics[pc_offset][last_pc].dist[3] += 1;
-        }else if (last_warp_id != current_warp_id) {
-            this->_pc_statistics[pc_offset][last_pc].dist[2] += 1;
-        }else if (last_lane_id != current_lane_id) {
-            this->_pc_statistics[pc_offset][last_pc].dist[1] += 1;
-        }else {
-            this->_pc_statistics[pc_offset][last_pc].dist[0] += 1;
+            local_pc_statistics[pc_offset][last_pc].dist[3] += 1;
+        } else if (last_warp_id != current_warp_id) {
+            local_pc_statistics[pc_offset][last_pc].dist[2] += 1;
+        } else if (last_lane_id != current_lane_id) {
+            local_pc_statistics[pc_offset][last_pc].dist[1] += 1;
+        } else {
+            local_pc_statistics[pc_offset][last_pc].dist[0] += 1;
         }
-        last_access.last_pc = pc_offset;
-        last_access.last_flat_thread_id = (current_block_id << 10) | (current_warp_id << 5) | current_lane_id;
     }
 }
 
-void PcDependency::unit_access_shared(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) {
+void PcDependency::unit_access_shared(
+    uint64_t ptr,
+    uint32_t pc_offset,
+    uint64_t current_block_id,
+    uint32_t current_warp_id,
+    uint32_t current_lane_id,
+    int access_size,
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics,
+    std::unordered_map<uint64_t, shadow_memory_entry>& local_shadow_memory_shared
+) {
     // 共享内存地址在同一个 block 内唯一，使用 block_id 高位 + 地址低 32 位作为 key，
     const uint64_t packed_base = ((current_block_id & 0xFFFFFFFFull) << 32)
                                  | (ptr & 0xFFFFFFFFull);
 
     for (int i = 0; i < access_size; i += 4) {
         const uint64_t addr = packed_base + i;  // 4 字节粒度
-
-        auto it = this->_shadow_memory_shared.find(addr);
-        if (it == this->_shadow_memory_shared.end()) {
-            // cold miss
-            this->_pc_statistics[pc_offset][0xFFFFFFFF].dist[0] += 1;
-            auto& entry = this->_shadow_memory_shared.emplace(addr, shadow_memory_entry()).first->second;
-            entry.last_pc = pc_offset;
-            entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id; // 只编码 warp/lane
+        const uint32_t current_flat_thread_id =
+            static_cast<uint32_t>((current_warp_id << 5) | current_lane_id);
+
+        auto [it, inserted] = local_shadow_memory_shared.emplace(addr, shadow_memory_entry());
+        const bool is_cold_miss = inserted;
+        const uint64_t old_packed = __atomic_exchange_n(
+            &(it->second.packed),
+            pack_shadow_entry(pc_offset, current_flat_thread_id),
+            __ATOMIC_ACQ_REL
+        );
+
+        if (is_cold_miss) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
             continue;
         }
 
-        auto& entry = it->second;
-        const uint64_t last_warp_id = (entry.last_flat_thread_id >> 5) & 0x1F;
-        const uint64_t last_lane_id = entry.last_flat_thread_id & 0x1F;
-        const uint32_t last_pc = entry.last_pc;
+        const uint32_t last_pc = unpack_shadow_pc(old_packed);
+        const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed);
+        const uint64_t last_warp_id = static_cast<uint64_t>((last_flat_thread_id >> 5) & 0x1F);
+        const uint64_t last_lane_id = static_cast<uint64_t>(last_flat_thread_id & 0x1F);
 
         if (last_warp_id != current_warp_id) {
             // 不同 warp 同 block
-            this->_pc_statistics[pc_offset][last_pc].dist[2] += 1;
+            local_pc_statistics[pc_offset][last_pc].dist[2] += 1;
         } else if (last_lane_id != current_lane_id) {
             // 同 warp 不同 lane
-            this->_pc_statistics[pc_offset][last_pc].dist[1] += 1;
+            local_pc_statistics[pc_offset][last_pc].dist[1] += 1;
         } else {
             // 同一线程
-            this->_pc_statistics[pc_offset][last_pc].dist[0] += 1;
+            local_pc_statistics[pc_offset][last_pc].dist[0] += 1;
         }
-
-        entry.last_pc = pc_offset;
-        entry.last_flat_thread_id = (current_warp_id << 5) | current_lane_id;
     }
 }
 
@@ -437,67 +503,201 @@ void PcDependency::unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t
 }
 
 
-void PcDependency::gpu_data_analysis(void* data, uint64_t size) {
-    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
-    for (uint64_t i = 0; i < size; i++) {
-        MemoryAccess trace = accesses_buffer[i];
-        uint32_t pc_offset = trace.pc;
-        uint32_t flags = trace.flags;
-        uint32_t access_size = trace.accessSize;
-        uint32_t distinct_sector_count = trace.distinct_sector_count;
-        uint32_t active_mask = trace.active_mask;
-        switch (trace.type) {
-            case MemoryType::Local:{
-                    flags |= SANITIZER_MEMORY_LOCAL;
-                    break;
-                }
-            case MemoryType::Shared:{
-                    flags |= SANITIZER_MEMORY_SHARED;
-                    for (int j = 0; j < GPU_WARP_SIZE; j++) {
-                        if (active_mask & (1u << j)) {
-                            unit_access_shared(trace.addresses[j], pc_offset, trace.ctaId, trace.warpId, j, trace.accessSize);
-                        }
+void PcDependency::worker_loop(uint64_t worker_idx) {
+    uint64_t seen_generation = 0;
+    while (true) {
+        uint64_t current_generation = 0;
+        {
+            std::unique_lock<std::mutex> lock(_worker_pool_mutex);
+            _worker_pool_cv.wait(lock, [&]{
+                return _worker_pool_shutdown || _worker_job_generation > seen_generation;
+            });
+            if (_worker_pool_shutdown) {
+                return;
+            }
+            current_generation = _worker_job_generation;
+        }
+
+        auto& local_pc_statistics = _job_worker_pc_statistics[worker_idx];
+        auto& local_pc_flags = _job_worker_pc_flags[worker_idx];
+        auto& local_distinct_sector_count = _job_worker_distinct_sector_count[worker_idx];
+        auto& local_shadow_memory_shared = _worker_shadow_memory_shared[worker_idx];
+        const auto& trace_indices = _job_worker_trace_indices[worker_idx];
+
+        for (uint64_t i : trace_indices) {
+            MemoryAccess trace = _job_accesses_buffer[i];
+            uint32_t pc_offset = trace.pc;
+            uint32_t flags = trace.flags;
+            uint32_t access_size = trace.accessSize;
+            uint32_t distinct_sector_count = trace.distinct_sector_count;
+            uint32_t active_mask = trace.active_mask;
+            switch (trace.type) {
+                case MemoryType::Local:{
+                        flags |= SANITIZER_MEMORY_LOCAL;
+                        break;
                     }
-                    break;
-                }
-            case MemoryType::Global:{
-                    flags |= SANITIZER_MEMORY_GLOBAL;
-                    memory_region memory_region_target;
-                    uint64_t first_valid_address = 0;
-                    for (int j = 0; j < GPU_WARP_SIZE; j++) {
-                        if (active_mask & (1u << j)) {
-                            first_valid_address = trace.addresses[j];
-                            break;
+                case MemoryType::Shared:{
+                        flags |= SANITIZER_MEMORY_SHARED;
+                        for (int j = 0; j < GPU_WARP_SIZE; j++) {
+                            if (active_mask & (1u << j)) {
+                                unit_access_shared(
+                                    trace.addresses[j],
+                                    pc_offset,
+                                    trace.ctaId,
+                                    trace.warpId,
+                                    j,
+                                    trace.accessSize,
+                                    local_pc_statistics,
+                                    local_shadow_memory_shared
+                                );
+                            }
                         }
+                        break;
                     }
-                    assert(first_valid_address != 0);
-                    for (auto memory_region_iter : this->_memory_regions) {
-                        if (memory_region_iter.contains(first_valid_address)) {
-                            memory_region_target = memory_region_iter;
-                            break;
+                case MemoryType::Global:{
+                        flags |= SANITIZER_MEMORY_GLOBAL;
+                        memory_region memory_region_target;
+                        uint64_t first_valid_address = 0;
+                        for (int j = 0; j < GPU_WARP_SIZE; j++) {
+                            if (active_mask & (1u << j)) {
+                                first_valid_address = trace.addresses[j];
+                                break;
+                            }
                         }
-                    }
-                    uint64_t memory_region_start = memory_region_target.get_start();
-                    assert(memory_region_start != 0);
-                    for ( int j = 0; j < GPU_WARP_SIZE; j++) {
-                        if (active_mask & (1u << j)) {
-                            unit_access(trace.addresses[j] - memory_region_start, pc_offset, trace.ctaId, trace.warpId, j, memory_region_target, access_size);
+                        assert(first_valid_address != 0);
+                        for (auto memory_region_iter : this->_memory_regions) {
+                            if (memory_region_iter.contains(first_valid_address)) {
+                                memory_region_target = memory_region_iter;
+                                break;
+                            }
+                        }
+                        uint64_t memory_region_start = memory_region_target.get_start();
+                        assert(memory_region_start != 0);
+                        for ( int j = 0; j < GPU_WARP_SIZE; j++) {
+                            if (active_mask & (1u << j)) {
+                                unit_access(
+                                    trace.addresses[j] - memory_region_start,
+                                    pc_offset,
+                                    trace.ctaId,
+                                    trace.warpId,
+                                    j,
+                                    memory_region_target,
+                                    access_size,
+                                    local_pc_statistics
+                                );
+                            }
                         }
+                        break;
                     }
+                default:
+                    printf("unknown memory type\n");
                     break;
+            }
+            auto& local_flag = local_pc_flags[pc_offset];
+            local_flag.first |= flags;
+            if (local_flag.second == 0) {
+                local_flag.second = access_size;
+            } else if (local_flag.second != access_size) {
+                local_flag.second = std::max(local_flag.second, access_size);
+            }
+            if (distinct_sector_count >= 1 && distinct_sector_count <= 32) {
+                local_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1;
+            }
+            const uint32_t active_lane_count = __builtin_popcount(active_mask);
+            if (active_lane_count <= 32) {
+                local_distinct_sector_count[pc_offset][32 + active_lane_count] += 1;
+            }
+        }
+
+        {
+            std::lock_guard<std::mutex> guard(_worker_pool_mutex);
+            seen_generation = current_generation;
+            if (!trace_indices.empty()) {
+                assert(_worker_pending_jobs > 0);
+                _worker_pending_jobs -= 1;
+                if (_worker_pending_jobs == 0) {
+                    _worker_pool_done_cv.notify_one();
                 }
-            default:
-                printf("unknown memory type\n");
-                break;
+            }
+        }
+    }
+}
+
+
+void PcDependency::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    if (size == 0) {
+        return;
+    }
+
+    for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) {
+        _job_worker_trace_indices[worker_idx].clear();
+        _job_worker_pc_statistics[worker_idx].clear();
+        _job_worker_pc_flags[worker_idx].clear();
+        _job_worker_distinct_sector_count[worker_idx].clear();
+        _job_worker_trace_indices[worker_idx].reserve((size / _worker_count) + 1);
+    }
+
+    // Stable assignment by block id keeps intra-block trace order.
+    for (uint64_t i = 0; i < size; ++i) {
+        const uint64_t worker_idx = accesses_buffer[i].ctaId % _worker_count;
+        _job_worker_trace_indices[worker_idx].push_back(i);
+    }
+
+    uint64_t pending_jobs = 0;
+    for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) {
+        if (!_job_worker_trace_indices[worker_idx].empty()) {
+            pending_jobs += 1;
         }
-        this->_pc_flags[pc_offset] = std::make_pair(flags, access_size);
-        // Defensive bounds checks: GPU side should produce [1, 32].
-        if (distinct_sector_count >= 1 && distinct_sector_count <= 32) {
-            this->_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1;
+    }
+    if (pending_jobs == 0) {
+        return;
+    }
+
+    {
+        std::lock_guard<std::mutex> guard(_worker_pool_mutex);
+        _job_accesses_buffer = accesses_buffer;
+        _worker_pending_jobs = pending_jobs;
+        ++_worker_job_generation;
+    }
+    _worker_pool_cv.notify_all();
+    {
+        std::unique_lock<std::mutex> lock(_worker_pool_mutex);
+        _worker_pool_done_cv.wait(lock, [&]{
+            return _worker_pending_jobs == 0;
+        });
+    }
+
+    for (auto& local_flags_map : _job_worker_pc_flags) {
+        for (auto& [pc, local_flag] : local_flags_map) {
+            auto& global_flag = this->_pc_flags[pc];
+            global_flag.first |= local_flag.first;
+            if (global_flag.second == 0) {
+                global_flag.second = local_flag.second;
+            } else if (global_flag.second != local_flag.second) {
+                global_flag.second = std::max(global_flag.second, local_flag.second);
+            }
+        }
+    }
+
+    for (auto& local_distinct_map : _job_worker_distinct_sector_count) {
+        for (auto& [pc, local_hist] : local_distinct_map) {
+            auto& global_hist = this->_distinct_sector_count[pc];
+            for (size_t idx = 0; idx < global_hist.size(); ++idx) {
+                global_hist[idx] += local_hist[idx];
+            }
         }
-        const uint32_t active_lane_count = __builtin_popcount(active_mask);
-        if (active_lane_count <= 32) {
-            this->_distinct_sector_count[pc_offset][32 + active_lane_count] += 1;
+    }
+
+    for (auto& local_map : _job_worker_pc_statistics) {
+        for (auto& [cur_pc, local_inner] : local_map) {
+            auto& global_inner = this->_pc_statistics[cur_pc];
+            for (auto& [anc_pc, local_stats] : local_inner) {
+                auto& global_stats = global_inner[anc_pc];
+                for (int d = 0; d < 4; ++d) {
+                    global_stats.dist[d] += local_stats.dist[d];
+                }
+            }
         }
     }
 

From 98c0353dab89adbee0f730f141fc3af4a4285bbf Mon Sep 17 00:00:00 2001
From: Yanbo Zhao <yzhao62@eb2-3224-lin10.csc.ncsu.edu>
Date: Mon, 16 Feb 2026 16:28:41 -0500
Subject: [PATCH 6/6] lock free parallelization optimization

---
 include/tools/pc_dependency_analysis.h |   8 +-
 src/tools/pc_dependency_analysis.cpp   | 174 +++++++++++++++----------
 2 files changed, 110 insertions(+), 72 deletions(-)

diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h
index 717307a..1df9813 100644
--- a/include/tools/pc_dependency_analysis.h
+++ b/include/tools/pc_dependency_analysis.h
@@ -113,7 +113,8 @@ class alignas(8) shadow_memory_entry{
 public:
     shadow_memory_entry() {};
     ~shadow_memory_entry() {};
-    // Packed representation: low 32 bits = last_pc, high 32 bits = last_flat_thread_id.
+    // Packed representation: low 32 bits = (generation:8 | pc24:24),
+    // high 32 bits = last_flat_thread_id.
     // Keeping a single 64-bit field avoids type-punning UB in atomic exchange.
     // packed == 0 means invalid/uninitialized (cold).
     uint64_t packed = 0;
@@ -223,7 +224,7 @@ class PcDependency final : public Tool {
         uint32_t current_lane_id,
         int access_size,
         std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics,
-        std::unordered_map<uint64_t, shadow_memory_entry>& local_shadow_memory_shared
+        std::unordered_map<uint64_t, std::unordered_map<uint32_t, shadow_memory_entry>>& local_shadow_memory_shared
     );
 
     void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size);
@@ -237,6 +238,7 @@ class PcDependency final : public Tool {
 
     std::string output_directory;
     uint32_t kernel_id = 0;
+    uint8_t _kernel_generation = 0;
 
 
     std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
@@ -259,7 +261,7 @@ class PcDependency final : public Tool {
     // Persistent worker pool and per-worker shared-memory shadow state.
     uint64_t _worker_count = 1;
     std::vector<std::thread> _workers;
-    std::vector<std::unordered_map<uint64_t, shadow_memory_entry>> _worker_shadow_memory_shared;
+    std::vector<std::unordered_map<uint64_t, std::unordered_map<uint32_t, shadow_memory_entry>>> _worker_shadow_memory_shared;
 
     // Per-batch job data produced by gpu_data_analysis and consumed by workers.
     const MemoryAccess* _job_accesses_buffer = nullptr;
diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp
index 6dadf09..aa3f525 100644
--- a/src/tools/pc_dependency_analysis.cpp
+++ b/src/tools/pc_dependency_analysis.cpp
@@ -64,17 +64,38 @@ static std::string flags_to_string(uint32_t flags) {
     return oss.str();
 }
 
-static inline uint64_t pack_shadow_entry(uint32_t pc, uint32_t flat_thread_id) {
-    return (static_cast<uint64_t>(flat_thread_id) << 32) | static_cast<uint64_t>(pc);
+static inline uint64_t pack_shadow_entry(uint8_t generation, uint32_t pc24, uint32_t flat_thread_id) {
+    const uint32_t encoded_pc = (static_cast<uint32_t>(generation) << 24)
+                              | (pc24 & 0x00FFFFFFu);
+    return (static_cast<uint64_t>(flat_thread_id) << 32) | static_cast<uint64_t>(encoded_pc);
 }
 
-static inline uint32_t unpack_shadow_pc(uint64_t packed) {
+static inline uint32_t unpack_shadow_pc_encoded(uint64_t packed) {
     return static_cast<uint32_t>(packed & 0xFFFFFFFFu);
 }
 
 static inline uint32_t unpack_shadow_flat_tid(uint64_t packed) {
     return static_cast<uint32_t>(packed >> 32);
 }
+
+static inline const memory_region* find_memory_region_containing(
+    const std::vector<memory_region>& regions,
+    uint64_t addr
+) {
+    auto it = std::upper_bound(
+        regions.begin(),
+        regions.end(),
+        addr,
+        [](uint64_t value, const memory_region& region) {
+            return value < region.get_start();
+        }
+    );
+    if (it == regions.begin()) {
+        return nullptr;
+    }
+    --it;
+    return it->contains(addr) ? &(*it) : nullptr;
+}
 } // namespace
 
 
@@ -132,10 +153,13 @@ void PcDependency::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel)
     for (auto& shared_map : _worker_shadow_memory_shared) {
         shared_map.clear();
     }
-    for (auto& shadow_memory_iter : _shadow_memories) {
-        shadow_memory_iter.second->reset_entries();
+    _kernel_generation = static_cast<uint8_t>(_kernel_generation + 1u);
+    if (_kernel_generation == 0) {
+        for (auto& shadow_memory_iter : _shadow_memories) {
+            shadow_memory_iter.second->reset_entries();
+        }
+        printf("[PC_DEPENDENCY] Shadow generation wrapped, resetting entries\n");
     }
-    printf("[PC_DEPENDENCY] Resetting shadow memory entries\n");
     _timer.increment(true);
 }
 
@@ -333,7 +357,10 @@ void PcDependency::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
     alloc_events.emplace(_timer.get(), mem);
     active_memories.emplace(mem->addr, mem);
     memory_region memory_region_current = memory_region((uint64_t)mem->addr, (uint64_t)(mem->addr + mem->size));
-    _memory_regions.push_back(memory_region_current);
+    _memory_regions.insert(
+        std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current),
+        memory_region_current
+    );
     _shadow_memories.emplace(memory_region_current, std::make_unique<shadow_memory>(mem->size));
 
     printf("[PC_DEPENDENCY] Allocating shadow memory for memory region: %p - %p, size: %lu\n", (void*)memory_region_current.get_start(), (void*)memory_region_current.get_end(), mem->size);
@@ -349,8 +376,8 @@ void PcDependency::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
 
     memory_region r((uint64_t)mem->addr, (uint64_t)mem->addr + sz);
 
-    auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r);
-    if (vit != _memory_regions.end()) _memory_regions.erase(vit);
+    auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r);
+    if (vit != _memory_regions.end() && *vit == r) _memory_regions.erase(vit);
 
     _shadow_memories.erase(r);
     printf("[PC_DEPENDENCY] Freeing shadow memory for memory region: %p - %p, size: %lu\n", (void*)r.get_start(), (void*)r.get_end(), sz);
@@ -361,8 +388,12 @@ void PcDependency::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
 void PcDependency::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
     tensor_events.emplace(_timer.get(), ten);
     active_tensors.emplace(ten->addr, ten);
-    _memory_regions.push_back(memory_region((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size)));
-    _shadow_memories.emplace(_memory_regions.back(), std::make_unique<shadow_memory>(ten->size));
+    memory_region memory_region_current((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size));
+    _memory_regions.insert(
+        std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current),
+        memory_region_current
+    );
+    _shadow_memories.emplace(memory_region_current, std::make_unique<shadow_memory>(ten->size));
     printf("[PC_DEPENDENCY] Allocating shadow memory for tensor region: %p - %p, size: %lu\n", (void*)ten->addr, (void*)(ten->addr + ten->size), ten->size);
 
     _timer.increment(true);
@@ -379,8 +410,8 @@ void PcDependency::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
 
     memory_region r((uint64_t)ten->addr, (uint64_t)ten->addr + sz);
 
-    auto vit = std::find(_memory_regions.begin(), _memory_regions.end(), r);
-    if (vit != _memory_regions.end()) {
+    auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r);
+    if (vit != _memory_regions.end() && *vit == r) {
         _memory_regions.erase(vit);
     }
 
@@ -421,7 +452,7 @@ void PcDependency::unit_access(
         auto& entry = shadow_memory.get_entry(addr);
         const uint64_t old_packed = __atomic_exchange_n(
             &entry.packed,
-            pack_shadow_entry(pc_offset, current_flat_thread_id),
+            pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id),
             __ATOMIC_ACQ_REL
         );
         const bool is_cold_miss = (old_packed == 0);
@@ -431,7 +462,13 @@ void PcDependency::unit_access(
             continue;
         }
 
-        const uint32_t last_pc = unpack_shadow_pc(old_packed);
+        const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed);
+        const uint8_t last_generation = static_cast<uint8_t>(last_pc_encoded >> 24);
+        if (last_generation != _kernel_generation) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
+            continue;
+        }
+        const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu);
         const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed);
         const uint64_t last_block_id = static_cast<uint64_t>(last_flat_thread_id >> 10);
         const uint64_t last_warp_id = static_cast<uint64_t>((last_flat_thread_id >> 5) & 0x1F);
@@ -456,31 +493,34 @@ void PcDependency::unit_access_shared(
     uint32_t current_lane_id,
     int access_size,
     std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics,
-    std::unordered_map<uint64_t, shadow_memory_entry>& local_shadow_memory_shared
+    std::unordered_map<uint64_t, std::unordered_map<uint32_t, shadow_memory_entry>>& local_shadow_memory_shared
 ) {
-    // 共享内存地址在同一个 block 内唯一，使用 block_id 高位 + 地址低 32 位作为 key，
-    const uint64_t packed_base = ((current_block_id & 0xFFFFFFFFull) << 32)
-                                 | (ptr & 0xFFFFFFFFull);
+    // Per-CTA layered shadow map: local_shadow_memory_shared[cta_id][addr_low32]
+    auto& cta_shadow = local_shadow_memory_shared[current_block_id];
+    const uint32_t base_addr_low32 = static_cast<uint32_t>(ptr & 0xFFFFFFFFull);
 
     for (int i = 0; i < access_size; i += 4) {
-        const uint64_t addr = packed_base + i;  // 4 字节粒度
+        const uint32_t addr = base_addr_low32 + static_cast<uint32_t>(i);  // 4 字节粒度
         const uint32_t current_flat_thread_id =
             static_cast<uint32_t>((current_warp_id << 5) | current_lane_id);
 
-        auto [it, inserted] = local_shadow_memory_shared.emplace(addr, shadow_memory_entry());
+        auto [it, inserted] = cta_shadow.emplace(addr, shadow_memory_entry());
         const bool is_cold_miss = inserted;
-        const uint64_t old_packed = __atomic_exchange_n(
-            &(it->second.packed),
-            pack_shadow_entry(pc_offset, current_flat_thread_id),
-            __ATOMIC_ACQ_REL
-        );
+        const uint64_t old_packed = it->second.packed;
+        it->second.packed = pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id);
 
         if (is_cold_miss) {
             local_pc_statistics[pc_offset][0].dist[0] += 1;
             continue;
         }
 
-        const uint32_t last_pc = unpack_shadow_pc(old_packed);
+        const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed);
+        const uint8_t last_generation = static_cast<uint8_t>(last_pc_encoded >> 24);
+        if (last_generation != _kernel_generation) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
+            continue;
+        }
+        const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu);
         const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed);
         const uint64_t last_warp_id = static_cast<uint64_t>((last_flat_thread_id >> 5) & 0x1F);
         const uint64_t last_lane_id = static_cast<uint64_t>(last_flat_thread_id & 0x1F);
@@ -525,8 +565,8 @@ void PcDependency::worker_loop(uint64_t worker_idx) {
         const auto& trace_indices = _job_worker_trace_indices[worker_idx];
 
         for (uint64_t i : trace_indices) {
-            MemoryAccess trace = _job_accesses_buffer[i];
-            uint32_t pc_offset = trace.pc;
+            const MemoryAccess& trace = _job_accesses_buffer[i];
+            uint32_t pc_offset = (trace.pc & 0x00FFFFFFu);
             uint32_t flags = trace.flags;
             uint32_t access_size = trace.accessSize;
             uint32_t distinct_sector_count = trace.distinct_sector_count;
@@ -538,54 +578,50 @@ void PcDependency::worker_loop(uint64_t worker_idx) {
                     }
                 case MemoryType::Shared:{
                         flags |= SANITIZER_MEMORY_SHARED;
-                        for (int j = 0; j < GPU_WARP_SIZE; j++) {
-                            if (active_mask & (1u << j)) {
-                                unit_access_shared(
-                                    trace.addresses[j],
-                                    pc_offset,
-                                    trace.ctaId,
-                                    trace.warpId,
-                                    j,
-                                    trace.accessSize,
-                                    local_pc_statistics,
-                                    local_shadow_memory_shared
-                                );
-                            }
+                        uint32_t remaining_mask = active_mask;
+                        while (remaining_mask != 0) {
+                            const uint32_t j = static_cast<uint32_t>(__builtin_ctz(remaining_mask));
+                            remaining_mask &= (remaining_mask - 1);
+                            unit_access_shared(
+                                trace.addresses[j],
+                                pc_offset,
+                                trace.ctaId,
+                                trace.warpId,
+                                j,
+                                trace.accessSize,
+                                local_pc_statistics,
+                                local_shadow_memory_shared
+                            );
                         }
                         break;
                     }
                 case MemoryType::Global:{
                         flags |= SANITIZER_MEMORY_GLOBAL;
-                        memory_region memory_region_target;
-                        uint64_t first_valid_address = 0;
-                        for (int j = 0; j < GPU_WARP_SIZE; j++) {
-                            if (active_mask & (1u << j)) {
-                                first_valid_address = trace.addresses[j];
-                                break;
-                            }
-                        }
-                        assert(first_valid_address != 0);
-                        for (auto memory_region_iter : this->_memory_regions) {
-                            if (memory_region_iter.contains(first_valid_address)) {
-                                memory_region_target = memory_region_iter;
-                                break;
-                            }
+                        if (active_mask == 0) {
+                            break;
                         }
+                        const uint32_t first_lane = static_cast<uint32_t>(__builtin_ctz(active_mask));
+                        const uint64_t first_valid_address = trace.addresses[first_lane];
+                        const memory_region* memory_region_target_ptr =
+                            find_memory_region_containing(this->_memory_regions, first_valid_address);
+                        assert(memory_region_target_ptr != nullptr);
+                        memory_region memory_region_target = *memory_region_target_ptr;
                         uint64_t memory_region_start = memory_region_target.get_start();
                         assert(memory_region_start != 0);
-                        for ( int j = 0; j < GPU_WARP_SIZE; j++) {
-                            if (active_mask & (1u << j)) {
-                                unit_access(
-                                    trace.addresses[j] - memory_region_start,
-                                    pc_offset,
-                                    trace.ctaId,
-                                    trace.warpId,
-                                    j,
-                                    memory_region_target,
-                                    access_size,
-                                    local_pc_statistics
-                                );
-                            }
+                        uint32_t remaining_mask = active_mask;
+                        while (remaining_mask != 0) {
+                            const uint32_t j = static_cast<uint32_t>(__builtin_ctz(remaining_mask));
+                            remaining_mask &= (remaining_mask - 1);
+                            unit_access(
+                                trace.addresses[j] - memory_region_start,
+                                pc_offset,
+                                trace.ctaId,
+                                trace.warpId,
+                                j,
+                                memory_region_target,
+                                access_size,
+                                local_pc_statistics
+                            );
                         }
                         break;
                     }