diff --git a/include/sanalyzer.h b/include/sanalyzer.h
index e42ae53..0d9e188 100644
--- a/include/sanalyzer.h
+++ b/include/sanalyzer.h
@@ -25,6 +25,9 @@ typedef enum {
     GPU_PATCH_TIME_HOTNESS_CPU = 8,
     GPU_PATCH_ROOFLINE_FLOPS_NVBIT = 9,
     GPU_PATCH_ROOFLINE_SIZE = 10,
+    GPU_PATCH_HEATMAP_ANALYSIS = 11,
+    GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS = 12,
+    GPU_PATCH_PC_DEPENDENCY_ANALYSIS = 13,
 } AccelProfPatchName_t;
 
 
diff --git a/include/tools/block_divergence_analysis.h b/include/tools/block_divergence_analysis.h
new file mode 100644
index 0000000..4fb52dd
--- /dev/null
+++ b/include/tools/block_divergence_analysis.h
@@ -0,0 +1,73 @@
+#ifndef YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H
+#define YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H
+
+
+#include "tools/tool.h"
+#include "utils/event.h"
+#include "gpu_patch.h"
+
+#include <map>
+#include <vector>
+#include <set>
+#include <unordered_map>
+namespace yosemite {
+
+class BlockDivergenceAnalysis final : public Tool {
+public:
+    BlockDivergenceAnalysis();
+
+    ~BlockDivergenceAnalysis();
+
+    void gpu_data_analysis(void* data, uint64_t size);
+
+    void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void evt_callback(EventPtr_t evt);
+
+    void flush();
+
+private:
+    void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel);
+
+    void mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem);
+
+    void mem_free_callback(std::shared_ptr<MemFree_t> mem);
+
+    void ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten);
+
+    void ten_free_callback(std::shared_ptr<TenFree_t> ten);
+
+    void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
+
+
+/*
+********************************* variables *********************************
+*/
+    Timer_t _timer;
+
+    std::string output_directory;
+    uint32_t kernel_id = 0;
+
+    std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
+    std::map<uint64_t, std::shared_ptr<MemAlloc_t>> alloc_events;
+    std::map<DevPtr, std::shared_ptr<MemAlloc_t>> active_memories;
+
+    std::map<uint64_t, std::shared_ptr<TenAlloc_t>> tensor_events;
+    std::map<DevPtr, std::shared_ptr<TenAlloc_t>> active_tensors;
+
+    struct BlockStat {
+        std::unordered_map<uint64_t, uint64_t> pc_counts;
+        uint64_t read_count = 0;
+        uint64_t write_count = 0;
+    };
+
+    std::unordered_map<uint64_t, BlockStat> _block_entries;
+    std::set<uint64_t> _unique_pcs;
+};
+
+}   // yosemite
+#endif // YOSEMITE_TOOL_BLOCK_DIVERGENCE_ANALYSIS_H
diff --git a/include/tools/heatmap_analysis.h b/include/tools/heatmap_analysis.h
new file mode 100644
index 0000000..2a50475
--- /dev/null
+++ b/include/tools/heatmap_analysis.h
@@ -0,0 +1,78 @@
+#ifndef YOSEMITE_HEATMAP_ANALYSIS_H
+#define YOSEMITE_HEATMAP_ANALYSIS_H
+
+
+#include "tools/tool.h"
+#include "utils/event.h"
+#include "gpu_patch.h"
+
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <array>
+
+#define SECTOR_TAG_SHIFT 5
+
+namespace yosemite {
+
+class HeatmapAnalysis final : public Tool {
+public:
+    HeatmapAnalysis();
+
+    ~HeatmapAnalysis();
+
+    void gpu_data_analysis(void* data, uint64_t size);
+
+    void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void evt_callback(EventPtr_t evt);
+
+    void flush();
+    
+private:
+    void unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length);
+    
+    void add_sector_pc_information(uint64_t sector_tag, uint64_t pc);
+
+    void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel);
+
+    void mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem);
+
+    void mem_free_callback(std::shared_ptr<MemFree_t> mem);
+
+    void ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten);
+
+    void ten_free_callback(std::shared_ptr<TenFree_t> ten);
+
+    void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
+
+
+/*
+********************************* variables *********************************
+*/
+
+    Timer_t _timer;
+
+    std::string output_directory;
+    uint32_t kernel_id = 0;
+
+    std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
+    std::map<uint64_t, std::shared_ptr<MemAlloc_t>> alloc_events;
+    std::map<DevPtr, std::shared_ptr<MemAlloc_t>> active_memories;
+
+    std::map<uint64_t, std::shared_ptr<TenAlloc>> tensor_events;
+    std::map<DevPtr, std::shared_ptr<TenAlloc>> active_tensors;
+
+    std::vector<MemoryAccess> _traces;
+    std::unordered_map<uint64_t, std::array<uint32_t, 18>> _heatmap_data;
+    std::unordered_map<uint64_t, std::set<uint64_t>> _sector_pc_information;
+
+};
+
+}   // namespace yosemite
+#endif // YOSEMITE_HEATMAP_ANALYSIS_H
diff --git a/include/tools/pc_dependency_analysis.h b/include/tools/pc_dependency_analysis.h
new file mode 100644
index 0000000..1df9813
--- /dev/null
+++ b/include/tools/pc_dependency_analysis.h
@@ -0,0 +1,283 @@
+#ifndef YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H
+#define YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H
+
+
+#include "tools/tool.h"
+#include "utils/event.h"
+#include "gpu_patch.h"
+
+#include <map>
+#include <vector>
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <array>
+#include <cstdint>
+#include <string>
+#include <memory>
+#include <cassert>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+#include <cstring>
+#include <sys/mman.h>
+
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ
+#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE
+#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_RED
+#define SANITIZER_MEMORY_DEVICE_FLAG_RED 0x3
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC
+#define SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC 0x4
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH
+#define SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH 0x8
+#endif
+
+#ifndef SANITIZER_MEMORY_GLOBAL
+#define SANITIZER_MEMORY_GLOBAL 0x10
+#endif
+
+#ifndef SANITIZER_MEMORY_SHARED
+#define SANITIZER_MEMORY_SHARED 0x20
+#endif
+
+#ifndef SANITIZER_MEMORY_LOCAL
+#define SANITIZER_MEMORY_LOCAL 0x40
+#endif
+
+namespace yosemite {
+
+/* we choose to use PC offset instead of PC because the PC is too long for shadow memory and it is not necessary to track the original PC.
+The offset will be calculated during trace collection.
+
+Every memory allocation will cause a shadow memory to be created.
+Every memory deallocation will cause a shadow memory to be destroyed.
+Shadow memory bitmask will be reset when a kernel finished. (to avoid mass shadow memory reset)
+
+The gpu data analysis will 
+1.iterate the trace buffer and query the shadow memory to get the corresponding shadow memory entry.
+2. compare the last access information with the current access information with the rules below:
+    0. if bitmask of this access is 0, it means the current access is a cold miss set it's acient pc to 0xFFFFFFFF.
+    1. if last access and current access are from the same thread, then it is an intra thread access.
+    2. if last access and current access are from the same warp, then it is an intra warp access.
+    3. if last access and current access are from the same block, then it is an intra block access.
+    4. if last access and current access are from the same grid, then it is an intra grid access.
+3. update the pc_statistics with the current pc, ancient pc and the distance.
+4. update the shadow memory entry with the current pc and the flat thread id.
+*/
+
+
+class memory_region{
+public:
+    memory_region() : start(0), end(0) {};
+    memory_region(uint64_t start, uint64_t end) : start(start), end(end) {};
+    ~memory_region() {};
+
+    bool contains(uint64_t ptr) const {
+        return ptr >= start && ptr < end;
+    };
+
+    bool operator==(const memory_region& other) const {
+        return start == other.start && end == other.end;
+    };
+
+    bool operator<(const memory_region& other) const {
+        // strict-weak-ordering: compare both start and end
+        if (start != other.start) return start < other.start;
+        return end < other.end;
+    };
+
+    uint64_t get_start() const {
+        return start;
+    };
+    uint64_t get_end() const {
+        return end;
+    };
+
+private:
+    uint64_t start;
+    uint64_t end;
+};
+
+class alignas(8) shadow_memory_entry{
+public:
+    shadow_memory_entry() {};
+    ~shadow_memory_entry() {};
+    // Packed representation: low 32 bits = (generation:8 | pc24:24),
+    // high 32 bits = last_flat_thread_id.
+    // Keeping a single 64-bit field avoids type-punning UB in atomic exchange.
+    // packed == 0 means invalid/uninitialized (cold).
+    uint64_t packed = 0;
+};
+
+class shadow_memory{
+public:
+    shadow_memory(uint64_t size) 
+    :_size(size),
+    _size_celled((size + 3) / 4 * 4),
+    _stride(_size_celled / 4),
+    _entries_bytes(std::max<uint64_t>(1, _size_celled * sizeof(shadow_memory_entry))) {
+        _shadow_memory_entries = static_cast<shadow_memory_entry*>(
+            mmap(nullptr, _entries_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
+        );
+        assert(_shadow_memory_entries != MAP_FAILED);
+
+        printf("[PC_DEPENDENCY] Shadow memory entries: %lu\n", size);
+        printf("[PC_DEPENDENCY] Shadow memory per entry size: %lu\n", sizeof(shadow_memory_entry));
+        printf("[PC_DEPENDENCY] Shadow memory size: %lu\n", size*sizeof(shadow_memory_entry));
+      };
+    ~shadow_memory() {
+        if (_shadow_memory_entries != nullptr && _shadow_memory_entries != MAP_FAILED) {
+            munmap(_shadow_memory_entries, _entries_bytes);
+            _shadow_memory_entries = nullptr;
+        }
+    }
+    void reset_entries() {
+        if (madvise(_shadow_memory_entries, _entries_bytes, MADV_DONTNEED) != 0) {
+            std::memset(_shadow_memory_entries, 0, _entries_bytes);
+        }
+    };
+    shadow_memory_entry& get_entry(uint64_t offset) {
+        assert(offset < _size);
+        //update layout: use offset/4 + offset%4 * _size/4 to make every 4 bytes adjacent in one cache line
+        return _shadow_memory_entries[(offset/4) + (offset%4) * _stride];
+        // return _shadow_memory_entries[offset];
+    }
+    uint64_t _size;
+    uint64_t _size_celled;
+    uint64_t _stride;
+    uint64_t _entries_bytes;
+    shadow_memory_entry* _shadow_memory_entries = nullptr;
+};
+
+
+class PC_statisitics{
+public:
+    std::array<uint64_t, 4> dist = {0, 0, 0, 0}; 
+    // 0: intra thread
+    // 1: intra warp
+    // 2: intra block
+    // 3: intra grid
+};
+
+class PcDependency final : public Tool {
+public:
+    PcDependency();
+
+    ~PcDependency();
+
+    void gpu_data_analysis(void* data, uint64_t size);
+
+    void query_ranges(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void query_tensors(void* ranges, uint32_t limit, uint32_t* count) override {};
+
+    void allocation_callback(uint64_t ptr, uint64_t size);
+
+    void deallocation_callback(uint64_t ptr);
+
+    void evt_callback(EventPtr_t evt);
+
+    void flush();
+
+private:
+    void kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel);
+
+    void mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem);
+
+    void mem_free_callback(std::shared_ptr<MemFree_t> mem);
+
+    void ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten);
+
+    void ten_free_callback(std::shared_ptr<TenFree_t> ten);
+
+    void kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel);
+
+    void unit_access(
+        uint64_t ptr,
+        uint32_t pc_offset,
+        uint64_t current_block_id,
+        uint32_t current_warp_id,
+        uint32_t current_lane_id,
+        memory_region& memory_region_target,
+        int access_size,
+        std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics
+    );
+
+    void unit_access_shared(
+        uint64_t ptr,
+        uint32_t pc_offset,
+        uint64_t current_block_id,
+        uint32_t current_warp_id,
+        uint32_t current_lane_id,
+        int access_size,
+        std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics,
+        std::unordered_map<uint64_t, std::unordered_map<uint32_t, shadow_memory_entry>>& local_shadow_memory_shared
+    );
+
+    void unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size);
+    void worker_loop(uint64_t worker_idx);
+
+
+/*
+********************************* variables *********************************
+*/
+    Timer_t _timer;
+
+    std::string output_directory;
+    uint32_t kernel_id = 0;
+    uint8_t _kernel_generation = 0;
+
+
+    std::map<uint64_t, std::shared_ptr<KernelLaunch_t>> kernel_events;
+    std::map<uint64_t, std::shared_ptr<MemAlloc_t>> alloc_events;
+    std::map<DevPtr, std::shared_ptr<MemAlloc_t>> active_memories;
+
+    std::map<uint64_t, std::shared_ptr<TenAlloc>> tensor_events;
+    std::map<DevPtr, std::shared_ptr<TenAlloc>> active_tensors;
+
+
+    std::vector<memory_region> _memory_regions;
+
+    std::map<memory_region, std::unique_ptr<shadow_memory>> _shadow_memories; // memory region, shadow memory
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>> _pc_statistics; // current pc offset, ancient pc offset, PC_statisitics
+    std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> _pc_flags; // pc offset, flags, size of the access
+    // Index [0..31] stores distinct sector count 1..32.
+    // Index [32..64] stores active lane count 0..32.
+    std::unordered_map<uint32_t, std::array<uint64_t, 65>> _distinct_sector_count; // pc offset, distinct sector distribution
+
+    // Persistent worker pool and per-worker shared-memory shadow state.
+    uint64_t _worker_count = 1;
+    std::vector<std::thread> _workers;
+    std::vector<std::unordered_map<uint64_t, std::unordered_map<uint32_t, shadow_memory_entry>>> _worker_shadow_memory_shared;
+
+    // Per-batch job data produced by gpu_data_analysis and consumed by workers.
+    const MemoryAccess* _job_accesses_buffer = nullptr;
+    std::vector<std::vector<uint64_t>> _job_worker_trace_indices;
+    std::vector<std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>> _job_worker_pc_statistics;
+    std::vector<std::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>>> _job_worker_pc_flags;
+    std::vector<std::unordered_map<uint32_t, std::array<uint64_t, 65>>> _job_worker_distinct_sector_count;
+
+    std::mutex _worker_pool_mutex;
+    std::condition_variable _worker_pool_cv;
+    std::condition_variable _worker_pool_done_cv;
+    bool _worker_pool_shutdown = false;
+    uint64_t _worker_job_generation = 0;
+    uint64_t _worker_pending_jobs = 0;
+
+};
+
+}   // yosemite
+#endif // YOSEMITE_TOOL_PC_DEPENDENCY_ANALYSIS_H
diff --git a/include/tools/tool_type.h b/include/tools/tool_type.h
index 4115ebb..b4d39dd 100644
--- a/include/tools/tool_type.h
+++ b/include/tools/tool_type.h
@@ -16,7 +16,10 @@ typedef enum {
     ROOFLINE_FLOPS = 11,
     ROOFLINE_SIZE = 12,
     ROOFLINE_TIME = 13,
-    TOOL_NUMS = 14
+    HEATMAP_ANALYSIS = 14,
+    BLOCK_DIVERGENCE_ANALYSIS = 15,
+    PC_DEPENDENCY_ANALYSIS = 16,
+    TOOL_NUMS = 17
 } AnalysisTool_t;
 
 #endif // TOOL_TYPE_H
\ No newline at end of file
diff --git a/include/utils/event.h b/include/utils/event.h
index 903fc43..c29dd4e 100644
--- a/include/utils/event.h
+++ b/include/utils/event.h
@@ -69,6 +69,7 @@ typedef struct KernelLaunch : public Event {
     uint32_t touched_objects;
     uint32_t touched_objects_size;
     uint64_t key;   // for UVM Advisor
+    uint64_t kernel_pc;
 
     KernelLaunch() {
         this->evt_type = EventType_KERNEL_LAUNCH;
diff --git a/src/sanalyzer.cpp b/src/sanalyzer.cpp
index 0f250c5..faa6056 100644
--- a/src/sanalyzer.cpp
+++ b/src/sanalyzer.cpp
@@ -16,6 +16,9 @@
 #include "tools/time_hotness_cpu.h"
 #include "tools/event_trace.h"
 #include "tools/event_trace_mgpu.h"
+#include "tools/heatmap_analysis.h"
+#include "tools/block_divergence_analysis.h"
+#include "tools/pc_dependency_analysis.h"
 
 #include <memory>
 #include <map>
@@ -104,6 +107,15 @@ YosemiteResult_t yosemite_tool_enable(AnalysisTool_t& tool) {
     } else if (std::string(tool_name) == "event_trace_mgpu") {
         tool = EVENT_TRACE_MGPU;
         _tools.emplace(EVENT_TRACE_MGPU, std::make_shared<EventTraceMGPU>());
+    } else if (std::string(tool_name) == "heatmap_analysis") {
+        tool = HEATMAP_ANALYSIS;
+        _tools.emplace(HEATMAP_ANALYSIS, std::make_shared<HeatmapAnalysis>());
+    } else if (std::string(tool_name) == "block_divergence_analysis") {
+        tool = BLOCK_DIVERGENCE_ANALYSIS;
+        _tools.emplace(BLOCK_DIVERGENCE_ANALYSIS, std::make_shared<BlockDivergenceAnalysis>());
+    } else if (std::string(tool_name) == "pc_dependency_analysis") {
+        tool = PC_DEPENDENCY_ANALYSIS;
+        _tools.emplace(PC_DEPENDENCY_ANALYSIS, std::make_shared<PcDependency>());
     } else {
         fprintf(stderr, "[SANALYZER ERROR] Tool not found.\n");
         fflush(stderr);
@@ -249,6 +261,17 @@ YosemiteResult_t yosemite_init(AccelProfOptions_t& options) {
         options.patch_name = GPU_NO_PATCH;
     } else if (tool == EVENT_TRACE_MGPU) {
         options.patch_name = GPU_NO_PATCH;
+    } else if (tool == HEATMAP_ANALYSIS) {
+        options.patch_name = GPU_PATCH_HEATMAP_ANALYSIS;
+        options.patch_file = "gpu_patch_heatmap_analysis.fatbin";
+    } else if (tool == BLOCK_DIVERGENCE_ANALYSIS) {
+        options.patch_name = GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS;
+        options.patch_file = "gpu_patch_block_divergence_analysis.fatbin";
+    } else if (tool == PC_DEPENDENCY_ANALYSIS) {
+        options.patch_name = GPU_PATCH_PC_DEPENDENCY_ANALYSIS;
+        // nv-compute/Makefile generates fatbins based on gpu_src/*.cu filenames.
+        // The source file for this tool is nv-compute/gpu_src/gpu_patch_pc_dependency.cu
+        options.patch_file = "gpu_patch_pc_dependency.fatbin";
     }
 
     // enable torch profiler?
diff --git a/src/tools/block_divergence_analysis.cpp b/src/tools/block_divergence_analysis.cpp
new file mode 100644
index 0000000..8dd16d6
--- /dev/null
+++ b/src/tools/block_divergence_analysis.cpp
@@ -0,0 +1,190 @@
+#include "tools/block_divergence_analysis.h"
+#include "utils/helper.h"
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <cassert>
+#include <algorithm>
+#include <iomanip>
+#include <vector>
+
+#ifdef __has_include
+#if __has_include(<sanitizer_patching.h>)
+#include <sanitizer_patching.h>
+#endif
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_READ
+#define SANITIZER_MEMORY_DEVICE_FLAG_READ 0x1
+#endif
+
+#ifndef SANITIZER_MEMORY_DEVICE_FLAG_WRITE
+#define SANITIZER_MEMORY_DEVICE_FLAG_WRITE 0x2
+#endif
+
+using namespace yosemite;
+
+
+BlockDivergenceAnalysis::BlockDivergenceAnalysis() : Tool(MEM_TRACE) {
+    const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED");
+    if (torch_prof && std::string(torch_prof) == "1") {
+        fprintf(stdout, "Enabling torch profiler in BlockDivergenceAnalysis.\n");
+        _torch_enabled = true;
+    }
+
+    const char* env_app_name = std::getenv("YOSEMITE_APP_NAME");
+    if (env_app_name != nullptr) {
+        output_directory = "block_distribution_" + std::string(env_app_name)
+                            + "_" + get_current_date_n_time();
+    } else {
+        output_directory = "block_distribution_" + get_current_date_n_time();
+    }
+    check_folder_existance(output_directory);
+}
+
+
+BlockDivergenceAnalysis::~BlockDivergenceAnalysis() {}
+
+
+void BlockDivergenceAnalysis::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
+
+    kernel->kernel_id = kernel_id++;
+    kernel_events.emplace(_timer.get(), kernel);
+    _block_entries.clear();
+    _unique_pcs.clear();
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
+    std::string filename = output_directory + "/kernel_"
+                            + std::to_string(kernel->kernel_id) + ".csv";
+    printf("Dumping traces to %s\n", filename.c_str());
+
+    std::ofstream out(filename);
+    std::vector<uint64_t> pc_list(_unique_pcs.begin(), _unique_pcs.end());
+    std::sort(pc_list.begin(), pc_list.end());
+
+    std::vector<uint64_t> block_ids;
+    block_ids.reserve(_block_entries.size());
+    for (const auto& entry : _block_entries) {
+        block_ids.push_back(entry.first);
+    }
+    std::sort(block_ids.begin(), block_ids.end());
+
+    out << "blockidx,blockidy,blockidz";
+    for (const auto pc : pc_list) {
+        out << ",0x" << std::hex << std::setw(16) << std::setfill('0') << pc << std::dec;
+    }
+    out << ",read_count,write_count" << std::endl;
+
+    for (const auto block_id : block_ids) {
+        const auto& stats = _block_entries.at(block_id);
+        out << block_id << ",0,0";
+        for (const auto pc : pc_list) {
+            auto pc_it = stats.pc_counts.find(pc);
+            uint64_t count = (pc_it != stats.pc_counts.end()) ? pc_it->second : 0;
+            out << "," << count;
+        }
+        out << "," << stats.read_count << "," << stats.write_count << std::endl;
+    }
+}
+
+
+void BlockDivergenceAnalysis::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
+    auto evt = std::prev(kernel_events.end())->second;
+    evt->end_time = _timer.get();
+
+    kernel_trace_flush(evt);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
+    alloc_events.emplace(_timer.get(), mem);
+    active_memories.emplace(mem->addr, mem);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
+    auto it = active_memories.find(mem->addr);
+    assert(it != active_memories.end());
+    active_memories.erase(it);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
+    tensor_events.emplace(_timer.get(), ten);
+    active_tensors.emplace(ten->addr, ten);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
+    auto it = active_tensors.find(ten->addr);
+    assert(it != active_tensors.end());
+    active_tensors.erase(it);
+
+    _timer.increment(true);
+}
+
+
+void BlockDivergenceAnalysis::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    for (uint64_t i = 0; i < size; i++) {
+        const MemoryAccess& trace = accesses_buffer[i];
+        uint64_t executed_inst_count = static_cast<uint64_t>(__builtin_popcount(trace.active_mask));
+        uint64_t pc = trace.pc;
+        uint64_t cta_id = trace.ctaId;
+
+        auto& entry = _block_entries[cta_id];
+        entry.pc_counts[pc] += executed_inst_count;
+        if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) {
+            entry.read_count += executed_inst_count;
+        }
+        if (trace.flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) {
+            entry.write_count += executed_inst_count;
+        }
+
+        _unique_pcs.insert(pc);
+    }
+
+}
+
+
+void BlockDivergenceAnalysis::evt_callback(EventPtr_t evt) {
+    switch (evt->evt_type) {
+        case EventType_KERNEL_LAUNCH:
+            kernel_start_callback(std::dynamic_pointer_cast<KernelLaunch_t>(evt));
+            break;
+        case EventType_KERNEL_END:
+            kernel_end_callback(std::dynamic_pointer_cast<KernelEnd_t>(evt));
+            break;
+        case EventType_MEM_ALLOC:
+            mem_alloc_callback(std::dynamic_pointer_cast<MemAlloc_t>(evt));
+            break;
+        case EventType_MEM_FREE:
+            mem_free_callback(std::dynamic_pointer_cast<MemFree_t>(evt));
+            break;
+        case EventType_TEN_ALLOC:
+            ten_alloc_callback(std::dynamic_pointer_cast<TenAlloc_t>(evt));
+            break;
+        case EventType_TEN_FREE:
+            ten_free_callback(std::dynamic_pointer_cast<TenFree_t>(evt));
+            break;
+        default:
+            break;
+    }
+}
+
+
+void BlockDivergenceAnalysis::flush() {
+}
diff --git a/src/tools/heatmap_analysis.cpp b/src/tools/heatmap_analysis.cpp
new file mode 100644
index 0000000..6be87ce
--- /dev/null
+++ b/src/tools/heatmap_analysis.cpp
@@ -0,0 +1,199 @@
+#include "tools/heatmap_analysis.h"
+#include "utils/helper.h"
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <cassert>
+#include <iostream>
+#include <bitset>
+#include <sstream>
+#include <algorithm>
+#include <vector>
+
+
+using namespace yosemite;
+
+
+HeatmapAnalysis::HeatmapAnalysis() : Tool(HEATMAP_ANALYSIS) {
+    const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED");
+    if (torch_prof && std::string(torch_prof) == "1") {
+        fprintf(stdout, "Enabling torch profiler in HeatmapAnalysis.\n");
+        _torch_enabled = true;
+    }
+
+    const char* env_app_name = std::getenv("YOSEMITE_APP_NAME");
+    if (env_app_name != nullptr) {
+        output_directory = "heatmap_" + std::string(env_app_name)
+                            + "_" + get_current_date_n_time();
+    } else {
+        output_directory = "heatmap_" + get_current_date_n_time();
+    }
+    check_folder_existance(output_directory);
+}
+
+
+HeatmapAnalysis::~HeatmapAnalysis() {}
+
+
+void HeatmapAnalysis::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
+
+    kernel->kernel_id = kernel_id++;
+    kernel_events.emplace(_timer.get(), kernel);
+    _traces.clear();
+    _heatmap_data.clear();
+    _sector_pc_information.clear();
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
+    std::string filename = output_directory + "/kernel_"
+                            + std::to_string(kernel->kernel_id) + ".csv";
+    printf("Dumping block 0 heatmap to %s\n", filename.c_str());
+
+    std::ofstream out(filename);
+    std::stringstream ss;
+
+    std::vector<std::pair<uint64_t, std::array<uint32_t, 18>>> sorted_heatmap_data(_heatmap_data.begin(), _heatmap_data.end());
+    std::sort(sorted_heatmap_data.begin(), sorted_heatmap_data.end(), [](const std::pair<uint64_t, std::array<uint32_t, 18>>& a, const std::pair<uint64_t, std::array<uint32_t, 18>>& b) {
+        return a.first < b.first;
+    });
+    ss << "Sector Tag,\t\tDistinct Warp Count,\tAccess Count,\t\t\tTouched PC" << std::endl;
+    for (auto& [tag, data] : sorted_heatmap_data) {
+        ss << "0x"<<std::hex << tag << std::dec << ",\t\t";
+        for (int i = 0; i < 9; i++) {
+            ss << std::bitset<32>(data[i]).count() << ",";
+        }
+        ss << "\t\t";
+        for (int i = 9; i < 18; i++) {
+            ss << data[i] << ",";
+        }
+        for (auto pc : _sector_pc_information[tag]) {
+            ss << "\t\t0x" << std::hex << pc << std::dec << ",";
+        }
+        ss << std::endl;
+    }
+
+    out << ss.str();
+
+    out.close();
+}
+
+
+void HeatmapAnalysis::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
+    auto evt = std::prev(kernel_events.end())->second;
+    evt->end_time = _timer.get();
+
+    kernel_trace_flush(evt);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
+    alloc_events.emplace(_timer.get(), mem);
+    active_memories.emplace(mem->addr, mem);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
+    auto it = active_memories.find(mem->addr);
+    assert(it != active_memories.end());
+    active_memories.erase(it);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
+    tensor_events.emplace(_timer.get(), ten);
+    active_tensors.emplace(ten->addr, ten);
+
+    _timer.increment(true);
+}
+
+
+void HeatmapAnalysis::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
+    auto it = active_tensors.find(ten->addr);
+    assert(it != active_tensors.end());
+    active_tensors.erase(it);
+
+    _timer.increment(true);
+}
+
+// function signature:
+// addr: the address of the memory access
+// warp_id: the warp id of the memory access
+// sector_tag: the sector tag of the memory access
+// offset: the offset of the memory access
+// length: the length of the memory access
+// count_access_flag: whether to count the access flag
+// return: void
+void HeatmapAnalysis::unit_access(uint32_t warp_id, uint64_t sector_tag, uint32_t offset, uint32_t length) {
+    
+    // heatmap_data[tag][0-7]: distinct warp id mask for each word in this sector;
+    // heatmap_data[tag][8]: distinct warp id mask for entire sector;
+    // heatmap_data[tag][9-17]: access count for each word and the last is for entire sector;
+    // // if count_access_flag is true, then the access count for the entire sector is incremented by 1;
+    auto& sector_data = _heatmap_data[sector_tag];
+    auto mask = (1u << warp_id);
+    for (int i = 0; i < length; i+=4) {
+        sector_data[offset+i/4] |= mask;
+        sector_data[8] |= mask;
+        sector_data[9+offset+i/4] += 1;
+    }
+    sector_data[17] += 1;
+}
+
+void HeatmapAnalysis::add_sector_pc_information(uint64_t sector_tag, uint64_t pc) {
+    _sector_pc_information[sector_tag].insert(pc);
+}
+
+
+void HeatmapAnalysis::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    for (uint64_t i = 0; i < size; i++) {
+        auto trace = accesses_buffer[i];
+        for (int j = 0; j < GPU_WARP_SIZE; j++) {
+            if (trace.active_mask & (1u << j)) {
+                auto sector_tag = trace.addresses[j] >> SECTOR_TAG_SHIFT;
+                auto offset = (trace.addresses[j] & 31) >> 2;
+                unit_access(trace.warpId, sector_tag, offset, trace.accessSize);
+                add_sector_pc_information(sector_tag, trace.pc);
+            }
+        }
+    } 
+}
+
+void HeatmapAnalysis::evt_callback(EventPtr_t evt) {
+    switch (evt->evt_type) {
+        case EventType_KERNEL_LAUNCH:
+            kernel_start_callback(std::dynamic_pointer_cast<KernelLaunch_t>(evt));
+            break;
+        case EventType_KERNEL_END:
+            kernel_end_callback(std::dynamic_pointer_cast<KernelEnd_t>(evt));
+            break;
+        case EventType_MEM_ALLOC:
+            mem_alloc_callback(std::dynamic_pointer_cast<MemAlloc_t>(evt));
+            break;
+        case EventType_MEM_FREE:
+            mem_free_callback(std::dynamic_pointer_cast<MemFree_t>(evt));
+            break;
+        case EventType_TEN_ALLOC:
+            ten_alloc_callback(std::dynamic_pointer_cast<TenAlloc_t>(evt));
+            break;
+        case EventType_TEN_FREE:
+            ten_free_callback(std::dynamic_pointer_cast<TenFree_t>(evt));
+            break;
+        default:
+            break;
+    }
+}
+
+
+void HeatmapAnalysis::flush() {
+}
diff --git a/src/tools/pc_dependency_analysis.cpp b/src/tools/pc_dependency_analysis.cpp
new file mode 100644
index 0000000..aa3f525
--- /dev/null
+++ b/src/tools/pc_dependency_analysis.cpp
@@ -0,0 +1,770 @@
+#include "tools/pc_dependency_analysis.h"
+#include "utils/helper.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <fstream>
+#include <memory>
+#include <cassert>
+#include <iostream>
+#include <sstream>
+#include <set>
+#include <iomanip>
+#include <thread>
+#include <atomic>
+
+
+using namespace yosemite;
+
+namespace {
+static std::string json_escape(const std::string& s) {
+    std::string out;
+    out.reserve(s.size() + 8);
+    for (char c : s) {
+        switch (c) {
+            case '\"': out += "\\\""; break;
+            case '\\': out += "\\\\"; break;
+            case '\b': out += "\\b"; break;
+            case '\f': out += "\\f"; break;
+            case '\n': out += "\\n"; break;
+            case '\r': out += "\\r"; break;
+            case '\t': out += "\\t"; break;
+            default:
+                // control chars
+                if (static_cast<unsigned char>(c) < 0x20) {
+                    std::ostringstream oss;
+                    oss << "\\u"
+                        << std::hex << std::setw(4) << std::setfill('0')
+                        << (int)static_cast<unsigned char>(c);
+                    out += oss.str();
+                } else {
+                    out += c;
+                }
+        }
+    }
+    return out;
+}
+
+static std::string hex_u32(uint32_t v) {
+    std::ostringstream oss;
+    oss << "0x" << std::hex << v;
+    return oss.str();
+}
+static std::string flags_to_string(uint32_t flags) {
+    std::ostringstream oss;
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_READ) oss << "READ";
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_WRITE) oss << "WRITE";
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_ATOMIC) oss << "ATOMIC";
+    if (flags & SANITIZER_MEMORY_DEVICE_FLAG_PREFETCH) oss << "PREFETCH";
+    oss << " ";
+    if (flags & SANITIZER_MEMORY_GLOBAL) oss << "GLOBAL";
+    if (flags & SANITIZER_MEMORY_SHARED) oss << "SHARED";
+    if (flags & SANITIZER_MEMORY_LOCAL) oss << "LOCAL";
+
+    return oss.str();
+}
+
+static inline uint64_t pack_shadow_entry(uint8_t generation, uint32_t pc24, uint32_t flat_thread_id) {
+    const uint32_t encoded_pc = (static_cast<uint32_t>(generation) << 24)
+                              | (pc24 & 0x00FFFFFFu);
+    return (static_cast<uint64_t>(flat_thread_id) << 32) | static_cast<uint64_t>(encoded_pc);
+}
+
+static inline uint32_t unpack_shadow_pc_encoded(uint64_t packed) {
+    return static_cast<uint32_t>(packed & 0xFFFFFFFFu);
+}
+
+static inline uint32_t unpack_shadow_flat_tid(uint64_t packed) {
+    return static_cast<uint32_t>(packed >> 32);
+}
+
+static inline const memory_region* find_memory_region_containing(
+    const std::vector<memory_region>& regions,
+    uint64_t addr
+) {
+    auto it = std::upper_bound(
+        regions.begin(),
+        regions.end(),
+        addr,
+        [](uint64_t value, const memory_region& region) {
+            return value < region.get_start();
+        }
+    );
+    if (it == regions.begin()) {
+        return nullptr;
+    }
+    --it;
+    return it->contains(addr) ? &(*it) : nullptr;
+}
+} // namespace
+
+
+PcDependency::PcDependency() : Tool(PC_DEPENDENCY_ANALYSIS) {
+    const char* torch_prof = std::getenv("TORCH_PROFILE_ENABLED");
+    if (torch_prof && std::string(torch_prof) == "1") {
+        fprintf(stdout, "Enabling torch profiler in PcDependency.\n");
+        _torch_enabled = true;
+    }
+
+    const char* env_app_name = std::getenv("YOSEMITE_APP_NAME");
+    if (env_app_name != nullptr) {
+        output_directory = "dependency_" + std::string(env_app_name)
+                            + "_" + get_current_date_n_time();
+    } else {
+        output_directory = "dependency_" + get_current_date_n_time();
+    }
+    check_folder_existance(output_directory);
+
+    _worker_count = std::max(1u, std::thread::hardware_concurrency());
+    _worker_shadow_memory_shared.resize(_worker_count);
+    _job_worker_trace_indices.resize(_worker_count);
+    _job_worker_pc_statistics.resize(_worker_count);
+    _job_worker_pc_flags.resize(_worker_count);
+    _job_worker_distinct_sector_count.resize(_worker_count);
+    _workers.reserve(_worker_count);
+    for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) {
+        _workers.emplace_back(&PcDependency::worker_loop, this, worker_idx);
+    }
+}
+
+
+PcDependency::~PcDependency() {
+    {
+        std::lock_guard<std::mutex> guard(_worker_pool_mutex);
+        _worker_pool_shutdown = true;
+        ++_worker_job_generation;
+    }
+    _worker_pool_cv.notify_all();
+    for (auto& worker : _workers) {
+        if (worker.joinable()) {
+            worker.join();
+        }
+    }
+}
+
+
+void PcDependency::kernel_start_callback(std::shared_ptr<KernelLaunch_t> kernel) {
+
+    kernel->kernel_id = kernel_id++;
+    kernel_events.emplace(_timer.get(), kernel);
+    _pc_statistics.clear();
+    _pc_flags.clear();
+    _distinct_sector_count.clear();
+    for (auto& shared_map : _worker_shadow_memory_shared) {
+        shared_map.clear();
+    }
+    _kernel_generation = static_cast<uint8_t>(_kernel_generation + 1u);
+    if (_kernel_generation == 0) {
+        for (auto& shadow_memory_iter : _shadow_memories) {
+            shadow_memory_iter.second->reset_entries();
+        }
+        printf("[PC_DEPENDENCY] Shadow generation wrapped, resetting entries\n");
+    }
+    _timer.increment(true);
+}
+
+
+void PcDependency::kernel_trace_flush(std::shared_ptr<KernelLaunch_t> kernel) {
+    std::string filename = output_directory + "/kernel_"
+                            + std::to_string(kernel->kernel_id) + ".csv";
+    printf("Dumping pc dependency to %s\n", filename.c_str());
+
+    std::ofstream out(filename);
+    out << "current_pc_offset,ancient_pc_offset,flags,intra_thread,intra_warp,intra_block,intra_grid\n";
+
+    std::vector<std::pair<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>> outer(
+        _pc_statistics.begin(), _pc_statistics.end());
+    std::sort(outer.begin(), outer.end(),
+              [](auto& a, auto& b){ return a.first < b.first; });
+
+    for (auto& [cur_pc, inner_map] : outer) {
+        std::vector<std::pair<uint32_t, PC_statisitics>> inner(inner_map.begin(), inner_map.end());
+        std::sort(inner.begin(), inner.end(),
+                  [](auto& a, auto& b){ return a.first < b.first; });
+
+        uint32_t flags = 0;
+        uint32_t access_size = 0;
+        auto fit = _pc_flags.find(cur_pc);
+        if (fit != _pc_flags.end()){ flags = fit->second.first; access_size = fit->second.second;}
+
+        for (auto& [anc_pc, st] : inner) {
+            out << "0x" << std::hex << cur_pc
+                << ",0x" << anc_pc
+                << ",0x" << flags
+                << std::dec
+                << "," << st.dist[0]
+                << "," << st.dist[1]
+                << "," << st.dist[2]
+                << "," << st.dist[3]
+                << "\n";
+        }
+    }
+
+    // JSON output for building PC dependency graph (joinable with CFG)
+    std::string json_filename = output_directory + "/kernel_"
+                                + std::to_string(kernel->kernel_id) + ".json";
+    std::ofstream jout(json_filename);
+    jout << "{\n";
+    jout << "  \"tool\": \"pc_dependency_analysis\",\n";
+    jout << "  \"kernel\": {\n";
+    jout << "    \"kernel_id\": " << kernel->kernel_id << ",\n";
+    jout << "    \"kernel_name\": \"" << json_escape(kernel->kernel_name) << "\",\n";
+    jout << "    \"device_id\": " << kernel->device_id << ",\n";
+    jout << "    \"kernel_pc\": " << kernel->kernel_pc << ",\n";
+    jout << "    \"kernel_pc_hex\": \"" << hex_u32((uint32_t)kernel->kernel_pc) << "\"\n";
+    jout << "  },\n";
+    jout << "  \"shadow_memory_granularity_bytes\": 1,\n";
+    jout << "  \"sample_stride_bytes\": 4,\n";
+
+    // Collect nodes (all current PCs + all non-cold ancient PCs)
+    std::set<uint32_t> nodes;
+    for (const auto& [cur_pc, inner_map] : _pc_statistics) {
+        nodes.insert(cur_pc);
+        for (const auto& [anc_pc, st] : inner_map) {
+            (void)st;
+            if (anc_pc != 0u) nodes.insert(anc_pc);
+        }
+    }
+
+    jout << "  \"nodes\": [\n";
+    {
+        bool first = true;
+        for (uint32_t pc : nodes) {
+            if (!first) jout << ",\n";
+            first = false;
+            auto fit = _pc_flags.find(pc);
+            bool has_flags = (fit != _pc_flags.end());
+            uint32_t flags = has_flags ? fit->second.first : 0;
+            uint32_t access_size = has_flags ? fit->second.second : 0;
+            bool has_distinct_sector_count = (_distinct_sector_count.find(pc) != _distinct_sector_count.end());
+            jout << "    {\"pc\": " << pc
+                 << ", \"pc_hex\": \"" << hex_u32(pc) << "\"";
+            if (has_flags) {
+                jout << ", \"flags\": \"" << flags_to_string(flags) << "\""
+                     << ", \"flags_hex\": \"" << hex_u32(flags) << "\""
+                     << ", \"access_size\": " << access_size;
+            } else {
+                jout << ", \"flags\": null, \"flags_hex\": null, \"access_size\": null";
+            }
+            if (has_distinct_sector_count) {
+                jout << ", \"distinct_sector_count\": {";
+                for (int i = 1; i <= 32; i++) {
+                    jout << "\"" << i << "\": " << _distinct_sector_count[pc][i - 1];
+                    if (i != 32) {
+                        jout << ", ";
+                    }
+                }
+                jout << "}";
+                jout << ", \"active_lane_count\": {";
+                for (int i = 0; i <= 32; i++) {
+                    jout << "\"" << i << "\": " << _distinct_sector_count[pc][32 + i];
+                    if (i != 32) {
+                        jout << ", ";
+                    }
+                }
+                jout << "}";
+            } else {
+                jout << ", \"distinct_sector_count\": null, \"active_lane_count\": null";
+            }
+            jout << "}";
+        }
+        jout << "\n";
+    }
+    jout << "  ],\n";
+
+    // Edges: ancient_pc -> current_pc, with per-scope counts.
+    jout << "  \"edges\": [\n";
+    {
+        // Stable order: sort by current pc then ancient pc
+        std::vector<std::pair<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>> outer2(
+            _pc_statistics.begin(), _pc_statistics.end());
+        std::sort(outer2.begin(), outer2.end(),
+                  [](auto& a, auto& b){ return a.first < b.first; });
+
+        bool first_edge = true;
+        for (auto& [cur_pc, inner_map] : outer2) {
+            std::vector<std::pair<uint32_t, PC_statisitics>> inner2(inner_map.begin(), inner_map.end());
+            std::sort(inner2.begin(), inner2.end(),
+                      [](auto& a, auto& b){ return a.first < b.first; });
+
+            // current flags if available
+            auto cfit = _pc_flags.find(cur_pc);
+            bool has_cflags = (cfit != _pc_flags.end());
+            uint32_t cflags = has_cflags ? cfit->second.first : 0;
+            uint32_t c_access_size = has_cflags ? cfit->second.second : 0;
+
+            for (auto& [anc_pc, st] : inner2) {
+                if (!first_edge) jout << ",\n";
+                first_edge = false;
+
+                bool cold_miss = (anc_pc == 0u);
+
+                jout << "    {\"current_pc\": " << cur_pc
+                     << ", \"current_pc_hex\": \"" << hex_u32(cur_pc) << "\""
+                     << ", \"ancient_pc\": ";
+                if (cold_miss) {
+                    jout << "null";
+                } else {
+                    jout << anc_pc;
+                }
+                jout << ", \"ancient_pc_hex\": ";
+                if (cold_miss) {
+                    jout << "null";
+                } else {
+                    jout << "\"" << hex_u32(anc_pc) << "\"";
+                }
+                jout << ", \"cold_miss\": " << (cold_miss ? "true" : "false");
+
+                if (has_cflags) {
+                    jout << ", \"current_flags\": " << cflags
+                         << ", \"current_flags_hex\": \"" << hex_u32(cflags) << "\""
+                         << ", \"current_access_size\": " << c_access_size;
+                } else {
+                    jout << ", \"current_flags\": null, \"current_flags_hex\": null";
+                }
+
+                jout << ", \"dist\": {"
+                     << "\"intra_thread\": " << st.dist[0]
+                     << ", \"intra_warp\": " << st.dist[1]
+                     << ", \"intra_block\": " << st.dist[2]
+                     << ", \"intra_grid\": " << st.dist[3]
+                     << "}}";
+            }
+        }
+        jout << "\n";
+    }
+    jout << "  ]\n";
+    jout << "}\n";
+    printf("Dumping pc dependency graph json to %s\n", json_filename.c_str());
+}
+
+
+void PcDependency::kernel_end_callback(std::shared_ptr<KernelEnd_t> kernel) {
+    auto evt = std::prev(kernel_events.end())->second;
+    evt->end_time = _timer.get();
+    for (auto& shared_map : _worker_shadow_memory_shared) {
+        shared_map.clear();
+    }
+    printf("[PC_DEPENDENCY] Clearing shadow memory shared\n");
+    kernel_trace_flush(evt);
+
+    _timer.increment(true);
+}
+
+
+void PcDependency::mem_alloc_callback(std::shared_ptr<MemAlloc_t> mem) {
+    // TODO： add shadow memory allocation here
+    alloc_events.emplace(_timer.get(), mem);
+    active_memories.emplace(mem->addr, mem);
+    memory_region memory_region_current = memory_region((uint64_t)mem->addr, (uint64_t)(mem->addr + mem->size));
+    _memory_regions.insert(
+        std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current),
+        memory_region_current
+    );
+    _shadow_memories.emplace(memory_region_current, std::make_unique<shadow_memory>(mem->size));
+
+    printf("[PC_DEPENDENCY] Allocating shadow memory for memory region: %p - %p, size: %lu\n", (void*)memory_region_current.get_start(), (void*)memory_region_current.get_end(), mem->size);
+    _timer.increment(true);
+}
+
+void PcDependency::mem_free_callback(std::shared_ptr<MemFree_t> mem) {
+    auto it = active_memories.find(mem->addr);
+    assert(it != active_memories.end());
+
+    uint64_t sz = it->second->size;   // 从 alloc 事件拿 size
+    active_memories.erase(it);
+
+    memory_region r((uint64_t)mem->addr, (uint64_t)mem->addr + sz);
+
+    auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r);
+    if (vit != _memory_regions.end() && *vit == r) _memory_regions.erase(vit);
+
+    _shadow_memories.erase(r);
+    printf("[PC_DEPENDENCY] Freeing shadow memory for memory region: %p - %p, size: %lu\n", (void*)r.get_start(), (void*)r.get_end(), sz);
+    _timer.increment(true);
+}
+
+
+void PcDependency::ten_alloc_callback(std::shared_ptr<TenAlloc_t> ten) {
+    tensor_events.emplace(_timer.get(), ten);
+    active_tensors.emplace(ten->addr, ten);
+    memory_region memory_region_current((uint64_t)ten->addr, (uint64_t)(ten->addr + ten->size));
+    _memory_regions.insert(
+        std::lower_bound(_memory_regions.begin(), _memory_regions.end(), memory_region_current),
+        memory_region_current
+    );
+    _shadow_memories.emplace(memory_region_current, std::make_unique<shadow_memory>(ten->size));
+    printf("[PC_DEPENDENCY] Allocating shadow memory for tensor region: %p - %p, size: %lu\n", (void*)ten->addr, (void*)(ten->addr + ten->size), ten->size);
+
+    _timer.increment(true);
+}
+
+
+void PcDependency::ten_free_callback(std::shared_ptr<TenFree_t> ten) {
+    auto it = active_tensors.find(ten->addr);
+    assert(it != active_tensors.end());
+
+    // TenFree.size may be negative (e.g., accounting-style events). Use size from TenAlloc.
+    const uint64_t sz = static_cast<uint64_t>(it->second->size);
+    active_tensors.erase(it);
+
+    memory_region r((uint64_t)ten->addr, (uint64_t)ten->addr + sz);
+
+    auto vit = std::lower_bound(_memory_regions.begin(), _memory_regions.end(), r);
+    if (vit != _memory_regions.end() && *vit == r) {
+        _memory_regions.erase(vit);
+    }
+
+    _shadow_memories.erase(r);
+    printf("[PC_DEPENDENCY] Freeing shadow memory for tensor region: %p - %p, size: %lu\n",
+           (void*)r.get_start(), (void*)r.get_end(), sz);
+    _timer.increment(true);
+}
+
+void PcDependency::unit_access(
+    uint64_t ptr,
+    uint32_t pc_offset,
+    uint64_t current_block_id,
+    uint32_t current_warp_id,
+    uint32_t current_lane_id,
+    memory_region& memory_region_target,
+    int access_size,
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics
+) {
+    // auto& shadow_memory = this->_shadow_memories[memory_region_target];
+    auto shadow_memory_it = this->_shadow_memories.find(memory_region_target);
+    if (shadow_memory_it == this->_shadow_memories.end()) {
+        printf("shadow memory not found for memory region: %lu - %lu\n", memory_region_target.get_start(), memory_region_target.get_end());
+        return;
+    }
+    auto& shadow_memory = *(shadow_memory_it->second);
+    const uint32_t current_flat_thread_id =
+        static_cast<uint32_t>((current_block_id << 10) | (current_warp_id << 5) | current_lane_id);
+
+    for (int i = 0; i < access_size; i += 4) {
+        const uint64_t addr = ptr + i;
+        // Byte-granularity shadow memory: addr is byte offset within allocation.
+        // Bound check to avoid OOB on allocations at end boundary or odd sizes.
+        if (addr >= shadow_memory._size) {
+            break;
+        }
+
+        auto& entry = shadow_memory.get_entry(addr);
+        const uint64_t old_packed = __atomic_exchange_n(
+            &entry.packed,
+            pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id),
+            __ATOMIC_ACQ_REL
+        );
+        const bool is_cold_miss = (old_packed == 0);
+
+        if (is_cold_miss) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
+            continue;
+        }
+
+        const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed);
+        const uint8_t last_generation = static_cast<uint8_t>(last_pc_encoded >> 24);
+        if (last_generation != _kernel_generation) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
+            continue;
+        }
+        const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu);
+        const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed);
+        const uint64_t last_block_id = static_cast<uint64_t>(last_flat_thread_id >> 10);
+        const uint64_t last_warp_id = static_cast<uint64_t>((last_flat_thread_id >> 5) & 0x1F);
+        const uint64_t last_lane_id = static_cast<uint64_t>(last_flat_thread_id & 0x1F);
+        if (last_block_id != current_block_id) {
+            local_pc_statistics[pc_offset][last_pc].dist[3] += 1;
+        } else if (last_warp_id != current_warp_id) {
+            local_pc_statistics[pc_offset][last_pc].dist[2] += 1;
+        } else if (last_lane_id != current_lane_id) {
+            local_pc_statistics[pc_offset][last_pc].dist[1] += 1;
+        } else {
+            local_pc_statistics[pc_offset][last_pc].dist[0] += 1;
+        }
+    }
+}
+
+void PcDependency::unit_access_shared(
+    uint64_t ptr,
+    uint32_t pc_offset,
+    uint64_t current_block_id,
+    uint32_t current_warp_id,
+    uint32_t current_lane_id,
+    int access_size,
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, PC_statisitics>>& local_pc_statistics,
+    std::unordered_map<uint64_t, std::unordered_map<uint32_t, shadow_memory_entry>>& local_shadow_memory_shared
+) {
+    // Per-CTA layered shadow map: local_shadow_memory_shared[cta_id][addr_low32]
+    auto& cta_shadow = local_shadow_memory_shared[current_block_id];
+    const uint32_t base_addr_low32 = static_cast<uint32_t>(ptr & 0xFFFFFFFFull);
+
+    for (int i = 0; i < access_size; i += 4) {
+        const uint32_t addr = base_addr_low32 + static_cast<uint32_t>(i);  // 4 字节粒度
+        const uint32_t current_flat_thread_id =
+            static_cast<uint32_t>((current_warp_id << 5) | current_lane_id);
+
+        auto [it, inserted] = cta_shadow.emplace(addr, shadow_memory_entry());
+        const bool is_cold_miss = inserted;
+        const uint64_t old_packed = it->second.packed;
+        it->second.packed = pack_shadow_entry(_kernel_generation, pc_offset, current_flat_thread_id);
+
+        if (is_cold_miss) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
+            continue;
+        }
+
+        const uint32_t last_pc_encoded = unpack_shadow_pc_encoded(old_packed);
+        const uint8_t last_generation = static_cast<uint8_t>(last_pc_encoded >> 24);
+        if (last_generation != _kernel_generation) {
+            local_pc_statistics[pc_offset][0].dist[0] += 1;
+            continue;
+        }
+        const uint32_t last_pc = (last_pc_encoded & 0x00FFFFFFu);
+        const uint32_t last_flat_thread_id = unpack_shadow_flat_tid(old_packed);
+        const uint64_t last_warp_id = static_cast<uint64_t>((last_flat_thread_id >> 5) & 0x1F);
+        const uint64_t last_lane_id = static_cast<uint64_t>(last_flat_thread_id & 0x1F);
+
+        if (last_warp_id != current_warp_id) {
+            // 不同 warp 同 block
+            local_pc_statistics[pc_offset][last_pc].dist[2] += 1;
+        } else if (last_lane_id != current_lane_id) {
+            // 同 warp 不同 lane
+            local_pc_statistics[pc_offset][last_pc].dist[1] += 1;
+        } else {
+            // 同一线程
+            local_pc_statistics[pc_offset][last_pc].dist[0] += 1;
+        }
+    }
+}
+
+void PcDependency::unit_access_local(uint64_t ptr, uint32_t pc_offset, uint64_t current_block_id, uint32_t current_warp_id, uint32_t current_lane_id, int access_size) {
+    // TODO: implement local memory access
+}
+
+
+void PcDependency::worker_loop(uint64_t worker_idx) {
+    uint64_t seen_generation = 0;
+    while (true) {
+        uint64_t current_generation = 0;
+        {
+            std::unique_lock<std::mutex> lock(_worker_pool_mutex);
+            _worker_pool_cv.wait(lock, [&]{
+                return _worker_pool_shutdown || _worker_job_generation > seen_generation;
+            });
+            if (_worker_pool_shutdown) {
+                return;
+            }
+            current_generation = _worker_job_generation;
+        }
+
+        auto& local_pc_statistics = _job_worker_pc_statistics[worker_idx];
+        auto& local_pc_flags = _job_worker_pc_flags[worker_idx];
+        auto& local_distinct_sector_count = _job_worker_distinct_sector_count[worker_idx];
+        auto& local_shadow_memory_shared = _worker_shadow_memory_shared[worker_idx];
+        const auto& trace_indices = _job_worker_trace_indices[worker_idx];
+
+        for (uint64_t i : trace_indices) {
+            const MemoryAccess& trace = _job_accesses_buffer[i];
+            uint32_t pc_offset = (trace.pc & 0x00FFFFFFu);
+            uint32_t flags = trace.flags;
+            uint32_t access_size = trace.accessSize;
+            uint32_t distinct_sector_count = trace.distinct_sector_count;
+            uint32_t active_mask = trace.active_mask;
+            switch (trace.type) {
+                case MemoryType::Local:{
+                        flags |= SANITIZER_MEMORY_LOCAL;
+                        break;
+                    }
+                case MemoryType::Shared:{
+                        flags |= SANITIZER_MEMORY_SHARED;
+                        uint32_t remaining_mask = active_mask;
+                        while (remaining_mask != 0) {
+                            const uint32_t j = static_cast<uint32_t>(__builtin_ctz(remaining_mask));
+                            remaining_mask &= (remaining_mask - 1);
+                            unit_access_shared(
+                                trace.addresses[j],
+                                pc_offset,
+                                trace.ctaId,
+                                trace.warpId,
+                                j,
+                                trace.accessSize,
+                                local_pc_statistics,
+                                local_shadow_memory_shared
+                            );
+                        }
+                        break;
+                    }
+                case MemoryType::Global:{
+                        flags |= SANITIZER_MEMORY_GLOBAL;
+                        if (active_mask == 0) {
+                            break;
+                        }
+                        const uint32_t first_lane = static_cast<uint32_t>(__builtin_ctz(active_mask));
+                        const uint64_t first_valid_address = trace.addresses[first_lane];
+                        const memory_region* memory_region_target_ptr =
+                            find_memory_region_containing(this->_memory_regions, first_valid_address);
+                        assert(memory_region_target_ptr != nullptr);
+                        memory_region memory_region_target = *memory_region_target_ptr;
+                        uint64_t memory_region_start = memory_region_target.get_start();
+                        assert(memory_region_start != 0);
+                        uint32_t remaining_mask = active_mask;
+                        while (remaining_mask != 0) {
+                            const uint32_t j = static_cast<uint32_t>(__builtin_ctz(remaining_mask));
+                            remaining_mask &= (remaining_mask - 1);
+                            unit_access(
+                                trace.addresses[j] - memory_region_start,
+                                pc_offset,
+                                trace.ctaId,
+                                trace.warpId,
+                                j,
+                                memory_region_target,
+                                access_size,
+                                local_pc_statistics
+                            );
+                        }
+                        break;
+                    }
+                default:
+                    printf("unknown memory type\n");
+                    break;
+            }
+            auto& local_flag = local_pc_flags[pc_offset];
+            local_flag.first |= flags;
+            if (local_flag.second == 0) {
+                local_flag.second = access_size;
+            } else if (local_flag.second != access_size) {
+                local_flag.second = std::max(local_flag.second, access_size);
+            }
+            if (distinct_sector_count >= 1 && distinct_sector_count <= 32) {
+                local_distinct_sector_count[pc_offset][distinct_sector_count - 1] += 1;
+            }
+            const uint32_t active_lane_count = __builtin_popcount(active_mask);
+            if (active_lane_count <= 32) {
+                local_distinct_sector_count[pc_offset][32 + active_lane_count] += 1;
+            }
+        }
+
+        {
+            std::lock_guard<std::mutex> guard(_worker_pool_mutex);
+            seen_generation = current_generation;
+            if (!trace_indices.empty()) {
+                assert(_worker_pending_jobs > 0);
+                _worker_pending_jobs -= 1;
+                if (_worker_pending_jobs == 0) {
+                    _worker_pool_done_cv.notify_one();
+                }
+            }
+        }
+    }
+}
+
+
+void PcDependency::gpu_data_analysis(void* data, uint64_t size) {
+    MemoryAccess* accesses_buffer = (MemoryAccess*)data;
+    if (size == 0) {
+        return;
+    }
+
+    for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) {
+        _job_worker_trace_indices[worker_idx].clear();
+        _job_worker_pc_statistics[worker_idx].clear();
+        _job_worker_pc_flags[worker_idx].clear();
+        _job_worker_distinct_sector_count[worker_idx].clear();
+        _job_worker_trace_indices[worker_idx].reserve((size / _worker_count) + 1);
+    }
+
+    // Stable assignment by block id keeps intra-block trace order.
+    for (uint64_t i = 0; i < size; ++i) {
+        const uint64_t worker_idx = accesses_buffer[i].ctaId % _worker_count;
+        _job_worker_trace_indices[worker_idx].push_back(i);
+    }
+
+    uint64_t pending_jobs = 0;
+    for (uint64_t worker_idx = 0; worker_idx < _worker_count; ++worker_idx) {
+        if (!_job_worker_trace_indices[worker_idx].empty()) {
+            pending_jobs += 1;
+        }
+    }
+    if (pending_jobs == 0) {
+        return;
+    }
+
+    {
+        std::lock_guard<std::mutex> guard(_worker_pool_mutex);
+        _job_accesses_buffer = accesses_buffer;
+        _worker_pending_jobs = pending_jobs;
+        ++_worker_job_generation;
+    }
+    _worker_pool_cv.notify_all();
+    {
+        std::unique_lock<std::mutex> lock(_worker_pool_mutex);
+        _worker_pool_done_cv.wait(lock, [&]{
+            return _worker_pending_jobs == 0;
+        });
+    }
+
+    for (auto& local_flags_map : _job_worker_pc_flags) {
+        for (auto& [pc, local_flag] : local_flags_map) {
+            auto& global_flag = this->_pc_flags[pc];
+            global_flag.first |= local_flag.first;
+            if (global_flag.second == 0) {
+                global_flag.second = local_flag.second;
+            } else if (global_flag.second != local_flag.second) {
+                global_flag.second = std::max(global_flag.second, local_flag.second);
+            }
+        }
+    }
+
+    for (auto& local_distinct_map : _job_worker_distinct_sector_count) {
+        for (auto& [pc, local_hist] : local_distinct_map) {
+            auto& global_hist = this->_distinct_sector_count[pc];
+            for (size_t idx = 0; idx < global_hist.size(); ++idx) {
+                global_hist[idx] += local_hist[idx];
+            }
+        }
+    }
+
+    for (auto& local_map : _job_worker_pc_statistics) {
+        for (auto& [cur_pc, local_inner] : local_map) {
+            auto& global_inner = this->_pc_statistics[cur_pc];
+            for (auto& [anc_pc, local_stats] : local_inner) {
+                auto& global_stats = global_inner[anc_pc];
+                for (int d = 0; d < 4; ++d) {
+                    global_stats.dist[d] += local_stats.dist[d];
+                }
+            }
+        }
+    }
+
+}
+
+
+void PcDependency::evt_callback(EventPtr_t evt) {
+    switch (evt->evt_type) {
+        case EventType_KERNEL_LAUNCH:
+            kernel_start_callback(std::dynamic_pointer_cast<KernelLaunch_t>(evt));
+            break;
+        case EventType_KERNEL_END:
+            kernel_end_callback(std::dynamic_pointer_cast<KernelEnd_t>(evt));
+            break;
+        case EventType_MEM_ALLOC:
+            mem_alloc_callback(std::dynamic_pointer_cast<MemAlloc_t>(evt));
+            break;
+        case EventType_MEM_FREE:
+            mem_free_callback(std::dynamic_pointer_cast<MemFree_t>(evt));
+            break;
+        case EventType_TEN_ALLOC:
+            ten_alloc_callback(std::dynamic_pointer_cast<TenAlloc_t>(evt));
+            break;
+        case EventType_TEN_FREE:
+            ten_free_callback(std::dynamic_pointer_cast<TenFree_t>(evt));
+            break;
+        default:
+            break;
+    }
+}
+
+
+void PcDependency::flush() {
+}