From 8cfb0ecb98997a0b895e90b89156a86e63b2ce0f Mon Sep 17 00:00:00 2001
From: rasbid <104773487+rasbid@users.noreply.github.com>
Date: Tue, 7 Oct 2025 10:32:59 +0300
Subject: [PATCH 1/4] Implement Vulkan matmul profiling instrumentation

---
 .../rx580-vulkan-instrumentation-plan.md      | 105 +++++
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 422 ++++++++++++++++++
 2 files changed, 527 insertions(+)
 create mode 100644 docs/development/rx580-vulkan-instrumentation-plan.md

diff --git a/docs/development/rx580-vulkan-instrumentation-plan.md b/docs/development/rx580-vulkan-instrumentation-plan.md
new file mode 100644
index 00000000000..c4086948200
--- /dev/null
+++ b/docs/development/rx580-vulkan-instrumentation-plan.md
@@ -0,0 +1,105 @@
+# RX580 Vulkan Instrumentation Plan
+
+This document expands the instrumentation portion of the optimization plan for the AMD RX580 (GCN gfx803) Vulkan backend path in `ggml`. The focus is on capturing detailed GPU timings and pipeline statistics to identify the prefill (`mul_mat*`) and infill (`mul_mat_vec*`) bottlenecks.
+
+## Goals
+
+1. Provide accurate per-dispatch GPU timings for all critical matmul pipelines.
+2. Collect pipeline statistics when `VK_KHR_pipeline_executable_properties` is supported to understand register, LDS, and instruction usage.
+3. Keep instrumentation optional and low-overhead in production builds.
+4. Surface the collected data in a format that is easy to analyze.
+
+## Workstreams
+
+### 1. Timestamp Query Instrumentation
+
+1. **Query pool management**
+   - Extend the Vulkan context (e.g., `ggml_vk_context`) to own one or more timestamp query pools sized for the maximum number of concurrent dispatches we might record in a command buffer.
+   - Ensure pools are created only when instrumentation is enabled, falling back to no-op implementations otherwise.
+   - Implement recycling logic so pools can be reset between frames without recreating them.
+
+2. **Command buffer integration**
+   - Update `ggml_vk_dispatch_pipeline()` to write timestamps immediately before and after each dispatch when instrumentation is active.
+   - Handle command buffers that are pre-recorded vs. dynamically recorded; ensure that timestamp commands are inserted alongside the existing pipeline barrier logic.
+   - Guard timestamp writes behind a check for `VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT` support to avoid validation errors on devices lacking compute timestamp support.
+
+3. **Result retrieval**
+   - After submission, collect timestamp results using `vkGetQueryPoolResults()` with `VK_QUERY_RESULT_64_BIT` to maintain precision.
+   - Convert timestamp differences to nanoseconds using the device's timestampPeriod.
+   - Aggregate results by pipeline name (e.g., `pipeline->name`) and by phase (prefill vs. infill) for easy reporting.
+
+### 2. Pipeline Executable Properties (PEP) Support
+
+1. **Capability detection**
+   - During device creation, probe for `VK_KHR_pipeline_executable_properties` and store the support flag in the device capabilities structure.
+   - Gate any PEP usage behind this flag so unsupported drivers do not incur additional calls.
+
+2. **Data collection**
+   - Add helper routines that call `vkGetPipelineExecutablePropertiesKHR` and `vkGetPipelineExecutableStatisticsKHR` for pipelines that are executed when instrumentation is enabled.
+   - Focus on collecting metrics relevant to matmul tuning, such as LDS usage, SGPR/VGPR counts, and instruction counts.
+   - Cache the results per pipeline to avoid repeated expensive queries.
+
+3. **Reporting**
+   - Integrate PEP data into the same reporting channel as timestamp results, clearly annotating pipelines with their resource usage stats.
+   - Provide a summary table in the logs or exported JSON to highlight potential register pressure or occupancy issues specific to the RX580.
+
+### 3. Configuration & UX
+
+1. **Runtime controls**
+   - Introduce an environment variable (e.g., `GGML_VK_PROFILING=1`) or a build-time option to toggle instrumentation. Default to disabled.
+   - When enabled, log a concise message describing which instrumentation features are active (timestamps, PEP).
+
+2. **Data output**
+   - Emit human-readable log lines summarizing per-dispatch timings and pipeline stats.
+   - Optionally generate a structured JSON blob that contains:
+     ```json
+     {
+       "device": "AMD Radeon RX580",
+       "timestamp_period_ns": <number>,
+       "dispatches": [
+         {
+           "pipeline": "mul_mat_q4_0_l",
+           "phase": "prefill",
+           "time_us": 123.4,
+           "executables": {
+             "LDSUsage": "32KB",
+             "VGPRs": 64,
+             "SGPRs": 96
+           }
+         }
+       ]
+     }
+     ```
+   - Ensure the logging respects existing verbosity settings to avoid flooding standard output during regular runs.
+
+3. **Validation & Testing**
+   - Add unit/integration tests in the Vulkan backend (where feasible) to confirm instrumentation paths do not crash when enabled/disabled.
+   - Run manual validation on an RX580: execute representative prefill and infill workloads, capture the logs/JSON, and verify that timings are recorded for all relevant pipelines.
+
+## Implementation Checklist
+
+- [x] Add instrumentation configuration flag and device capability storage.
+- [x] Create timestamp query pools and wire them into `ggml_vk_dispatch_pipeline()`.
+- [x] Implement result aggregation and logging/JSON export.
+- [x] Hook up `VK_KHR_pipeline_executable_properties` data collection.
+- [x] Document usage instructions for developers profiling the RX580 path.
+
+## Usage
+
+Set `GGML_VK_PROFILING=1` to enable the Vulkan profiler. The backend logs the active features (timestamps and pipeline executable
+properties) and prints a per-dispatch breakdown for every `mul_mat*` and `mul_mat_vec*` kernel, followed by an aggregated summary
+with the most relevant AMD statistics (VGPRs, SGPRs, LDS usage, etc.). Set `GGML_VK_PROFILING=json` to emit the same information
+as a JSON blob in addition to the human-readable log. Disable the environment variable to return to the zero-overhead fast path.
+
+The output contains:
+
+- Individual dispatch timings with workgroup sizes for prefill (`mul_mat*`) and infill (`mul_mat_vec*`) pipelines.
+- Aggregated averages and totals grouped by pipeline and phase, annotated with cached pipeline executable statistics when the
+  device supports `VK_KHR_pipeline_executable_properties`.
+- Optional structured JSON mirroring the log content for downstream analysis.
+
+## Expected Outcomes
+
+- Developers can pinpoint the specific matmul kernels that dominate RX580 runtime, with precise GPU timings.
+- Pipeline statistics illuminate whether occupancy, register pressure, or LDS saturation contribute to bottlenecks.
+- Instrumentation remains optional, enabling routine builds to stay lightweight while providing deep insights when needed.
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ebbb412e55f..673cb0f0286 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -29,6 +29,9 @@ VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 #include <mutex>
 #include <future>
 #include <thread>
+#include <set>
+#include <cctype>
+#include <cstdio>
 
 #if defined(_MSC_VER)
 # define NOMINMAX 1
@@ -129,6 +132,8 @@ struct vk_pipeline_struct {
     bool compiled {};
     // number of registers used, extracted from pipeline executable properties
     uint32_t register_count {};
+    bool profiling_stats_cached {};
+    std::map<std::string, std::string> profiling_stats;
 };
 
 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -1409,6 +1414,357 @@ class vk_perf_logger {
     std::map<std::string, std::vector<uint64_t>> flops;
 };
 
+struct vk_profiling_dispatch_record {
+    vk_pipeline pipeline;
+    std::string pipeline_name;
+    std::string phase;
+    uint32_t query_begin {};
+    uint32_t query_end {};
+    std::array<uint32_t, 3> elements {};
+    std::array<uint32_t, 3> workgroups {};
+};
+
+struct vk_profiling_state {
+    vk::QueryPool query_pool;
+    uint32_t capacity {};
+    uint32_t next_query {};
+    bool overflowed {};
+    bool timestamps_supported {};
+    bool logged_features {};
+    bool warned_no_timestamps {};
+    std::vector<vk_profiling_dispatch_record> dispatches;
+};
+
+static bool ggml_vk_profiler_matches_pipeline(const std::string& name) {
+    return name.find("matmul") != std::string::npos || name.find("mul_mat") != std::string::npos;
+}
+
+static std::string ggml_vk_profiler_phase(const std::string& name) {
+    if (name.find("mul_mat_vec") != std::string::npos) {
+        return "infill";
+    }
+    if (ggml_vk_profiler_matches_pipeline(name)) {
+        return "prefill";
+    }
+    return "other";
+}
+
+static bool ggml_vk_profiler_is_relevant_stat(const std::string& name) {
+    std::string lowered(name.size(), '\0');
+    std::transform(name.begin(), name.end(), lowered.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+    return lowered.find("vgpr") != std::string::npos ||
+           lowered.find("sgpr") != std::string::npos ||
+           lowered.find("lds")  != std::string::npos ||
+           lowered.find("instr") != std::string::npos ||
+           lowered.find("register") != std::string::npos;
+}
+
+static std::string ggml_vk_profiler_json_escape(const std::string& value) {
+    std::string escaped;
+    escaped.reserve(value.size());
+    for (char c : value) {
+        switch (c) {
+            case '\\': escaped += "\\\\"; break;
+            case '\"': escaped += "\\\""; break;
+            case '\n': escaped += "\\n"; break;
+            case '\r': escaped += "\\r"; break;
+            case '\t': escaped += "\\t"; break;
+            default:
+                if (static_cast<unsigned char>(c) < 0x20) {
+                    char buffer[7];
+                    snprintf(buffer, sizeof(buffer), "\\u%04x", c & 0xff);
+                    escaped += buffer;
+                } else {
+                    escaped += c;
+                }
+        }
+    }
+    return escaped;
+}
+
+static std::string ggml_vk_profiler_format_statistic(const vk::PipelineExecutableStatisticKHR & stat) {
+    switch (stat.format) {
+        case vk::PipelineExecutableStatisticFormatKHR::eBool32:
+            return stat.value.b32 ? "true" : "false";
+        case vk::PipelineExecutableStatisticFormatKHR::eInt64:
+            return std::to_string(stat.value.i64);
+        case vk::PipelineExecutableStatisticFormatKHR::eUint64:
+            return std::to_string(stat.value.u64);
+        case vk::PipelineExecutableStatisticFormatKHR::eFloat64: {
+            std::ostringstream ss;
+            ss.setf(std::ios::fixed);
+            ss << std::setprecision(3) << stat.value.f64;
+            return ss.str();
+        }
+        default:
+            return "unknown";
+    }
+}
+
+static void ggml_vk_profiler_cache_pipeline_stats(vk_device& device, vk_pipeline& pipeline) {
+    if (!pipeline) {
+        return;
+    }
+    if (!device->pipeline_executable_properties_support) {
+        if (pipeline->register_count && pipeline->profiling_stats.find("Register Count") == pipeline->profiling_stats.end()) {
+            pipeline->profiling_stats["Register Count"] = std::to_string(pipeline->register_count);
+        }
+        return;
+    }
+    if (!pipeline->profiling_stats_cached) {
+        try {
+            vk::PipelineInfoKHR pipeline_info;
+            pipeline_info.pipeline = pipeline->pipeline;
+            auto executables = device->device.getPipelineExecutablePropertiesKHR(pipeline_info);
+            for (uint32_t executable_index = 0; executable_index < executables.size(); ++executable_index) {
+                vk::PipelineExecutableInfoKHR executable_info;
+                executable_info.pipeline = pipeline->pipeline;
+                executable_info.executableIndex = executable_index;
+                auto statistics = device->device.getPipelineExecutableStatisticsKHR(executable_info);
+                for (const auto & stat : statistics) {
+                    pipeline->profiling_stats[stat.name] = ggml_vk_profiler_format_statistic(stat);
+                }
+            }
+        } catch (const vk::SystemError& e) {
+            GGML_LOG_WARN("ggml_vulkan: failed to query pipeline executable statistics for %s: %s\n", pipeline->name.c_str(), e.what());
+        }
+        pipeline->profiling_stats_cached = true;
+    }
+    if (pipeline->register_count && pipeline->profiling_stats.find("Register Count") == pipeline->profiling_stats.end()) {
+        pipeline->profiling_stats["Register Count"] = std::to_string(pipeline->register_count);
+    }
+}
+
+static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t estimated_dispatches) {
+    if (!vk_profiling_enabled) {
+        return;
+    }
+
+    if (!ctx->profiling) {
+        ctx->profiling = std::make_unique<vk_profiling_state>();
+    }
+
+    vk_profiling_state & profiler = *ctx->profiling;
+    profiler.dispatches.clear();
+    profiler.next_query = 0;
+    profiler.overflowed = false;
+    profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0;
+
+    if (!profiler.timestamps_supported) {
+        if (!profiler.warned_no_timestamps) {
+            GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n", ctx->device->name.c_str());
+            profiler.warned_no_timestamps = true;
+        }
+        return;
+    }
+
+    uint32_t queries_needed = std::max<uint32_t>(256u, estimated_dispatches * 2u + 2u);
+    if (queries_needed > profiler.capacity) {
+        if (profiler.query_pool) {
+            ctx->device->device.destroyQueryPool(profiler.query_pool);
+        }
+        vk::QueryPoolCreateInfo query_info;
+        query_info.queryType = vk::QueryType::eTimestamp;
+        query_info.queryCount = queries_needed;
+        profiler.query_pool = ctx->device->device.createQueryPool(query_info);
+        profiler.capacity = queries_needed;
+    }
+
+    if (profiler.query_pool) {
+        ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity);
+    }
+
+    if (!profiler.logged_features) {
+        GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamps %s, pipeline executable properties %s)%s\n",
+                      ctx->device->name.c_str(),
+                      profiler.timestamps_supported ? "enabled" : "unavailable",
+                      ctx->device->pipeline_executable_properties_support ? "available" : "unavailable",
+                      vk_profiling_json_enabled ? " [json output]" : "");
+        profiler.logged_features = true;
+    }
+}
+
+static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
+    if (!vk_profiling_enabled || !ctx->profiling) {
+        return;
+    }
+
+    vk_profiling_state & profiler = *ctx->profiling;
+    if (!profiler.timestamps_supported || profiler.dispatches.empty() || !profiler.query_pool) {
+        profiler.dispatches.clear();
+        profiler.next_query = 0;
+        return;
+    }
+
+    if (profiler.overflowed) {
+        GGML_LOG_WARN("ggml_vulkan: profiling query pool exhausted on %s; results incomplete\n", ctx->device->name.c_str());
+        profiler.dispatches.clear();
+        profiler.next_query = 0;
+        return;
+    }
+
+    const uint32_t query_count = profiler.next_query;
+    if (query_count == 0) {
+        return;
+    }
+
+    std::vector<uint64_t> timestamps(query_count);
+    VK_CHECK(ctx->device->device.getQueryPoolResults(profiler.query_pool,
+                                                     0,
+                                                     query_count,
+                                                     query_count * sizeof(uint64_t),
+                                                     timestamps.data(),
+                                                     sizeof(uint64_t),
+                                                     vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait),
+             "get profiling timestamps");
+
+    double timestamp_period = ctx->device->properties.limits.timestampPeriod;
+
+    struct vk_profiling_pipeline_summary {
+        std::string name;
+        std::string phase;
+        double total_ns = 0.0;
+        uint32_t count = 0;
+        vk_pipeline pipeline;
+    };
+
+    std::map<std::pair<std::string, std::string>, vk_profiling_pipeline_summary> summary_map;
+    std::vector<double> dispatch_times_us;
+    dispatch_times_us.reserve(profiler.dispatches.size());
+
+    for (size_t i = 0; i < profiler.dispatches.size(); ++i) {
+        const auto & record = profiler.dispatches[i];
+        double duration_us = 0.0;
+        if (record.query_begin < query_count && record.query_end < query_count) {
+            uint64_t start = timestamps[record.query_begin];
+            uint64_t end   = timestamps[record.query_end];
+            double duration_ns = double(end - start) * timestamp_period;
+            duration_us = duration_ns / 1000.0;
+
+            auto key = std::make_pair(record.pipeline_name, record.phase);
+            auto & entry = summary_map[key];
+            entry.name = record.pipeline_name;
+            entry.phase = record.phase;
+            entry.total_ns += duration_ns;
+            entry.count += 1;
+            if (record.pipeline) {
+                entry.pipeline = record.pipeline;
+            }
+        } else {
+            GGML_LOG_WARN("ggml_vulkan: profiling query index out of range for %s\n", record.pipeline_name.c_str());
+        }
+        dispatch_times_us.push_back(duration_us);
+    }
+
+    if (!profiler.dispatches.empty()) {
+        GGML_LOG_INFO("ggml_vulkan: profiling dispatches for %s\n", ctx->device->name.c_str());
+    }
+
+    for (size_t i = 0; i < profiler.dispatches.size(); ++i) {
+        const auto & record = profiler.dispatches[i];
+        double duration_us = dispatch_times_us[i];
+        GGML_LOG_INFO("  dispatch %zu [%s] %s -> %.3f us (wg=%u,%u,%u)\n",
+                      i,
+                      record.phase.c_str(),
+                      record.pipeline_name.c_str(),
+                      duration_us,
+                      record.workgroups[0],
+                      record.workgroups[1],
+                      record.workgroups[2]);
+    }
+
+    std::vector<vk_profiling_pipeline_summary> summaries;
+    summaries.reserve(summary_map.size());
+    for (auto & kv : summary_map) {
+        summaries.push_back(kv.second);
+    }
+
+    std::sort(summaries.begin(), summaries.end(), [](const vk_profiling_pipeline_summary& a, const vk_profiling_pipeline_summary& b) {
+        return a.total_ns > b.total_ns;
+    });
+
+    for (const auto & entry : summaries) {
+        double avg_us = entry.count ? (entry.total_ns / entry.count) / 1000.0 : 0.0;
+        double total_us = entry.total_ns / 1000.0;
+
+        std::string stats_suffix;
+        if (entry.pipeline) {
+            ggml_vk_profiler_cache_pipeline_stats(ctx->device, entry.pipeline);
+            std::vector<std::pair<std::string, std::string>> stats;
+            for (const auto & stat : entry.pipeline->profiling_stats) {
+                if (ggml_vk_profiler_is_relevant_stat(stat.first)) {
+                    stats.emplace_back(stat.first, stat.second);
+                }
+            }
+            if (!stats.empty()) {
+                std::ostringstream stats_stream;
+                stats_stream << " stats: ";
+                for (size_t i = 0; i < stats.size(); ++i) {
+                    if (i != 0) {
+                        stats_stream << ", ";
+                    }
+                    stats_stream << stats[i].first << "=" << stats[i].second;
+                }
+                stats_suffix = stats_stream.str();
+            }
+        }
+
+        GGML_LOG_INFO("  summary [%s] %s dispatches=%u avg=%.3f us total=%.3f us%s\n",
+                      entry.phase.c_str(),
+                      entry.name.c_str(),
+                      entry.count,
+                      avg_us,
+                      total_us,
+                      stats_suffix.c_str());
+    }
+
+    if (vk_profiling_json_enabled && !profiler.dispatches.empty()) {
+        std::ostringstream json;
+        json << "{\n";
+        json << "  \"device\": \"" << ggml_vk_profiler_json_escape(ctx->device->name) << "\",\n";
+        json << "  \"timestamp_period_ns\": " << timestamp_period << ",\n";
+        json << "  \"dispatches\": [\n";
+        for (size_t i = 0; i < profiler.dispatches.size(); ++i) {
+            const auto & record = profiler.dispatches[i];
+            json << "    {\n";
+            json << "      \"pipeline\": \"" << ggml_vk_profiler_json_escape(record.pipeline_name) << "\",\n";
+            json << "      \"phase\": \"" << ggml_vk_profiler_json_escape(record.phase) << "\",\n";
+            std::ostringstream time_stream;
+            time_stream.setf(std::ios::fixed);
+            time_stream << std::setprecision(3) << dispatch_times_us[i];
+            json << "      \"time_us\": " << time_stream.str() << ",\n";
+            json << "      \"workgroups\": [" << record.workgroups[0] << ", " << record.workgroups[1] << ", " << record.workgroups[2] << "],\n";
+            json << "      \"executables\": {";
+            bool first = true;
+            if (record.pipeline) {
+                ggml_vk_profiler_cache_pipeline_stats(ctx->device, record.pipeline);
+                for (const auto & stat : record.pipeline->profiling_stats) {
+                    if (!ggml_vk_profiler_is_relevant_stat(stat.first)) {
+                        continue;
+                    }
+                    if (!first) {
+                        json << ", ";
+                    }
+                    json << "\"" << ggml_vk_profiler_json_escape(stat.first) << "\": \"" << ggml_vk_profiler_json_escape(stat.second) << "\"";
+                    first = false;
+                }
+            }
+            json << "}\n";
+            json << "    }";
+            if (i + 1 < profiler.dispatches.size()) {
+                json << ",";
+            }
+            json << "\n";
+        }
+        json << "  ]\n";
+        json << "}\n";
+        GGML_LOG_INFO("%s", json.str().c_str());
+    }
+
+    profiler.dispatches.clear();
+    profiler.next_query = 0;
+}
+
 struct ggml_backend_vk_context {
     std::string name;
 
@@ -1454,6 +1810,8 @@ struct ggml_backend_vk_context {
     // number of additional consecutive nodes that are being fused with the
     // node currently being processed
     int num_additional_fused_ops {};
+
+    std::unique_ptr<vk_profiling_state> profiling;
 };
 
 static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
@@ -1536,6 +1894,8 @@ static bool vk_instance_initialized = false;
 static vk_instance_t vk_instance;
 
 static bool vk_perf_logger_enabled = false;
+static bool vk_profiling_enabled = false;
+static bool vk_profiling_json_enabled = false;
 
 #ifdef GGML_VULKAN_CHECK_RESULTS
 static size_t vk_skip_checks;
@@ -4574,6 +4934,19 @@ static void ggml_vk_instance_init() {
 
     vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
 
+    const char * profiling_env = getenv("GGML_VK_PROFILING");
+    if (profiling_env != nullptr) {
+        vk_profiling_enabled = true;
+        std::string profiling_value = profiling_env;
+        std::string profiling_value_lower = profiling_value;
+        std::transform(profiling_value_lower.begin(), profiling_value_lower.end(), profiling_value_lower.begin(),
+            [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        vk_profiling_json_enabled = profiling_value_lower.find("json") != std::string::npos;
+    } else {
+        vk_profiling_enabled = false;
+        vk_profiling_json_enabled = false;
+    }
+
     // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
     VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
 
@@ -5223,7 +5596,41 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
                                 0,
                                 { descriptor_set },
                                 {});
+    bool profile_dispatch = false;
+    uint32_t query_begin = 0;
+    uint32_t query_end = 0;
+    vk_profiling_state * profiler_state = nullptr;
+    if (vk_profiling_enabled && ctx->profiling && ctx->profiling->timestamps_supported && ctx->profiling->query_pool &&
+        !ctx->profiling->overflowed && ggml_vk_profiler_matches_pipeline(pipeline->name)) {
+        profiler_state = ctx->profiling.get();
+        if (profiler_state->next_query + 1 < profiler_state->capacity) {
+            query_begin = profiler_state->next_query++;
+            query_end = profiler_state->next_query++;
+            profile_dispatch = true;
+
+            vk_profiling_dispatch_record record;
+            record.pipeline = pipeline;
+            record.pipeline_name = pipeline->name;
+            record.phase = ggml_vk_profiler_phase(pipeline->name);
+            record.query_begin = query_begin;
+            record.query_end = query_end;
+            record.elements = elements;
+            record.workgroups = { wg0, wg1, wg2 };
+            profiler_state->dispatches.push_back(std::move(record));
+        } else {
+            profiler_state->overflowed = true;
+        }
+    }
+
+    if (profile_dispatch) {
+        subctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eComputeShader, profiler_state->query_pool, query_begin);
+    }
+
     subctx->s->buffer.dispatch(wg0, wg1, wg2);
+
+    if (profile_dispatch) {
+        subctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eComputeShader, profiler_state->query_pool, query_end);
+    }
 }
 
 static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
@@ -11470,6 +11877,12 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
     ctx->descriptor_pools.clear();
     ctx->descriptor_sets.clear();
 
+    if (ctx->profiling && ctx->profiling->query_pool) {
+        ctx->device->device.destroyQueryPool(ctx->profiling->query_pool);
+        ctx->profiling->query_pool = vk::QueryPool{};
+    }
+    ctx->profiling.reset();
+
     ctx->compute_cmd_pool.destroy(ctx->device->device);
     ctx->transfer_cmd_pool.destroy(ctx->device->device);
 }
@@ -11973,6 +12386,11 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     // Reserve tensor context space for all nodes
     ctx->tensor_ctxs.resize(cgraph->n_nodes);
 
+    if (vk_profiling_enabled) {
+        uint32_t expected_dispatches = (uint32_t)(std::max(1, cgraph->n_nodes) * 6);
+        ggml_vk_profiler_begin_graph(ctx, expected_dispatches);
+    }
+
     bool first_node_in_batch = true; // true if next node will be first node in a batch
     int submit_node_idx = 0; // index to first node in a batch
 
@@ -12110,6 +12528,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         ctx->device->perf_logger->print_timings();
     }
 
+    if (vk_profiling_enabled) {
+        ggml_vk_profiler_end_graph(ctx);
+    }
+
     ggml_vk_graph_cleanup(ctx);
 
     return GGML_STATUS_SUCCESS;

From 64535ff5c380117662f810484cfe8c795b2507ca Mon Sep 17 00:00:00 2001
From: rasbid <104773487+rasbid@users.noreply.github.com>
Date: Tue, 7 Oct 2025 10:57:48 +0300
Subject: [PATCH 2/4] Fix Vulkan profiling context initialization

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 360 ++++++++++++++-------------
 1 file changed, 183 insertions(+), 177 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 673cb0f0286..bd275150662 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1535,6 +1535,138 @@ static void ggml_vk_profiler_cache_pipeline_stats(vk_device& device, vk_pipeline
     }
 }
 
+struct ggml_backend_vk_context {
+    std::string name;
+
+    vk_device device;
+
+    size_t semaphore_idx, event_idx;
+    ggml_vk_garbage_collector gc;
+    size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset;
+    vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials;
+    vk::Fence fence, almost_ready_fence;
+    bool almost_ready_fence_pending {};
+    // Set before op_add and unset after op_rms_norm to indicate that the add should
+    // write partial sums to accumulate the square of the vector components
+    bool do_add_rms_partials;
+
+    // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
+    vk_pipeline_struct * prealloc_y_last_pipeline_used {};
+    const ggml_tensor * prealloc_y_last_tensor_used {};
+
+    // Track which nodes have been used since the last sync, and whether they were written to
+    std::vector<const ggml_tensor *> unsynced_nodes_written;
+    std::vector<const ggml_tensor *> unsynced_nodes_read;
+    // Track which prealloc buffers have pending reads that need to be synchronized.
+    // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set),
+    // and set to true after the buffer contents are consumed.
+    bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync;
+
+    vk_buffer buffer_pool[MAX_VK_BUFFERS];
+
+    vk_context_ref compute_ctx;
+    vk_context_ref transfer_ctx;
+
+    std::vector<vk_context_ref> tensor_ctxs;
+
+    std::vector<vk::DescriptorPool> descriptor_pools;
+    std::vector<vk::DescriptorSet> descriptor_sets;
+    uint32_t descriptor_set_idx {};
+    uint32_t pipeline_descriptor_set_requirements {};
+
+    vk_command_pool compute_cmd_pool;
+    vk_command_pool transfer_cmd_pool;
+
+    // number of additional consecutive nodes that are being fused with the
+    // node currently being processed
+    int num_additional_fused_ops {};
+
+    std::unique_ptr<vk_profiling_state> profiling;
+};
+
+static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+
+static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
+    if (tensor->view_src) {
+        return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
+    }
+    return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
+}
+
+struct ggml_backend_vk_buffer_context {
+    vk_device_ref device;
+    vk_buffer dev_buffer;
+    std::string name;
+
+    ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
+        device(device),
+        dev_buffer(dev_buffer),
+        name(name) {
+    }
+
+    ~ggml_backend_vk_buffer_context() {
+        ggml_vk_destroy_buffer(dev_buffer);
+    }
+};
+
+#ifdef GGML_VULKAN_MEMORY_DEBUG
+static std::mutex log_mutex;
+
+void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
+    std::lock_guard<std::mutex> guard(log_mutex);
+    vk_buffer buf = buf_ref.lock();
+    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
+    const std::string type = device ? "device" : "host";
+    allocations[buf->buffer] = size;
+    total_device += device ? size : 0;
+    total_host += device ? 0 : size;
+    VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
+}
+
+void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
+    if (buf_ref.expired() || buf_ref.lock()->size == 0) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> guard(log_mutex);
+    vk_buffer buf = buf_ref.lock();
+    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
+    std::string type = device ? "device" : "host";
+    auto it = allocations.find(buf->buffer);
+    total_device -= device ? it->second : 0;
+    total_host -= device ? 0 : it->second;
+    if (it != allocations.end()) {
+        VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
+        allocations.erase(it);
+    } else {
+        VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
+    }
+}
+#endif // GGML_VULKAN_MEMORY_DEBUG
+
+struct vk_instance_t {
+    vk::Instance instance;
+
+    bool debug_utils_support = false;  // VK_EXT_debug_utils enabled
+    PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {};
+    PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {};
+    PFN_vkQueueEndDebugUtilsLabelEXT   pfn_vkQueueEndDebugUtilsLabelEXT   = {};
+    PFN_vkCmdBeginDebugUtilsLabelEXT   pfn_vkCmdBeginDebugUtilsLabelEXT   = {};
+    PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {};
+    PFN_vkCmdInsertDebugUtilsLabelEXT  pfn_vkCmdInsertDebugUtilsLabelEXT  = {};
+
+    std::vector<size_t> device_indices;
+    std::vector<bool>   device_supports_membudget;
+    vk_device devices[GGML_VK_MAX_DEVICES];
+};
+
+static bool vk_instance_initialized = false;
+static vk_instance_t vk_instance;
+
+static bool vk_perf_logger_enabled = false;
+static bool vk_profiling_enabled = false;
+static bool vk_profiling_json_enabled = false;
+
 static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t estimated_dispatches) {
     if (!vk_profiling_enabled) {
         return;
@@ -1545,43 +1677,49 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t
     }
 
     vk_profiling_state & profiler = *ctx->profiling;
-    profiler.dispatches.clear();
-    profiler.next_query = 0;
     profiler.overflowed = false;
-    profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0;
 
-    if (!profiler.timestamps_supported) {
-        if (!profiler.warned_no_timestamps) {
-            GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n", ctx->device->name.c_str());
-            profiler.warned_no_timestamps = true;
+    if (!profiler.logged_features) {
+        profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0;
+
+        if (!profiler.timestamps_supported) {
+            if (!profiler.warned_no_timestamps) {
+                profiler.warned_no_timestamps = true;
+                GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n",
+                              ctx->device->name.c_str());
+            }
+            return;
         }
-        return;
-    }
 
-    uint32_t queries_needed = std::max<uint32_t>(256u, estimated_dispatches * 2u + 2u);
-    if (queries_needed > profiler.capacity) {
         if (profiler.query_pool) {
             ctx->device->device.destroyQueryPool(profiler.query_pool);
+            profiler.query_pool = {};
         }
-        vk::QueryPoolCreateInfo query_info;
-        query_info.queryType = vk::QueryType::eTimestamp;
-        query_info.queryCount = queries_needed;
-        profiler.query_pool = ctx->device->device.createQueryPool(query_info);
-        profiler.capacity = queries_needed;
-    }
 
-    if (profiler.query_pool) {
+        profiler.capacity = std::max<uint32_t>(estimated_dispatches * 2u, 256u);
+
+        vk::QueryPoolCreateInfo query_info({}, vk::QueryType::eTimestamp, profiler.capacity);
+        profiler.query_pool = ctx->device->device.createQueryPool(query_info);
         ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity);
-    }
+        profiler.logged_features = true;
 
-    if (!profiler.logged_features) {
-        GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamps %s, pipeline executable properties %s)%s\n",
+        GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamp support: %s, pipeline stats: %s%s)\n",
                       ctx->device->name.c_str(),
-                      profiler.timestamps_supported ? "enabled" : "unavailable",
+                      profiler.timestamps_supported ? "available" : "unavailable",
                       ctx->device->pipeline_executable_properties_support ? "available" : "unavailable",
                       vk_profiling_json_enabled ? " [json output]" : "");
-        profiler.logged_features = true;
+    } else {
+        if (!profiler.timestamps_supported) {
+            return;
+        }
+
+        if (profiler.query_pool && profiler.next_query) {
+            ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity);
+        }
     }
+
+    profiler.next_query = 0;
+    profiler.dispatches.clear();
 }
 
 static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
@@ -1590,21 +1728,17 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
     }
 
     vk_profiling_state & profiler = *ctx->profiling;
-    if (!profiler.timestamps_supported || profiler.dispatches.empty() || !profiler.query_pool) {
-        profiler.dispatches.clear();
-        profiler.next_query = 0;
+
+    if (!profiler.timestamps_supported || !profiler.query_pool) {
         return;
     }
 
     if (profiler.overflowed) {
         GGML_LOG_WARN("ggml_vulkan: profiling query pool exhausted on %s; results incomplete\n", ctx->device->name.c_str());
-        profiler.dispatches.clear();
-        profiler.next_query = 0;
-        return;
     }
 
     const uint32_t query_count = profiler.next_query;
-    if (query_count == 0) {
+    if (query_count == 0 || profiler.dispatches.empty()) {
         return;
     }
 
@@ -1612,19 +1746,22 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
     VK_CHECK(ctx->device->device.getQueryPoolResults(profiler.query_pool,
                                                      0,
                                                      query_count,
-                                                     query_count * sizeof(uint64_t),
+                                                     sizeof(uint64_t) * query_count,
                                                      timestamps.data(),
                                                      sizeof(uint64_t),
-                                                     vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait),
-             "get profiling timestamps");
+                                                     vk::QueryResultFlagBits::e64),
+             "getQueryPoolResults");
 
     double timestamp_period = ctx->device->properties.limits.timestampPeriod;
+    if (timestamp_period == 0.0) {
+        timestamp_period = 1.0;
+    }
 
     struct vk_profiling_pipeline_summary {
         std::string name;
         std::string phase;
-        double total_ns = 0.0;
-        uint32_t count = 0;
+        double total_ns {};
+        uint32_t count {};
         vk_pipeline pipeline;
     };
 
@@ -1632,13 +1769,13 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
     std::vector<double> dispatch_times_us;
     dispatch_times_us.reserve(profiler.dispatches.size());
 
-    for (size_t i = 0; i < profiler.dispatches.size(); ++i) {
-        const auto & record = profiler.dispatches[i];
+    for (const auto & record : profiler.dispatches) {
         double duration_us = 0.0;
-        if (record.query_begin < query_count && record.query_end < query_count) {
-            uint64_t start = timestamps[record.query_begin];
-            uint64_t end   = timestamps[record.query_end];
-            double duration_ns = double(end - start) * timestamp_period;
+
+        if (record.query_end < timestamps.size() && record.query_begin < timestamps.size()) {
+            const uint64_t start = timestamps[record.query_begin];
+            const uint64_t end   = timestamps[record.query_end];
+            const double duration_ns = double(end - start) * timestamp_period;
             duration_us = duration_ns / 1000.0;
 
             auto key = std::make_pair(record.pipeline_name, record.phase);
@@ -1653,6 +1790,7 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
         } else {
             GGML_LOG_WARN("ggml_vulkan: profiling query index out of range for %s\n", record.pipeline_name.c_str());
         }
+
         dispatch_times_us.push_back(duration_us);
     }
 
@@ -1662,7 +1800,7 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
 
     for (size_t i = 0; i < profiler.dispatches.size(); ++i) {
         const auto & record = profiler.dispatches[i];
-        double duration_us = dispatch_times_us[i];
+        const double duration_us = dispatch_times_us[i];
         GGML_LOG_INFO("  dispatch %zu [%s] %s -> %.3f us (wg=%u,%u,%u)\n",
                       i,
                       record.phase.c_str(),
@@ -1679,13 +1817,13 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
         summaries.push_back(kv.second);
     }
 
-    std::sort(summaries.begin(), summaries.end(), [](const vk_profiling_pipeline_summary& a, const vk_profiling_pipeline_summary& b) {
+    std::sort(summaries.begin(), summaries.end(), [](const vk_profiling_pipeline_summary & a, const vk_profiling_pipeline_summary & b) {
         return a.total_ns > b.total_ns;
     });
 
     for (const auto & entry : summaries) {
-        double avg_us = entry.count ? (entry.total_ns / entry.count) / 1000.0 : 0.0;
-        double total_us = entry.total_ns / 1000.0;
+        const double avg_us = entry.count ? (entry.total_ns / entry.count) / 1000.0 : 0.0;
+        const double total_us = entry.total_ns / 1000.0;
 
         std::string stats_suffix;
         if (entry.pipeline) {
@@ -1765,138 +1903,6 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
     profiler.next_query = 0;
 }
 
-struct ggml_backend_vk_context {
-    std::string name;
-
-    vk_device device;
-
-    size_t semaphore_idx, event_idx;
-    ggml_vk_garbage_collector gc;
-    size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset;
-    vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials;
-    vk::Fence fence, almost_ready_fence;
-    bool almost_ready_fence_pending {};
-    // Set before op_add and unset after op_rms_norm to indicate that the add should
-    // write partial sums to accumulate the square of the vector components
-    bool do_add_rms_partials;
-
-    // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
-    vk_pipeline_struct * prealloc_y_last_pipeline_used {};
-    const ggml_tensor * prealloc_y_last_tensor_used {};
-
-    // Track which nodes have been used since the last sync, and whether they were written to
-    std::vector<const ggml_tensor *> unsynced_nodes_written;
-    std::vector<const ggml_tensor *> unsynced_nodes_read;
-    // Track which prealloc buffers have pending reads that need to be synchronized.
-    // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set),
-    // and set to true after the buffer contents are consumed.
-    bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync;
-
-    vk_buffer buffer_pool[MAX_VK_BUFFERS];
-
-    vk_context_ref compute_ctx;
-    vk_context_ref transfer_ctx;
-
-    std::vector<vk_context_ref> tensor_ctxs;
-
-    std::vector<vk::DescriptorPool> descriptor_pools;
-    std::vector<vk::DescriptorSet> descriptor_sets;
-    uint32_t descriptor_set_idx {};
-    uint32_t pipeline_descriptor_set_requirements {};
-
-    vk_command_pool compute_cmd_pool;
-    vk_command_pool transfer_cmd_pool;
-
-    // number of additional consecutive nodes that are being fused with the
-    // node currently being processed
-    int num_additional_fused_ops {};
-
-    std::unique_ptr<vk_profiling_state> profiling;
-};
-
-static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
-
-static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
-    if (tensor->view_src) {
-        return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
-    }
-    return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
-}
-
-struct ggml_backend_vk_buffer_context {
-    vk_device_ref device;
-    vk_buffer dev_buffer;
-    std::string name;
-
-    ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) :
-        device(device),
-        dev_buffer(dev_buffer),
-        name(name) {
-    }
-
-    ~ggml_backend_vk_buffer_context() {
-        ggml_vk_destroy_buffer(dev_buffer);
-    }
-};
-
-#ifdef GGML_VULKAN_MEMORY_DEBUG
-static std::mutex log_mutex;
-
-void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
-    std::lock_guard<std::mutex> guard(log_mutex);
-    vk_buffer buf = buf_ref.lock();
-    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
-    const std::string type = device ? "device" : "host";
-    allocations[buf->buffer] = size;
-    total_device += device ? size : 0;
-    total_host += device ? 0 : size;
-    VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
-}
-
-void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
-    if (buf_ref.expired() || buf_ref.lock()->size == 0) {
-        return;
-    }
-
-    std::lock_guard<std::mutex> guard(log_mutex);
-    vk_buffer buf = buf_ref.lock();
-    const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
-    std::string type = device ? "device" : "host";
-    auto it = allocations.find(buf->buffer);
-    total_device -= device ? it->second : 0;
-    total_host -= device ? 0 : it->second;
-    if (it != allocations.end()) {
-        VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
-        allocations.erase(it);
-    } else {
-        VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer);
-    }
-}
-#endif // GGML_VULKAN_MEMORY_DEBUG
-
-struct vk_instance_t {
-    vk::Instance instance;
-
-    bool debug_utils_support = false;  // VK_EXT_debug_utils enabled
-    PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {};
-    PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {};
-    PFN_vkQueueEndDebugUtilsLabelEXT   pfn_vkQueueEndDebugUtilsLabelEXT   = {};
-    PFN_vkCmdBeginDebugUtilsLabelEXT   pfn_vkCmdBeginDebugUtilsLabelEXT   = {};
-    PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {};
-    PFN_vkCmdInsertDebugUtilsLabelEXT  pfn_vkCmdInsertDebugUtilsLabelEXT  = {};
-
-    std::vector<size_t> device_indices;
-    std::vector<bool>   device_supports_membudget;
-    vk_device devices[GGML_VK_MAX_DEVICES];
-};
-
-static bool vk_instance_initialized = false;
-static vk_instance_t vk_instance;
-
-static bool vk_perf_logger_enabled = false;
-static bool vk_profiling_enabled = false;
-static bool vk_profiling_json_enabled = false;
-
 #ifdef GGML_VULKAN_CHECK_RESULTS
 static size_t vk_skip_checks;
 static size_t vk_output_tensor;

From 879a789d034737a61b476aedfb32dfde27309d63 Mon Sep 17 00:00:00 2001
From: rasbid <104773487+rasbid@users.noreply.github.com>
Date: Tue, 7 Oct 2025 11:17:53 +0300
Subject: [PATCH 3/4] Fix Vulkan profiling build errors

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index bd275150662..87e70ca4769 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1693,7 +1693,7 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t
 
         if (profiler.query_pool) {
             ctx->device->device.destroyQueryPool(profiler.query_pool);
-            profiler.query_pool = {};
+            profiler.query_pool = vk::QueryPool{};
         }
 
         profiler.capacity = std::max<uint32_t>(estimated_dispatches * 2u, 256u);
@@ -1827,9 +1827,10 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
 
         std::string stats_suffix;
         if (entry.pipeline) {
-            ggml_vk_profiler_cache_pipeline_stats(ctx->device, entry.pipeline);
+            vk_pipeline pipeline = entry.pipeline;
+            ggml_vk_profiler_cache_pipeline_stats(ctx->device, pipeline);
             std::vector<std::pair<std::string, std::string>> stats;
-            for (const auto & stat : entry.pipeline->profiling_stats) {
+            for (const auto & stat : pipeline->profiling_stats) {
                 if (ggml_vk_profiler_is_relevant_stat(stat.first)) {
                     stats.emplace_back(stat.first, stat.second);
                 }
@@ -1875,8 +1876,9 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {
             json << "      \"executables\": {";
             bool first = true;
             if (record.pipeline) {
-                ggml_vk_profiler_cache_pipeline_stats(ctx->device, record.pipeline);
-                for (const auto & stat : record.pipeline->profiling_stats) {
+                vk_pipeline pipeline = record.pipeline;
+                ggml_vk_profiler_cache_pipeline_stats(ctx->device, pipeline);
+                for (const auto & stat : pipeline->profiling_stats) {
                     if (!ggml_vk_profiler_is_relevant_stat(stat.first)) {
                         continue;
                     }

From c2d82980aba6755a56192c923a22ec79227fd8d3 Mon Sep 17 00:00:00 2001
From: rasbid <104773487+rasbid@users.noreply.github.com>
Date: Tue, 7 Oct 2025 13:30:42 +0300
Subject: [PATCH 4/4] Handle dynamic Vulkan profiling query pool sizing

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 43 +++++++++++++++++-----------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 87e70ca4769..8caefdb28cb 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1679,6 +1679,11 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t
     vk_profiling_state & profiler = *ctx->profiling;
     profiler.overflowed = false;
 
+    const uint32_t min_dispatch_guess = std::max<uint32_t>(estimated_dispatches, 1u);
+    const uint32_t max_dispatch_guess = std::numeric_limits<uint32_t>::max() / 2u;
+    const uint32_t clamped_dispatch_guess = std::min(min_dispatch_guess, max_dispatch_guess);
+    const uint32_t required_queries = std::max<uint32_t>(clamped_dispatch_guess * 2u, 256u);
+
     if (!profiler.logged_features) {
         profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0;
 
@@ -1688,19 +1693,11 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t
                 GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n",
                               ctx->device->name.c_str());
             }
+            profiler.dispatches.clear();
+            profiler.next_query = 0;
             return;
         }
 
-        if (profiler.query_pool) {
-            ctx->device->device.destroyQueryPool(profiler.query_pool);
-            profiler.query_pool = vk::QueryPool{};
-        }
-
-        profiler.capacity = std::max<uint32_t>(estimated_dispatches * 2u, 256u);
-
-        vk::QueryPoolCreateInfo query_info({}, vk::QueryType::eTimestamp, profiler.capacity);
-        profiler.query_pool = ctx->device->device.createQueryPool(query_info);
-        ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity);
         profiler.logged_features = true;
 
         GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamp support: %s, pipeline stats: %s%s)\n",
@@ -1708,18 +1705,32 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t
                       profiler.timestamps_supported ? "available" : "unavailable",
                       ctx->device->pipeline_executable_properties_support ? "available" : "unavailable",
                       vk_profiling_json_enabled ? " [json output]" : "");
-    } else {
-        if (!profiler.timestamps_supported) {
-            return;
-        }
+    } else if (!profiler.timestamps_supported) {
+        profiler.dispatches.clear();
+        profiler.next_query = 0;
+        return;
+    }
 
-        if (profiler.query_pool && profiler.next_query) {
-            ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity);
+    if (!profiler.query_pool || required_queries > profiler.capacity) {
+        if (profiler.query_pool) {
+            ctx->device->device.destroyQueryPool(profiler.query_pool);
+            profiler.query_pool = vk::QueryPool{};
         }
+
+        profiler.capacity = required_queries;
+
+        vk::QueryPoolCreateInfo query_info({}, vk::QueryType::eTimestamp, profiler.capacity);
+        profiler.query_pool = ctx->device->device.createQueryPool(query_info);
+    }
+
+    if (profiler.query_pool) {
+        ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity);
     }
 
     profiler.next_query = 0;
     profiler.dispatches.clear();
+    const size_t dispatch_capacity_hint = std::min<size_t>(clamped_dispatch_guess, profiler.capacity / 2u);
+    profiler.dispatches.reserve(dispatch_capacity_hint);
 }
 
 static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {