From 8cfb0ecb98997a0b895e90b89156a86e63b2ce0f Mon Sep 17 00:00:00 2001 From: rasbid <104773487+rasbid@users.noreply.github.com> Date: Tue, 7 Oct 2025 10:32:59 +0300 Subject: [PATCH 1/4] Implement Vulkan matmul profiling instrumentation --- .../rx580-vulkan-instrumentation-plan.md | 105 +++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 422 ++++++++++++++++++ 2 files changed, 527 insertions(+) create mode 100644 docs/development/rx580-vulkan-instrumentation-plan.md diff --git a/docs/development/rx580-vulkan-instrumentation-plan.md b/docs/development/rx580-vulkan-instrumentation-plan.md new file mode 100644 index 00000000000..c4086948200 --- /dev/null +++ b/docs/development/rx580-vulkan-instrumentation-plan.md @@ -0,0 +1,105 @@ +# RX580 Vulkan Instrumentation Plan + +This document expands the instrumentation portion of the optimization plan for the AMD RX580 (GCN gfx803) Vulkan backend path in `ggml`. The focus is on capturing detailed GPU timings and pipeline statistics to identify the prefill (`mul_mat*`) and infill (`mul_mat_vec*`) bottlenecks. + +## Goals + +1. Provide accurate per-dispatch GPU timings for all critical matmul pipelines. +2. Collect pipeline statistics when `VK_KHR_pipeline_executable_properties` is supported to understand register, LDS, and instruction usage. +3. Keep instrumentation optional and low-overhead in production builds. +4. Surface the collected data in a format that is easy to analyze. + +## Workstreams + +### 1. Timestamp Query Instrumentation + +1. **Query pool management** + - Extend the Vulkan context (e.g., `ggml_vk_context`) to own one or more timestamp query pools sized for the maximum number of concurrent dispatches we might record in a command buffer. + - Ensure pools are created only when instrumentation is enabled, falling back to no-op implementations otherwise. + - Implement recycling logic so pools can be reset between frames without recreating them. + +2. **Command buffer integration** + - Update `ggml_vk_dispatch_pipeline()` to write timestamps immediately before and after each dispatch when instrumentation is active. + - Handle command buffers that are pre-recorded vs. dynamically recorded; ensure that timestamp commands are inserted alongside the existing pipeline barrier logic. + - Guard timestamp writes behind a check for `VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT` support to avoid validation errors on devices lacking compute timestamp support. + +3. **Result retrieval** + - After submission, collect timestamp results using `vkGetQueryPoolResults()` with `VK_QUERY_RESULT_64_BIT` to maintain precision. + - Convert timestamp differences to nanoseconds using the device's timestampPeriod. + - Aggregate results by pipeline name (e.g., `pipeline->name`) and by phase (prefill vs. infill) for easy reporting. + +### 2. Pipeline Executable Properties (PEP) Support + +1. **Capability detection** + - During device creation, probe for `VK_KHR_pipeline_executable_properties` and store the support flag in the device capabilities structure. + - Gate any PEP usage behind this flag so unsupported drivers do not incur additional calls. + +2. **Data collection** + - Add helper routines that call `vkGetPipelineExecutablePropertiesKHR` and `vkGetPipelineExecutableStatisticsKHR` for pipelines that are executed when instrumentation is enabled. + - Focus on collecting metrics relevant to matmul tuning, such as LDS usage, SGPR/VGPR counts, and instruction counts. + - Cache the results per pipeline to avoid repeated expensive queries. + +3. **Reporting** + - Integrate PEP data into the same reporting channel as timestamp results, clearly annotating pipelines with their resource usage stats. + - Provide a summary table in the logs or exported JSON to highlight potential register pressure or occupancy issues specific to the RX580. + +### 3. Configuration & UX + +1. **Runtime controls** + - Introduce an environment variable (e.g., `GGML_VK_PROFILING=1`) or a build-time option to toggle instrumentation. Default to disabled. + - When enabled, log a concise message describing which instrumentation features are active (timestamps, PEP). + +2. **Data output** + - Emit human-readable log lines summarizing per-dispatch timings and pipeline stats. + - Optionally generate a structured JSON blob that contains: + ```json + { + "device": "AMD Radeon RX580", + "timestamp_period_ns": , + "dispatches": [ + { + "pipeline": "mul_mat_q4_0_l", + "phase": "prefill", + "time_us": 123.4, + "executables": { + "LDSUsage": "32KB", + "VGPRs": 64, + "SGPRs": 96 + } + } + ] + } + ``` + - Ensure the logging respects existing verbosity settings to avoid flooding standard output during regular runs. + +3. **Validation & Testing** + - Add unit/integration tests in the Vulkan backend (where feasible) to confirm instrumentation paths do not crash when enabled/disabled. + - Run manual validation on an RX580: execute representative prefill and infill workloads, capture the logs/JSON, and verify that timings are recorded for all relevant pipelines. + +## Implementation Checklist + +- [x] Add instrumentation configuration flag and device capability storage. +- [x] Create timestamp query pools and wire them into `ggml_vk_dispatch_pipeline()`. +- [x] Implement result aggregation and logging/JSON export. +- [x] Hook up `VK_KHR_pipeline_executable_properties` data collection. +- [x] Document usage instructions for developers profiling the RX580 path. + +## Usage + +Set `GGML_VK_PROFILING=1` to enable the Vulkan profiler. The backend logs the active features (timestamps and pipeline executable +properties) and prints a per-dispatch breakdown for every `mul_mat*` and `mul_mat_vec*` kernel, followed by an aggregated summary +with the most relevant AMD statistics (VGPRs, SGPRs, LDS usage, etc.). Set `GGML_VK_PROFILING=json` to emit the same information +as a JSON blob in addition to the human-readable log. Disable the environment variable to return to the zero-overhead fast path. + +The output contains: + +- Individual dispatch timings with workgroup sizes for prefill (`mul_mat*`) and infill (`mul_mat_vec*`) pipelines. +- Aggregated averages and totals grouped by pipeline and phase, annotated with cached pipeline executable statistics when the + device supports `VK_KHR_pipeline_executable_properties`. +- Optional structured JSON mirroring the log content for downstream analysis. + +## Expected Outcomes + +- Developers can pinpoint the specific matmul kernels that dominate RX580 runtime, with precise GPU timings. +- Pipeline statistics illuminate whether occupancy, register pressure, or LDS saturation contribute to bottlenecks. +- Instrumentation remains optional, enabling routine builds to stay lightweight while providing deep insights when needed. diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ebbb412e55f..673cb0f0286 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -29,6 +29,9 @@ VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE #include #include #include +#include +#include +#include #if defined(_MSC_VER) # define NOMINMAX 1 @@ -129,6 +132,8 @@ struct vk_pipeline_struct { bool compiled {}; // number of registers used, extracted from pipeline executable properties uint32_t register_count {}; + bool profiling_stats_cached {}; + std::map profiling_stats; }; typedef std::shared_ptr vk_pipeline; @@ -1409,6 +1414,357 @@ class vk_perf_logger { std::map> flops; }; +struct vk_profiling_dispatch_record { + vk_pipeline pipeline; + std::string pipeline_name; + std::string phase; + uint32_t query_begin {}; + uint32_t query_end {}; + std::array elements {}; + std::array workgroups {}; +}; + +struct vk_profiling_state { + vk::QueryPool query_pool; + uint32_t capacity {}; + uint32_t next_query {}; + bool overflowed {}; + bool timestamps_supported {}; + bool logged_features {}; + bool warned_no_timestamps {}; + std::vector dispatches; +}; + +static bool ggml_vk_profiler_matches_pipeline(const std::string& name) { + return name.find("matmul") != std::string::npos || name.find("mul_mat") != std::string::npos; +} + +static std::string ggml_vk_profiler_phase(const std::string& name) { + if (name.find("mul_mat_vec") != std::string::npos) { + return "infill"; + } + if (ggml_vk_profiler_matches_pipeline(name)) { + return "prefill"; + } + return "other"; +} + +static bool ggml_vk_profiler_is_relevant_stat(const std::string& name) { + std::string lowered(name.size(), '\0'); + std::transform(name.begin(), name.end(), lowered.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + return lowered.find("vgpr") != std::string::npos || + lowered.find("sgpr") != std::string::npos || + lowered.find("lds") != std::string::npos || + lowered.find("instr") != std::string::npos || + lowered.find("register") != std::string::npos; +} + +static std::string ggml_vk_profiler_json_escape(const std::string& value) { + std::string escaped; + escaped.reserve(value.size()); + for (char c : value) { + switch (c) { + case '\\': escaped += "\\\\"; break; + case '\"': escaped += "\\\""; break; + case '\n': escaped += "\\n"; break; + case '\r': escaped += "\\r"; break; + case '\t': escaped += "\\t"; break; + default: + if (static_cast(c) < 0x20) { + char buffer[7]; + snprintf(buffer, sizeof(buffer), "\\u%04x", c & 0xff); + escaped += buffer; + } else { + escaped += c; + } + } + } + return escaped; +} + +static std::string ggml_vk_profiler_format_statistic(const vk::PipelineExecutableStatisticKHR & stat) { + switch (stat.format) { + case vk::PipelineExecutableStatisticFormatKHR::eBool32: + return stat.value.b32 ? "true" : "false"; + case vk::PipelineExecutableStatisticFormatKHR::eInt64: + return std::to_string(stat.value.i64); + case vk::PipelineExecutableStatisticFormatKHR::eUint64: + return std::to_string(stat.value.u64); + case vk::PipelineExecutableStatisticFormatKHR::eFloat64: { + std::ostringstream ss; + ss.setf(std::ios::fixed); + ss << std::setprecision(3) << stat.value.f64; + return ss.str(); + } + default: + return "unknown"; + } +} + +static void ggml_vk_profiler_cache_pipeline_stats(vk_device& device, vk_pipeline& pipeline) { + if (!pipeline) { + return; + } + if (!device->pipeline_executable_properties_support) { + if (pipeline->register_count && pipeline->profiling_stats.find("Register Count") == pipeline->profiling_stats.end()) { + pipeline->profiling_stats["Register Count"] = std::to_string(pipeline->register_count); + } + return; + } + if (!pipeline->profiling_stats_cached) { + try { + vk::PipelineInfoKHR pipeline_info; + pipeline_info.pipeline = pipeline->pipeline; + auto executables = device->device.getPipelineExecutablePropertiesKHR(pipeline_info); + for (uint32_t executable_index = 0; executable_index < executables.size(); ++executable_index) { + vk::PipelineExecutableInfoKHR executable_info; + executable_info.pipeline = pipeline->pipeline; + executable_info.executableIndex = executable_index; + auto statistics = device->device.getPipelineExecutableStatisticsKHR(executable_info); + for (const auto & stat : statistics) { + pipeline->profiling_stats[stat.name] = ggml_vk_profiler_format_statistic(stat); + } + } + } catch (const vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: failed to query pipeline executable statistics for %s: %s\n", pipeline->name.c_str(), e.what()); + } + pipeline->profiling_stats_cached = true; + } + if (pipeline->register_count && pipeline->profiling_stats.find("Register Count") == pipeline->profiling_stats.end()) { + pipeline->profiling_stats["Register Count"] = std::to_string(pipeline->register_count); + } +} + +static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t estimated_dispatches) { + if (!vk_profiling_enabled) { + return; + } + + if (!ctx->profiling) { + ctx->profiling = std::make_unique(); + } + + vk_profiling_state & profiler = *ctx->profiling; + profiler.dispatches.clear(); + profiler.next_query = 0; + profiler.overflowed = false; + profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0; + + if (!profiler.timestamps_supported) { + if (!profiler.warned_no_timestamps) { + GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n", ctx->device->name.c_str()); + profiler.warned_no_timestamps = true; + } + return; + } + + uint32_t queries_needed = std::max(256u, estimated_dispatches * 2u + 2u); + if (queries_needed > profiler.capacity) { + if (profiler.query_pool) { + ctx->device->device.destroyQueryPool(profiler.query_pool); + } + vk::QueryPoolCreateInfo query_info; + query_info.queryType = vk::QueryType::eTimestamp; + query_info.queryCount = queries_needed; + profiler.query_pool = ctx->device->device.createQueryPool(query_info); + profiler.capacity = queries_needed; + } + + if (profiler.query_pool) { + ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity); + } + + if (!profiler.logged_features) { + GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamps %s, pipeline executable properties %s)%s\n", + ctx->device->name.c_str(), + profiler.timestamps_supported ? "enabled" : "unavailable", + ctx->device->pipeline_executable_properties_support ? "available" : "unavailable", + vk_profiling_json_enabled ? " [json output]" : ""); + profiler.logged_features = true; + } +} + +static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { + if (!vk_profiling_enabled || !ctx->profiling) { + return; + } + + vk_profiling_state & profiler = *ctx->profiling; + if (!profiler.timestamps_supported || profiler.dispatches.empty() || !profiler.query_pool) { + profiler.dispatches.clear(); + profiler.next_query = 0; + return; + } + + if (profiler.overflowed) { + GGML_LOG_WARN("ggml_vulkan: profiling query pool exhausted on %s; results incomplete\n", ctx->device->name.c_str()); + profiler.dispatches.clear(); + profiler.next_query = 0; + return; + } + + const uint32_t query_count = profiler.next_query; + if (query_count == 0) { + return; + } + + std::vector timestamps(query_count); + VK_CHECK(ctx->device->device.getQueryPoolResults(profiler.query_pool, + 0, + query_count, + query_count * sizeof(uint64_t), + timestamps.data(), + sizeof(uint64_t), + vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), + "get profiling timestamps"); + + double timestamp_period = ctx->device->properties.limits.timestampPeriod; + + struct vk_profiling_pipeline_summary { + std::string name; + std::string phase; + double total_ns = 0.0; + uint32_t count = 0; + vk_pipeline pipeline; + }; + + std::map, vk_profiling_pipeline_summary> summary_map; + std::vector dispatch_times_us; + dispatch_times_us.reserve(profiler.dispatches.size()); + + for (size_t i = 0; i < profiler.dispatches.size(); ++i) { + const auto & record = profiler.dispatches[i]; + double duration_us = 0.0; + if (record.query_begin < query_count && record.query_end < query_count) { + uint64_t start = timestamps[record.query_begin]; + uint64_t end = timestamps[record.query_end]; + double duration_ns = double(end - start) * timestamp_period; + duration_us = duration_ns / 1000.0; + + auto key = std::make_pair(record.pipeline_name, record.phase); + auto & entry = summary_map[key]; + entry.name = record.pipeline_name; + entry.phase = record.phase; + entry.total_ns += duration_ns; + entry.count += 1; + if (record.pipeline) { + entry.pipeline = record.pipeline; + } + } else { + GGML_LOG_WARN("ggml_vulkan: profiling query index out of range for %s\n", record.pipeline_name.c_str()); + } + dispatch_times_us.push_back(duration_us); + } + + if (!profiler.dispatches.empty()) { + GGML_LOG_INFO("ggml_vulkan: profiling dispatches for %s\n", ctx->device->name.c_str()); + } + + for (size_t i = 0; i < profiler.dispatches.size(); ++i) { + const auto & record = profiler.dispatches[i]; + double duration_us = dispatch_times_us[i]; + GGML_LOG_INFO(" dispatch %zu [%s] %s -> %.3f us (wg=%u,%u,%u)\n", + i, + record.phase.c_str(), + record.pipeline_name.c_str(), + duration_us, + record.workgroups[0], + record.workgroups[1], + record.workgroups[2]); + } + + std::vector summaries; + summaries.reserve(summary_map.size()); + for (auto & kv : summary_map) { + summaries.push_back(kv.second); + } + + std::sort(summaries.begin(), summaries.end(), [](const vk_profiling_pipeline_summary& a, const vk_profiling_pipeline_summary& b) { + return a.total_ns > b.total_ns; + }); + + for (const auto & entry : summaries) { + double avg_us = entry.count ? (entry.total_ns / entry.count) / 1000.0 : 0.0; + double total_us = entry.total_ns / 1000.0; + + std::string stats_suffix; + if (entry.pipeline) { + ggml_vk_profiler_cache_pipeline_stats(ctx->device, entry.pipeline); + std::vector> stats; + for (const auto & stat : entry.pipeline->profiling_stats) { + if (ggml_vk_profiler_is_relevant_stat(stat.first)) { + stats.emplace_back(stat.first, stat.second); + } + } + if (!stats.empty()) { + std::ostringstream stats_stream; + stats_stream << " stats: "; + for (size_t i = 0; i < stats.size(); ++i) { + if (i != 0) { + stats_stream << ", "; + } + stats_stream << stats[i].first << "=" << stats[i].second; + } + stats_suffix = stats_stream.str(); + } + } + + GGML_LOG_INFO(" summary [%s] %s dispatches=%u avg=%.3f us total=%.3f us%s\n", + entry.phase.c_str(), + entry.name.c_str(), + entry.count, + avg_us, + total_us, + stats_suffix.c_str()); + } + + if (vk_profiling_json_enabled && !profiler.dispatches.empty()) { + std::ostringstream json; + json << "{\n"; + json << " \"device\": \"" << ggml_vk_profiler_json_escape(ctx->device->name) << "\",\n"; + json << " \"timestamp_period_ns\": " << timestamp_period << ",\n"; + json << " \"dispatches\": [\n"; + for (size_t i = 0; i < profiler.dispatches.size(); ++i) { + const auto & record = profiler.dispatches[i]; + json << " {\n"; + json << " \"pipeline\": \"" << ggml_vk_profiler_json_escape(record.pipeline_name) << "\",\n"; + json << " \"phase\": \"" << ggml_vk_profiler_json_escape(record.phase) << "\",\n"; + std::ostringstream time_stream; + time_stream.setf(std::ios::fixed); + time_stream << std::setprecision(3) << dispatch_times_us[i]; + json << " \"time_us\": " << time_stream.str() << ",\n"; + json << " \"workgroups\": [" << record.workgroups[0] << ", " << record.workgroups[1] << ", " << record.workgroups[2] << "],\n"; + json << " \"executables\": {"; + bool first = true; + if (record.pipeline) { + ggml_vk_profiler_cache_pipeline_stats(ctx->device, record.pipeline); + for (const auto & stat : record.pipeline->profiling_stats) { + if (!ggml_vk_profiler_is_relevant_stat(stat.first)) { + continue; + } + if (!first) { + json << ", "; + } + json << "\"" << ggml_vk_profiler_json_escape(stat.first) << "\": \"" << ggml_vk_profiler_json_escape(stat.second) << "\""; + first = false; + } + } + json << "}\n"; + json << " }"; + if (i + 1 < profiler.dispatches.size()) { + json << ","; + } + json << "\n"; + } + json << " ]\n"; + json << "}\n"; + GGML_LOG_INFO("%s", json.str().c_str()); + } + + profiler.dispatches.clear(); + profiler.next_query = 0; +} + struct ggml_backend_vk_context { std::string name; @@ -1454,6 +1810,8 @@ struct ggml_backend_vk_context { // number of additional consecutive nodes that are being fused with the // node currently being processed int num_additional_fused_ops {}; + + std::unique_ptr profiling; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -1536,6 +1894,8 @@ static bool vk_instance_initialized = false; static vk_instance_t vk_instance; static bool vk_perf_logger_enabled = false; +static bool vk_profiling_enabled = false; +static bool vk_profiling_json_enabled = false; #ifdef GGML_VULKAN_CHECK_RESULTS static size_t vk_skip_checks; @@ -4574,6 +4934,19 @@ static void ggml_vk_instance_init() { vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + const char * profiling_env = getenv("GGML_VK_PROFILING"); + if (profiling_env != nullptr) { + vk_profiling_enabled = true; + std::string profiling_value = profiling_env; + std::string profiling_value_lower = profiling_value; + std::transform(profiling_value_lower.begin(), profiling_value_lower.end(), profiling_value_lower.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + vk_profiling_json_enabled = profiling_value_lower.find("json") != std::string::npos; + } else { + vk_profiling_enabled = false; + vk_profiling_json_enabled = false; + } + // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers- VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance); @@ -5223,7 +5596,41 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& 0, { descriptor_set }, {}); + bool profile_dispatch = false; + uint32_t query_begin = 0; + uint32_t query_end = 0; + vk_profiling_state * profiler_state = nullptr; + if (vk_profiling_enabled && ctx->profiling && ctx->profiling->timestamps_supported && ctx->profiling->query_pool && + !ctx->profiling->overflowed && ggml_vk_profiler_matches_pipeline(pipeline->name)) { + profiler_state = ctx->profiling.get(); + if (profiler_state->next_query + 1 < profiler_state->capacity) { + query_begin = profiler_state->next_query++; + query_end = profiler_state->next_query++; + profile_dispatch = true; + + vk_profiling_dispatch_record record; + record.pipeline = pipeline; + record.pipeline_name = pipeline->name; + record.phase = ggml_vk_profiler_phase(pipeline->name); + record.query_begin = query_begin; + record.query_end = query_end; + record.elements = elements; + record.workgroups = { wg0, wg1, wg2 }; + profiler_state->dispatches.push_back(std::move(record)); + } else { + profiler_state->overflowed = true; + } + } + + if (profile_dispatch) { + subctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eComputeShader, profiler_state->query_pool, query_begin); + } + subctx->s->buffer.dispatch(wg0, wg1, wg2); + + if (profile_dispatch) { + subctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eComputeShader, profiler_state->query_pool, query_end); + } } static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { @@ -11470,6 +11877,12 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->descriptor_pools.clear(); ctx->descriptor_sets.clear(); + if (ctx->profiling && ctx->profiling->query_pool) { + ctx->device->device.destroyQueryPool(ctx->profiling->query_pool); + ctx->profiling->query_pool = vk::QueryPool{}; + } + ctx->profiling.reset(); + ctx->compute_cmd_pool.destroy(ctx->device->device); ctx->transfer_cmd_pool.destroy(ctx->device->device); } @@ -11973,6 +12386,11 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Reserve tensor context space for all nodes ctx->tensor_ctxs.resize(cgraph->n_nodes); + if (vk_profiling_enabled) { + uint32_t expected_dispatches = (uint32_t)(std::max(1, cgraph->n_nodes) * 6); + ggml_vk_profiler_begin_graph(ctx, expected_dispatches); + } + bool first_node_in_batch = true; // true if next node will be first node in a batch int submit_node_idx = 0; // index to first node in a batch @@ -12110,6 +12528,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->device->perf_logger->print_timings(); } + if (vk_profiling_enabled) { + ggml_vk_profiler_end_graph(ctx); + } + ggml_vk_graph_cleanup(ctx); return GGML_STATUS_SUCCESS; From 64535ff5c380117662f810484cfe8c795b2507ca Mon Sep 17 00:00:00 2001 From: rasbid <104773487+rasbid@users.noreply.github.com> Date: Tue, 7 Oct 2025 10:57:48 +0300 Subject: [PATCH 2/4] Fix Vulkan profiling context initialization --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 360 ++++++++++++++------------- 1 file changed, 183 insertions(+), 177 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 673cb0f0286..bd275150662 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1535,6 +1535,138 @@ static void ggml_vk_profiler_cache_pipeline_stats(vk_device& device, vk_pipeline } } +struct ggml_backend_vk_context { + std::string name; + + vk_device device; + + size_t semaphore_idx, event_idx; + ggml_vk_garbage_collector gc; + size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset; + vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials; + vk::Fence fence, almost_ready_fence; + bool almost_ready_fence_pending {}; + // Set before op_add and unset after op_rms_norm to indicate that the add should + // write partial sums to accumulate the square of the vector components + bool do_add_rms_partials; + + // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. + vk_pipeline_struct * prealloc_y_last_pipeline_used {}; + const ggml_tensor * prealloc_y_last_tensor_used {}; + + // Track which nodes have been used since the last sync, and whether they were written to + std::vector unsynced_nodes_written; + std::vector unsynced_nodes_read; + // Track which prealloc buffers have pending reads that need to be synchronized. + // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set), + // and set to true after the buffer contents are consumed. + bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync; + + vk_buffer buffer_pool[MAX_VK_BUFFERS]; + + vk_context_ref compute_ctx; + vk_context_ref transfer_ctx; + + std::vector tensor_ctxs; + + std::vector descriptor_pools; + std::vector descriptor_sets; + uint32_t descriptor_set_idx {}; + uint32_t pipeline_descriptor_set_requirements {}; + + vk_command_pool compute_cmd_pool; + vk_command_pool transfer_cmd_pool; + + // number of additional consecutive nodes that are being fused with the + // node currently being processed + int num_additional_fused_ops {}; + + std::unique_ptr profiling; +}; + +static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; +} + +struct ggml_backend_vk_buffer_context { + vk_device_ref device; + vk_buffer dev_buffer; + std::string name; + + ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : + device(device), + dev_buffer(dev_buffer), + name(name) { + } + + ~ggml_backend_vk_buffer_context() { + ggml_vk_destroy_buffer(dev_buffer); + } +}; + +#ifdef GGML_VULKAN_MEMORY_DEBUG +static std::mutex log_mutex; + +void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { + std::lock_guard guard(log_mutex); + vk_buffer buf = buf_ref.lock(); + const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); + const std::string type = device ? "device" : "host"; + allocations[buf->buffer] = size; + total_device += device ? size : 0; + total_host += device ? 0 : size; + VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); +} + +void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { + if (buf_ref.expired() || buf_ref.lock()->size == 0) { + return; + } + + std::lock_guard guard(log_mutex); + vk_buffer buf = buf_ref.lock(); + const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); + std::string type = device ? "device" : "host"; + auto it = allocations.find(buf->buffer); + total_device -= device ? it->second : 0; + total_host -= device ? 0 : it->second; + if (it != allocations.end()) { + VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); + allocations.erase(it); + } else { + VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); + } +} +#endif // GGML_VULKAN_MEMORY_DEBUG + +struct vk_instance_t { + vk::Instance instance; + + bool debug_utils_support = false; // VK_EXT_debug_utils enabled + PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; + PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; + PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; + PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; + PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; + PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; + + std::vector device_indices; + std::vector device_supports_membudget; + vk_device devices[GGML_VK_MAX_DEVICES]; +}; + +static bool vk_instance_initialized = false; +static vk_instance_t vk_instance; + +static bool vk_perf_logger_enabled = false; +static bool vk_profiling_enabled = false; +static bool vk_profiling_json_enabled = false; + static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t estimated_dispatches) { if (!vk_profiling_enabled) { return; @@ -1545,43 +1677,49 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t } vk_profiling_state & profiler = *ctx->profiling; - profiler.dispatches.clear(); - profiler.next_query = 0; profiler.overflowed = false; - profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0; - if (!profiler.timestamps_supported) { - if (!profiler.warned_no_timestamps) { - GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n", ctx->device->name.c_str()); - profiler.warned_no_timestamps = true; + if (!profiler.logged_features) { + profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0; + + if (!profiler.timestamps_supported) { + if (!profiler.warned_no_timestamps) { + profiler.warned_no_timestamps = true; + GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n", + ctx->device->name.c_str()); + } + return; } - return; - } - uint32_t queries_needed = std::max(256u, estimated_dispatches * 2u + 2u); - if (queries_needed > profiler.capacity) { if (profiler.query_pool) { ctx->device->device.destroyQueryPool(profiler.query_pool); + profiler.query_pool = {}; } - vk::QueryPoolCreateInfo query_info; - query_info.queryType = vk::QueryType::eTimestamp; - query_info.queryCount = queries_needed; - profiler.query_pool = ctx->device->device.createQueryPool(query_info); - profiler.capacity = queries_needed; - } - if (profiler.query_pool) { + profiler.capacity = std::max(estimated_dispatches * 2u, 256u); + + vk::QueryPoolCreateInfo query_info({}, vk::QueryType::eTimestamp, profiler.capacity); + profiler.query_pool = ctx->device->device.createQueryPool(query_info); ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity); - } + profiler.logged_features = true; - if (!profiler.logged_features) { - GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamps %s, pipeline executable properties %s)%s\n", + GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamp support: %s, pipeline stats: %s%s)\n", ctx->device->name.c_str(), - profiler.timestamps_supported ? "enabled" : "unavailable", + profiler.timestamps_supported ? "available" : "unavailable", ctx->device->pipeline_executable_properties_support ? "available" : "unavailable", vk_profiling_json_enabled ? " [json output]" : ""); - profiler.logged_features = true; + } else { + if (!profiler.timestamps_supported) { + return; + } + + if (profiler.query_pool && profiler.next_query) { + ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity); + } } + + profiler.next_query = 0; + profiler.dispatches.clear(); } static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { @@ -1590,21 +1728,17 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { } vk_profiling_state & profiler = *ctx->profiling; - if (!profiler.timestamps_supported || profiler.dispatches.empty() || !profiler.query_pool) { - profiler.dispatches.clear(); - profiler.next_query = 0; + + if (!profiler.timestamps_supported || !profiler.query_pool) { return; } if (profiler.overflowed) { GGML_LOG_WARN("ggml_vulkan: profiling query pool exhausted on %s; results incomplete\n", ctx->device->name.c_str()); - profiler.dispatches.clear(); - profiler.next_query = 0; - return; } const uint32_t query_count = profiler.next_query; - if (query_count == 0) { + if (query_count == 0 || profiler.dispatches.empty()) { return; } @@ -1612,19 +1746,22 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { VK_CHECK(ctx->device->device.getQueryPoolResults(profiler.query_pool, 0, query_count, - query_count * sizeof(uint64_t), + sizeof(uint64_t) * query_count, timestamps.data(), sizeof(uint64_t), - vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), - "get profiling timestamps"); + vk::QueryResultFlagBits::e64), + "getQueryPoolResults"); double timestamp_period = ctx->device->properties.limits.timestampPeriod; + if (timestamp_period == 0.0) { + timestamp_period = 1.0; + } struct vk_profiling_pipeline_summary { std::string name; std::string phase; - double total_ns = 0.0; - uint32_t count = 0; + double total_ns {}; + uint32_t count {}; vk_pipeline pipeline; }; @@ -1632,13 +1769,13 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { std::vector dispatch_times_us; dispatch_times_us.reserve(profiler.dispatches.size()); - for (size_t i = 0; i < profiler.dispatches.size(); ++i) { - const auto & record = profiler.dispatches[i]; + for (const auto & record : profiler.dispatches) { double duration_us = 0.0; - if (record.query_begin < query_count && record.query_end < query_count) { - uint64_t start = timestamps[record.query_begin]; - uint64_t end = timestamps[record.query_end]; - double duration_ns = double(end - start) * timestamp_period; + + if (record.query_end < timestamps.size() && record.query_begin < timestamps.size()) { + const uint64_t start = timestamps[record.query_begin]; + const uint64_t end = timestamps[record.query_end]; + const double duration_ns = double(end - start) * timestamp_period; duration_us = duration_ns / 1000.0; auto key = std::make_pair(record.pipeline_name, record.phase); @@ -1653,6 +1790,7 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { } else { GGML_LOG_WARN("ggml_vulkan: profiling query index out of range for %s\n", record.pipeline_name.c_str()); } + dispatch_times_us.push_back(duration_us); } @@ -1662,7 +1800,7 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { for (size_t i = 0; i < profiler.dispatches.size(); ++i) { const auto & record = profiler.dispatches[i]; - double duration_us = dispatch_times_us[i]; + const double duration_us = dispatch_times_us[i]; GGML_LOG_INFO(" dispatch %zu [%s] %s -> %.3f us (wg=%u,%u,%u)\n", i, record.phase.c_str(), @@ -1679,13 +1817,13 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { summaries.push_back(kv.second); } - std::sort(summaries.begin(), summaries.end(), [](const vk_profiling_pipeline_summary& a, const vk_profiling_pipeline_summary& b) { + std::sort(summaries.begin(), summaries.end(), [](const vk_profiling_pipeline_summary & a, const vk_profiling_pipeline_summary & b) { return a.total_ns > b.total_ns; }); for (const auto & entry : summaries) { - double avg_us = entry.count ? (entry.total_ns / entry.count) / 1000.0 : 0.0; - double total_us = entry.total_ns / 1000.0; + const double avg_us = entry.count ? (entry.total_ns / entry.count) / 1000.0 : 0.0; + const double total_us = entry.total_ns / 1000.0; std::string stats_suffix; if (entry.pipeline) { @@ -1765,138 +1903,6 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { profiler.next_query = 0; } -struct ggml_backend_vk_context { - std::string name; - - vk_device device; - - size_t semaphore_idx, event_idx; - ggml_vk_garbage_collector gc; - size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset; - vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials; - vk::Fence fence, almost_ready_fence; - bool almost_ready_fence_pending {}; - // Set before op_add and unset after op_rms_norm to indicate that the add should - // write partial sums to accumulate the square of the vector components - bool do_add_rms_partials; - - // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. - vk_pipeline_struct * prealloc_y_last_pipeline_used {}; - const ggml_tensor * prealloc_y_last_tensor_used {}; - - // Track which nodes have been used since the last sync, and whether they were written to - std::vector unsynced_nodes_written; - std::vector unsynced_nodes_read; - // Track which prealloc buffers have pending reads that need to be synchronized. - // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set), - // and set to true after the buffer contents are consumed. - bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync; - - vk_buffer buffer_pool[MAX_VK_BUFFERS]; - - vk_context_ref compute_ctx; - vk_context_ref transfer_ctx; - - std::vector tensor_ctxs; - - std::vector descriptor_pools; - std::vector descriptor_sets; - uint32_t descriptor_set_idx {}; - uint32_t pipeline_descriptor_set_requirements {}; - - vk_command_pool compute_cmd_pool; - vk_command_pool transfer_cmd_pool; - - // number of additional consecutive nodes that are being fused with the - // node currently being processed - int num_additional_fused_ops {}; - - std::unique_ptr profiling; -}; - -static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT - -static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { - if (tensor->view_src) { - return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; - } - return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; -} - -struct ggml_backend_vk_buffer_context { - vk_device_ref device; - vk_buffer dev_buffer; - std::string name; - - ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : - device(device), - dev_buffer(dev_buffer), - name(name) { - } - - ~ggml_backend_vk_buffer_context() { - ggml_vk_destroy_buffer(dev_buffer); - } -}; - -#ifdef GGML_VULKAN_MEMORY_DEBUG -static std::mutex log_mutex; - -void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { - std::lock_guard guard(log_mutex); - vk_buffer buf = buf_ref.lock(); - const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); - const std::string type = device ? "device" : "host"; - allocations[buf->buffer] = size; - total_device += device ? size : 0; - total_host += device ? 0 : size; - VK_LOG_MEMORY(buf->device->name << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); -} - -void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { - if (buf_ref.expired() || buf_ref.lock()->size == 0) { - return; - } - - std::lock_guard guard(log_mutex); - vk_buffer buf = buf_ref.lock(); - const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); - std::string type = device ? "device" : "host"; - auto it = allocations.find(buf->buffer); - total_device -= device ? it->second : 0; - total_host -= device ? 0 : it->second; - if (it != allocations.end()) { - VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); - allocations.erase(it); - } else { - VK_LOG_MEMORY("ERROR " << buf->device->name << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); - } -} -#endif // GGML_VULKAN_MEMORY_DEBUG - -struct vk_instance_t { - vk::Instance instance; - - bool debug_utils_support = false; // VK_EXT_debug_utils enabled - PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {}; - PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {}; - PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {}; - PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {}; - PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {}; - PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {}; - - std::vector device_indices; - std::vector device_supports_membudget; - vk_device devices[GGML_VK_MAX_DEVICES]; -}; - -static bool vk_instance_initialized = false; -static vk_instance_t vk_instance; - -static bool vk_perf_logger_enabled = false; -static bool vk_profiling_enabled = false; -static bool vk_profiling_json_enabled = false; - #ifdef GGML_VULKAN_CHECK_RESULTS static size_t vk_skip_checks; static size_t vk_output_tensor; From 879a789d034737a61b476aedfb32dfde27309d63 Mon Sep 17 00:00:00 2001 From: rasbid <104773487+rasbid@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:17:53 +0300 Subject: [PATCH 3/4] Fix Vulkan profiling build errors --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index bd275150662..87e70ca4769 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1693,7 +1693,7 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t if (profiler.query_pool) { ctx->device->device.destroyQueryPool(profiler.query_pool); - profiler.query_pool = {}; + profiler.query_pool = vk::QueryPool{}; } profiler.capacity = std::max(estimated_dispatches * 2u, 256u); @@ -1827,9 +1827,10 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { std::string stats_suffix; if (entry.pipeline) { - ggml_vk_profiler_cache_pipeline_stats(ctx->device, entry.pipeline); + vk_pipeline pipeline = entry.pipeline; + ggml_vk_profiler_cache_pipeline_stats(ctx->device, pipeline); std::vector> stats; - for (const auto & stat : entry.pipeline->profiling_stats) { + for (const auto & stat : pipeline->profiling_stats) { if (ggml_vk_profiler_is_relevant_stat(stat.first)) { stats.emplace_back(stat.first, stat.second); } @@ -1875,8 +1876,9 @@ static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) { json << " \"executables\": {"; bool first = true; if (record.pipeline) { - ggml_vk_profiler_cache_pipeline_stats(ctx->device, record.pipeline); - for (const auto & stat : record.pipeline->profiling_stats) { + vk_pipeline pipeline = record.pipeline; + ggml_vk_profiler_cache_pipeline_stats(ctx->device, pipeline); + for (const auto & stat : pipeline->profiling_stats) { if (!ggml_vk_profiler_is_relevant_stat(stat.first)) { continue; } From c2d82980aba6755a56192c923a22ec79227fd8d3 Mon Sep 17 00:00:00 2001 From: rasbid <104773487+rasbid@users.noreply.github.com> Date: Tue, 7 Oct 2025 13:30:42 +0300 Subject: [PATCH 4/4] Handle dynamic Vulkan profiling query pool sizing --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 43 +++++++++++++++++----------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 87e70ca4769..8caefdb28cb 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1679,6 +1679,11 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t vk_profiling_state & profiler = *ctx->profiling; profiler.overflowed = false; + const uint32_t min_dispatch_guess = std::max(estimated_dispatches, 1u); + const uint32_t max_dispatch_guess = std::numeric_limits::max() / 2u; + const uint32_t clamped_dispatch_guess = std::min(min_dispatch_guess, max_dispatch_guess); + const uint32_t required_queries = std::max(clamped_dispatch_guess * 2u, 256u); + if (!profiler.logged_features) { profiler.timestamps_supported = ctx->device->properties.limits.timestampComputeAndGraphics != 0; @@ -1688,19 +1693,11 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t GGML_LOG_WARN("ggml_vulkan: device %s does not support compute timestamps; profiling disabled\n", ctx->device->name.c_str()); } + profiler.dispatches.clear(); + profiler.next_query = 0; return; } - if (profiler.query_pool) { - ctx->device->device.destroyQueryPool(profiler.query_pool); - profiler.query_pool = vk::QueryPool{}; - } - - profiler.capacity = std::max(estimated_dispatches * 2u, 256u); - - vk::QueryPoolCreateInfo query_info({}, vk::QueryType::eTimestamp, profiler.capacity); - profiler.query_pool = ctx->device->device.createQueryPool(query_info); - ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity); profiler.logged_features = true; GGML_LOG_INFO("ggml_vulkan: profiling enabled for %s (timestamp support: %s, pipeline stats: %s%s)\n", @@ -1708,18 +1705,32 @@ static void ggml_vk_profiler_begin_graph(ggml_backend_vk_context * ctx, uint32_t profiler.timestamps_supported ? "available" : "unavailable", ctx->device->pipeline_executable_properties_support ? "available" : "unavailable", vk_profiling_json_enabled ? " [json output]" : ""); - } else { - if (!profiler.timestamps_supported) { - return; - } + } else if (!profiler.timestamps_supported) { + profiler.dispatches.clear(); + profiler.next_query = 0; + return; + } - if (profiler.query_pool && profiler.next_query) { - ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity); + if (!profiler.query_pool || required_queries > profiler.capacity) { + if (profiler.query_pool) { + ctx->device->device.destroyQueryPool(profiler.query_pool); + profiler.query_pool = vk::QueryPool{}; } + + profiler.capacity = required_queries; + + vk::QueryPoolCreateInfo query_info({}, vk::QueryType::eTimestamp, profiler.capacity); + profiler.query_pool = ctx->device->device.createQueryPool(query_info); + } + + if (profiler.query_pool) { + ctx->device->device.resetQueryPool(profiler.query_pool, 0, profiler.capacity); } profiler.next_query = 0; profiler.dispatches.clear(); + const size_t dispatch_capacity_hint = std::min(clamped_dispatch_guess, profiler.capacity / 2u); + profiler.dispatches.reserve(dispatch_capacity_hint); } static void ggml_vk_profiler_end_graph(ggml_backend_vk_context * ctx) {