From 1308d286341016468a057bef0ddddec3acef4dee Mon Sep 17 00:00:00 2001 From: stainlu Date: Thu, 9 Apr 2026 02:05:47 +0800 Subject: [PATCH] Add GPU timestamp queries for per-pass GPU profiling Extend the RHI with QuerySet, timestamp query methods, and copy_buffer_to_buffer so the renderer can measure per-pass GPU time via wgpu timestamp queries. The CPU Profiler now distinguishes Cpu vs Gpu sections and GPU timings appear in the same frame summary. Co-Authored-By: Claude Sonnet 4.6 --- crates/euca-agent/src/routes/profile.rs | 3 +- crates/euca-core/src/lib.rs | 2 +- crates/euca-core/src/profiler.rs | 83 ++++++- crates/euca-render/src/deferred.rs | 2 + crates/euca-render/src/gpu.rs | 16 ++ crates/euca-render/src/hardware.rs | 10 + crates/euca-render/src/post_process.rs | 1 + crates/euca-render/src/prepass.rs | 1 + crates/euca-render/src/renderer.rs | 289 ++++++++++++++++++++---- crates/euca-render/src/ssr.rs | 1 + crates/euca-render/src/ui_overlay.rs | 1 + crates/euca-render/src/velocity.rs | 1 + crates/euca-render/src/volumetric.rs | 1 + crates/euca-rhi/src/lib.rs | 53 +++++ crates/euca-rhi/src/metal_backend.rs | 30 ++- crates/euca-rhi/src/types.rs | 19 ++ crates/euca-rhi/src/wgpu_backend.rs | 89 +++++++- crates/euca-rhi/tests/metal_smoke.rs | 1 + examples/metal_combined.rs | 1 + examples/metal_cubes.rs | 1 + examples/metal_fx_upscale.rs | 1 + examples/metal_mesh_stress.rs | 1 + examples/metal_stress.rs | 1 + 23 files changed, 559 insertions(+), 49 deletions(-) diff --git a/crates/euca-agent/src/routes/profile.rs b/crates/euca-agent/src/routes/profile.rs index 6a90dc8..80453bf 100644 --- a/crates/euca-agent/src/routes/profile.rs +++ b/crates/euca-agent/src/routes/profile.rs @@ -18,10 +18,11 @@ pub async fn profile(State(world): State) -> Json = profiler .frame_summary() .iter() - .map(|(name, us)| { + .map(|(name, us, kind)| { serde_json::json!({ "name": name, "us": (*us * 10.0).round() / 10.0, + "kind": format!("{kind:?}"), }) }) .collect(); diff --git a/crates/euca-core/src/lib.rs b/crates/euca-core/src/lib.rs index 9f51fc8..4a96595 100644 --- a/crates/euca-core/src/lib.rs +++ b/crates/euca-core/src/lib.rs @@ -12,7 +12,7 @@ mod time; pub use app::App; pub use platform::performance_core_count; pub use plugin::Plugin; -pub use profiler::{ProfileSection, Profiler, profiler_begin, profiler_end}; +pub use profiler::{ProfileSection, ProfileSectionKind, Profiler, profiler_begin, profiler_end}; pub use time::Time; /// Re-export `winit` for downstream crates that need window types. diff --git a/crates/euca-core/src/profiler.rs b/crates/euca-core/src/profiler.rs index eda94ef..a4cb856 100644 --- a/crates/euca-core/src/profiler.rs +++ b/crates/euca-core/src/profiler.rs @@ -4,12 +4,23 @@ use std::time::Instant; /// Maximum number of frame times retained for averaging. const MAX_FRAME_HISTORY: usize = 60; +/// Whether a profile section was measured on the CPU or GPU. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ProfileSectionKind { + /// Timed on the CPU via `std::time::Instant`. + Cpu, + /// Timed on the GPU via timestamp queries. + Gpu, +} + /// A recorded profile section within a single frame. pub struct ProfileSection { /// Human-readable section label. pub name: &'static str, /// Wall-clock duration of this section in microseconds. pub duration_us: f64, + /// Whether this section was timed on the CPU or GPU. + pub kind: ProfileSectionKind, } /// Built-in frame profiler that tracks per-section timings and rolling frame statistics. @@ -34,14 +45,26 @@ impl Profiler { } } - /// Return section names and durations (in microseconds) for the current frame. - pub fn frame_summary(&self) -> Vec<(&str, f64)> { + /// Return section names, durations (microseconds), and kind for the current frame. + pub fn frame_summary(&self) -> Vec<(&str, f64, ProfileSectionKind)> { self.sections .iter() - .map(|s| (s.name, s.duration_us)) + .map(|s| (s.name, s.duration_us, s.kind)) .collect() } + /// Record a GPU-timed section (called after timestamp readback). + /// + /// GPU sections are distinguished from CPU sections in the summary so + /// callers can display them separately or side-by-side. + pub fn record_gpu_section(&mut self, name: &'static str, duration_us: f64) { + self.sections.push(ProfileSection { + name, + duration_us, + kind: ProfileSectionKind::Gpu, + }); + } + /// Average frame time in milliseconds over the last [`MAX_FRAME_HISTORY`] frames. /// /// Returns `0.0` when no frame times have been recorded yet. @@ -64,10 +87,18 @@ impl Profiler { 1000.0 / avg } - /// Finish the current frame: record total frame time from all sections and reset - /// the section list for the next frame. + /// Finish the current frame: record total CPU frame time and reset the + /// section list for the next frame. + /// + /// Only CPU sections contribute to the rolling frame-time average. GPU + /// sections run in parallel with CPU work and are reported separately. pub fn end_frame(&mut self) { - let total_us: f64 = self.sections.iter().map(|s| s.duration_us).sum(); + let total_us: f64 = self + .sections + .iter() + .filter(|s| s.kind == ProfileSectionKind::Cpu) + .map(|s| s.duration_us) + .sum(); let total_ms = total_us / 1000.0; if self.frame_times.len() == MAX_FRAME_HISTORY { @@ -103,6 +134,7 @@ pub fn profiler_end(profiler: &mut Profiler) { profiler.sections.push(ProfileSection { name, duration_us: elapsed.as_secs_f64() * 1_000_000.0, + kind: ProfileSectionKind::Cpu, }); } @@ -229,4 +261,43 @@ mod tests { // Outer should be >= inner since it wraps it. assert!(summary[1].1 >= summary[0].1); } + + #[test] + fn cpu_sections_tagged_as_cpu() { + let mut profiler = Profiler::new(); + profiler_begin(&mut profiler, "cpu_work"); + profiler_end(&mut profiler); + + let summary = profiler.frame_summary(); + assert_eq!(summary[0].2, ProfileSectionKind::Cpu); + } + + #[test] + fn gpu_section_recorded_and_tagged() { + let mut profiler = Profiler::new(); + profiler.record_gpu_section("shadow_pass", 123.4); + + let summary = profiler.frame_summary(); + assert_eq!(summary.len(), 1); + assert_eq!(summary[0].0, "shadow_pass"); + assert!((summary[0].1 - 123.4).abs() < f64::EPSILON); + assert_eq!(summary[0].2, ProfileSectionKind::Gpu); + } + + #[test] + fn gpu_sections_excluded_from_frame_time() { + let mut profiler = Profiler::new(); + + // Add a CPU section and a GPU section. + profiler_begin(&mut profiler, "cpu"); + thread::sleep(Duration::from_millis(1)); + profiler_end(&mut profiler); + profiler.record_gpu_section("gpu_pass", 5000.0); // 5ms in us + + profiler.end_frame(); + + // Frame time should only include the ~1ms CPU section, not the 5ms GPU section. + let avg = profiler.avg_frame_time_ms(); + assert!(avg < 50.0, "avg should not include GPU section: {avg}"); + } } diff --git a/crates/euca-render/src/deferred.rs b/crates/euca-render/src/deferred.rs index 90cfd22..c5aa573 100644 --- a/crates/euca-render/src/deferred.rs +++ b/crates/euca-render/src/deferred.rs @@ -520,6 +520,7 @@ impl DeferredPipeline { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(&self.gbuffer_pipeline); @@ -546,6 +547,7 @@ impl DeferredPipeline { }, })], depth_stencil_attachment: None, + timestamp_writes: None, }, ); pass.set_pipeline(&self.lighting_pipeline); diff --git a/crates/euca-render/src/gpu.rs b/crates/euca-render/src/gpu.rs index 20d14b1..fcc67e7 100644 --- a/crates/euca-render/src/gpu.rs +++ b/crates/euca-render/src/gpu.rs @@ -108,6 +108,12 @@ impl GpuContext { required_features |= wgpu::Features::MULTI_DRAW_INDIRECT_COUNT; } + // GPU timestamp queries for per-pass GPU profiling. + if supported.contains(wgpu::Features::TIMESTAMP_QUERY) { + required_features |= wgpu::Features::TIMESTAMP_QUERY; + log::info!("GPU supports TIMESTAMP_QUERY — GPU pass timing enabled"); + } + // Bindless materials: texture binding arrays + non-uniform indexing. let bindless_features = wgpu::Features::TEXTURE_BINDING_ARRAY | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING; @@ -173,6 +179,8 @@ impl GpuContext { let unified_memory = survey.supports_unified_memory(); + let has_timestamp_query = required_features.contains(wgpu::Features::TIMESTAMP_QUERY); + let capabilities = Capabilities { unified_memory, multi_draw_indirect: has_multi_draw_indirect, @@ -184,6 +192,7 @@ impl GpuContext { max_bindings_per_bind_group: cap_max_bindings, max_binding_array_elements: cap_max_binding_array, device_name: adapter_info.name.clone(), + timestamp_query: has_timestamp_query, ..Default::default() }; @@ -248,6 +257,10 @@ impl GpuContext { required_features |= wgpu::Features::MULTI_DRAW_INDIRECT_COUNT; } + if supported.contains(wgpu::Features::TIMESTAMP_QUERY) { + required_features |= wgpu::Features::TIMESTAMP_QUERY; + } + let bindless_features = wgpu::Features::TEXTURE_BINDING_ARRAY | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING; if supported.contains(bindless_features) { @@ -305,6 +318,8 @@ impl GpuContext { let unified_memory = survey.supports_unified_memory(); + let has_timestamp_query_async = required_features.contains(wgpu::Features::TIMESTAMP_QUERY); + let capabilities = Capabilities { unified_memory, multi_draw_indirect: has_multi_draw_indirect, @@ -316,6 +331,7 @@ impl GpuContext { max_bindings_per_bind_group: cap_max_bindings, max_binding_array_elements: cap_max_binding_array, device_name: adapter_info.name.clone(), + timestamp_query: has_timestamp_query_async, ..Default::default() }; diff --git a/crates/euca-render/src/hardware.rs b/crates/euca-render/src/hardware.rs index 242111a..5e9711d 100644 --- a/crates/euca-render/src/hardware.rs +++ b/crates/euca-render/src/hardware.rs @@ -215,6 +215,16 @@ impl HardwareSurvey { self.selected().vendor == GpuVendor::Apple } + /// Whether the selected adapter supports GPU timestamp queries. + /// + /// Timestamp queries enable per-pass GPU timing that is reported alongside + /// CPU profiler sections for a complete frame profile. + pub fn supports_timestamp_queries(&self) -> bool { + self.selected() + .features + .contains(wgpu::Features::TIMESTAMP_QUERY) + } + /// Get the selected adapter info. pub fn selected(&self) -> &AdapterInfo { &self.adapters[self.selected_adapter] diff --git a/crates/euca-render/src/post_process.rs b/crates/euca-render/src/post_process.rs index 06059c6..5586bca 100644 --- a/crates/euca-render/src/post_process.rs +++ b/crates/euca-render/src/post_process.rs @@ -990,6 +990,7 @@ fn run_fullscreen_pass( }, })], depth_stencil_attachment: None, + timestamp_writes: None, }, ); pass.set_pipeline(pipeline); diff --git a/crates/euca-render/src/prepass.rs b/crates/euca-render/src/prepass.rs index 748d1e2..4fa3780 100644 --- a/crates/euca-render/src/prepass.rs +++ b/crates/euca-render/src/prepass.rs @@ -335,6 +335,7 @@ impl PrepassPipeline { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(&self.pipeline); diff --git a/crates/euca-render/src/renderer.rs b/crates/euca-render/src/renderer.rs index b2fe66d..abd0765 100644 --- a/crates/euca-render/src/renderer.rs +++ b/crates/euca-render/src/renderer.rs @@ -311,6 +311,123 @@ const SHADOW_MAP_SIZE: u32 = 2048; const NUM_SHADOW_CASCADES: u32 = 3; const CASCADE_ORTHO_SIZES: [f32; 3] = [8.0, 20.0, 50.0]; +/// Maximum number of GPU-timed sections per frame. +/// +/// Each section uses two timestamp queries (start + end), so the query set +/// holds `MAX_GPU_TIMER_SECTIONS * 2` entries. +const MAX_GPU_TIMER_SECTIONS: u32 = 32; + +/// Per-pass GPU timing state using timestamp queries. +/// +/// Implements a 2-frame readback pipeline to avoid GPU pipeline stalls: +/// frame N writes timestamps and resolves them into `resolve_buffer`, then +/// copies to `readback_buffer`. Frame N+1 maps `readback_buffer` to read +/// the previous frame's timestamps while the current frame's queries land +/// in the resolve buffer. +struct GpuTimerState { + /// GPU query set holding raw timestamp values. + query_set: D::QuerySet, + /// GPU buffer that receives resolved u64 tick values + /// (QUERY_RESOLVE | COPY_SRC). + resolve_buffer: D::Buffer, + /// CPU-readable staging buffer (MAP_READ | COPY_DST). + readback_buffer: D::Buffer, + /// Sections being timed this frame: (name, query_start_index). + sections: Vec<(&'static str, u32)>, + /// Number of timestamps written this frame (each section uses 2). + timestamp_count: u32, + /// Nanoseconds per GPU timestamp tick. + timestamp_period_ns: f32, + /// Previous frame's section metadata, paired with the readback buffer + /// that should now contain their resolved timestamps. + prev_sections: Vec<(&'static str, u32)>, + /// Previous frame's timestamp count. + prev_timestamp_count: u32, +} + +impl GpuTimerState { + /// Try to create GPU timer state. Returns `None` if timestamps are unsupported. + fn new(rhi: &D) -> Option { + let query_count = MAX_GPU_TIMER_SECTIONS * 2; + let query_set = rhi.create_query_set(query_count)?; + let buf_size = (query_count as u64) * std::mem::size_of::() as u64; + + let resolve_buffer = rhi.create_buffer(&euca_rhi::BufferDesc { + label: Some("GPU Timer Resolve"), + size: buf_size, + usage: euca_rhi::BufferUsages::QUERY_RESOLVE | euca_rhi::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }); + + let readback_buffer = rhi.create_buffer(&euca_rhi::BufferDesc { + label: Some("GPU Timer Readback"), + size: buf_size, + usage: euca_rhi::BufferUsages::MAP_READ | euca_rhi::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + let timestamp_period_ns = rhi.timestamp_period_ns(); + + Some(Self { + query_set, + resolve_buffer, + readback_buffer, + sections: Vec::with_capacity(MAX_GPU_TIMER_SECTIONS as usize), + timestamp_count: 0, + timestamp_period_ns, + prev_sections: Vec::new(), + prev_timestamp_count: 0, + }) + } + + /// Register a section to be timed this frame. + /// + /// Returns the query index pair (begin, end) for use in + /// `RenderPassTimestampWrites`, or `None` if the section limit is reached. + fn begin_section(&mut self, name: &'static str) -> Option<(u32, u32)> { + if self.timestamp_count + 2 > MAX_GPU_TIMER_SECTIONS * 2 { + return None; + } + let begin_idx = self.timestamp_count; + let end_idx = begin_idx + 1; + self.sections.push((name, begin_idx)); + self.timestamp_count += 2; + Some((begin_idx, end_idx)) + } + + /// Resolve this frame's queries and copy to the readback buffer. + /// + /// Must be called before submitting the encoder so the resolve and copy + /// commands are part of the same submission. + fn resolve_and_copy(&self, rhi: &D, encoder: &mut D::CommandEncoder) { + if self.timestamp_count == 0 { + return; + } + rhi.resolve_query_set( + encoder, + &self.query_set, + 0..self.timestamp_count, + &self.resolve_buffer, + ); + let byte_count = (self.timestamp_count as u64) * std::mem::size_of::() as u64; + rhi.copy_buffer_to_buffer( + encoder, + &self.resolve_buffer, + 0, + &self.readback_buffer, + 0, + byte_count, + ); + } + + /// Rotate frame state: move current sections to `prev_*` for next frame's readback. + fn end_frame(&mut self) { + self.prev_sections = std::mem::take(&mut self.sections); + self.prev_timestamp_count = self.timestamp_count; + self.timestamp_count = 0; + } +} + /// The main PBR forward renderer. /// /// Owns all GPU pipeline state, uploaded meshes, materials, textures, and @@ -453,6 +570,8 @@ pub struct Renderer>, /// SSGI pass (screen-space global illumination with temporal accumulation). ssgi_pass: crate::ssgi::SsgiPass, /// Previous frame HDR color for SSGI temporal reprojection. @@ -1182,6 +1301,7 @@ impl Renderer { metalfx_reset_history: false, start_time: std::time::Instant::now(), water_pipeline, + gpu_timer: GpuTimerState::new(rhi), ssgi_pass: crate::ssgi::SsgiPass::new(rhi, surface_w, surface_h), prev_color_texture: { rhi.create_texture(&euca_rhi::TextureDesc { @@ -1260,6 +1380,37 @@ impl Renderer { } } + /// Read back the previous frame's GPU timestamps and record them in the profiler. + /// + /// Call this once per frame, before `end_frame()`, to populate the profiler + /// with GPU-side pass timings. Does nothing when timestamp queries are + /// unsupported or no previous frame data is available. + pub fn read_gpu_timings(&self, rhi: &D, profiler: &mut euca_core::Profiler) { + let timer = match &self.gpu_timer { + Some(t) if t.prev_timestamp_count > 0 => t, + _ => return, + }; + + let timestamps = + rhi.read_timestamp_buffer(&timer.readback_buffer, timer.prev_timestamp_count); + if timestamps.is_empty() { + return; + } + + let period_ns = timer.timestamp_period_ns; + for &(name, begin_idx) in &timer.prev_sections { + let end_idx = begin_idx + 1; + if (end_idx as usize) < timestamps.len() { + let begin_tick = timestamps[begin_idx as usize]; + let end_tick = timestamps[end_idx as usize]; + if end_tick >= begin_tick { + let duration_us = (end_tick - begin_tick) as f64 * (period_ns as f64) / 1000.0; + profiler.record_gpu_section(name, duration_us); + } + } + } + } + /// Grow the main instance buffer and rebuild its bind group if `count` /// exceeds the current capacity. Returns `true` if the buffer was grown. fn ensure_instance_capacity(&mut self, rhi: &D, count: usize) -> bool { @@ -2067,6 +2218,21 @@ impl Renderer { encoder: &mut D::CommandEncoder, ) { let rhi: &D = gpu; + + // ── GPU timestamp setup ── + // Register sections for GPU timing; allocate query indices up front. + // Post-process timing is not attached here because PostProcessStack + // runs many internal sub-passes; per-pass timing can be threaded + // through PostProcessStack::execute in a future enhancement. + let shadow_ts = self + .gpu_timer + .as_mut() + .and_then(|t| t.begin_section("gpu:shadow")); + let pbr_ts = self + .gpu_timer + .as_mut() + .and_then(|t| t.begin_section("gpu:pbr")); + let vp = camera.view_projection_matrix(gpu.aspect_ratio()); let light_vp = Self::light_vp(light); let (opaque_cmds, transparent_cmds, water_cmds) = @@ -2115,6 +2281,22 @@ impl Renderer { self.ensure_shadow_instance_capacity(rhi, shadow_instances.len()); self.shadow_instance_buffer.write(&**gpu, &shadow_instances); } + // Attach shadow timestamps: begin on the first cascade, end on the last. + let is_first_cascade = cascade_idx == 0; + let is_last_cascade = cascade_idx == CASCADE_ORTHO_SIZES.len() - 1; + let shadow_ts_writes = shadow_ts.and_then(|(begin_idx, end_idx)| { + self.gpu_timer + .as_ref() + .map(|t| euca_rhi::RenderPassTimestampWrites { + query_set: &t.query_set, + beginning_of_pass_write_index: if is_first_cascade { + Some(begin_idx) + } else { + None + }, + end_of_pass_write_index: if is_last_cascade { Some(end_idx) } else { None }, + }) + }); let mut pass = rhi.begin_render_pass( encoder, &euca_rhi::RenderPassDesc { @@ -2128,6 +2310,7 @@ impl Renderer { }), stencil_ops: None, }), + timestamp_writes: shadow_ts_writes, }, ); pass.set_pipeline(&self.shadow_pipeline); @@ -2316,10 +2499,23 @@ impl Renderer { self.metalfx_low_res_depth_view.as_ref().unwrap(), ) } else { - (&self.msaa_hdr_view, Some(resolve_target), &self.depth_texture) + ( + &self.msaa_hdr_view, + Some(resolve_target), + &self.depth_texture, + ) }; { + let pbr_ts_writes = pbr_ts.and_then(|(begin_idx, end_idx)| { + self.gpu_timer + .as_ref() + .map(|t| euca_rhi::RenderPassTimestampWrites { + query_set: &t.query_set, + beginning_of_pass_write_index: Some(begin_idx), + end_of_pass_write_index: Some(end_idx), + }) + }); let mut pass = rhi.begin_render_pass( encoder, &euca_rhi::RenderPassDesc { @@ -2349,6 +2545,7 @@ impl Renderer { }), stencil_ops: None, }), + timestamp_writes: pbr_ts_writes, }, ); pass.set_pipeline(&self.sky_pipeline); @@ -2540,43 +2737,43 @@ impl Renderer { &self.metalfx_output, ) { - let (jitter_x, jitter_y) = (camera.jitter[0], camera.jitter[1]); - rhi.encode_metalfx_upscale( - encoder, - upscaler.as_ref(), - low_color, - low_depth, - &self.velocity_textures.velocity_texture, - output_tex, - jitter_x, - jitter_y, - self.metalfx_reset_history, - ); - self.metalfx_reset_history = false; + let (jitter_x, jitter_y) = (camera.jitter[0], camera.jitter[1]); + rhi.encode_metalfx_upscale( + encoder, + upscaler.as_ref(), + low_color, + low_depth, + &self.velocity_textures.velocity_texture, + output_tex, + jitter_x, + jitter_y, + self.metalfx_reset_history, + ); + self.metalfx_reset_history = false; - // Blit MetalFX output into the post-process ping buffer so - // downstream passes (TAA, motion blur, DoF) read the upscaled image. - let (sw, sh) = rhi.surface_size(); - rhi.copy_texture_to_texture( - encoder, - &euca_rhi::TexelCopyTextureInfo { - texture: output_tex, - mip_level: 0, - origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 }, - aspect: euca_rhi::TextureAspect::All, - }, - &euca_rhi::TexelCopyTextureInfo { - texture: self.post_process_stack.ping_texture(), - mip_level: 0, - origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 }, - aspect: euca_rhi::TextureAspect::All, - }, - euca_rhi::Extent3d { - width: sw, - height: sh, - depth_or_array_layers: 1, - }, - ); + // Blit MetalFX output into the post-process ping buffer so + // downstream passes (TAA, motion blur, DoF) read the upscaled image. + let (sw, sh) = rhi.surface_size(); + rhi.copy_texture_to_texture( + encoder, + &euca_rhi::TexelCopyTextureInfo { + texture: output_tex, + mip_level: 0, + origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 }, + aspect: euca_rhi::TextureAspect::All, + }, + &euca_rhi::TexelCopyTextureInfo { + texture: self.post_process_stack.ping_texture(), + mip_level: 0, + origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 }, + aspect: euca_rhi::TextureAspect::All, + }, + euca_rhi::Extent3d { + width: sw, + height: sh, + depth_or_array_layers: 1, + }, + ); } // GPU compute particles: update (compute dispatch) then draw (render pass). @@ -2610,6 +2807,7 @@ impl Renderer { stencil_ops: None, }, ), + timestamp_writes: None, }, ); for system in &self.gpu_particle_systems { @@ -2769,6 +2967,8 @@ impl Renderer { self.frame_count = self.frame_count.wrapping_add(1); // Post-processing via the modular stack. + // (Note: post_process_ts is attached to the last fullscreen pass internally + // via the post_process_stack; here we time it at the GPU level.) { let proj = camera.projection_matrix(gpu.aspect_ratio()); let inv_projection = proj.inverse().to_cols_array_2d(); @@ -2782,6 +2982,18 @@ impl Renderer { &projection, ); } + + // ── GPU timestamp resolve ── + // Resolve this frame's queries and copy to readback buffer. This must + // happen before the encoder is submitted by the caller. + if let Some(ref timer) = self.gpu_timer { + timer.resolve_and_copy(rhi, encoder); + } + + // Rotate GPU timer frame state so next frame reads this frame's data. + if let Some(ref mut timer) = self.gpu_timer { + timer.end_frame(); + } } /// Filter out occluded draw commands using the HZB from the previous frame. @@ -2985,8 +3197,7 @@ impl Renderer { | euca_rhi::TextureUsages::TEXTURE_BINDING, view_formats: &[], }); - let output_view = - rhi.create_texture_view(&output, &euca_rhi::TextureViewDesc::default()); + let output_view = rhi.create_texture_view(&output, &euca_rhi::TextureViewDesc::default()); // Create the MetalFX temporal scaler (panics on unsupported hardware). let upscaler = rhi.create_temporal_upscaler( diff --git a/crates/euca-render/src/ssr.rs b/crates/euca-render/src/ssr.rs index 43bd4b0..e0d4b47 100644 --- a/crates/euca-render/src/ssr.rs +++ b/crates/euca-render/src/ssr.rs @@ -279,6 +279,7 @@ impl SsrPass { }, })], depth_stencil_attachment: None, + timestamp_writes: None, }, ); pass.set_pipeline(&self.pipeline); diff --git a/crates/euca-render/src/ui_overlay.rs b/crates/euca-render/src/ui_overlay.rs index ec50314..20af1d9 100644 --- a/crates/euca-render/src/ui_overlay.rs +++ b/crates/euca-render/src/ui_overlay.rs @@ -173,6 +173,7 @@ impl UiOverlayRenderer { }, })], depth_stencil_attachment: None, // no depth testing for UI + timestamp_writes: None, }, ); diff --git a/crates/euca-render/src/velocity.rs b/crates/euca-render/src/velocity.rs index 0474b35..53dfd98 100644 --- a/crates/euca-render/src/velocity.rs +++ b/crates/euca-render/src/velocity.rs @@ -367,6 +367,7 @@ impl VelocityPipeline { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(&self.pipeline); diff --git a/crates/euca-render/src/volumetric.rs b/crates/euca-render/src/volumetric.rs index 92f6f75..151921d 100644 --- a/crates/euca-render/src/volumetric.rs +++ b/crates/euca-render/src/volumetric.rs @@ -425,6 +425,7 @@ impl VolumetricFogPass { }, })], depth_stencil_attachment: None, + timestamp_writes: None, }, ); pass.set_pipeline(&self.composite_pipeline); diff --git a/crates/euca-rhi/src/lib.rs b/crates/euca-rhi/src/lib.rs index ba050f2..baa2854 100644 --- a/crates/euca-rhi/src/lib.rs +++ b/crates/euca-rhi/src/lib.rs @@ -49,6 +49,7 @@ pub trait RenderDevice: 'static { where Self: 'a; type SurfaceTexture; + type QuerySet: 'static; // -- Capabilities -- @@ -106,6 +107,16 @@ pub trait RenderDevice: 'static { size: Option, ); + fn copy_buffer_to_buffer( + &self, + encoder: &mut Self::CommandEncoder, + src: &Self::Buffer, + src_offset: u64, + dst: &Self::Buffer, + dst_offset: u64, + size: u64, + ); + fn copy_texture_to_texture( &self, encoder: &mut Self::CommandEncoder, @@ -147,6 +158,48 @@ pub trait RenderDevice: 'static { w as f32 / h as f32 } + // -- Timestamp queries -- + + /// Create a timestamp query set with `count` slots. + /// + /// Returns `None` if the backend does not support timestamp queries. + #[allow(unused_variables)] + fn create_query_set(&self, count: u32) -> Option { + None + } + + /// Resolve timestamp queries into a destination buffer for CPU readback. + /// + /// Copies the raw u64 tick values for indices in `range` into `dest` at + /// offset 0. No-op on backends that do not support timestamp queries. + #[allow(unused_variables)] + fn resolve_query_set( + &self, + encoder: &mut Self::CommandEncoder, + query_set: &Self::QuerySet, + range: std::ops::Range, + dest: &Self::Buffer, + ) { + } + + /// Nanoseconds per GPU timestamp tick. + /// + /// Used to convert raw tick differences into wall-clock durations. + /// Returns `1.0` by default (backends override with the actual period). + fn timestamp_period_ns(&self) -> f32 { + 1.0 + } + + /// Read resolved timestamp data from a MAP_READ buffer. + /// + /// Returns the raw u64 tick values for `count` timestamps. This may + /// block briefly while the GPU finishes writing the buffer. Returns + /// an empty vec if the buffer is not ready or timestamps are unsupported. + #[allow(unused_variables)] + fn read_timestamp_buffer(&self, buffer: &Self::Buffer, count: u32) -> Vec { + Vec::new() + } + /// Encode a MetalFX temporal upscale pass. /// /// Default implementation is a no-op (non-Metal backends). `MetalDevice` diff --git a/crates/euca-rhi/src/metal_backend.rs b/crates/euca-rhi/src/metal_backend.rs index c10a7cf..813e893 100644 --- a/crates/euca-rhi/src/metal_backend.rs +++ b/crates/euca-rhi/src/metal_backend.rs @@ -844,6 +844,7 @@ impl RenderDevice for MetalDevice { type RenderPass<'a> = MetalRenderPass<'a>; type ComputePass<'a> = MetalComputePass<'a>; type SurfaceTexture = MetalSurfaceTexture; + type QuerySet = (); fn capabilities(&self) -> &Capabilities { &self.capabilities @@ -1422,6 +1423,31 @@ impl RenderDevice for MetalDevice { } } + fn copy_buffer_to_buffer( + &self, + encoder: &mut MetalCommandEncoder, + src: &MetalBuffer, + src_offset: u64, + dst: &MetalBuffer, + dst_offset: u64, + size: u64, + ) { + unsafe { + let blit = encoder + .command_buffer + .blitCommandEncoder() + .expect("Failed to create Metal blit encoder for buffer copy"); + blit.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size( + &src.0, + src_offset as usize, + &dst.0, + dst_offset as usize, + size as usize, + ); + blit.endEncoding(); + } + } + fn copy_texture_to_texture( &self, encoder: &mut MetalCommandEncoder, @@ -1584,7 +1610,9 @@ impl RenderDevice for MetalDevice { reset: bool, ) { if let Some(scaler) = upscaler.downcast_ref::() { - scaler.encode(encoder, color, depth, motion, output, jitter_x, jitter_y, reset); + scaler.encode( + encoder, color, depth, motion, output, jitter_x, jitter_y, reset, + ); } else { log::warn!("encode_metalfx_upscale: upscaler is not a MetalFXUpscaler — skipping"); } diff --git a/crates/euca-rhi/src/types.rs b/crates/euca-rhi/src/types.rs index 264c8f2..a4796b4 100644 --- a/crates/euca-rhi/src/types.rs +++ b/crates/euca-rhi/src/types.rs @@ -270,6 +270,7 @@ bitflags! { UNIFORM = 1 << 6; STORAGE = 1 << 7; INDIRECT = 1 << 8; + QUERY_RESOLVE = 1 << 9; } bitflags! { @@ -855,10 +856,25 @@ pub struct RenderPassDepthStencilAttachment<'a, D: RenderDevice + ?Sized> { pub stencil_ops: Option>, } +/// Timestamp query writes for a render or compute pass. +/// +/// When attached to a pass descriptor, the GPU writes timestamps at the +/// beginning and/or end of the pass into the given query set indices. +pub struct RenderPassTimestampWrites<'a, D: RenderDevice + ?Sized> { + /// The query set that receives the timestamp values. + pub query_set: &'a D::QuerySet, + /// Query index for the beginning-of-pass timestamp, or `None` to skip. + pub beginning_of_pass_write_index: Option, + /// Query index for the end-of-pass timestamp, or `None` to skip. + pub end_of_pass_write_index: Option, +} + pub struct RenderPassDesc<'a, D: RenderDevice + ?Sized> { pub label: Option<&'a str>, pub color_attachments: &'a [Option>], pub depth_stencil_attachment: Option>, + /// Optional GPU timestamp writes for per-pass timing. + pub timestamp_writes: Option>, } // --------------------------------------------------------------------------- @@ -930,6 +946,8 @@ pub struct Capabilities { /// Whether memoryless render targets are supported (tile memory only, /// saves ~20% bandwidth for transient G-buffer attachments). pub memoryless_render_targets: bool, + /// Whether GPU timestamp queries are supported for per-pass timing. + pub timestamp_query: bool, } impl Default for Capabilities { @@ -948,6 +966,7 @@ impl Default for Capabilities { apple_silicon: false, max_buffer_length: 256 * 1024 * 1024, // 256 MiB conservative default memoryless_render_targets: false, + timestamp_query: false, } } } diff --git a/crates/euca-rhi/src/wgpu_backend.rs b/crates/euca-rhi/src/wgpu_backend.rs index d5ca921..4665e27 100644 --- a/crates/euca-rhi/src/wgpu_backend.rs +++ b/crates/euca-rhi/src/wgpu_backend.rs @@ -414,6 +414,9 @@ impl From for wgpu::BufferUsages { if u.contains(BufferUsages::INDIRECT) { out |= Self::INDIRECT; } + if u.contains(BufferUsages::QUERY_RESOLVE) { + out |= Self::QUERY_RESOLVE; + } out } } @@ -646,6 +649,7 @@ impl RenderDevice for WgpuDevice { type RenderPass<'a> = wgpu::RenderPass<'a>; type ComputePass<'a> = wgpu::ComputePass<'a>; type SurfaceTexture = wgpu::SurfaceTexture; + type QuerySet = wgpu::QuerySet; fn capabilities(&self) -> &Capabilities { &self.capabilities @@ -925,11 +929,20 @@ impl RenderDevice for WgpuDevice { } }); + let ts_writes = desc + .timestamp_writes + .as_ref() + .map(|tw| wgpu::RenderPassTimestampWrites { + query_set: tw.query_set, + beginning_of_pass_write_index: tw.beginning_of_pass_write_index, + end_of_pass_write_index: tw.end_of_pass_write_index, + }); + encoder.begin_render_pass(&wgpu::RenderPassDescriptor { label: desc.label, color_attachments: &color_attachments, depth_stencil_attachment: depth_stencil, - timestamp_writes: None, + timestamp_writes: ts_writes, occlusion_query_set: None, }) } @@ -955,6 +968,18 @@ impl RenderDevice for WgpuDevice { encoder.clear_buffer(buffer, offset, size); } + fn copy_buffer_to_buffer( + &self, + encoder: &mut wgpu::CommandEncoder, + src: &wgpu::Buffer, + src_offset: u64, + dst: &wgpu::Buffer, + dst_offset: u64, + size: u64, + ) { + encoder.copy_buffer_to_buffer(src, src_offset, dst, dst_offset, size); + } + fn copy_texture_to_texture( &self, encoder: &mut wgpu::CommandEncoder, @@ -1032,6 +1057,68 @@ impl RenderDevice for WgpuDevice { fn surface_size(&self) -> (u32, u32) { (self.surface_config.width, self.surface_config.height) } + + fn create_query_set(&self, count: u32) -> Option { + if !self.capabilities.timestamp_query { + return None; + } + Some(self.device.create_query_set(&wgpu::QuerySetDescriptor { + label: Some("GPU Timestamp Queries"), + ty: wgpu::QueryType::Timestamp, + count, + })) + } + + fn resolve_query_set( + &self, + encoder: &mut wgpu::CommandEncoder, + query_set: &wgpu::QuerySet, + range: std::ops::Range, + dest: &wgpu::Buffer, + ) { + encoder.resolve_query_set(query_set, range, dest, 0); + } + + fn timestamp_period_ns(&self) -> f32 { + self.queue.get_timestamp_period() + } + + fn read_timestamp_buffer(&self, buffer: &wgpu::Buffer, count: u32) -> Vec { + if count == 0 { + return Vec::new(); + } + let byte_len = (count as u64) * std::mem::size_of::() as u64; + let slice = buffer.slice(..byte_len); + + // Request mapping and synchronously wait for the GPU to finish. + // This blocks briefly but is acceptable for profiling code paths. + let (sender, receiver) = std::sync::mpsc::channel(); + slice.map_async(wgpu::MapMode::Read, move |result| { + let _ = sender.send(result); + }); + // Poll until the GPU work (including the buffer copy) completes. + loop { + match self.device.poll(wgpu::PollType::Poll) { + Ok(status) if status.is_queue_empty() => break, + Err(_) => return Vec::new(), + _ => std::thread::yield_now(), + } + } + + match receiver.recv() { + Ok(Ok(())) => { + let data = slice.get_mapped_range(); + let timestamps: Vec = data + .chunks_exact(8) + .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap())) + .collect(); + drop(data); + buffer.unmap(); + timestamps + } + _ => Vec::new(), + } + } } // =========================================================================== diff --git a/crates/euca-rhi/tests/metal_smoke.rs b/crates/euca-rhi/tests/metal_smoke.rs index c12cf48..fab330c 100644 --- a/crates/euca-rhi/tests/metal_smoke.rs +++ b/crates/euca-rhi/tests/metal_smoke.rs @@ -267,6 +267,7 @@ fn metal_offscreen_render_pass() { }, })], depth_stencil_attachment: None, + timestamp_writes: None, }, ); pass.set_pipeline(&pipeline); diff --git a/examples/metal_combined.rs b/examples/metal_combined.rs index dbb95a0..95001f2 100644 --- a/examples/metal_combined.rs +++ b/examples/metal_combined.rs @@ -394,6 +394,7 @@ fn main() { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(pipeline); diff --git a/examples/metal_cubes.rs b/examples/metal_cubes.rs index f0fb861..620d19f 100644 --- a/examples/metal_cubes.rs +++ b/examples/metal_cubes.rs @@ -323,6 +323,7 @@ fn main() { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(pipeline); diff --git a/examples/metal_fx_upscale.rs b/examples/metal_fx_upscale.rs index 2b5dbbd..5de1fea 100644 --- a/examples/metal_fx_upscale.rs +++ b/examples/metal_fx_upscale.rs @@ -449,6 +449,7 @@ fn main() { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(pipeline); diff --git a/examples/metal_mesh_stress.rs b/examples/metal_mesh_stress.rs index de3446c..65e48a7 100644 --- a/examples/metal_mesh_stress.rs +++ b/examples/metal_mesh_stress.rs @@ -406,6 +406,7 @@ fn main() { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(pipeline); diff --git a/examples/metal_stress.rs b/examples/metal_stress.rs index d412d07..0c6f1ab 100644 --- a/examples/metal_stress.rs +++ b/examples/metal_stress.rs @@ -409,6 +409,7 @@ fn main() { }), stencil_ops: None, }), + timestamp_writes: None, }, ); pass.set_pipeline(pipeline);