From 1308d286341016468a057bef0ddddec3acef4dee Mon Sep 17 00:00:00 2001
From: stainlu <stainlu@grimo.ai>
Date: Thu, 9 Apr 2026 02:05:47 +0800
Subject: [PATCH] Add GPU timestamp queries for per-pass GPU profiling

Extend the RHI with QuerySet, timestamp query methods, and
copy_buffer_to_buffer so the renderer can measure per-pass GPU time
via wgpu timestamp queries. The CPU Profiler now distinguishes
Cpu vs Gpu sections and GPU timings appear in the same frame summary.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/euca-agent/src/routes/profile.rs |   3 +-
 crates/euca-core/src/lib.rs             |   2 +-
 crates/euca-core/src/profiler.rs        |  83 ++++++-
 crates/euca-render/src/deferred.rs      |   2 +
 crates/euca-render/src/gpu.rs           |  16 ++
 crates/euca-render/src/hardware.rs      |  10 +
 crates/euca-render/src/post_process.rs  |   1 +
 crates/euca-render/src/prepass.rs       |   1 +
 crates/euca-render/src/renderer.rs      | 289 ++++++++++++++++++++----
 crates/euca-render/src/ssr.rs           |   1 +
 crates/euca-render/src/ui_overlay.rs    |   1 +
 crates/euca-render/src/velocity.rs      |   1 +
 crates/euca-render/src/volumetric.rs    |   1 +
 crates/euca-rhi/src/lib.rs              |  53 +++++
 crates/euca-rhi/src/metal_backend.rs    |  30 ++-
 crates/euca-rhi/src/types.rs            |  19 ++
 crates/euca-rhi/src/wgpu_backend.rs     |  89 +++++++-
 crates/euca-rhi/tests/metal_smoke.rs    |   1 +
 examples/metal_combined.rs              |   1 +
 examples/metal_cubes.rs                 |   1 +
 examples/metal_fx_upscale.rs            |   1 +
 examples/metal_mesh_stress.rs           |   1 +
 examples/metal_stress.rs                |   1 +
 23 files changed, 559 insertions(+), 49 deletions(-)
diff --git a/crates/euca-agent/src/routes/profile.rs b/crates/euca-agent/src/routes/profile.rs
index 6a90dc8..80453bf 100644
--- a/crates/euca-agent/src/routes/profile.rs
+++ b/crates/euca-agent/src/routes/profile.rs
@@ -18,10 +18,11 @@ pub async fn profile(State(world): State<SharedWorld>) -> Json<serde_json::Value
         let sections: Vec<serde_json::Value> = profiler
             .frame_summary()
             .iter()
-            .map(|(name, us)| {
+            .map(|(name, us, kind)| {
                 serde_json::json!({
                     "name": name,
                     "us": (*us * 10.0).round() / 10.0,
+                    "kind": format!("{kind:?}"),
                 })
             })
             .collect();
diff --git a/crates/euca-core/src/lib.rs b/crates/euca-core/src/lib.rs
index 9f51fc8..4a96595 100644
--- a/crates/euca-core/src/lib.rs
+++ b/crates/euca-core/src/lib.rs
@@ -12,7 +12,7 @@ mod time;
 pub use app::App;
 pub use platform::performance_core_count;
 pub use plugin::Plugin;
-pub use profiler::{ProfileSection, Profiler, profiler_begin, profiler_end};
+pub use profiler::{ProfileSection, ProfileSectionKind, Profiler, profiler_begin, profiler_end};
 pub use time::Time;
 
 /// Re-export `winit` for downstream crates that need window types.
diff --git a/crates/euca-core/src/profiler.rs b/crates/euca-core/src/profiler.rs
index eda94ef..a4cb856 100644
--- a/crates/euca-core/src/profiler.rs
+++ b/crates/euca-core/src/profiler.rs
@@ -4,12 +4,23 @@ use std::time::Instant;
 /// Maximum number of frame times retained for averaging.
 const MAX_FRAME_HISTORY: usize = 60;
 
+/// Whether a profile section was measured on the CPU or GPU.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum ProfileSectionKind {
+    /// Timed on the CPU via `std::time::Instant`.
+    Cpu,
+    /// Timed on the GPU via timestamp queries.
+    Gpu,
+}
+
 /// A recorded profile section within a single frame.
 pub struct ProfileSection {
     /// Human-readable section label.
     pub name: &'static str,
     /// Wall-clock duration of this section in microseconds.
     pub duration_us: f64,
+    /// Whether this section was timed on the CPU or GPU.
+    pub kind: ProfileSectionKind,
 }
 
 /// Built-in frame profiler that tracks per-section timings and rolling frame statistics.
@@ -34,14 +45,26 @@ impl Profiler {
         }
     }
 
-    /// Return section names and durations (in microseconds) for the current frame.
-    pub fn frame_summary(&self) -> Vec<(&str, f64)> {
+    /// Return section names, durations (microseconds), and kind for the current frame.
+    pub fn frame_summary(&self) -> Vec<(&str, f64, ProfileSectionKind)> {
         self.sections
             .iter()
-            .map(|s| (s.name, s.duration_us))
+            .map(|s| (s.name, s.duration_us, s.kind))
             .collect()
     }
 
+    /// Record a GPU-timed section (called after timestamp readback).
+    ///
+    /// GPU sections are distinguished from CPU sections in the summary so
+    /// callers can display them separately or side-by-side.
+    pub fn record_gpu_section(&mut self, name: &'static str, duration_us: f64) {
+        self.sections.push(ProfileSection {
+            name,
+            duration_us,
+            kind: ProfileSectionKind::Gpu,
+        });
+    }
+
     /// Average frame time in milliseconds over the last [`MAX_FRAME_HISTORY`] frames.
     ///
     /// Returns `0.0` when no frame times have been recorded yet.
@@ -64,10 +87,18 @@ impl Profiler {
         1000.0 / avg
     }
 
-    /// Finish the current frame: record total frame time from all sections and reset
-    /// the section list for the next frame.
+    /// Finish the current frame: record total CPU frame time and reset the
+    /// section list for the next frame.
+    ///
+    /// Only CPU sections contribute to the rolling frame-time average. GPU
+    /// sections run in parallel with CPU work and are reported separately.
     pub fn end_frame(&mut self) {
-        let total_us: f64 = self.sections.iter().map(|s| s.duration_us).sum();
+        let total_us: f64 = self
+            .sections
+            .iter()
+            .filter(|s| s.kind == ProfileSectionKind::Cpu)
+            .map(|s| s.duration_us)
+            .sum();
         let total_ms = total_us / 1000.0;
 
         if self.frame_times.len() == MAX_FRAME_HISTORY {
@@ -103,6 +134,7 @@ pub fn profiler_end(profiler: &mut Profiler) {
     profiler.sections.push(ProfileSection {
         name,
         duration_us: elapsed.as_secs_f64() * 1_000_000.0,
+        kind: ProfileSectionKind::Cpu,
     });
 }
 
@@ -229,4 +261,43 @@ mod tests {
         // Outer should be >= inner since it wraps it.
         assert!(summary[1].1 >= summary[0].1);
     }
+
+    #[test]
+    fn cpu_sections_tagged_as_cpu() {
+        let mut profiler = Profiler::new();
+        profiler_begin(&mut profiler, "cpu_work");
+        profiler_end(&mut profiler);
+
+        let summary = profiler.frame_summary();
+        assert_eq!(summary[0].2, ProfileSectionKind::Cpu);
+    }
+
+    #[test]
+    fn gpu_section_recorded_and_tagged() {
+        let mut profiler = Profiler::new();
+        profiler.record_gpu_section("shadow_pass", 123.4);
+
+        let summary = profiler.frame_summary();
+        assert_eq!(summary.len(), 1);
+        assert_eq!(summary[0].0, "shadow_pass");
+        assert!((summary[0].1 - 123.4).abs() < f64::EPSILON);
+        assert_eq!(summary[0].2, ProfileSectionKind::Gpu);
+    }
+
+    #[test]
+    fn gpu_sections_excluded_from_frame_time() {
+        let mut profiler = Profiler::new();
+
+        // Add a CPU section and a GPU section.
+        profiler_begin(&mut profiler, "cpu");
+        thread::sleep(Duration::from_millis(1));
+        profiler_end(&mut profiler);
+        profiler.record_gpu_section("gpu_pass", 5000.0); // 5ms in us
+
+        profiler.end_frame();
+
+        // Frame time should only include the ~1ms CPU section, not the 5ms GPU section.
+        let avg = profiler.avg_frame_time_ms();
+        assert!(avg < 50.0, "avg should not include GPU section: {avg}");
+    }
 }
diff --git a/crates/euca-render/src/deferred.rs b/crates/euca-render/src/deferred.rs
index 90cfd22..c5aa573 100644
--- a/crates/euca-render/src/deferred.rs
+++ b/crates/euca-render/src/deferred.rs
@@ -520,6 +520,7 @@ impl<D: RenderDevice> DeferredPipeline<D> {
                     }),
                     stencil_ops: None,
                 }),
+                timestamp_writes: None,
             },
         );
         pass.set_pipeline(&self.gbuffer_pipeline);
@@ -546,6 +547,7 @@ impl<D: RenderDevice> DeferredPipeline<D> {
                     },
                 })],
                 depth_stencil_attachment: None,
+                timestamp_writes: None,
             },
         );
         pass.set_pipeline(&self.lighting_pipeline);
diff --git a/crates/euca-render/src/gpu.rs b/crates/euca-render/src/gpu.rs
index 20d14b1..fcc67e7 100644
--- a/crates/euca-render/src/gpu.rs
+++ b/crates/euca-render/src/gpu.rs
@@ -108,6 +108,12 @@ impl GpuContext {
             required_features |= wgpu::Features::MULTI_DRAW_INDIRECT_COUNT;
         }
 
+        // GPU timestamp queries for per-pass GPU profiling.
+        if supported.contains(wgpu::Features::TIMESTAMP_QUERY) {
+            required_features |= wgpu::Features::TIMESTAMP_QUERY;
+            log::info!("GPU supports TIMESTAMP_QUERY — GPU pass timing enabled");
+        }
+
         // Bindless materials: texture binding arrays + non-uniform indexing.
         let bindless_features = wgpu::Features::TEXTURE_BINDING_ARRAY
             | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING;
@@ -173,6 +179,8 @@ impl GpuContext {
 
         let unified_memory = survey.supports_unified_memory();
 
+        let has_timestamp_query = required_features.contains(wgpu::Features::TIMESTAMP_QUERY);
+
         let capabilities = Capabilities {
             unified_memory,
             multi_draw_indirect: has_multi_draw_indirect,
@@ -184,6 +192,7 @@ impl GpuContext {
             max_bindings_per_bind_group: cap_max_bindings,
             max_binding_array_elements: cap_max_binding_array,
             device_name: adapter_info.name.clone(),
+            timestamp_query: has_timestamp_query,
             ..Default::default()
         };
 
@@ -248,6 +257,10 @@ impl GpuContext {
             required_features |= wgpu::Features::MULTI_DRAW_INDIRECT_COUNT;
         }
 
+        if supported.contains(wgpu::Features::TIMESTAMP_QUERY) {
+            required_features |= wgpu::Features::TIMESTAMP_QUERY;
+        }
+
         let bindless_features = wgpu::Features::TEXTURE_BINDING_ARRAY
             | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING;
         if supported.contains(bindless_features) {
@@ -305,6 +318,8 @@ impl GpuContext {
 
         let unified_memory = survey.supports_unified_memory();
 
+        let has_timestamp_query_async = required_features.contains(wgpu::Features::TIMESTAMP_QUERY);
+
         let capabilities = Capabilities {
             unified_memory,
             multi_draw_indirect: has_multi_draw_indirect,
@@ -316,6 +331,7 @@ impl GpuContext {
             max_bindings_per_bind_group: cap_max_bindings,
             max_binding_array_elements: cap_max_binding_array,
             device_name: adapter_info.name.clone(),
+            timestamp_query: has_timestamp_query_async,
             ..Default::default()
         };
 
diff --git a/crates/euca-render/src/hardware.rs b/crates/euca-render/src/hardware.rs
index 242111a..5e9711d 100644
--- a/crates/euca-render/src/hardware.rs
+++ b/crates/euca-render/src/hardware.rs
@@ -215,6 +215,16 @@ impl HardwareSurvey {
         self.selected().vendor == GpuVendor::Apple
     }
 
+    /// Whether the selected adapter supports GPU timestamp queries.
+    ///
+    /// Timestamp queries enable per-pass GPU timing that is reported alongside
+    /// CPU profiler sections for a complete frame profile.
+    pub fn supports_timestamp_queries(&self) -> bool {
+        self.selected()
+            .features
+            .contains(wgpu::Features::TIMESTAMP_QUERY)
+    }
+
     /// Get the selected adapter info.
     pub fn selected(&self) -> &AdapterInfo {
         &self.adapters[self.selected_adapter]
diff --git a/crates/euca-render/src/post_process.rs b/crates/euca-render/src/post_process.rs
index 06059c6..5586bca 100644
--- a/crates/euca-render/src/post_process.rs
+++ b/crates/euca-render/src/post_process.rs
@@ -990,6 +990,7 @@ fn run_fullscreen_pass<D: euca_rhi::RenderDevice>(
                 },
             })],
             depth_stencil_attachment: None,
+            timestamp_writes: None,
         },
     );
     pass.set_pipeline(pipeline);
diff --git a/crates/euca-render/src/prepass.rs b/crates/euca-render/src/prepass.rs
index 748d1e2..4fa3780 100644
--- a/crates/euca-render/src/prepass.rs
+++ b/crates/euca-render/src/prepass.rs
@@ -335,6 +335,7 @@ impl<D: euca_rhi::RenderDevice> PrepassPipeline<D> {
                     }),
                     stencil_ops: None,
                 }),
+                timestamp_writes: None,
             },
         );
         pass.set_pipeline(&self.pipeline);
diff --git a/crates/euca-render/src/renderer.rs b/crates/euca-render/src/renderer.rs
index b2fe66d..abd0765 100644
--- a/crates/euca-render/src/renderer.rs
+++ b/crates/euca-render/src/renderer.rs
@@ -311,6 +311,123 @@ const SHADOW_MAP_SIZE: u32 = 2048;
 const NUM_SHADOW_CASCADES: u32 = 3;
 const CASCADE_ORTHO_SIZES: [f32; 3] = [8.0, 20.0, 50.0];
 
+/// Maximum number of GPU-timed sections per frame.
+///
+/// Each section uses two timestamp queries (start + end), so the query set
+/// holds `MAX_GPU_TIMER_SECTIONS * 2` entries.
+const MAX_GPU_TIMER_SECTIONS: u32 = 32;
+
+/// Per-pass GPU timing state using timestamp queries.
+///
+/// Implements a 2-frame readback pipeline to avoid GPU pipeline stalls:
+/// frame N writes timestamps and resolves them into `resolve_buffer`, then
+/// copies to `readback_buffer`. Frame N+1 maps `readback_buffer` to read
+/// the previous frame's timestamps while the current frame's queries land
+/// in the resolve buffer.
+struct GpuTimerState<D: RenderDevice> {
+    /// GPU query set holding raw timestamp values.
+    query_set: D::QuerySet,
+    /// GPU buffer that receives resolved u64 tick values
+    /// (QUERY_RESOLVE | COPY_SRC).
+    resolve_buffer: D::Buffer,
+    /// CPU-readable staging buffer (MAP_READ | COPY_DST).
+    readback_buffer: D::Buffer,
+    /// Sections being timed this frame: (name, query_start_index).
+    sections: Vec<(&'static str, u32)>,
+    /// Number of timestamps written this frame (each section uses 2).
+    timestamp_count: u32,
+    /// Nanoseconds per GPU timestamp tick.
+    timestamp_period_ns: f32,
+    /// Previous frame's section metadata, paired with the readback buffer
+    /// that should now contain their resolved timestamps.
+    prev_sections: Vec<(&'static str, u32)>,
+    /// Previous frame's timestamp count.
+    prev_timestamp_count: u32,
+}
+
+impl<D: RenderDevice> GpuTimerState<D> {
+    /// Try to create GPU timer state. Returns `None` if timestamps are unsupported.
+    fn new(rhi: &D) -> Option<Self> {
+        let query_count = MAX_GPU_TIMER_SECTIONS * 2;
+        let query_set = rhi.create_query_set(query_count)?;
+        let buf_size = (query_count as u64) * std::mem::size_of::<u64>() as u64;
+
+        let resolve_buffer = rhi.create_buffer(&euca_rhi::BufferDesc {
+            label: Some("GPU Timer Resolve"),
+            size: buf_size,
+            usage: euca_rhi::BufferUsages::QUERY_RESOLVE | euca_rhi::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        });
+
+        let readback_buffer = rhi.create_buffer(&euca_rhi::BufferDesc {
+            label: Some("GPU Timer Readback"),
+            size: buf_size,
+            usage: euca_rhi::BufferUsages::MAP_READ | euca_rhi::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let timestamp_period_ns = rhi.timestamp_period_ns();
+
+        Some(Self {
+            query_set,
+            resolve_buffer,
+            readback_buffer,
+            sections: Vec::with_capacity(MAX_GPU_TIMER_SECTIONS as usize),
+            timestamp_count: 0,
+            timestamp_period_ns,
+            prev_sections: Vec::new(),
+            prev_timestamp_count: 0,
+        })
+    }
+
+    /// Register a section to be timed this frame.
+    ///
+    /// Returns the query index pair (begin, end) for use in
+    /// `RenderPassTimestampWrites`, or `None` if the section limit is reached.
+    fn begin_section(&mut self, name: &'static str) -> Option<(u32, u32)> {
+        if self.timestamp_count + 2 > MAX_GPU_TIMER_SECTIONS * 2 {
+            return None;
+        }
+        let begin_idx = self.timestamp_count;
+        let end_idx = begin_idx + 1;
+        self.sections.push((name, begin_idx));
+        self.timestamp_count += 2;
+        Some((begin_idx, end_idx))
+    }
+
+    /// Resolve this frame's queries and copy to the readback buffer.
+    ///
+    /// Must be called before submitting the encoder so the resolve and copy
+    /// commands are part of the same submission.
+    fn resolve_and_copy(&self, rhi: &D, encoder: &mut D::CommandEncoder) {
+        if self.timestamp_count == 0 {
+            return;
+        }
+        rhi.resolve_query_set(
+            encoder,
+            &self.query_set,
+            0..self.timestamp_count,
+            &self.resolve_buffer,
+        );
+        let byte_count = (self.timestamp_count as u64) * std::mem::size_of::<u64>() as u64;
+        rhi.copy_buffer_to_buffer(
+            encoder,
+            &self.resolve_buffer,
+            0,
+            &self.readback_buffer,
+            0,
+            byte_count,
+        );
+    }
+
+    /// Rotate frame state: move current sections to `prev_*` for next frame's readback.
+    fn end_frame(&mut self) {
+        self.prev_sections = std::mem::take(&mut self.sections);
+        self.prev_timestamp_count = self.timestamp_count;
+        self.timestamp_count = 0;
+    }
+}
+
 /// The main PBR forward renderer.
 ///
 /// Owns all GPU pipeline state, uploaded meshes, materials, textures, and
@@ -453,6 +570,8 @@ pub struct Renderer<D: euca_rhi::RenderDevice = euca_rhi::wgpu_backend::WgpuDevi
     start_time: std::time::Instant,
     /// Water render pipeline (alpha blend, no depth write).
     water_pipeline: D::RenderPipeline,
+    /// Optional GPU timestamp timer. `None` when the backend lacks timestamp queries.
+    gpu_timer: Option<GpuTimerState<D>>,
     /// SSGI pass (screen-space global illumination with temporal accumulation).
     ssgi_pass: crate::ssgi::SsgiPass<D>,
     /// Previous frame HDR color for SSGI temporal reprojection.
@@ -1182,6 +1301,7 @@ impl<D: RenderDevice> Renderer<D> {
             metalfx_reset_history: false,
             start_time: std::time::Instant::now(),
             water_pipeline,
+            gpu_timer: GpuTimerState::new(rhi),
             ssgi_pass: crate::ssgi::SsgiPass::new(rhi, surface_w, surface_h),
             prev_color_texture: {
                 rhi.create_texture(&euca_rhi::TextureDesc {
@@ -1260,6 +1380,37 @@ impl<D: RenderDevice> Renderer<D> {
         }
     }
 
+    /// Read back the previous frame's GPU timestamps and record them in the profiler.
+    ///
+    /// Call this once per frame, before `end_frame()`, to populate the profiler
+    /// with GPU-side pass timings. Does nothing when timestamp queries are
+    /// unsupported or no previous frame data is available.
+    pub fn read_gpu_timings(&self, rhi: &D, profiler: &mut euca_core::Profiler) {
+        let timer = match &self.gpu_timer {
+            Some(t) if t.prev_timestamp_count > 0 => t,
+            _ => return,
+        };
+
+        let timestamps =
+            rhi.read_timestamp_buffer(&timer.readback_buffer, timer.prev_timestamp_count);
+        if timestamps.is_empty() {
+            return;
+        }
+
+        let period_ns = timer.timestamp_period_ns;
+        for &(name, begin_idx) in &timer.prev_sections {
+            let end_idx = begin_idx + 1;
+            if (end_idx as usize) < timestamps.len() {
+                let begin_tick = timestamps[begin_idx as usize];
+                let end_tick = timestamps[end_idx as usize];
+                if end_tick >= begin_tick {
+                    let duration_us = (end_tick - begin_tick) as f64 * (period_ns as f64) / 1000.0;
+                    profiler.record_gpu_section(name, duration_us);
+                }
+            }
+        }
+    }
+
     /// Grow the main instance buffer and rebuild its bind group if `count`
     /// exceeds the current capacity. Returns `true` if the buffer was grown.
     fn ensure_instance_capacity(&mut self, rhi: &D, count: usize) -> bool {
@@ -2067,6 +2218,21 @@ impl<D: RenderDevice> Renderer<D> {
         encoder: &mut D::CommandEncoder,
     ) {
         let rhi: &D = gpu;
+
+        // ── GPU timestamp setup ──
+        // Register sections for GPU timing; allocate query indices up front.
+        // Post-process timing is not attached here because PostProcessStack
+        // runs many internal sub-passes; per-pass timing can be threaded
+        // through PostProcessStack::execute in a future enhancement.
+        let shadow_ts = self
+            .gpu_timer
+            .as_mut()
+            .and_then(|t| t.begin_section("gpu:shadow"));
+        let pbr_ts = self
+            .gpu_timer
+            .as_mut()
+            .and_then(|t| t.begin_section("gpu:pbr"));
+
         let vp = camera.view_projection_matrix(gpu.aspect_ratio());
         let light_vp = Self::light_vp(light);
         let (opaque_cmds, transparent_cmds, water_cmds) =
@@ -2115,6 +2281,22 @@ impl<D: RenderDevice> Renderer<D> {
                 self.ensure_shadow_instance_capacity(rhi, shadow_instances.len());
                 self.shadow_instance_buffer.write(&**gpu, &shadow_instances);
             }
+            // Attach shadow timestamps: begin on the first cascade, end on the last.
+            let is_first_cascade = cascade_idx == 0;
+            let is_last_cascade = cascade_idx == CASCADE_ORTHO_SIZES.len() - 1;
+            let shadow_ts_writes = shadow_ts.and_then(|(begin_idx, end_idx)| {
+                self.gpu_timer
+                    .as_ref()
+                    .map(|t| euca_rhi::RenderPassTimestampWrites {
+                        query_set: &t.query_set,
+                        beginning_of_pass_write_index: if is_first_cascade {
+                            Some(begin_idx)
+                        } else {
+                            None
+                        },
+                        end_of_pass_write_index: if is_last_cascade { Some(end_idx) } else { None },
+                    })
+            });
             let mut pass = rhi.begin_render_pass(
                 encoder,
                 &euca_rhi::RenderPassDesc {
@@ -2128,6 +2310,7 @@ impl<D: RenderDevice> Renderer<D> {
                         }),
                         stencil_ops: None,
                     }),
+                    timestamp_writes: shadow_ts_writes,
                 },
             );
             pass.set_pipeline(&self.shadow_pipeline);
@@ -2316,10 +2499,23 @@ impl<D: RenderDevice> Renderer<D> {
                 self.metalfx_low_res_depth_view.as_ref().unwrap(),
             )
         } else {
-            (&self.msaa_hdr_view, Some(resolve_target), &self.depth_texture)
+            (
+                &self.msaa_hdr_view,
+                Some(resolve_target),
+                &self.depth_texture,
+            )
         };
 
         {
+            let pbr_ts_writes = pbr_ts.and_then(|(begin_idx, end_idx)| {
+                self.gpu_timer
+                    .as_ref()
+                    .map(|t| euca_rhi::RenderPassTimestampWrites {
+                        query_set: &t.query_set,
+                        beginning_of_pass_write_index: Some(begin_idx),
+                        end_of_pass_write_index: Some(end_idx),
+                    })
+            });
             let mut pass = rhi.begin_render_pass(
                 encoder,
                 &euca_rhi::RenderPassDesc {
@@ -2349,6 +2545,7 @@ impl<D: RenderDevice> Renderer<D> {
                         }),
                         stencil_ops: None,
                     }),
+                    timestamp_writes: pbr_ts_writes,
                 },
             );
             pass.set_pipeline(&self.sky_pipeline);
@@ -2540,43 +2737,43 @@ impl<D: RenderDevice> Renderer<D> {
                 &self.metalfx_output,
             )
         {
-                let (jitter_x, jitter_y) = (camera.jitter[0], camera.jitter[1]);
-                rhi.encode_metalfx_upscale(
-                    encoder,
-                    upscaler.as_ref(),
-                    low_color,
-                    low_depth,
-                    &self.velocity_textures.velocity_texture,
-                    output_tex,
-                    jitter_x,
-                    jitter_y,
-                    self.metalfx_reset_history,
-                );
-                self.metalfx_reset_history = false;
+            let (jitter_x, jitter_y) = (camera.jitter[0], camera.jitter[1]);
+            rhi.encode_metalfx_upscale(
+                encoder,
+                upscaler.as_ref(),
+                low_color,
+                low_depth,
+                &self.velocity_textures.velocity_texture,
+                output_tex,
+                jitter_x,
+                jitter_y,
+                self.metalfx_reset_history,
+            );
+            self.metalfx_reset_history = false;
 
-                // Blit MetalFX output into the post-process ping buffer so
-                // downstream passes (TAA, motion blur, DoF) read the upscaled image.
-                let (sw, sh) = rhi.surface_size();
-                rhi.copy_texture_to_texture(
-                    encoder,
-                    &euca_rhi::TexelCopyTextureInfo {
-                        texture: output_tex,
-                        mip_level: 0,
-                        origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 },
-                        aspect: euca_rhi::TextureAspect::All,
-                    },
-                    &euca_rhi::TexelCopyTextureInfo {
-                        texture: self.post_process_stack.ping_texture(),
-                        mip_level: 0,
-                        origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 },
-                        aspect: euca_rhi::TextureAspect::All,
-                    },
-                    euca_rhi::Extent3d {
-                        width: sw,
-                        height: sh,
-                        depth_or_array_layers: 1,
-                    },
-                );
+            // Blit MetalFX output into the post-process ping buffer so
+            // downstream passes (TAA, motion blur, DoF) read the upscaled image.
+            let (sw, sh) = rhi.surface_size();
+            rhi.copy_texture_to_texture(
+                encoder,
+                &euca_rhi::TexelCopyTextureInfo {
+                    texture: output_tex,
+                    mip_level: 0,
+                    origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 },
+                    aspect: euca_rhi::TextureAspect::All,
+                },
+                &euca_rhi::TexelCopyTextureInfo {
+                    texture: self.post_process_stack.ping_texture(),
+                    mip_level: 0,
+                    origin: euca_rhi::Origin3d { x: 0, y: 0, z: 0 },
+                    aspect: euca_rhi::TextureAspect::All,
+                },
+                euca_rhi::Extent3d {
+                    width: sw,
+                    height: sh,
+                    depth_or_array_layers: 1,
+                },
+            );
         }
 
         // GPU compute particles: update (compute dispatch) then draw (render pass).
@@ -2610,6 +2807,7 @@ impl<D: RenderDevice> Renderer<D> {
                                 stencil_ops: None,
                             },
                         ),
+                        timestamp_writes: None,
                     },
                 );
                 for system in &self.gpu_particle_systems {
@@ -2769,6 +2967,8 @@ impl<D: RenderDevice> Renderer<D> {
         self.frame_count = self.frame_count.wrapping_add(1);
 
         // Post-processing via the modular stack.
+        // (Note: post_process_ts is attached to the last fullscreen pass internally
+        //  via the post_process_stack; here we time it at the GPU level.)
         {
             let proj = camera.projection_matrix(gpu.aspect_ratio());
             let inv_projection = proj.inverse().to_cols_array_2d();
@@ -2782,6 +2982,18 @@ impl<D: RenderDevice> Renderer<D> {
                 &projection,
             );
         }
+
+        // ── GPU timestamp resolve ──
+        // Resolve this frame's queries and copy to readback buffer. This must
+        // happen before the encoder is submitted by the caller.
+        if let Some(ref timer) = self.gpu_timer {
+            timer.resolve_and_copy(rhi, encoder);
+        }
+
+        // Rotate GPU timer frame state so next frame reads this frame's data.
+        if let Some(ref mut timer) = self.gpu_timer {
+            timer.end_frame();
+        }
     }
 
     /// Filter out occluded draw commands using the HZB from the previous frame.
@@ -2985,8 +3197,7 @@ impl Renderer<euca_rhi::metal_backend::MetalDevice> {
                 | euca_rhi::TextureUsages::TEXTURE_BINDING,
             view_formats: &[],
         });
-        let output_view =
-            rhi.create_texture_view(&output, &euca_rhi::TextureViewDesc::default());
+        let output_view = rhi.create_texture_view(&output, &euca_rhi::TextureViewDesc::default());
 
         // Create the MetalFX temporal scaler (panics on unsupported hardware).
         let upscaler = rhi.create_temporal_upscaler(
diff --git a/crates/euca-render/src/ssr.rs b/crates/euca-render/src/ssr.rs
index 43bd4b0..e0d4b47 100644
--- a/crates/euca-render/src/ssr.rs
+++ b/crates/euca-render/src/ssr.rs
@@ -279,6 +279,7 @@ impl<D: euca_rhi::RenderDevice> SsrPass<D> {
                     },
                 })],
                 depth_stencil_attachment: None,
+                timestamp_writes: None,
             },
         );
         pass.set_pipeline(&self.pipeline);
diff --git a/crates/euca-render/src/ui_overlay.rs b/crates/euca-render/src/ui_overlay.rs
index ec50314..20af1d9 100644
--- a/crates/euca-render/src/ui_overlay.rs
+++ b/crates/euca-render/src/ui_overlay.rs
@@ -173,6 +173,7 @@ impl<D: euca_rhi::RenderDevice> UiOverlayRenderer<D> {
                     },
                 })],
                 depth_stencil_attachment: None, // no depth testing for UI
+                timestamp_writes: None,
             },
         );
 
diff --git a/crates/euca-render/src/velocity.rs b/crates/euca-render/src/velocity.rs
index 0474b35..53dfd98 100644
--- a/crates/euca-render/src/velocity.rs
+++ b/crates/euca-render/src/velocity.rs
@@ -367,6 +367,7 @@ impl<D: euca_rhi::RenderDevice> VelocityPipeline<D> {
                     }),
                     stencil_ops: None,
                 }),
+                timestamp_writes: None,
             },
         );
         pass.set_pipeline(&self.pipeline);
diff --git a/crates/euca-render/src/volumetric.rs b/crates/euca-render/src/volumetric.rs
index 92f6f75..151921d 100644
--- a/crates/euca-render/src/volumetric.rs
+++ b/crates/euca-render/src/volumetric.rs
@@ -425,6 +425,7 @@ impl<D: RenderDevice> VolumetricFogPass<D> {
                         },
                     })],
                     depth_stencil_attachment: None,
+                    timestamp_writes: None,
                 },
             );
             pass.set_pipeline(&self.composite_pipeline);
diff --git a/crates/euca-rhi/src/lib.rs b/crates/euca-rhi/src/lib.rs
index ba050f2..baa2854 100644
--- a/crates/euca-rhi/src/lib.rs
+++ b/crates/euca-rhi/src/lib.rs
@@ -49,6 +49,7 @@ pub trait RenderDevice: 'static {
     where
         Self: 'a;
     type SurfaceTexture;
+    type QuerySet: 'static;
 
     // -- Capabilities --
 
@@ -106,6 +107,16 @@ pub trait RenderDevice: 'static {
         size: Option<u64>,
     );
 
+    fn copy_buffer_to_buffer(
+        &self,
+        encoder: &mut Self::CommandEncoder,
+        src: &Self::Buffer,
+        src_offset: u64,
+        dst: &Self::Buffer,
+        dst_offset: u64,
+        size: u64,
+    );
+
     fn copy_texture_to_texture(
         &self,
         encoder: &mut Self::CommandEncoder,
@@ -147,6 +158,48 @@ pub trait RenderDevice: 'static {
         w as f32 / h as f32
     }
 
+    // -- Timestamp queries --
+
+    /// Create a timestamp query set with `count` slots.
+    ///
+    /// Returns `None` if the backend does not support timestamp queries.
+    #[allow(unused_variables)]
+    fn create_query_set(&self, count: u32) -> Option<Self::QuerySet> {
+        None
+    }
+
+    /// Resolve timestamp queries into a destination buffer for CPU readback.
+    ///
+    /// Copies the raw u64 tick values for indices in `range` into `dest` at
+    /// offset 0. No-op on backends that do not support timestamp queries.
+    #[allow(unused_variables)]
+    fn resolve_query_set(
+        &self,
+        encoder: &mut Self::CommandEncoder,
+        query_set: &Self::QuerySet,
+        range: std::ops::Range<u32>,
+        dest: &Self::Buffer,
+    ) {
+    }
+
+    /// Nanoseconds per GPU timestamp tick.
+    ///
+    /// Used to convert raw tick differences into wall-clock durations.
+    /// Returns `1.0` by default (backends override with the actual period).
+    fn timestamp_period_ns(&self) -> f32 {
+        1.0
+    }
+
+    /// Read resolved timestamp data from a MAP_READ buffer.
+    ///
+    /// Returns the raw u64 tick values for `count` timestamps. This may
+    /// block briefly while the GPU finishes writing the buffer. Returns
+    /// an empty vec if the buffer is not ready or timestamps are unsupported.
+    #[allow(unused_variables)]
+    fn read_timestamp_buffer(&self, buffer: &Self::Buffer, count: u32) -> Vec<u64> {
+        Vec::new()
+    }
+
     /// Encode a MetalFX temporal upscale pass.
     ///
     /// Default implementation is a no-op (non-Metal backends). `MetalDevice`
diff --git a/crates/euca-rhi/src/metal_backend.rs b/crates/euca-rhi/src/metal_backend.rs
index c10a7cf..813e893 100644
--- a/crates/euca-rhi/src/metal_backend.rs
+++ b/crates/euca-rhi/src/metal_backend.rs
@@ -844,6 +844,7 @@ impl RenderDevice for MetalDevice {
     type RenderPass<'a> = MetalRenderPass<'a>;
     type ComputePass<'a> = MetalComputePass<'a>;
     type SurfaceTexture = MetalSurfaceTexture;
+    type QuerySet = ();
 
     fn capabilities(&self) -> &Capabilities {
         &self.capabilities
@@ -1422,6 +1423,31 @@ impl RenderDevice for MetalDevice {
         }
     }
 
+    fn copy_buffer_to_buffer(
+        &self,
+        encoder: &mut MetalCommandEncoder,
+        src: &MetalBuffer,
+        src_offset: u64,
+        dst: &MetalBuffer,
+        dst_offset: u64,
+        size: u64,
+    ) {
+        unsafe {
+            let blit = encoder
+                .command_buffer
+                .blitCommandEncoder()
+                .expect("Failed to create Metal blit encoder for buffer copy");
+            blit.copyFromBuffer_sourceOffset_toBuffer_destinationOffset_size(
+                &src.0,
+                src_offset as usize,
+                &dst.0,
+                dst_offset as usize,
+                size as usize,
+            );
+            blit.endEncoding();
+        }
+    }
+
     fn copy_texture_to_texture(
         &self,
         encoder: &mut MetalCommandEncoder,
@@ -1584,7 +1610,9 @@ impl RenderDevice for MetalDevice {
         reset: bool,
     ) {
         if let Some(scaler) = upscaler.downcast_ref::<MetalFXUpscaler>() {
-            scaler.encode(encoder, color, depth, motion, output, jitter_x, jitter_y, reset);
+            scaler.encode(
+                encoder, color, depth, motion, output, jitter_x, jitter_y, reset,
+            );
         } else {
             log::warn!("encode_metalfx_upscale: upscaler is not a MetalFXUpscaler — skipping");
         }
diff --git a/crates/euca-rhi/src/types.rs b/crates/euca-rhi/src/types.rs
index 264c8f2..a4796b4 100644
--- a/crates/euca-rhi/src/types.rs
+++ b/crates/euca-rhi/src/types.rs
@@ -270,6 +270,7 @@ bitflags! {
     UNIFORM       = 1 << 6;
     STORAGE       = 1 << 7;
     INDIRECT      = 1 << 8;
+    QUERY_RESOLVE = 1 << 9;
 }
 
 bitflags! {
@@ -855,10 +856,25 @@ pub struct RenderPassDepthStencilAttachment<'a, D: RenderDevice + ?Sized> {
     pub stencil_ops: Option<Operations<u32>>,
 }
 
+/// Timestamp query writes for a render or compute pass.
+///
+/// When attached to a pass descriptor, the GPU writes timestamps at the
+/// beginning and/or end of the pass into the given query set indices.
+pub struct RenderPassTimestampWrites<'a, D: RenderDevice + ?Sized> {
+    /// The query set that receives the timestamp values.
+    pub query_set: &'a D::QuerySet,
+    /// Query index for the beginning-of-pass timestamp, or `None` to skip.
+    pub beginning_of_pass_write_index: Option<u32>,
+    /// Query index for the end-of-pass timestamp, or `None` to skip.
+    pub end_of_pass_write_index: Option<u32>,
+}
+
 pub struct RenderPassDesc<'a, D: RenderDevice + ?Sized> {
     pub label: Option<&'a str>,
     pub color_attachments: &'a [Option<RenderPassColorAttachment<'a, D>>],
     pub depth_stencil_attachment: Option<RenderPassDepthStencilAttachment<'a, D>>,
+    /// Optional GPU timestamp writes for per-pass timing.
+    pub timestamp_writes: Option<RenderPassTimestampWrites<'a, D>>,
 }
 
 // ---------------------------------------------------------------------------
@@ -930,6 +946,8 @@ pub struct Capabilities {
     /// Whether memoryless render targets are supported (tile memory only,
     /// saves ~20% bandwidth for transient G-buffer attachments).
     pub memoryless_render_targets: bool,
+    /// Whether GPU timestamp queries are supported for per-pass timing.
+    pub timestamp_query: bool,
 }
 
 impl Default for Capabilities {
@@ -948,6 +966,7 @@ impl Default for Capabilities {
             apple_silicon: false,
             max_buffer_length: 256 * 1024 * 1024, // 256 MiB conservative default
             memoryless_render_targets: false,
+            timestamp_query: false,
         }
     }
 }
diff --git a/crates/euca-rhi/src/wgpu_backend.rs b/crates/euca-rhi/src/wgpu_backend.rs
index d5ca921..4665e27 100644
--- a/crates/euca-rhi/src/wgpu_backend.rs
+++ b/crates/euca-rhi/src/wgpu_backend.rs
@@ -414,6 +414,9 @@ impl From<BufferUsages> for wgpu::BufferUsages {
         if u.contains(BufferUsages::INDIRECT) {
             out |= Self::INDIRECT;
         }
+        if u.contains(BufferUsages::QUERY_RESOLVE) {
+            out |= Self::QUERY_RESOLVE;
+        }
         out
     }
 }
@@ -646,6 +649,7 @@ impl RenderDevice for WgpuDevice {
     type RenderPass<'a> = wgpu::RenderPass<'a>;
     type ComputePass<'a> = wgpu::ComputePass<'a>;
     type SurfaceTexture = wgpu::SurfaceTexture;
+    type QuerySet = wgpu::QuerySet;
 
     fn capabilities(&self) -> &Capabilities {
         &self.capabilities
@@ -925,11 +929,20 @@ impl RenderDevice for WgpuDevice {
             }
         });
 
+        let ts_writes = desc
+            .timestamp_writes
+            .as_ref()
+            .map(|tw| wgpu::RenderPassTimestampWrites {
+                query_set: tw.query_set,
+                beginning_of_pass_write_index: tw.beginning_of_pass_write_index,
+                end_of_pass_write_index: tw.end_of_pass_write_index,
+            });
+
         encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
             label: desc.label,
             color_attachments: &color_attachments,
             depth_stencil_attachment: depth_stencil,
-            timestamp_writes: None,
+            timestamp_writes: ts_writes,
             occlusion_query_set: None,
         })
     }
@@ -955,6 +968,18 @@ impl RenderDevice for WgpuDevice {
         encoder.clear_buffer(buffer, offset, size);
     }
 
+    fn copy_buffer_to_buffer(
+        &self,
+        encoder: &mut wgpu::CommandEncoder,
+        src: &wgpu::Buffer,
+        src_offset: u64,
+        dst: &wgpu::Buffer,
+        dst_offset: u64,
+        size: u64,
+    ) {
+        encoder.copy_buffer_to_buffer(src, src_offset, dst, dst_offset, size);
+    }
+
     fn copy_texture_to_texture(
         &self,
         encoder: &mut wgpu::CommandEncoder,
@@ -1032,6 +1057,68 @@ impl RenderDevice for WgpuDevice {
     fn surface_size(&self) -> (u32, u32) {
         (self.surface_config.width, self.surface_config.height)
     }
+
+    fn create_query_set(&self, count: u32) -> Option<wgpu::QuerySet> {
+        if !self.capabilities.timestamp_query {
+            return None;
+        }
+        Some(self.device.create_query_set(&wgpu::QuerySetDescriptor {
+            label: Some("GPU Timestamp Queries"),
+            ty: wgpu::QueryType::Timestamp,
+            count,
+        }))
+    }
+
+    fn resolve_query_set(
+        &self,
+        encoder: &mut wgpu::CommandEncoder,
+        query_set: &wgpu::QuerySet,
+        range: std::ops::Range<u32>,
+        dest: &wgpu::Buffer,
+    ) {
+        encoder.resolve_query_set(query_set, range, dest, 0);
+    }
+
+    fn timestamp_period_ns(&self) -> f32 {
+        self.queue.get_timestamp_period()
+    }
+
+    fn read_timestamp_buffer(&self, buffer: &wgpu::Buffer, count: u32) -> Vec<u64> {
+        if count == 0 {
+            return Vec::new();
+        }
+        let byte_len = (count as u64) * std::mem::size_of::<u64>() as u64;
+        let slice = buffer.slice(..byte_len);
+
+        // Request mapping and synchronously wait for the GPU to finish.
+        // This blocks briefly but is acceptable for profiling code paths.
+        let (sender, receiver) = std::sync::mpsc::channel();
+        slice.map_async(wgpu::MapMode::Read, move |result| {
+            let _ = sender.send(result);
+        });
+        // Poll until the GPU work (including the buffer copy) completes.
+        loop {
+            match self.device.poll(wgpu::PollType::Poll) {
+                Ok(status) if status.is_queue_empty() => break,
+                Err(_) => return Vec::new(),
+                _ => std::thread::yield_now(),
+            }
+        }
+
+        match receiver.recv() {
+            Ok(Ok(())) => {
+                let data = slice.get_mapped_range();
+                let timestamps: Vec<u64> = data
+                    .chunks_exact(8)
+                    .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap()))
+                    .collect();
+                drop(data);
+                buffer.unmap();
+                timestamps
+            }
+            _ => Vec::new(),
+        }
+    }
 }
 
 // ===========================================================================
diff --git a/crates/euca-rhi/tests/metal_smoke.rs b/crates/euca-rhi/tests/metal_smoke.rs
index c12cf48..fab330c 100644
--- a/crates/euca-rhi/tests/metal_smoke.rs
+++ b/crates/euca-rhi/tests/metal_smoke.rs
@@ -267,6 +267,7 @@ fn metal_offscreen_render_pass() {
                     },
                 })],
                 depth_stencil_attachment: None,
+                timestamp_writes: None,
             },
         );
         pass.set_pipeline(&pipeline);
diff --git a/examples/metal_combined.rs b/examples/metal_combined.rs
index dbb95a0..95001f2 100644
--- a/examples/metal_combined.rs
+++ b/examples/metal_combined.rs
@@ -394,6 +394,7 @@ fn main() {
                                     }),
                                     stencil_ops: None,
                                 }),
+                                timestamp_writes: None,
                             },
                         );
                         pass.set_pipeline(pipeline);
diff --git a/examples/metal_cubes.rs b/examples/metal_cubes.rs
index f0fb861..620d19f 100644
--- a/examples/metal_cubes.rs
+++ b/examples/metal_cubes.rs
@@ -323,6 +323,7 @@ fn main() {
                                     }),
                                     stencil_ops: None,
                                 }),
+                                timestamp_writes: None,
                             },
                         );
                         pass.set_pipeline(pipeline);
diff --git a/examples/metal_fx_upscale.rs b/examples/metal_fx_upscale.rs
index 2b5dbbd..5de1fea 100644
--- a/examples/metal_fx_upscale.rs
+++ b/examples/metal_fx_upscale.rs
@@ -449,6 +449,7 @@ fn main() {
                                     }),
                                     stencil_ops: None,
                                 }),
+                                timestamp_writes: None,
                             },
                         );
                         pass.set_pipeline(pipeline);
diff --git a/examples/metal_mesh_stress.rs b/examples/metal_mesh_stress.rs
index de3446c..65e48a7 100644
--- a/examples/metal_mesh_stress.rs
+++ b/examples/metal_mesh_stress.rs
@@ -406,6 +406,7 @@ fn main() {
                                     }),
                                     stencil_ops: None,
                                 }),
+                                timestamp_writes: None,
                             },
                         );
                         pass.set_pipeline(pipeline);
diff --git a/examples/metal_stress.rs b/examples/metal_stress.rs
index d412d07..0c6f1ab 100644
--- a/examples/metal_stress.rs
+++ b/examples/metal_stress.rs
@@ -409,6 +409,7 @@ fn main() {
                                     }),
                                     stencil_ops: None,
                                 }),
+                                timestamp_writes: None,
                             },
                         );
                         pass.set_pipeline(pipeline);