Perf: parallel extract workers + download largest wheels first

lightsofapollo · claude · lightsofapollo · commit d7e6a0de969a · 2026-03-13T19:38:00.000-06:00
Two optimizations to reduce pipeline stalls:

1. Sort queue by size descending — start downloading the biggest wheels
   (torch 873MB, nvidia-cudnn 674MB) first, so extraction can begin
   sooner and pipeline better with remaining downloads.

2. Multiple extract workers (4) — instead of one extract worker processing
   wheels serially, spawn N workers pulling from the same channel. Each
   gets extract_threads/N rayon threads. Prevents small wheels from
   queuing behind large ones.

Cold start: 34.9s → 32.6s (health), 43.7s → 41.8s (inference)
Warm start: 5.3s → 4.6s

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/crates/zs-fast-wheel/src/daemon.rs b/crates/zs-fast-wheel/src/daemon.rs
@@ -29,6 +29,18 @@ pub struct DaemonConfig {
     pub extract_threads: usize,
 }
 
+impl DaemonConfig {
+    /// Number of parallel extract workers. Each worker pulls wheels from the
+    /// channel and extracts independently, preventing small wheels from queuing
+    /// behind large ones. Each worker gets `extract_threads / workers` rayon threads.
+    pub fn extract_workers(&self) -> usize {
+        // 4 workers is a good default: allows 4 wheels to extract simultaneously,
+        // each with ~6-7 threads on a 26-core machine.
+        // Minimum 1, cap at extract_threads (no point having more workers than threads).
+        4.min(self.extract_threads)
+    }
+}
+
 impl Default for DaemonConfig {
     fn default() -> Self {
         Self {
@@ -121,10 +133,11 @@ impl DaemonEngine {
             .pool_max_idle_per_host(config.parallel_downloads)
             .build()?;
 
-        // Channel: downloaded wheels flow from download workers → extract worker.
-        // Small capacity (4) provides backpressure — if extraction is slow,
-        // downloads pause rather than filling disk with temp files.
-        let (tx, rx) = tokio::sync::mpsc::channel::<DownloadedWheel>(4);
+        // Channel: downloaded wheels flow from download workers → extract workers.
+        // Capacity = 2 * extract_workers provides enough buffering for workers to
+        // stay busy while providing backpressure to avoid filling disk with temp files.
+        let num_workers = config.extract_workers();
+        let (tx, rx) = tokio::sync::mpsc::channel::<DownloadedWheel>(num_workers * 2);
 
         let tmp_dir = tempfile::tempdir().context("failed to create temp dir")?;
         let tmp_path = tmp_dir.path().to_path_buf();
@@ -191,76 +204,95 @@ impl DaemonEngine {
         drop(tx);
 
         // === Extract stage ===
-        // Single blocking loop: receives downloaded wheels, extracts each immediately.
-        // Extraction uses all extract_threads for parallelism within a single wheel.
-        let site_packages = config.site_packages.clone();
-        let ext_threads = config.extract_threads;
-        let stats = self.stats.clone();
-        let completion = self.completion.clone();
-        let queue = self.queue.clone();
-        let total_wheels = self.total_wheels;
-
-        let extract_handle = tokio::task::spawn_blocking(move || {
-            let rx = rx;
-            // blocking_recv in a loop — channel closes when all downloads finish
-            let mut rx = rx;
-            while let Some(downloaded) = rx.blocking_recv() {
-                let dist = downloaded.spec.distribution.clone();
-                let extract_start = Instant::now();
-
-                let result = extract::extract_wheel_atomic(
-                    &downloaded.path,
-                    &site_packages,
-                    &dist,
-                    ext_threads,
-                    true,
-                    &stats,
-                );
-
-                let (lock, cvar) = &*completion;
-
-                match result {
-                    Ok(()) => {
-                        let elapsed = extract_start.elapsed();
-                        tracing::info!(
-                            "[{dist}] extracted in {:.1}s",
-                            elapsed.as_secs_f64()
-                        );
-
-                        {
-                            let mut q = queue.lock().unwrap();
-                            q.mark_done(&dist);
+        // Multiple extract workers pull from the same channel, extracting different
+        // wheels in parallel. Each worker gets a share of the total extract threads.
+        // This prevents small wheels from queuing behind large ones.
+        let num_extract_workers = config.extract_workers();
+        let threads_per_worker = (config.extract_threads / num_extract_workers).max(1);
+        let rx = Arc::new(tokio::sync::Mutex::new(rx));
+
+        let mut extract_handles = Vec::new();
+        for worker_id in 0..num_extract_workers {
+            let site_packages = config.site_packages.clone();
+            let stats = self.stats.clone();
+            let completion = self.completion.clone();
+            let queue = self.queue.clone();
+            let total_wheels = self.total_wheels;
+            let rx = rx.clone();
+
+            let handle = tokio::task::spawn_blocking(move || {
+                loop {
+                    // Lock channel briefly to receive next wheel
+                    let downloaded = {
+                        let mut rx = rx.blocking_lock();
+                        rx.blocking_recv()
+                    };
+                    let downloaded = match downloaded {
+                        Some(d) => d,
+                        None => break, // channel closed
+                    };
+
+                    let dist = downloaded.spec.distribution.clone();
+                    let extract_start = Instant::now();
+
+                    tracing::debug!("[{dist}] extract worker {worker_id} starting");
+
+                    let result = extract::extract_wheel_atomic(
+                        &downloaded.path,
+                        &site_packages,
+                        &dist,
+                        threads_per_worker,
+                        true,
+                        &stats,
+                    );
+
+                    let (lock, cvar) = &*completion;
+
+                    match result {
+                        Ok(()) => {
+                            let elapsed = extract_start.elapsed();
+                            tracing::info!(
+                                "[{dist}] extracted in {:.1}s (worker {worker_id})",
+                                elapsed.as_secs_f64()
+                            );
+
+                            {
+                                let mut q = queue.lock().unwrap();
+                                q.mark_done(&dist);
+                            }
+
+                            let mut state = lock.lock().unwrap();
+                            state.done.insert(dist);
+                            if state.done.len() + state.failed.len() >= total_wheels {
+                                state.all_finished = true;
+                            }
+                            cvar.notify_all();
                         }
-
-                        let mut state = lock.lock().unwrap();
-                        state.done.insert(dist);
-                        if state.done.len() + state.failed.len() >= total_wheels {
-                            state.all_finished = true;
+                        Err(e) => {
+                            let err_msg = format!("{e:#}");
+                            tracing::error!("[{dist}] extraction failed: {err_msg}");
+
+                            {
+                                let mut q = queue.lock().unwrap();
+                                q.mark_failed(&dist);
+                            }
+
+                            let mut state = lock.lock().unwrap();
+                            state.failed.insert(dist, err_msg);
+                            if state.done.len() + state.failed.len() >= total_wheels {
+                                state.all_finished = true;
+                            }
+                            cvar.notify_all();
                         }
-                        cvar.notify_all();
                     }
-                    Err(e) => {
-                        let err_msg = format!("{e:#}");
-                        tracing::error!("[{dist}] extraction failed: {err_msg}");
-
-                        {
-                            let mut q = queue.lock().unwrap();
-                            q.mark_failed(&dist);
-                        }
 
-                        let mut state = lock.lock().unwrap();
-                        state.failed.insert(dist, err_msg);
-                        if state.done.len() + state.failed.len() >= total_wheels {
-                            state.all_finished = true;
-                        }
-                        cvar.notify_all();
-                    }
+                    // Clean up temp file
+                    let _ = std::fs::remove_file(&downloaded.path);
                 }
+            });
 
-                // Clean up temp file
-                let _ = std::fs::remove_file(&downloaded.path);
-            }
-        });
+            extract_handles.push(handle);
+        }
 
         // === Wait for download failures ===
         // Collect download errors and mark them as failed
@@ -277,8 +309,10 @@ impl DaemonEngine {
             }
         }
 
-        // Wait for extract worker to finish
-        extract_handle.await?;
+        // Wait for all extract workers to finish
+        for handle in extract_handles {
+            handle.await?;
+        }
 
         // Mark all finished
         {
diff --git a/crates/zs-fast-wheel/src/queue.rs b/crates/zs-fast-wheel/src/queue.rs
@@ -4,7 +4,8 @@ use crate::manifest::WheelSpec;
 
 /// Priority queue for wheel installation scheduling.
 ///
-/// Default order: small wheels first, large last.
+/// Default order: large wheels first (start downloading big wheels early so
+/// extraction can pipeline with remaining downloads).
 /// Supports demand-driven reprioritization via `prioritize()`.
 pub struct InstallQueue {
     /// Wheels not yet started, ordered by priority
@@ -17,9 +18,10 @@ pub struct InstallQueue {
 
 impl InstallQueue {
     /// Create a new queue from a list of wheel specs.
-    /// Sorts by size ascending (small wheels first).
+    /// Sorts by size descending (large wheels first — download big wheels early
+    /// so extraction starts sooner and pipelines better with remaining downloads).
     pub fn new(mut wheels: Vec<WheelSpec>) -> Self {
-        wheels.sort_by_key(|w| w.size);
+        wheels.sort_by_key(|w| std::cmp::Reverse(w.size));
         Self {
             pending: wheels.into(),
             in_progress: HashSet::new(),
@@ -99,7 +101,7 @@ mod tests {
     }
 
     #[test]
-    fn test_sorts_by_size_ascending() {
+    fn test_sorts_by_size_descending() {
         let wheels = vec![
             make_wheel("torch", 900_000_000),
             make_wheel("six", 12_000),
@@ -108,11 +110,11 @@ mod tests {
         let mut queue = InstallQueue::new(wheels);
 
         let first = queue.next().unwrap();
-        assert_eq!(first.distribution, "six");
+        assert_eq!(first.distribution, "torch");
         let second = queue.next().unwrap();
         assert_eq!(second.distribution, "numpy");
         let third = queue.next().unwrap();
-        assert_eq!(third.distribution, "torch");
+        assert_eq!(third.distribution, "six");
         assert!(queue.next().is_none());
     }
 
@@ -125,9 +127,10 @@ mod tests {
         ];
         let mut queue = InstallQueue::new(wheels);
 
-        queue.prioritize("torch");
+        // six is last (smallest) — prioritize moves it to front
+        queue.prioritize("six");
         let first = queue.next().unwrap();
-        assert_eq!(first.distribution, "torch");
+        assert_eq!(first.distribution, "six");
     }
 
     #[test]
@@ -139,13 +142,13 @@ mod tests {
         let mut queue = InstallQueue::new(wheels);
 
         let first = queue.next().unwrap();
-        assert_eq!(first.distribution, "six");
-        queue.mark_done("six");
+        assert_eq!(first.distribution, "torch"); // largest first
+        queue.mark_done("torch");
 
         // Prioritizing a done package should be a no-op
-        queue.prioritize("six");
+        queue.prioritize("torch");
         let second = queue.next().unwrap();
-        assert_eq!(second.distribution, "torch");
+        assert_eq!(second.distribution, "six");
     }
 
     #[test]
@@ -157,13 +160,13 @@ mod tests {
         ];
         let mut queue = InstallQueue::new(wheels);
 
-        let first = queue.next().unwrap(); // six is now in_progress
-        assert_eq!(first.distribution, "six");
+        let first = queue.next().unwrap(); // torch is now in_progress (largest first)
+        assert_eq!(first.distribution, "torch");
 
         // Prioritizing an in-progress package should be a no-op
-        queue.prioritize("six");
+        queue.prioritize("torch");
         let second = queue.next().unwrap();
-        assert_eq!(second.distribution, "numpy"); // not six again
+        assert_eq!(second.distribution, "numpy"); // not torch again
     }
 
     #[test]