From 3d8584a5e3b161b7455000368ed247c66ffeda07 Mon Sep 17 00:00:00 2001 From: Nick Mitchell Date: Mon, 23 Feb 2026 15:52:14 -0500 Subject: [PATCH] feat(local): add model pool unload to release GPU memory When running multiple models sequentially (e.g. multi-model benchmarks), previously loaded models remained in VRAM causing OOM errors. Adds ModelPool::unload_all() and exposes it as spnl::model_pool::unload_all() so callers can free GPU memory between model runs. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Nick Mitchell --- spnl/src/generate/backend/mistralrs/loader.rs | 6 ++++++ spnl/src/generate/backend/mistralrs/mod.rs | 6 ++++++ spnl/src/lib.rs | 9 +++++++++ 3 files changed, 21 insertions(+) diff --git a/spnl/src/generate/backend/mistralrs/loader.rs b/spnl/src/generate/backend/mistralrs/loader.rs index f261595a..4eca3b65 100644 --- a/spnl/src/generate/backend/mistralrs/loader.rs +++ b/spnl/src/generate/backend/mistralrs/loader.rs @@ -185,6 +185,12 @@ impl ModelPool { } } + /// Unload all models, releasing GPU memory. + pub async fn unload_all(&self) { + let mut models = self.models.write().await; + models.clear(); + } + /// Get or load a model pub async fn get_or_load(&self, model_name: &str) -> anyhow::Result> { // Check if model is already loaded diff --git a/spnl/src/generate/backend/mistralrs/mod.rs b/spnl/src/generate/backend/mistralrs/mod.rs index 29175d6b..0d612fa6 100644 --- a/spnl/src/generate/backend/mistralrs/mod.rs +++ b/spnl/src/generate/backend/mistralrs/mod.rs @@ -271,6 +271,12 @@ pub async fn generate_completion( Ok(Query::Par(final_results)) } +/// Unload all models from the global pool, releasing GPU memory. +/// Call between benchmark runs to avoid accumulating models in VRAM. +pub async fn unload_all_models() { + get_model_pool().unload_all().await +} + /// Generate multiple completions for the same input (Repeat operation) pub async fn generate_chat( spec: Repeat, diff --git a/spnl/src/lib.rs b/spnl/src/lib.rs index 2b9e74c7..f2c931c2 100644 --- a/spnl/src/lib.rs +++ b/spnl/src/lib.rs @@ -30,3 +30,12 @@ pub mod gce; #[cfg(feature = "vllm")] pub mod vllm; + +/// Model pool management. Only available with the `local` feature. +#[cfg(feature = "local")] +pub mod model_pool { + /// Unload all models from the global pool, releasing GPU memory. + pub async fn unload_all() { + crate::generate::backend::mistralrs::unload_all_models().await + } +}