From cdc2ba2022b8f380dbd3c8b41c8e4c14d34049e0 Mon Sep 17 00:00:00 2001 From: Nick Hainke Date: Wed, 17 Jun 2026 12:31:52 +0200 Subject: [PATCH] Fix GPU OOM on models larger than VRAM by reducing layer count on load failure When a large model (e.g. Gemma 4 26B-A4B at 18 GB) fails to load because llama.cpp tries to allocate all layers on a small GPU (3.7 GB VRAM), the load itself throws before the probe loop can shed layers. Reduce the layer count until the loading is succesfull. --- ltengine/src/llm.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/ltengine/src/llm.rs b/ltengine/src/llm.rs index 056b9b5..0b69e53 100644 --- a/ltengine/src/llm.rs +++ b/ltengine/src/llm.rs @@ -93,10 +93,23 @@ impl LLM { let (model, gpu_layers) = if use_gpu { let mut n_gpu = 9999u32; let model = loop { - let model = LlamaModel::load_from_file( + let model = match LlamaModel::load_from_file( &backend, &model_path, &LlamaModelParams::default().with_n_gpu_layers(n_gpu), - ).with_context(|| "Unable to load model")?; + ) { + Ok(m) => m, + Err(_) => { + // Load failed (likely GPU OOM before probe). On the first failure + // jump to 64 (covers most models); after that halve to converge fast. + let next = if n_gpu >= 9999 { 64 } else { n_gpu / 2 }; + eprintln!("ltengine: model load failed at {} GPU layers, retrying with {}", n_gpu, next); + n_gpu = next; + if n_gpu == 0 { + return Err(anyhow::anyhow!("Unable to load model even with 0 GPU layers")); + } + continue; + } + }; // Probe: create a minimal context and decode one token to confirm // the GPU has enough VRAM for compute scratch buffers.