From cdc2ba2022b8f380dbd3c8b41c8e4c14d34049e0 Mon Sep 17 00:00:00 2001
From: Nick Hainke <vincent@systemli.org>
Date: Wed, 17 Jun 2026 12:31:52 +0200
Subject: [PATCH] Fix GPU OOM on models larger than VRAM by reducing layer
 count on load failure

When a large model (e.g. Gemma 4 26B-A4B at 18 GB) fails to load because
llama.cpp tries to allocate all layers on a small GPU (3.7 GB VRAM), the
load itself throws before the probe loop can shed layers. Reduce the
layer count until the loading is succesfull.
---
 ltengine/src/llm.rs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/ltengine/src/llm.rs b/ltengine/src/llm.rs
index 056b9b5..0b69e53 100644
--- a/ltengine/src/llm.rs
+++ b/ltengine/src/llm.rs
@@ -93,10 +93,23 @@ impl LLM {
         let (model, gpu_layers) = if use_gpu {
             let mut n_gpu = 9999u32;
             let model = loop {
-                let model = LlamaModel::load_from_file(
+                let model = match LlamaModel::load_from_file(
                     &backend, &model_path,
                     &LlamaModelParams::default().with_n_gpu_layers(n_gpu),
-                ).with_context(|| "Unable to load model")?;
+                ) {
+                    Ok(m) => m,
+                    Err(_) => {
+                        // Load failed (likely GPU OOM before probe). On the first failure
+                        // jump to 64 (covers most models); after that halve to converge fast.
+                        let next = if n_gpu >= 9999 { 64 } else { n_gpu / 2 };
+                        eprintln!("ltengine: model load failed at {} GPU layers, retrying with {}", n_gpu, next);
+                        n_gpu = next;
+                        if n_gpu == 0 {
+                            return Err(anyhow::anyhow!("Unable to load model even with 0 GPU layers"));
+                        }
+                        continue;
+                    }
+                };
 
                 // Probe: create a minimal context and decode one token to confirm
                 // the GPU has enough VRAM for compute scratch buffers.