fix: auto-disable mmap when all layers offloaded to GPU (#1964)

ljluestc · ljluestc · commit ce77e2e97136 · 2026-03-21T23:57:37.000-07:00
When n_gpu_layers=-1, the entire model file stays memory-mapped in RAM
(via mmap) even after all weights are copied to VRAM. This causes
unexpectedly high host RAM usage that is not released until the process
exits.

This fix automatically disables mmap when all layers are offloaded to
GPU and GPU offload is supported. With mmap disabled, llama.cpp uses a
temporary read buffer that is freed after GPU upload, significantly
reducing host RAM consumption.

The behavior can be overridden by explicitly passing use_mmap=True.
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -242,6 +242,23 @@ def __init__(
             )  # keep a reference to the array so it is not gc'd
             self.model_params.tensor_split = self._c_tensor_split
         self.model_params.vocab_only = vocab_only
+
+        # When all layers are offloaded to GPU (n_gpu_layers == -1), disable mmap
+        # to prevent the memory-mapped model file from staying resident in RAM.
+        # With mmap enabled, the entire model file remains in the page cache even
+        # after weights are copied to VRAM. Disabling mmap causes llama.cpp to use
+        # a temporary read buffer that is freed after GPU upload.
+        # See: https://github.com/abetlen/llama-cpp-python/issues/1964
+        if n_gpu_layers == -1 and use_mmap and llama_cpp.llama_supports_gpu_offload():
+            if self.verbose:
+                print(
+                    "Automatically disabling mmap because all layers are offloaded "
+                    "to GPU (n_gpu_layers=-1). This reduces host RAM usage. "
+                    "Set use_mmap=True explicitly to override this behavior.",
+                    file=sys.stderr,
+                )
+            use_mmap = False
+
         self.model_params.use_mmap = use_mmap if lora_path is None else False
         self.model_params.use_mlock = use_mlock
 
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -48,7 +48,8 @@ class ModelSettings(BaseSettings):
     )
     use_mmap: bool = Field(
         default=llama_cpp.llama_supports_mmap(),
-        description="Use mmap.",
+        description="Use mmap. When n_gpu_layers is -1 (full GPU offload), mmap is automatically "
+        "disabled to reduce host RAM usage unless explicitly set to True.",
     )
     use_mlock: bool = Field(
         default=llama_cpp.llama_supports_mlock(),
diff --git a/tests/test_mmap_gpu_offload.py b/tests/test_mmap_gpu_offload.py
@@ -0,0 +1,75 @@
+"""Tests for automatic mmap disabling when all layers are offloaded to GPU.
+
+See: https://github.com/abetlen/llama-cpp-python/issues/1964
+"""
+
+import sys
+from unittest.mock import MagicMock
+from dataclasses import dataclass, field
+
+# Stub the native C library so tests can run without compiling llama.cpp
+_mock_llama_cpp = MagicMock()
+_mock_llama_cpp.llama_log_callback = lambda f: f
+_mock_llama_cpp.llama_log_set = MagicMock()
+sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp)
+
+_mock_llama = MagicMock()
+_mock_llama.StoppingCriteriaList = list
+_mock_llama.LogitsProcessorList = list
+_mock_llama.LlamaGrammar = MagicMock
+sys.modules.setdefault("llama_cpp.llama", _mock_llama)
+
+
+@dataclass
+class MockModelParams:
+    """Mimics the relevant fields of llama_model_params for testing."""
+    n_gpu_layers: int = 0
+    use_mmap: bool = True
+
+
+def _apply_mmap_logic(n_gpu_layers: int, use_mmap: bool, gpu_offload_supported: bool) -> bool:
+    """Replicate the mmap auto-disable logic from Llama.__init__."""
+    if n_gpu_layers == -1 and use_mmap and gpu_offload_supported:
+        return False
+    return use_mmap
+
+
+def test_mmap_disabled_when_all_layers_offloaded():
+    """When n_gpu_layers=-1 and GPU offload is supported, use_mmap should be set to False."""
+    result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=True)
+    assert result is False
+
+
+def test_mmap_kept_when_partial_offload():
+    """When n_gpu_layers is not -1, use_mmap should remain True."""
+    result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=True, gpu_offload_supported=True)
+    assert result is True
+
+
+def test_mmap_kept_when_no_gpu_support():
+    """When GPU offload is not supported, use_mmap should remain True even with n_gpu_layers=-1."""
+    result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=False)
+    assert result is True
+
+
+def test_mmap_kept_when_zero_gpu_layers():
+    """When n_gpu_layers=0, use_mmap should remain True (CPU-only inference)."""
+    result = _apply_mmap_logic(n_gpu_layers=0, use_mmap=True, gpu_offload_supported=True)
+    assert result is True
+
+
+def test_mmap_respects_explicit_false():
+    """When user explicitly sets use_mmap=False, it should stay False regardless."""
+    result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=False, gpu_offload_supported=True)
+    assert result is False
+
+
+def test_mmap_disabled_applies_to_params():
+    """Verify the logic correctly updates a MockModelParams object."""
+    params = MockModelParams(n_gpu_layers=-1, use_mmap=True)
+    params.use_mmap = _apply_mmap_logic(
+        n_gpu_layers=params.n_gpu_layers,
+        use_mmap=params.use_mmap,
+        gpu_offload_supported=True,
+    )
+    assert params.use_mmap is False

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,8 @@ class ModelSettings(BaseSettings):`
`48`	`48`	`)`
`49`	`49`	`use_mmap: bool = Field(`
`50`	`50`	`default=llama_cpp.llama_supports_mmap(),`
`51`		`- description="Use mmap.",`
	`51`	`+ description="Use mmap. When n_gpu_layers is -1 (full GPU offload), mmap is automatically "`
	`52`	`+ "disabled to reduce host RAM usage unless explicitly set to True.",`
`52`	`53`	`)`
`53`	`54`	`use_mlock: bool = Field(`
`54`	`55`	`default=llama_cpp.llama_supports_mlock(),`