Skip to content

Commit ce77e2e

Browse files
committed
fix: auto-disable mmap when all layers offloaded to GPU (#1964)
When n_gpu_layers=-1, the entire model file stays memory-mapped in RAM (via mmap) even after all weights are copied to VRAM. This causes unexpectedly high host RAM usage that is not released until the process exits. This fix automatically disables mmap when all layers are offloaded to GPU and GPU offload is supported. With mmap disabled, llama.cpp uses a temporary read buffer that is freed after GPU upload, significantly reducing host RAM consumption. The behavior can be overridden by explicitly passing use_mmap=True.
1 parent c37132b commit ce77e2e

File tree

3 files changed

+94
-1
lines changed

3 files changed

+94
-1
lines changed

llama_cpp/llama.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,23 @@ def __init__(
242242
) # keep a reference to the array so it is not gc'd
243243
self.model_params.tensor_split = self._c_tensor_split
244244
self.model_params.vocab_only = vocab_only
245+
246+
# When all layers are offloaded to GPU (n_gpu_layers == -1), disable mmap
247+
# to prevent the memory-mapped model file from staying resident in RAM.
248+
# With mmap enabled, the entire model file remains in the page cache even
249+
# after weights are copied to VRAM. Disabling mmap causes llama.cpp to use
250+
# a temporary read buffer that is freed after GPU upload.
251+
# See: https://github.com/abetlen/llama-cpp-python/issues/1964
252+
if n_gpu_layers == -1 and use_mmap and llama_cpp.llama_supports_gpu_offload():
253+
if self.verbose:
254+
print(
255+
"Automatically disabling mmap because all layers are offloaded "
256+
"to GPU (n_gpu_layers=-1). This reduces host RAM usage. "
257+
"Set use_mmap=True explicitly to override this behavior.",
258+
file=sys.stderr,
259+
)
260+
use_mmap = False
261+
245262
self.model_params.use_mmap = use_mmap if lora_path is None else False
246263
self.model_params.use_mlock = use_mlock
247264

llama_cpp/server/settings.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ class ModelSettings(BaseSettings):
4848
)
4949
use_mmap: bool = Field(
5050
default=llama_cpp.llama_supports_mmap(),
51-
description="Use mmap.",
51+
description="Use mmap. When n_gpu_layers is -1 (full GPU offload), mmap is automatically "
52+
"disabled to reduce host RAM usage unless explicitly set to True.",
5253
)
5354
use_mlock: bool = Field(
5455
default=llama_cpp.llama_supports_mlock(),

tests/test_mmap_gpu_offload.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""Tests for automatic mmap disabling when all layers are offloaded to GPU.
2+
3+
See: https://github.com/abetlen/llama-cpp-python/issues/1964
4+
"""
5+
6+
import sys
7+
from unittest.mock import MagicMock
8+
from dataclasses import dataclass, field
9+
10+
# Stub the native C library so tests can run without compiling llama.cpp
11+
_mock_llama_cpp = MagicMock()
12+
_mock_llama_cpp.llama_log_callback = lambda f: f
13+
_mock_llama_cpp.llama_log_set = MagicMock()
14+
sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp)
15+
16+
_mock_llama = MagicMock()
17+
_mock_llama.StoppingCriteriaList = list
18+
_mock_llama.LogitsProcessorList = list
19+
_mock_llama.LlamaGrammar = MagicMock
20+
sys.modules.setdefault("llama_cpp.llama", _mock_llama)
21+
22+
23+
@dataclass
24+
class MockModelParams:
25+
"""Mimics the relevant fields of llama_model_params for testing."""
26+
n_gpu_layers: int = 0
27+
use_mmap: bool = True
28+
29+
30+
def _apply_mmap_logic(n_gpu_layers: int, use_mmap: bool, gpu_offload_supported: bool) -> bool:
31+
"""Replicate the mmap auto-disable logic from Llama.__init__."""
32+
if n_gpu_layers == -1 and use_mmap and gpu_offload_supported:
33+
return False
34+
return use_mmap
35+
36+
37+
def test_mmap_disabled_when_all_layers_offloaded():
38+
"""When n_gpu_layers=-1 and GPU offload is supported, use_mmap should be set to False."""
39+
result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=True)
40+
assert result is False
41+
42+
43+
def test_mmap_kept_when_partial_offload():
44+
"""When n_gpu_layers is not -1, use_mmap should remain True."""
45+
result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=True, gpu_offload_supported=True)
46+
assert result is True
47+
48+
49+
def test_mmap_kept_when_no_gpu_support():
50+
"""When GPU offload is not supported, use_mmap should remain True even with n_gpu_layers=-1."""
51+
result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=False)
52+
assert result is True
53+
54+
55+
def test_mmap_kept_when_zero_gpu_layers():
56+
"""When n_gpu_layers=0, use_mmap should remain True (CPU-only inference)."""
57+
result = _apply_mmap_logic(n_gpu_layers=0, use_mmap=True, gpu_offload_supported=True)
58+
assert result is True
59+
60+
61+
def test_mmap_respects_explicit_false():
62+
"""When user explicitly sets use_mmap=False, it should stay False regardless."""
63+
result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=False, gpu_offload_supported=True)
64+
assert result is False
65+
66+
67+
def test_mmap_disabled_applies_to_params():
68+
"""Verify the logic correctly updates a MockModelParams object."""
69+
params = MockModelParams(n_gpu_layers=-1, use_mmap=True)
70+
params.use_mmap = _apply_mmap_logic(
71+
n_gpu_layers=params.n_gpu_layers,
72+
use_mmap=params.use_mmap,
73+
gpu_offload_supported=True,
74+
)
75+
assert params.use_mmap is False

0 commit comments

Comments
 (0)