Fix flash_attn default to match upstream AUTO behavior

Ralf Waldukat · Ralf Waldukat · commit 64b087c26500 · 2026-01-05T11:02:19.000+01:00
Critical fixes from code review:
- server/settings.py: Change flash_attn default from False to None (AUTO)
  Upstream llama.cpp defaults to LLAMA_FLASH_ATTN_TYPE_AUTO, server was
  incorrectly forcing DISABLED, blocking optimization for models that need it
- llama_cpp.py: Consistent stub style (pass -&gt; ...) for llama_max_tensor_buft_overrides
- CMakeLists.txt: Document version workaround for mtmd build
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -154,6 +154,9 @@ if (LLAMA_BUILD)
         endif()
 
         # Set version for mtmd (required by upstream CMakeLists.txt)
+        # NOTE: This is a workaround for mtmd build requirements.
+        # Version is set to 0.0.0 for local builds. If upstream adds version
+        # compatibility checks, this may need to match llama.cpp version.
         if (NOT DEFINED LLAMA_BUILD_NUMBER)
             set(LLAMA_BUILD_NUMBER 0)
         endif()
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1400,7 +1400,7 @@ def llama_supports_rpc() -> bool: ...
 @ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
 def llama_max_tensor_buft_overrides() -> int:
     """Get maximum number of tensor buffer type overrides"""
-    pass
+    ...
 
 
 # LLAMA_API enum llama_params_fit_status llama_params_fit(
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -104,7 +104,7 @@ class ModelSettings(BaseSettings):
         default=True, description="Whether to offload kqv to the GPU."
     )
     flash_attn: Optional[bool] = Field(
-        default=False,
+        default=None,
         description="Use flash attention. None=auto, True=enabled, False=disabled.",
     )
     # Sampling Params

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ class ModelSettings(BaseSettings):`
`104`	`104`	`default=True, description="Whether to offload kqv to the GPU."`
`105`	`105`	`)`
`106`	`106`	`flash_attn: Optional[bool] = Field(`
`107`		`- default=False,`
	`107`	`+ default=None,`
`108`	`108`	`description="Use flash attention. None=auto, True=enabled, False=disabled.",`
`109`	`109`	`)`
`110`	`110`	`# Sampling Params`