Merge pull request #10 from XyLearningProgramming/bugfix/oom

XyLearningProgramming · web-flow · commit 98dde6f0d110 · 2026-02-19T22:28:52.000+08:00
allowed more mem usage to avoid oom
diff --git a/deploy/helm/values.yaml b/deploy/helm/values.yaml
@@ -79,12 +79,11 @@ env: {}
 
 # Resource requests and limits for the container.
 # See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
-# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) on 1-CPU / 1 GB VPS nodes.
-# Previous values for Q8_0 (805 MB): limits cpu=3/mem=800Mi, requests cpu=50m/mem=32Mi
+# Tuned for Qwen3-0.6B-Q4_K_M (484 MB) + n_ctx=8192 KV cache (~448 MB) on 1-CPU / 1 GB VPS nodes.
 resources:
   limits:
     cpu: 1
-    memory: 700Mi
+    memory: 1Gi
   requests:
     cpu: 200m
     memory: 600Mi
diff --git a/slm_server/config.py b/slm_server/config.py
@@ -62,7 +62,7 @@ class Settings(BaseSettings):
         description="Owner label for /models list. Set SLM_MODEL_OWNER to override.",
     )
     n_ctx: int = Field(
-        4096, description="Maximum context window (input + generated tokens)."
+        8192, description="Maximum context window (input + generated tokens)."
     )
     n_threads: int = Field(
         2, description="Number of OpenMP threads llama‑cpp will spawn."

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ class Settings(BaseSettings):`
`62`	`62`	`description="Owner label for /models list. Set SLM_MODEL_OWNER to override.",`
`63`	`63`	`)`
`64`	`64`	`n_ctx: int = Field(`
`65`		`- 4096, description="Maximum context window (input + generated tokens)."`
	`65`	`+ 8192, description="Maximum context window (input + generated tokens)."`
`66`	`66`	`)`
`67`	`67`	`n_threads: int = Field(`
`68`	`68`	`2, description="Number of OpenMP threads llama‑cpp will spawn."`