feat: update example script for 70B model inference with disk KV cache support

ManuelSLemos · ManuelSLemos · commit 2c2ddb521394 · 2026-02-23T22:28:48.000+01:00
- Changed model from Qwen2.5-0.5B to Qwen2.5-72B for enhanced performance.
- Implemented disk-based KV cache to prevent OOM issues on 8 GB VRAM.
- Updated user prompt in the example to reflect a new question.
- Removed outdated comments and added new ones for clarity.
diff --git a/example.py b/example.py
@@ -1,32 +1,33 @@
 #!/usr/bin/env python3
 """
-RabbitLLM example — minimal inference script.
-
-Run: python example.py
-Or:  uv run python example.py
-
-Uses a small model (Qwen2.5-0.5B) for fast testing. For larger models or long
-context, see scripts/quickstart.py and the Configuration section in README.
+Inferencia 70B+ sin cuantización, con KV cache en disco (evita OOM en 8 GB VRAM).
 """
 
+import tempfile
 import warnings
 
 import torch
 from rabbitllm import AutoModel
 
 with warnings.catch_warnings():
     warnings.filterwarnings("ignore", message=".*CUDA.*unknown error.*", category=UserWarning)
-    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+# Directorio para el KV cache (en disco, no en GPU)
+kv_cache_dir = tempfile.mkdtemp(prefix="rabbitllm_kv_")
+# Para uso persistente: kv_cache_dir = "./kv_cache"
 
 model = AutoModel.from_pretrained(
-    "Qwen/Qwen2.5-0.5B-Instruct",
+    "Qwen/Qwen2.5-72B-Instruct",
     device=device,
-    compression="4bit",
+    compression=None,           # sin cuantización, full precision
+    kv_cache_dir=kv_cache_dir, # KV cache a disco → evita OOM en 8 GB
+    max_seq_len=512,           # ajusta si necesitas contexto más largo
 )
 
 messages = [
     {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "What is 2 + 2? Answer briefly."},
+    {"role": "user", "content": "What is the capital of France? Answer in one sentence."},
 ]
 
 input_text = model.tokenizer.apply_chat_template(
@@ -53,4 +54,4 @@
 )
 
 input_len = tokens["input_ids"].shape[1]
-print(model.tokenizer.decode(output.sequences[0][input_len:], skip_special_tokens=True))
+print(model.tokenizer.decode(output.sequences[0][input_len:], skip_special_tokens=True))