-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_tutorial_pages.py
More file actions
309 lines (231 loc) · 6.97 KB
/
create_tutorial_pages.py
File metadata and controls
309 lines (231 loc) · 6.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python3.11
"""Generate missing tutorial pages for llcuda v2.2.0 Kaggle notebooks."""
import os
from pathlib import Path
# Base directory
DOCS_DIR = Path(__file__).parent / "docs"
TUTORIALS_DIR = DOCS_DIR / "tutorials"
TUTORIALS_DIR.mkdir(parents=True, exist_ok=True)
# Tutorial definitions based on llcuda v2.2.0 notebooks
TUTORIALS = [
{
"num": "01",
"slug": "quickstart",
"title": "Quick Start",
"description": "Get started with llcuda in 5 minutes",
"time": "5 min",
"level": "Beginner",
"prereq": "None",
"vram": "3-5 GB (single T4)",
"content": """Get started with llcuda v2.2.0 in just 5 minutes on Kaggle dual T4 GPUs.
## Overview
This tutorial covers the essentials:
- Installing llcuda v2.2.0
- Downloading a GGUF model
- Starting the llama-server
- Making your first chat completion
- Cleaning up resources
## Prerequisites
- **Kaggle account** with GPU quota
- **Accelerator**: GPU T4 × 2
- **Internet**: Enabled for package installation
## Step 1: Install llcuda
```bash
pip install llcuda
```
On first import, llcuda will auto-download the 961 MB binary package containing llama.cpp build 7760 with FlashAttention.
## Step 2: Import and Initialize
```python
from llcuda.server import ServerManager, ServerConfig
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
print(f"GPU 0: {torch.cuda.get_device_name(0)}")
if torch.cuda.device_count() > 1:
print(f"GPU 1: {torch.cuda.get_device_name(1)}")
```
## Step 3: Download a Model
```python
# Download Gemma 2-2B Q4_K_M from HuggingFace
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(
repo_id="unsloth/gemma-2-2b-it-GGUF",
filename="gemma-2-2b-it-Q4_K_M.gguf"
)
print(f"Model downloaded to: {model_path}")
```
## Step 4: Start the Server
```python
# Configure for single GPU
config = ServerConfig(
model_path=model_path,
n_gpu_layers=99, # Offload all layers to GPU
context_size=4096,
flash_attn=True
)
# Start server
server = ServerManager()
server.start_with_config(config)
print("Server started successfully!")
```
## Step 5: Make Your First Request
```python
from llcuda.api.client import LlamaCppClient
# Create client
client = LlamaCppClient(base_url="http://localhost:8080")
# Chat completion
response = client.create_chat_completion(
messages=[
{"role": "user", "content": "Explain quantum computing in simple terms"}
],
max_tokens=200
)
print(response["choices"][0]["message"]["content"])
print(f"Tokens/sec: {response['usage']['tokens_per_sec']:.1f}")
```
## Step 6: Cleanup
```python
# Stop server
server.stop()
print("Server stopped")
```
## Expected Performance
On Kaggle dual T4 with Gemma 2-2B Q4_K_M:
- **Speed**: ~60 tokens/sec
- **Latency**: ~500ms
- **VRAM**: ~3-4 GB
## Next Steps
- [02 - Server Setup](02-server-setup.md) - Deep dive into server configuration
- [03 - Multi-GPU Inference](03-multi-gpu.md) - Use both T4 GPUs
- [API Reference](../api/server.md) - Complete ServerManager API
## Open in Kaggle
[](https://www.kaggle.com/code/waqasm86/01-quickstart-llcuda-v2-2-0)
"""
},
{
"num": "02",
"slug": "server-setup",
"title": "Server Setup",
"description": "Deep dive into llama-server configuration",
"time": "15 min",
"level": "Beginner",
"prereq": "Complete notebook 01",
"vram": "5-8 GB (single T4)",
"content": """Complete guide to configuring and managing the llama-server lifecycle.
## Overview
Learn about:
- ServerConfig parameter reference
- Server lifecycle (start → ready → stop)
- Health checking and monitoring
- Log access and debugging
- Multiple server configurations
## ServerConfig Parameters
```python
from llcuda.server import ServerConfig
config = ServerConfig(
model_path="model.gguf", # Required: Path to GGUF model
n_gpu_layers=99, # GPU layers (99 = all)
context_size=4096, # Context window
n_batch=2048, # Batch size
flash_attn=True, # Enable FlashAttention
tensor_split=None, # Single GPU (default)
host="127.0.0.1", # Server host
port=8080, # Server port
)
```
## Server Lifecycle
```python
from llcuda.server import ServerManager
server = ServerManager()
# Start server
server.start_with_config(config)
# Check if running
if server.is_running():
print("Server is running")
# Get server URL
print(f"Server URL: {server.get_base_url()}")
# Wait for ready
server.wait_until_ready(timeout=30)
# Get logs
logs = server.get_logs()
print(logs)
# Stop server
server.stop()
```
## Health Checking
```python
import requests
# Check server health
response = requests.get("http://localhost:8080/health")
print(response.json())
# Output:
# {"status": "ok", "slots_idle": 1, "slots_processing": 0}
```
## Advanced Configuration
### Multi-GPU Setup
```python
config = ServerConfig(
model_path="model.gguf",
tensor_split="0.5,0.5", # Split 50/50 across 2 GPUs
split_mode="layer", # Layer-wise splitting
n_gpu_layers=99,
flash_attn=True
)
```
### Memory Optimization
```python
config = ServerConfig(
model_path="model.gguf",
context_size=2048, # Smaller context
n_batch=512, # Smaller batch
n_gpu_layers=99,
flash_attn=True
)
```
## Debugging
```python
# Enable verbose logging
server.start_with_config(config, verbose=True)
# Access logs
logs = server.get_logs()
for line in logs.split('\\n')[-20:]: # Last 20 lines
print(line)
```
## Next Steps
- [03 - Multi-GPU Inference](03-multi-gpu.md) - Use dual T4 GPUs
- [API Reference](../api/server.md) - Complete ServerManager API
- [Troubleshooting](../guides/troubleshooting.md) - Common issues
## Open in Kaggle
[](https://www.kaggle.com/code/waqasm86/02-llama-server-setup-llcuda-v2-2-0)
"""
},
]
def create_tutorial(tutorial):
"""Create a single tutorial markdown file."""
filename = f"{tutorial['num']}-{tutorial['slug']}.md"
filepath = TUTORIALS_DIR / filename
# Create frontmatter and content
content = f"""# {tutorial['title']}
{tutorial['description']}
**Level**: {tutorial['level']}
**Time**: {tutorial['time']}
**Prerequisites**: {tutorial['prereq']}
**VRAM Required**: {tutorial['vram']}
---
{tutorial['content']}
---
**Questions?** [Open an issue on GitHub](https://github.com/llcuda/llcuda/issues)
"""
filepath.write_text(content)
print(f"✓ Created {filename}")
def main():
"""Generate all tutorial pages."""
print("Generating llcuda v2.2.0 tutorial pages...")
print()
for tutorial in TUTORIALS:
create_tutorial(tutorial)
print()
print(f"✓ Generated {len(TUTORIALS)} tutorial pages in {TUTORIALS_DIR}")
if __name__ == "__main__":
main()