forked from ShaerWare/AI_Secretary_System
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstart_vllm.sh
More file actions
executable file
·29 lines (23 loc) · 888 Bytes
/
start_vllm.sh
File metadata and controls
executable file
·29 lines (23 loc) · 888 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
# Запуск vLLM на GPU 1 (RTX 3060, 12GB)
# Llama-3.1-8B GPTQ INT4
# ВАЖНО: CUDA_DEVICE_ORDER=PCI_BUS_ID чтобы GPU нумеровались по PCI bus
export CUDA_DEVICE_ORDER=PCI_BUS_ID
export CUDA_VISIBLE_DEVICES=1
echo "=========================================="
echo " vLLM Server - Llama-3.1-8B GPTQ"
echo " GPU: RTX 3060 (12GB)"
echo " Port: 11434"
echo "=========================================="
source ~/vllm_env/venv/bin/activate
# Проверка GPU
python -c "import torch; print(f'GPU: {torch.cuda.get_device_name(0)}')" 2>/dev/null || echo "GPU check failed"
vllm serve "fbaldassarri/meta-llama_Llama-3.1-8B-Instruct-auto_gptq-int4-gs128-sym" \
--gpu-memory-utilization 0.70 \
--max-model-len 4096 \
--quantization gptq \
--dtype float16 \
--max-num-seqs 32 \
--port 11434 \
--enforce-eager \
--trust-remote-code