-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhip_example.py
More file actions
89 lines (72 loc) · 2.32 KB
/
hip_example.py
File metadata and controls
89 lines (72 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
"""
AMD AI Toolkit - HIP GPU Example
Working configuration for AMD RX 6700 XT (gfx1031)
Usage:
.\run_hip_model.ps1 hip_example.py
Requirements:
- AMD HIP SDK 5.7 installed
- llama-cpp-python built with HIP support
- A GGUF model in the models/ directory
"""
import os
import sys
import time
# Force line buffering
sys.stdout.reconfigure(line_buffering=True)
# Get script directory
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Find a model in the models directory
MODELS_DIR = os.path.join(SCRIPT_DIR, "models")
MODEL_PATH = None
if os.path.exists(MODELS_DIR):
for f in os.listdir(MODELS_DIR):
if f.endswith('.gguf'):
MODEL_PATH = os.path.join(MODELS_DIR, f)
break
if MODEL_PATH is None:
print("No GGUF model found in models/ directory")
print("Download a model and place it in:", MODELS_DIR)
print()
print("Example:")
print(" wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
sys.exit(1)
print("=" * 60, flush=True)
print(" AMD AI Toolkit - HIP GPU Example", flush=True)
print(" AMD Radeon RX 6700 XT (gfx1031)", flush=True)
print("=" * 60, flush=True)
print(flush=True)
from llama_cpp import Llama
print(f"Model: {os.path.basename(MODEL_PATH)}", flush=True)
print("Loading with full GPU offload...", flush=True)
start = time.time()
model = Llama(
model_path=MODEL_PATH,
n_gpu_layers=-1,
n_ctx=512,
main_gpu=0,
tensor_split=[1.0],
verbose=True
)
load_time = time.time() - start
print(flush=True)
print(f"Model loaded in {load_time:.2f}s", flush=True)
print(flush=True)
prompt = "The AMD Radeon RX 6700 XT graphics card is"
print(f"Prompt: {prompt}", flush=True)
print(flush=True)
print("Generating...", flush=True)
start = time.time()
output = model(prompt, max_tokens=50, temperature=0.7)
gen_time = time.time() - start
response = output["choices"][0]["text"]
tokens = output["usage"]["completion_tokens"]
print(flush=True)
print(f"Response: {response}", flush=True)
print(flush=True)
print("=" * 60, flush=True)
print(f" Generated {tokens} tokens in {gen_time:.2f}s", flush=True)
print(f" Speed: {tokens/gen_time:.2f} tokens/sec", flush=True)
print("=" * 60, flush=True)
print(flush=True)
print("HIP GPU acceleration is WORKING!", flush=True)