Skip to content

Commit e101280

Browse files
authored
Merge pull request #13 from GetSoloTech/vLLM
V llm
2 parents 2bc2cba + b9fc052 commit e101280

15 files changed

Lines changed: 1029 additions & 253 deletions

File tree

setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
"GPUtil",
2020
"psutil",
2121
"requests",
22-
"tabulate",
22+
"rich",
23+
"huggingface_hub",
24+
"llama-cpp-python",
25+
"pydantic",
2326
],
2427
extras_require={
2528
"dev": ["pytest", "black", "isort"],

solo_server/cli.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
import typer
2-
from .commands import run, stop, status
3-
from .start import start
2+
from solo_server.commands import run, stop, status, benchmark, download_hf as download
3+
from solo_server.main import setup
4+
45
app = typer.Typer()
56

67
# Commands
78
app.command()(run.run)
89
app.command()(stop.stop)
910
app.command()(status.status)
10-
app.command()(start)
11+
app.command()(download.download)
12+
app.command()(benchmark.benchmark)
13+
app.command()(setup)
14+
1115

1216
if __name__ == "__main__":
1317
app()

solo_server/commands/benchmark.py

Lines changed: 146 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,152 @@
1+
import time
2+
from typing import List, Tuple
3+
from datetime import datetime
4+
from llama_cpp import Llama
5+
from pydantic import BaseModel, Field, field_validator
6+
from rich.console import Console
7+
from rich.panel import Panel
8+
from rich.progress import track
19
import typer
2-
import subprocess
10+
import requests
311
import json
412

5-
def benchmark(model: str, output_format: str = "md"):
6-
"""
7-
Runs Llama-Bench with specified parameters.
8-
"""
9-
typer.echo(f"⏳ Running Llama-Bench for model: {model}")
13+
console = Console()
1014

11-
command = [
12-
"./llama-bench",
13-
"-m", model,
14-
"-o", output_format
15-
]
15+
class Message(BaseModel):
16+
role: str
17+
content: str
1618

19+
class LlamaResponse(BaseModel):
20+
model: str
21+
created_at: datetime
22+
message: Message
23+
done: bool
24+
total_duration: float
25+
load_duration: float = 0.0
26+
eval_count: int
27+
eval_duration: float
28+
29+
def load_model(model_path: str) -> Tuple[Llama, float]:
30+
console.print(Panel.fit(f"[cyan]Loading model: {model_path}[/]", title="[bold magenta]Solo Server[/]"))
31+
start_time = time.time()
32+
model = Llama(model_path=model_path)
33+
load_duration = time.time() - start_time
34+
return model, load_duration
35+
36+
def api_response(model: str, prompt: str, url: str, server_type:str = None) -> dict:
37+
payload = {
38+
"model": model,
39+
"prompt": prompt,
40+
}
41+
42+
if server_type == "ollama":
43+
payload["model"] = model.lower()
44+
payload["stream"] = False
45+
headers = {"Content-Type": "application/json"}
46+
start_time = time.time()
1747
try:
18-
result = subprocess.run(command, check=True, capture_output=True, text=True)
19-
typer.echo(result.stdout)
20-
except subprocess.CalledProcessError as e:
21-
typer.echo(f"❌ Benchmark failed: {e.stderr}", err=True)
48+
response = requests.post(url, data=json.dumps(payload), headers=headers)
49+
response.raise_for_status()
50+
data = response.json()
51+
# Add eval_duration if not present
52+
if "eval_duration" not in data:
53+
data["eval_duration"] = time.time() - start_time
54+
return data
55+
except requests.exceptions.RequestException as e:
56+
return {"error": str(e)}
57+
58+
def run_benchmark(server_type: str, model: object, model_name: str, prompt: str, load_duration: float) -> LlamaResponse:
59+
content = ""
60+
if server_type == "llama.cpp":
61+
start_time = time.time()
62+
response = model(prompt, stop=["\n"], echo=False)
63+
eval_duration = time.time() - start_time
64+
content = response["choices"][0]["text"]
65+
else:
66+
url = "http://localhost:11434/api/generate" if server_type == "ollama" else "http://localhost:8000/v1/completions"
67+
response = api_response(model_name, prompt, url, server_type)
68+
69+
if server_type == "vllm":
70+
if "choices" in response and "message" in response["choices"][0]:
71+
content = response["choices"][0]["message"]["content"]
72+
else:
73+
content = response["choices"][0]["text"]
74+
eval_duration = response.get("eval_duration", 0.0)
75+
else:
76+
content = response.get("response", "")
77+
load_duration = response.get("load_duration", 0.0) * 1e-9 # Convert nanoseconds to seconds
78+
eval_duration = response.get("eval_duration", 0.0) * 1e-9 # Convert nanoseconds to seconds
79+
80+
message = Message(role="assistant", content=content)
81+
82+
return LlamaResponse(
83+
model=model_name,
84+
created_at=datetime.now(),
85+
message=message,
86+
done=True,
87+
load_duration=load_duration,
88+
total_duration=load_duration + eval_duration,
89+
eval_count=len(content.split()),
90+
eval_duration=eval_duration,
91+
)
92+
93+
def inference_stats(model_response: LlamaResponse):
94+
# Add checks for zero duration
95+
response_ts = 0.0 if model_response.eval_duration == 0 else model_response.eval_count / model_response.eval_duration
96+
total_ts = 0.0 if model_response.total_duration == 0 else model_response.eval_count / model_response.total_duration
97+
98+
console.print(
99+
Panel.fit(
100+
f"[bold magenta]{model_response.model}[/]\n"
101+
f"[green]Response:[/] {response_ts:.2f} tokens/s\n"
102+
f"[blue]Total:[/] {total_ts:.2f} tokens/s\n\n"
103+
f"[yellow]Stats:[/]\n"
104+
f" - Response tokens: {model_response.eval_count}\n"
105+
f" - Model load time: {model_response.load_duration:.2f}s\n"
106+
f" - Response time: {model_response.eval_duration:.2f}s\n"
107+
f" - Total time: {model_response.total_duration:.2f}s",
108+
title="[bold cyan]Benchmark Results[/]",
109+
)
110+
)
111+
112+
def average_stats(responses: List[LlamaResponse]):
113+
if not responses:
114+
console.print("[red]No stats to average.[/]")
115+
return
116+
117+
avg_response = LlamaResponse(
118+
model=responses[0].model,
119+
created_at=datetime.now(),
120+
message=Message(role="system", content=f"Average stats across {len(responses)} runs"),
121+
done=True,
122+
total_duration=sum(r.total_duration for r in responses) / len(responses),
123+
load_duration=sum(r.load_duration for r in responses) / len(responses),
124+
eval_count=sum(r.eval_count for r in responses) // len(responses),
125+
eval_duration=sum(r.eval_duration for r in responses) / len(responses),
126+
)
127+
inference_stats(avg_response)
128+
129+
def benchmark(
130+
server_type: str = typer.Option(None, "-s", help="Type of server (e.g., ollama, vllm, llama.cpp)."),
131+
model_name: str = typer.Option(None, "-m", help="Name of the model."),
132+
prompts: List[str] = typer.Option(["Why is the sky blue?", "Write a report on the financials of Apple Inc.",
133+
"Tell me about San Francisco"], "-p", help="List of prompts to use for benchmarking."),
134+
):
135+
if not server_type:
136+
server_type = typer.prompt("Enter server type (ollama, vllm, llama.cpp)")
137+
if not model_name:
138+
model_name = typer.prompt("Enter model name")
139+
140+
console.print(f"\n[bold cyan]Starting Solo Server Benchmark for {server_type} with model {model_name}...[/]")
141+
142+
model = None
143+
load_duration = 0.0
144+
if server_type == "llama.cpp":
145+
model, load_duration = load_model(model_name)
146+
responses: List[LlamaResponse] = []
147+
for prompt in track(prompts, description="[cyan]Running benchmarks..."):
148+
response = run_benchmark(server_type, model, model_name, prompt, load_duration)
149+
responses.append(response)
150+
inference_stats(response)
151+
152+
average_stats(responses)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import typer
2+
from huggingface_hub import snapshot_download
3+
from rich.console import Console
4+
import os
5+
import json
6+
from solo_server.config import CONFIG_PATH
7+
import subprocess
8+
9+
console = Console()
10+
11+
def download(model: str) -> None:
12+
"""
13+
Downloads a Hugging Face model.
14+
"""
15+
console.print(f"🚀 Downloading model: [bold]{model}[/bold]...")
16+
try:
17+
model_path = snapshot_download(repo_id=model)
18+
console.print(f"✅ Model downloaded successfully: [bold]{model_path}[/bold]")
19+
except Exception as e:
20+
console.print(f"❌ Failed to download model: {e}", style="bold red")
21+
except KeyboardInterrupt:
22+
console.print("❌ Download cancelled by user.", style="bold red")

solo_server/commands/status.py

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,50 @@
11
import typer
22
import subprocess
33
from solo_server.utils.hardware import display_hardware_info
4-
from tabulate import tabulate
4+
from rich.console import Console
5+
from rich.table import Table
56
import json
67

7-
app = typer.Typer()
8+
console = Console()
89

9-
@app.command()
1010
def status():
1111
"""Check running models and system status."""
1212
display_hardware_info(typer)
1313

14-
# Check for running solo container
15-
container_result = subprocess.run(["docker", "ps", "-f", "name=solo", "--format", "{{json .}}"],
14+
# First check if docker is running
15+
try:
16+
subprocess.run(["docker", "ps"], capture_output=True, check=True)
17+
except subprocess.CalledProcessError:
18+
typer.echo("\n❌ Solo server not running. Please start solo-server first.")
19+
return
20+
21+
# Check for running solo containers
22+
container_result = subprocess.run(["docker", "ps", "-f", "name=solo*", "--format", "{{json .}}"],
1623
capture_output=True, text=True, check=True)
1724

18-
if container_result.stdout.strip():
19-
# Container is running, show available models
20-
typer.echo("\n🔍 Available Models:")
21-
models_result = subprocess.run(["docker", "exec", "solo", "ollama", "list"],
22-
capture_output=True, text=True, check=True)
23-
models = []
24-
for line in models_result.stdout.strip().split('\n'):
25-
parts = line.split()
26-
if len(parts) >= 7:
27-
size = f"{parts[2]} {parts[3]}"
28-
modified = f"{parts[4]} {parts[5]} {parts[6]}"
29-
models.append([parts[0], parts[1], size, modified])
25+
# if container_result.stdout.strip():
26+
# # Container is running, show available models
27+
# typer.echo("\n🔍 Available Models:")
28+
# models_result = subprocess.run(["docker", "exec", "solo-ollama", "ollama", "list"],
29+
# capture_output=True, text=True, check=True)
30+
# models = []
31+
# for line in models_result.stdout.strip().split('\n'):
32+
# parts = line.split()
33+
# if len(parts) >= 7:
34+
# size = f"{parts[2]} {parts[3]}"
35+
# modified = f"{parts[4]} {parts[5]} {parts[6]}"
36+
# models.append([parts[0], parts[1], size, modified])
3037

31-
if models:
32-
print(tabulate(models, headers=['NAME', 'ID', 'SIZE', 'MODIFIED'], tablefmt='grid'))
38+
# if models:
39+
# table = Table(title="Available Models")
40+
# table.add_column("NAME", justify="left")
41+
# table.add_column("ID", justify="left")
42+
# table.add_column("SIZE", justify="left")
43+
# table.add_column("MODIFIED", justify="left")
44+
45+
# for model in models:
46+
# table.add_row(*model)
47+
# console.print(table)
3348

3449
# Show running containers section (will be empty if none running)
3550
typer.echo("\n🔍 Running Containers:")
@@ -43,4 +58,11 @@ def status():
4358
container['Ports']
4459
])
4560

46-
print(tabulate(containers, headers=['NAME', 'STATUS', 'PORTS'], tablefmt='grid'))
61+
if containers:
62+
table = Table(title="Running Containers")
63+
table.add_column("NAME", justify="left")
64+
table.add_column("STATUS", justify="left")
65+
table.add_column("PORTS", justify="left")
66+
for container in containers:
67+
table.add_row(*container)
68+
console.print(table)

solo_server/commands/stop.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,18 @@ def stop(name: str = ""):
55
"""
66
Stops the Ollama Docker container and any running models.
77
"""
8-
typer.echo("🛑 Stopping Solo Server...")
8+
9+
# Check if docker is running
10+
try:
11+
subprocess.run(["docker", "info"],
12+
check=True,
13+
stdout=subprocess.PIPE,
14+
stderr=subprocess.PIPE)
15+
except subprocess.CalledProcessError:
16+
typer.echo("\n✅ Solo server is already stopped (Docker is not running)\n")
17+
return
18+
19+
typer.echo("Stopping Solo Server...")
920

1021
try:
1122
# Stop the Docker container

solo_server/config/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import os
2+
3+
CONFIG_DIR = os.path.expanduser('~/.solo_server')
4+
CONFIG_PATH = os.path.join(CONFIG_DIR, 'config.json')
5+
6+
if not os.path.exists(CONFIG_DIR):
7+
os.makedirs(CONFIG_DIR)

0 commit comments

Comments
 (0)