GetSoloTech
diff --git a/‎setup.py‎
Lines changed: 4 additions & 1 deletion b/‎setup.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎solo_server/cli.py‎
Lines changed: 7 additions & 3 deletions b/‎solo_server/cli.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎solo_server/commands/benchmark.py‎
Lines changed: 146 additions & 15 deletions b/‎solo_server/commands/benchmark.py‎
Lines changed: 146 additions & 15 deletions
diff --git a/‎solo_server/commands/download_hf.py‎
Lines changed: 22 additions & 0 deletions b/‎solo_server/commands/download_hf.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎solo_server/commands/status.py‎
Lines changed: 42 additions & 20 deletions b/‎solo_server/commands/status.py‎
Lines changed: 42 additions & 20 deletions
diff --git a/‎solo_server/commands/stop.py‎
Lines changed: 12 additions & 1 deletion b/‎solo_server/commands/stop.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎solo_server/config/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎solo_server/config/__init__.py‎
Lines changed: 7 additions & 0 deletions
@@ -19,7 +19,10 @@
         "GPUtil",
         "psutil",
         "requests", 
-        "tabulate", 
+        "rich",
+        "huggingface_hub",
+        "llama-cpp-python",
+        "pydantic", 
     ],
     extras_require={
         "dev": ["pytest", "black", "isort"],
 
@@ -1,13 +1,17 @@
 import typer
-from .commands import run, stop, status
-from .start import start    
+from solo_server.commands import run, stop, status, benchmark, download_hf as download  
+from solo_server.main import setup
+
 app = typer.Typer()
 
 # Commands
 app.command()(run.run)
 app.command()(stop.stop)
 app.command()(status.status)
-app.command()(start)
+app.command()(download.download)
+app.command()(benchmark.benchmark)
+app.command()(setup)
+
 
 if __name__ == "__main__":
     app()
@@ -1,21 +1,152 @@
+import time
+from typing import List, Tuple
+from datetime import datetime
+from llama_cpp import Llama
+from pydantic import BaseModel, Field, field_validator
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import track
 import typer
-import subprocess
+import requests
 import json
 
-def benchmark(model: str, output_format: str = "md"):
-    """
-    Runs Llama-Bench with specified parameters.
-    """
-    typer.echo(f"⏳ Running Llama-Bench for model: {model}")
+console = Console()
 
-    command = [
-        "./llama-bench",
-        "-m", model,
-        "-o", output_format
-    ]
+class Message(BaseModel):
+    role: str
+    content: str
 
+class LlamaResponse(BaseModel):
+    model: str
+    created_at: datetime
+    message: Message
+    done: bool
+    total_duration: float
+    load_duration: float = 0.0
+    eval_count: int
+    eval_duration: float
+
+def load_model(model_path: str) -> Tuple[Llama, float]:
+    console.print(Panel.fit(f"[cyan]Loading model: {model_path}[/]", title="[bold magenta]Solo Server[/]"))
+    start_time = time.time()
+    model = Llama(model_path=model_path)
+    load_duration = time.time() - start_time
+    return model, load_duration
+
+def api_response(model: str, prompt: str, url: str, server_type:str = None) -> dict:
+    payload = {
+        "model": model,
+        "prompt": prompt,
+    }
+
+    if server_type == "ollama":
+        payload["model"] = model.lower()
+        payload["stream"] = False
+    headers = {"Content-Type": "application/json"}
+    start_time = time.time()
     try:
-        result = subprocess.run(command, check=True, capture_output=True, text=True)
-        typer.echo(result.stdout)
-    except subprocess.CalledProcessError as e:
-        typer.echo(f"❌ Benchmark failed: {e.stderr}", err=True)
+        response = requests.post(url, data=json.dumps(payload), headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        # Add eval_duration if not present
+        if "eval_duration" not in data:
+            data["eval_duration"] = time.time() - start_time
+        return data
+    except requests.exceptions.RequestException as e:
+        return {"error": str(e)}
+
+def run_benchmark(server_type: str, model: object, model_name: str, prompt: str, load_duration: float) -> LlamaResponse:
+    content = ""
+    if server_type == "llama.cpp":
+        start_time = time.time()
+        response = model(prompt, stop=["\n"], echo=False)
+        eval_duration = time.time() - start_time
+        content = response["choices"][0]["text"]
+    else:
+        url = "http://localhost:11434/api/generate" if server_type == "ollama" else "http://localhost:8000/v1/completions"
+        response = api_response(model_name, prompt, url, server_type)
+
+        if server_type == "vllm":
+            if "choices" in response and "message" in response["choices"][0]:
+                content = response["choices"][0]["message"]["content"]
+            else:
+                content = response["choices"][0]["text"]
+            eval_duration = response.get("eval_duration", 0.0) 
+        else:
+            content = response.get("response", "")
+            load_duration = response.get("load_duration", 0.0) * 1e-9  # Convert nanoseconds to seconds
+            eval_duration = response.get("eval_duration", 0.0) * 1e-9  # Convert nanoseconds to seconds
+            
+    message = Message(role="assistant", content=content)
+
+    return LlamaResponse(
+        model=model_name,
+        created_at=datetime.now(),
+        message=message,
+        done=True,
+        load_duration=load_duration,
+        total_duration=load_duration + eval_duration,
+        eval_count=len(content.split()),
+        eval_duration=eval_duration,
+    )
+
+def inference_stats(model_response: LlamaResponse):
+    # Add checks for zero duration
+    response_ts = 0.0 if model_response.eval_duration == 0 else model_response.eval_count / model_response.eval_duration
+    total_ts = 0.0 if model_response.total_duration == 0 else model_response.eval_count / model_response.total_duration
+
+    console.print(
+        Panel.fit(
+            f"[bold magenta]{model_response.model}[/]\n"
+            f"[green]Response:[/] {response_ts:.2f} tokens/s\n"
+            f"[blue]Total:[/] {total_ts:.2f} tokens/s\n\n"
+            f"[yellow]Stats:[/]\n"
+            f" - Response tokens: {model_response.eval_count}\n"
+            f" - Model load time: {model_response.load_duration:.2f}s\n"
+            f" - Response time: {model_response.eval_duration:.2f}s\n"
+            f" - Total time: {model_response.total_duration:.2f}s",
+            title="[bold cyan]Benchmark Results[/]",
+        )
+    )
+
+def average_stats(responses: List[LlamaResponse]):
+    if not responses:
+        console.print("[red]No stats to average.[/]")
+        return
+
+    avg_response = LlamaResponse(
+        model=responses[0].model,
+        created_at=datetime.now(),
+        message=Message(role="system", content=f"Average stats across {len(responses)} runs"),
+        done=True,
+        total_duration=sum(r.total_duration for r in responses) / len(responses),
+        load_duration=sum(r.load_duration for r in responses) / len(responses),
+        eval_count=sum(r.eval_count for r in responses) // len(responses),
+        eval_duration=sum(r.eval_duration for r in responses) / len(responses),
+    )
+    inference_stats(avg_response)
+
+def benchmark(
+    server_type: str = typer.Option(None, "-s", help="Type of server (e.g., ollama, vllm, llama.cpp)."),
+    model_name: str = typer.Option(None, "-m", help="Name of the model."),
+    prompts: List[str] = typer.Option(["Why is the sky blue?", "Write a report on the financials of Apple Inc.", 
+                                       "Tell me about San Francisco"], "-p", help="List of prompts to use for benchmarking."),
+):
+    if not server_type:
+        server_type = typer.prompt("Enter server type (ollama, vllm, llama.cpp)")
+    if not model_name:
+        model_name = typer.prompt("Enter model name")
+
+    console.print(f"\n[bold cyan]Starting Solo Server Benchmark for {server_type} with model {model_name}...[/]")
+
+    model = None
+    load_duration = 0.0
+    if server_type == "llama.cpp":
+        model, load_duration = load_model(model_name)
+    responses: List[LlamaResponse] = []
+    for prompt in track(prompts, description="[cyan]Running benchmarks..."):
+        response = run_benchmark(server_type, model, model_name, prompt, load_duration)
+        responses.append(response)
+        inference_stats(response)
+    
+    average_stats(responses)
@@ -0,0 +1,22 @@
+import typer
+from huggingface_hub import snapshot_download
+from rich.console import Console
+import os
+import json
+from solo_server.config import CONFIG_PATH
+import subprocess
+
+console = Console()
+
+def download(model: str) -> None:
+    """
+    Downloads a Hugging Face model.
+    """
+    console.print(f"🚀 Downloading model: [bold]{model}[/bold]...")
+    try:
+        model_path = snapshot_download(repo_id=model)
+        console.print(f"✅ Model downloaded successfully: [bold]{model_path}[/bold]")
+    except Exception as e:
+        console.print(f"❌ Failed to download model: {e}", style="bold red")
+    except KeyboardInterrupt:
+        console.print("❌ Download cancelled by user.", style="bold red")
@@ -1,35 +1,50 @@
 import typer
 import subprocess
 from solo_server.utils.hardware import display_hardware_info
-from tabulate import tabulate
+from rich.console import Console
+from rich.table import Table
 import json
 
-app = typer.Typer()
+console = Console()
 
-@app.command()
 def status():
     """Check running models and system status."""
     display_hardware_info(typer)
 
-    # Check for running solo container
-    container_result = subprocess.run(["docker", "ps", "-f", "name=solo", "--format", "{{json .}}"],
+    # First check if docker is running
+    try:
+        subprocess.run(["docker", "ps"], capture_output=True, check=True)
+    except subprocess.CalledProcessError:
+        typer.echo("\n❌ Solo server not running. Please start solo-server first.")
+        return
+    
+    # Check for running solo containers
+    container_result = subprocess.run(["docker", "ps", "-f", "name=solo*", "--format", "{{json .}}"],
                                     capture_output=True, text=True, check=True)
 
-    if container_result.stdout.strip():
-        # Container is running, show available models
-        typer.echo("\n🔍 Available Models:")
-        models_result = subprocess.run(["docker", "exec", "solo", "ollama", "list"], 
-                                     capture_output=True, text=True, check=True)
-        models = []
-        for line in models_result.stdout.strip().split('\n'):
-            parts = line.split()
-            if len(parts) >= 7:
-                size = f"{parts[2]} {parts[3]}"
-                modified = f"{parts[4]} {parts[5]} {parts[6]}"
-                models.append([parts[0], parts[1], size, modified])
+    # if container_result.stdout.strip():
+    #     # Container is running, show available models
+    #     typer.echo("\n🔍 Available Models:")
+    #     models_result = subprocess.run(["docker", "exec", "solo-ollama", "ollama", "list"], 
+    #                                  capture_output=True, text=True, check=True)
+    #     models = []
+    #     for line in models_result.stdout.strip().split('\n'):
+    #         parts = line.split()
+    #         if len(parts) >= 7:
+    #             size = f"{parts[2]} {parts[3]}"
+    #             modified = f"{parts[4]} {parts[5]} {parts[6]}"
+    #             models.append([parts[0], parts[1], size, modified])
 
-        if models:
-            print(tabulate(models, headers=['NAME', 'ID', 'SIZE', 'MODIFIED'], tablefmt='grid'))
+    #     if models:
+    #         table = Table(title="Available Models")
+    #         table.add_column("NAME", justify="left")
+    #         table.add_column("ID", justify="left")
+    #         table.add_column("SIZE", justify="left")
+    #         table.add_column("MODIFIED", justify="left")
+            
+    #         for model in models:
+    #             table.add_row(*model)
+    #         console.print(table)
 
     # Show running containers section (will be empty if none running)
     typer.echo("\n🔍 Running Containers:")
@@ -43,4 +58,11 @@ def status():
                 container['Ports']
             ])
 
-    print(tabulate(containers, headers=['NAME', 'STATUS', 'PORTS'], tablefmt='grid'))
+    if containers:
+        table = Table(title="Running Containers")
+        table.add_column("NAME", justify="left")
+        table.add_column("STATUS", justify="left")
+        table.add_column("PORTS", justify="left")
+        for container in containers:
+            table.add_row(*container)
+        console.print(table)
@@ -5,7 +5,18 @@ def stop(name: str = ""):
     """
     Stops the Ollama Docker container and any running models.
     """
-    typer.echo("🛑 Stopping Solo Server...")
+
+    # Check if docker is running
+    try:
+        subprocess.run(["docker", "info"], 
+                      check=True, 
+                      stdout=subprocess.PIPE, 
+                      stderr=subprocess.PIPE)
+    except subprocess.CalledProcessError:
+        typer.echo("\n✅ Solo server is already stopped (Docker is not running)\n")
+        return
+
+    typer.echo("Stopping Solo Server...")
 
     try:
         # Stop the Docker container
 
@@ -0,0 +1,7 @@
+import os
+
+CONFIG_DIR = os.path.expanduser('~/.solo_server')
+CONFIG_PATH = os.path.join(CONFIG_DIR, 'config.json')
+
+if not os.path.exists(CONFIG_DIR):
+    os.makedirs(CONFIG_DIR)