Orpheus-FastAPI/runpod.toml at main · DevNexsler/Orpheus-FastAPI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# RunPod Project Configuration for Orpheus-FastAPI TTS Server

name = "orpheus-fastapi-tts"

[project]
uuid = "orpheus-fastapi-tts-uuid" # Consider generating a unique UUID if needed
# base_image = "runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel" # Example base, will be defined in Dockerfile
gpu_types = [
    "NVIDIA RTX A4000",
    "NVIDIA RTX A4500",
    "NVIDIA RTX A5000",
    "NVIDIA GeForce RTX 3090",
    "NVIDIA GeForce RTX 4090",
    "NVIDIA RTX A6000",
    "NVIDIA A100 80GB PCIe",
]
gpu_count = 1
volume_mount_path = "/runpod-volume" # Optional: define if persistent volume is needed
ports = "5005/http, 8000/http, 22/tcp" # Orpheus TTS, RunPod health/API (default), SSH
container_disk_size_gb = 30 # Adjust based on model size and dependencies

[project.env_vars]
POD_INACTIVITY_TIMEOUT = "60" # Adjust as needed
ORPHEUS_HOST = "0.0.0.0"
ORPHEUS_PORT = "8000"
ORPHEUS_API_URL = "http://llama-cpp-server:5006/v1/completions" # IMPORTANT: Ensure this points to your LLM backend
ORPHEUS_API_TIMEOUT = "120"
ORPHEUS_MAX_TOKENS = "8192"
ORPHEUS_TEMPERATURE = "0.6"
ORPHEUS_TOP_P = "0.9"
ORPHEUS_SAMPLE_RATE = "24000"
# ORPHEUS_MODEL_NAME = "model_name_if_needed_by_backend" # Uncomment and set if required by your LLM server

[build]
# Define how to build the Docker image
context = "." # Build context is the Orpheus-FastAPI directory
dockerfile = "Dockerfile.runpod" # We will create this Dockerfile next

# [runtime]
# # Command to start the Orpheus server within the container
# # command = ["python", "-u", "app.py"]
# command = ["python", "-u", "-c", "import os; import sys; import torch; print('---PYTHON DEBUG SCRIPT STARTED---', flush=True); print(f'Python version: {sys.version}', flush=True); print(f'Torch version: {torch.__version__}', flush=True); print(f'CUDA available: {torch.cuda.is_available()}', flush=True); print(f'Current PWD: {os.getcwd()}', flush=True); print('---Listing /app:---', flush=True); print(os.listdir('/app'), flush=True); print('---PYTHON DEBUG SCRIPT FINISHED INITIAL PRINTS, NOW SLEEPING---', flush=True); import time; time.sleep(600)"]

[serverless]
# Define serverless worker configuration
max_workers = 3  # Example, adjust as needed
min_workers = 0  # Example, 0 for scale-to-zero
idle_timeout = 300 # Seconds before scaling down an idle worker
concurrency_modifier = 1.0 # Adjust based on how many concurrent requests a single worker can handle

[api]
# API configuration for RunPod Serverless Handler
handler_path = "src/handler.py"  # Path to the handler script within the container
handler_name = "tts_handler"      # Name of the handler function in handler.py

# [network] # This section is usually not needed when using the serverless handler pattern
# # Network configuration (using RunPod defaults generally)
# # http_port = 8000 # RunPod default API port
# # tcp_ports = ["5005"] # Explicitly map Orpheus port if needed beyond standard proxy