From fe25f732bbc35ee25d212f0997cb68f47afe0b0a Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 16:48:24 +0100 Subject: [PATCH 01/24] chore: add flash agent skill co-located with source code Adds flash/SKILL.md rewritten around the unified Endpoint class API. Replaces the old skill in runpod/skills which documented the deprecated 8-class resource hierarchy. Co-locating the skill ensures it stays in sync with the codebase. Discoverable via `npx skills add runpod/flash`. --- flash/SKILL.md | 588 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 588 insertions(+) create mode 100644 flash/SKILL.md diff --git a/flash/SKILL.md b/flash/SKILL.md new file mode 100644 index 00000000..d6bf9379 --- /dev/null +++ b/flash/SKILL.md @@ -0,0 +1,588 @@ +--- +name: flash +description: Complete knowledge of runpod-flash - the Endpoint class, CLI, deployment, architecture. + Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuType", "GpuGroup". +user-invocable: true +allowed-tools: Read, Grep, Glob, Bash +--- + +# Runpod Flash + +**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on RunPod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer. + +- **Package**: `pip install runpod-flash` +- **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...` +- **CLI**: `flash` +- **Python**: >=3.10, <3.15 + +## Getting Started + +### 1. Install Flash + +```bash +pip install runpod-flash +``` + +### 2. Authenticate + +Either log in via browser (recommended): + +```bash +flash login +``` + +Or set your API key manually. Get a key from [RunPod account settings](https://docs.runpod.io/get-started/api-keys): + +```bash +export RUNPOD_API_KEY=your_api_key_here +``` + +Or save in a `.env` file (Flash auto-loads via `python-dotenv`): + +```bash +echo "RUNPOD_API_KEY=your_api_key_here" > .env +``` + +### 3. Write and run a remote function + +```python +import asyncio +from runpod_flash import Endpoint, GpuType + +@Endpoint(name="my-first-worker", gpu=GpuType.ANY, dependencies=["torch"]) +async def gpu_task(data): + import torch + tensor = torch.tensor(data, device="cuda") + return {"sum": tensor.sum().item(), "gpu": torch.cuda.get_device_name(0)} + +async def main(): + result = await gpu_task([1, 2, 3, 4, 5]) + print(result) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +First run takes ~1 minute (endpoint provisioning). Subsequent runs take ~1 second. + +### 4. Or create a Flash API project + +```bash +flash init my_project +cd my_project +pip install -r requirements.txt +# Edit .env and add your RUNPOD_API_KEY +flash run # Start local FastAPI server at localhost:8888 +``` + +API explorer available at `http://localhost:8888/docs`. + +### 5. Build and deploy to production + +```bash +flash build # Scan endpoints, package artifact +flash build --exclude torch,torchvision # Exclude packages in base image (500MB limit) +flash deploy new production # Create deployment environment +flash deploy send production # Upload and deploy +flash deploy list # List environments +flash deploy info production # Show details +flash deploy delete production # Tear down +``` + +## The Endpoint Class: Four Modes + +The `Endpoint` class is the single entry point for all Flash functionality. It replaces the old 8-class resource hierarchy (`LiveServerless`, `CpuLiveServerless`, etc.) which still works but emits `DeprecationWarning`. + +### Mode 1: Queue-Based Decorator (QB) + +One function = one endpoint = own workers. Best for batch processing, long-running tasks, automatic retries. + +```python +from runpod_flash import Endpoint, GpuType + +@Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"]) +async def gpu_hello(input_data: dict) -> dict: + import torch + gpu_available = torch.cuda.is_available() + gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU" + return { + "message": input_data.get("message", "Hello from GPU worker!"), + "gpu": {"available": gpu_available, "name": gpu_name}, + } +``` + +QB returns a `JobOutput` with `.output`, `.error`, `.status` fields: + +```python +result = await gpu_hello({"message": "test"}) +# result.output contains the return dict +``` + +### Mode 2: Load-Balanced Decorator (LB) + +Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP. + +```python +from runpod_flash import Endpoint + +api = Endpoint(name="lb_worker", cpu="cpu3c-1-2", workers=(1, 3)) + +@api.post("/process") +async def process(input_data: dict) -> dict: + from datetime import datetime + return {"status": "success", "echo": input_data, "timestamp": datetime.now().isoformat()} + +@api.get("/health") +async def health() -> dict: + return {"status": "healthy"} +``` + +LB returns the dict directly (no `JobOutput` wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`. + +### Mode 3: External Image Client + +Deploy a pre-built Docker image and call it as a client. No `@decorator` -- the Endpoint provisions the image and provides QB and LB client methods. + +```python +from runpod_flash import Endpoint, GpuGroup + +vllm = Endpoint(name="vllm", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_24) + +# LB-style calls +result = await vllm.post("/v1/completions", {"prompt": "hello"}) +models = await vllm.get("/v1/models") + +# QB-style calls +job = await vllm.run({"prompt": "hello"}) +await job.wait() +print(job.output) +``` + +### Mode 4: Existing Endpoint Client + +Connect to an already-deployed endpoint by ID. No provisioning. + +```python +from runpod_flash import Endpoint + +ep = Endpoint(id="abc123") + +# QB-style +job = await ep.runsync({"prompt": "hello"}) +print(job.output) + +# LB-style +result = await ep.post("/v1/completions", {"prompt": "hello"}) +``` + +## Constructor Parameters + +```python +Endpoint( + name: str = None, # Endpoint name (required unless id= is set) + *, + id: str = None, # Connect to existing endpoint (client mode) + gpu: GpuGroup | GpuType | list = None, # GPU type(s) -- mutually exclusive with cpu + cpu: str | CpuInstanceType | list = None, # CPU type(s) -- mutually exclusive with gpu + workers: int | tuple[int, int] = None, # Max workers (int) or (min, max) tuple. Default: (0, 1) + idle_timeout: int = 60, # Seconds before scale-down + dependencies: list[str] = None, # pip packages to install + system_dependencies: list[str] = None, # apt-get packages to install + accelerate_downloads: bool = True, # CDN download acceleration + volume: NetworkVolume = None, # Persistent storage + datacenter: DataCenter = DataCenter.EU_RO_1, # Data center location + env: dict[str, str] = None, # Environment variables + gpu_count: int = 1, # GPUs per worker + execution_timeout_ms: int = 0, # Execution timeout (0 = no limit) + flashboot: bool = True, # FlashBoot for fast cold starts + image: str = None, # Docker image (external image mode, mutually exclusive with id) + scaler_type: ServerlessScalerType = None, # QUEUE_DELAY (QB default) or REQUEST_COUNT (LB default) + scaler_value: int = 4, # Scaler parameter + template: PodTemplate = None, # Pod template overrides +) +``` + +**Mutual exclusions:** +- `gpu` and `cpu` cannot both be set +- `id` and `image` cannot both be set +- `name` or `id` is required + +**Defaults:** +- If neither `gpu` nor `cpu` is set (and not client mode), defaults to `gpu=GpuGroup.ANY` +- `workers=5` means `(0, 5)`. `workers=(2, 5)` means min 2, max 5. + +## EndpointJob + +Returned by `Endpoint.run()` and `Endpoint.runsync()` in client mode (image= or id=). + +```python +job = await ep.run({"prompt": "hello"}) + +# Properties +job.id # "job-abc123" +job.output # Result payload (after COMPLETED) +job.error # Error message (after FAILED) +job.done # True if terminal status (COMPLETED, FAILED, CANCELLED, TIMED_OUT) + +# Methods +await job.status() # Poll, update internal state, return status string +await job.wait(timeout=60) # Poll until terminal status (exponential backoff) +await job.cancel() # Cancel the job +``` + +## GPU & CPU Types + +### GPU Groups (GpuGroup enum) + +VRAM-class groups that map to one or more specific GPU models: + +| Group | VRAM | GPUs | +|-------|------|------| +| `GpuGroup.ANY` | Any | Any available (not for production) | +| `GpuGroup.AMPERE_16` | 16GB | RTX A4000, RTX A4500, RTX 4000 Ada, RTX 2000 Ada | +| `GpuGroup.AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 | +| `GpuGroup.ADA_24` | 24GB | RTX 4090 | +| `GpuGroup.ADA_32_PRO` | 32GB | RTX 5090 | +| `GpuGroup.AMPERE_48` | 48GB | A40, RTX A6000 | +| `GpuGroup.ADA_48_PRO` | 48GB | RTX 6000 Ada, L40, L40S | +| `GpuGroup.AMPERE_80` | 80GB | A100 80GB PCIe, A100-SXM4-80GB | +| `GpuGroup.ADA_80_PRO` | 80GB | H100 PCIe, H100 80GB HBM3, H100 NVL | +| `GpuGroup.HOPPER_141` | 141GB | H200 | + +### GPU Types (GpuType enum) + +Specific GPU models for exact hardware selection: + +`NVIDIA_GEFORCE_RTX_4090`, `NVIDIA_GEFORCE_RTX_5090`, `NVIDIA_RTX_6000_ADA_GENERATION`, `NVIDIA_H100_80GB_HBM3`, `NVIDIA_RTX_A4000`, `NVIDIA_RTX_A4500`, `NVIDIA_RTX_4000_ADA_GENERATION`, `NVIDIA_RTX_2000_ADA_GENERATION`, `NVIDIA_RTX_A5000`, `NVIDIA_L4`, `NVIDIA_GEFORCE_RTX_3090`, `NVIDIA_A40`, `NVIDIA_RTX_A6000`, `NVIDIA_A100_80GB_PCIe`, `NVIDIA_A100_SXM4_80GB`, `NVIDIA_H200` + +Usage: `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` or `gpu=[GpuType.NVIDIA_A100_80GB_PCIe, GpuType.NVIDIA_A100_SXM4_80GB]` + +### CPU Instance Types (CpuInstanceType enum) + +Format: `CPU{generation}{type}_{vcpu}_{memory_gb}`. Can also use string shorthand: `cpu="cpu3c-1-2"`. + +| Instance Type | Gen | Type | vCPU | RAM | Max Disk | +|--------------|-----|------|------|-----|----------| +| `CPU3G_1_4` | 3rd | General | 1 | 4GB | 10GB | +| `CPU3G_2_8` | 3rd | General | 2 | 8GB | 20GB | +| `CPU3G_4_16` | 3rd | General | 4 | 16GB | 40GB | +| `CPU3G_8_32` | 3rd | General | 8 | 32GB | 80GB | +| `CPU3C_1_2` | 3rd | Compute | 1 | 2GB | 10GB | +| `CPU3C_2_4` | 3rd | Compute | 2 | 4GB | 20GB | +| `CPU3C_4_8` | 3rd | Compute | 4 | 8GB | 40GB | +| `CPU3C_8_16` | 3rd | Compute | 8 | 16GB | 80GB | +| `CPU5C_1_2` | 5th | Compute | 1 | 2GB | 15GB | +| `CPU5C_2_4` | 5th | Compute | 2 | 4GB | 30GB | +| `CPU5C_4_8` | 5th | Compute | 4 | 8GB | 60GB | +| `CPU5C_8_16` | 5th | Compute | 8 | 16GB | 120GB | + +## Cloudpickle Scoping Rules + +Functions decorated with `@Endpoint(...)` are serialized with cloudpickle. They can ONLY access: +- Function parameters +- Local variables defined inside the function +- Imports done inside the function +- Built-in Python functions + +They CANNOT access: module-level imports, global variables, external functions/classes. + +```python +# WRONG - external references +import torch +@Endpoint(name="worker", gpu=GpuGroup.ADA_24) +async def bad(data): + return torch.tensor(data) # torch not accessible remotely + +# CORRECT - everything inside, dependencies declared +@Endpoint(name="worker", gpu=GpuGroup.ADA_24, dependencies=["torch"]) +async def good(data): + import torch + return torch.tensor(data) +``` + +All pip packages must be listed in `dependencies=[]`. System packages go in `system_dependencies=[]`. + +## CLI Commands + +### flash login + +```bash +flash login [--no-open] [--timeout SECONDS] +``` + +Authenticate via browser. Opens RunPod console for authorization, saves credentials locally. + +### flash init + +```bash +flash init [project_name] +``` + +Creates a project with three template workers: +- `gpu_worker.py` -- QB GPU endpoint using `@Endpoint` decorator +- `cpu_worker.py` -- QB CPU endpoint using `@Endpoint` decorator +- `lb_worker.py` -- LB CPU endpoint with `@api.post` and `@api.get` routes + +### flash run + +```bash +flash run [--host HOST] [--port PORT] +``` + +Starts a local FastAPI dev server at `localhost:8888` with auto-generated routes for all discovered endpoints. API explorer at `/docs`. + +| Option | Default | Description | +|--------|---------|-------------| +| `--host` | `localhost` | Server host (or `FLASH_HOST` env) | +| `--port` | `8888` | Server port (or `FLASH_PORT` env) | + +### flash build + +```bash +flash build [--exclude PACKAGES] [--keep-build] [--preview] +``` + +Scans `@Endpoint` decorators, groups by resource config, creates `flash_manifest.json`, installs dependencies for Linux x86_64, packages into `.flash/artifact.tar.gz`. + +| Option | Description | +|--------|-------------| +| `--exclude pkg1,pkg2` | Skip packages already in base Docker image | +| `--keep-build` | Don't delete `.flash/.build/` after packaging | +| `--preview` | Build then run in local Docker containers | + +**500MB deployment limit** -- use `--exclude` for packages in base image: + +```bash +flash build --exclude torch,torchvision,torchaudio +``` + +**`--preview` mode**: Creates Docker containers per resource config, starts mothership on `localhost:8000`, enables end-to-end local testing. + +### flash deploy + +```bash +flash deploy new [--app-name NAME] # Create environment +flash deploy send [--app-name NAME] # Deploy archive +flash deploy list [--app-name NAME] # List environments +flash deploy info [--app-name NAME] # Show details +flash deploy delete [--app-name NAME] # Delete (double confirmation) +``` + +`flash deploy send` requires `flash build` to have been run first. + +### flash undeploy + +```bash +flash undeploy list # List all deployed resources +flash undeploy # Undeploy specific resource +``` + +### flash env / flash app + +```bash +flash env list|create|get|delete # Environment management +flash app list|get # App management +``` + +## Common Patterns + +### QB GPU Endpoint + +```python +from runpod_flash import Endpoint, GpuGroup + +@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, workers=(0, 3), dependencies=["torch"]) +async def inference(data: dict) -> dict: + import torch + tensor = torch.tensor(data["values"], device="cuda") + return {"result": tensor.sum().item()} +``` + +### QB CPU Endpoint + +```python +from runpod_flash import Endpoint + +@Endpoint(name="cpu_worker", cpu="cpu3c-1-2") +async def cpu_hello(input_data: dict) -> dict: + import platform + from datetime import datetime + return { + "message": input_data.get("message", "Hello from CPU worker!"), + "timestamp": datetime.now().isoformat(), + "python_version": platform.python_version(), + } +``` + +### LB HTTP API + +```python +from runpod_flash import Endpoint + +api = Endpoint(name="my-api", gpu=GpuGroup.ADA_24, workers=(1, 5)) + +@api.get("/health") +async def health(): + return {"status": "ok"} + +@api.post("/compute") +async def compute(request: dict) -> dict: + return {"result": request} +``` + +### External Image Deployment + +```python +from runpod_flash import Endpoint, GpuGroup + +vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO) +result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"}) +``` + +### Hybrid GPU/CPU Pipeline + +```python +from runpod_flash import Endpoint, GpuGroup + +@Endpoint(name="preprocessor", cpu="cpu5c-4-8", dependencies=["pandas"]) +async def preprocess(data): + import pandas as pd + return pd.DataFrame(data).to_dict("records") + +@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, dependencies=["torch"]) +async def inference(data): + import torch + tensor = torch.tensor(data, device="cuda") + return {"result": tensor.sum().item()} + +async def pipeline(raw_data): + clean = await preprocess(raw_data) + return await inference(clean) +``` + +### Parallel Execution + +```python +import asyncio + +results = await asyncio.gather( + process_item(item1), + process_item(item2), + process_item(item3), +) +``` + +### NetworkVolume + +```python +from runpod_flash import Endpoint, GpuGroup, NetworkVolume, DataCenter + +volume = NetworkVolume(name="model-storage", size=100, dataCenterId=DataCenter.EU_RO_1) + +@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, volume=volume) +async def worker(data: dict) -> dict: + ... +``` + +### PodTemplate + +```python +from runpod_flash import Endpoint, GpuGroup, PodTemplate + +template = PodTemplate(containerDiskInGb=100) + +@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, template=template) +async def worker(data: dict) -> dict: + ... +``` + +## Error Handling + +### Queue-Based (QB) Resources + +```python +job_output = await my_function(data) +if job_output.error: + print(f"Failed: {job_output.error}") +else: + result = job_output.output +``` + +`JobOutput` fields: `id`, `status`, `output`, `error`, `started_at`, `ended_at` + +### Load-Balanced (LB) Resources + +```python +try: + result = await my_function(data) # Returns dict directly +except Exception as e: + print(f"Error: {e}") +``` + +### EndpointJob (Client Mode) + +```python +job = await ep.run({"prompt": "hello"}) +await job.wait(timeout=120) +if job.error: + print(f"Failed: {job.error}") +else: + print(job.output) +``` + +### Runtime Exceptions + +``` +FlashRuntimeError (base) + RemoteExecutionError # Remote function failed + SerializationError # cloudpickle serialization failed + GraphQLError # GraphQL base error + GraphQLMutationError # Mutation failed + GraphQLQueryError # Query failed + ManifestError # Invalid/missing manifest + ManifestServiceUnavailableError # State Manager unreachable +``` + +## Architecture Overview + +### Deployment Architecture + +**Mothership Pattern**: Coordinator endpoint + distributed child endpoints. + +1. `flash build` scans code, creates manifest + archive +2. `flash deploy send` uploads archive, provisions resources +3. Mothership boots, reconciles desired vs current state +4. Child endpoints query State Manager GraphQL for service discovery (peer-to-peer) +5. Functions route locally or remotely based on manifest + +### How Endpoint Resolves to Internal Classes + +The `Endpoint` class automatically selects the right internal resource class based on: +- **QB vs LB**: Inferred from usage (direct `@Endpoint` decorator = QB, `.get()`/`.post()` routes = LB) +- **GPU vs CPU**: From `gpu=` or `cpu=` parameter +- **Live vs Deploy**: From runtime environment (`flash run` = live, `flash deploy` = deploy classes) + +This means 8 internal classes are selected automatically -- users never need to pick one. + +### Cross-Endpoint Routing + +Functions on different endpoints can call each other transparently: +1. `ProductionWrapper` intercepts calls +2. `ServiceRegistry` looks up function in manifest +3. Local function? Execute directly +4. Remote function? Serialize args (cloudpickle), POST to remote endpoint + +**Serialization**: cloudpickle + base64, max 10MB payload. Pass URLs/paths instead of large data. + +## Common Gotchas + +1. **External scope in decorated functions** -- Most common error. All imports and logic must be inside the function body. +2. **Forgetting `await`** -- All remote functions must be awaited. +3. **Undeclared dependencies** -- Must be in `dependencies=[]` parameter. +4. **QB vs LB return types** -- QB returns `JobOutput` wrapper, LB returns dict directly. +5. **Large serialization** -- Max 10MB. Pass URLs/paths, not large data objects. +6. **Imports at module level** -- Import inside decorated functions, not at top of file. +7. **Bundle too large (>500MB)** -- Use `--exclude` for packages in base Docker image. +8. **Endpoints accumulate** -- Clean up with `flash undeploy list` / `flash undeploy `. +9. **Mixing decorator patterns** -- Cannot use `@Endpoint(...)` as direct decorator AND register routes (`.get()`/`.post()`) on the same instance. +10. **Client mode restrictions** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators. Cannot use `@ep.post("/path")` to register routes on a client. From 9f6bfb2be3a69572b5f7e372644f3ed86a50341e Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 16:59:48 +0100 Subject: [PATCH 02/24] fix: use correct "Runpod" casing in skill --- flash/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index d6bf9379..9874e6c9 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -8,7 +8,7 @@ allowed-tools: Read, Grep, Glob, Bash # Runpod Flash -**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on RunPod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer. +**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on Runpod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer. - **Package**: `pip install runpod-flash` - **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...` @@ -31,7 +31,7 @@ Either log in via browser (recommended): flash login ``` -Or set your API key manually. Get a key from [RunPod account settings](https://docs.runpod.io/get-started/api-keys): +Or set your API key manually. Get a key from [Runpod account settings](https://docs.runpod.io/get-started/api-keys): ```bash export RUNPOD_API_KEY=your_api_key_here @@ -310,7 +310,7 @@ All pip packages must be listed in `dependencies=[]`. System packages go in `sys flash login [--no-open] [--timeout SECONDS] ``` -Authenticate via browser. Opens RunPod console for authorization, saves credentials locally. +Authenticate via browser. Opens Runpod console for authorization, saves credentials locally. ### flash init From 4a8c55551c24f244bdda1bbea52271b383aa1ddb Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 18:17:28 +0100 Subject: [PATCH 03/24] chore: trim flash skill from 588 to 264 lines Remove content an agent doesn't need: architecture internals, full enum listings, verbose CLI option tables, redundant code patterns. Keep: constructor params, four modes, cloudpickle rules, gotchas. Point agents to source files for enum details they can read themselves. --- flash/SKILL.md | 492 +++++++++---------------------------------------- 1 file changed, 84 insertions(+), 408 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 9874e6c9..cec1b732 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -8,7 +8,7 @@ allowed-tools: Read, Grep, Glob, Bash # Runpod Flash -**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on Runpod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer. +**runpod-flash** (v1.6.0) -- Python SDK for distributed inference and serving on Runpod serverless. - **Package**: `pip install runpod-flash` - **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...` @@ -17,33 +17,13 @@ allowed-tools: Read, Grep, Glob, Bash ## Getting Started -### 1. Install Flash - ```bash pip install runpod-flash +flash login # Authenticate via browser (recommended) +# Or: export RUNPOD_API_KEY=... or add to .env file ``` -### 2. Authenticate - -Either log in via browser (recommended): - -```bash -flash login -``` - -Or set your API key manually. Get a key from [Runpod account settings](https://docs.runpod.io/get-started/api-keys): - -```bash -export RUNPOD_API_KEY=your_api_key_here -``` - -Or save in a `.env` file (Flash auto-loads via `python-dotenv`): - -```bash -echo "RUNPOD_API_KEY=your_api_key_here" > .env -``` - -### 3. Write and run a remote function +Minimal example: ```python import asyncio @@ -55,47 +35,25 @@ async def gpu_task(data): tensor = torch.tensor(data, device="cuda") return {"sum": tensor.sum().item(), "gpu": torch.cuda.get_device_name(0)} -async def main(): - result = await gpu_task([1, 2, 3, 4, 5]) - print(result) - -if __name__ == "__main__": - asyncio.run(main()) +asyncio.run(gpu_task([1, 2, 3, 4, 5])) ``` First run takes ~1 minute (endpoint provisioning). Subsequent runs take ~1 second. -### 4. Or create a Flash API project - -```bash -flash init my_project -cd my_project -pip install -r requirements.txt -# Edit .env and add your RUNPOD_API_KEY -flash run # Start local FastAPI server at localhost:8888 -``` - -API explorer available at `http://localhost:8888/docs`. - -### 5. Build and deploy to production +Create a project with templates: ```bash -flash build # Scan endpoints, package artifact -flash build --exclude torch,torchvision # Exclude packages in base image (500MB limit) -flash deploy new production # Create deployment environment -flash deploy send production # Upload and deploy -flash deploy list # List environments -flash deploy info production # Show details -flash deploy delete production # Tear down +flash init my_project && cd my_project +flash run # Local FastAPI dev server at localhost:8888/docs ``` ## The Endpoint Class: Four Modes -The `Endpoint` class is the single entry point for all Flash functionality. It replaces the old 8-class resource hierarchy (`LiveServerless`, `CpuLiveServerless`, etc.) which still works but emits `DeprecationWarning`. +The `Endpoint` class is the single entry point. It replaces the old 8-class hierarchy (`LiveServerless`, etc.) which still works but emits `DeprecationWarning`. ### Mode 1: Queue-Based Decorator (QB) -One function = one endpoint = own workers. Best for batch processing, long-running tasks, automatic retries. +One function = one endpoint = own workers. Best for batch, long-running tasks, automatic retries. Returns `JobOutput` with `.output`, `.error`, `.status`. ```python from runpod_flash import Endpoint, GpuType @@ -103,24 +61,16 @@ from runpod_flash import Endpoint, GpuType @Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"]) async def gpu_hello(input_data: dict) -> dict: import torch - gpu_available = torch.cuda.is_available() - gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU" - return { - "message": input_data.get("message", "Hello from GPU worker!"), - "gpu": {"available": gpu_available, "name": gpu_name}, - } -``` - -QB returns a `JobOutput` with `.output`, `.error`, `.status` fields: + gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU" + return {"message": input_data.get("message", "Hello!"), "gpu": gpu_name} -```python result = await gpu_hello({"message": "test"}) # result.output contains the return dict ``` ### Mode 2: Load-Balanced Decorator (LB) -Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP. +Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP. Returns dict directly (no wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`. ```python from runpod_flash import Endpoint @@ -130,315 +80,140 @@ api = Endpoint(name="lb_worker", cpu="cpu3c-1-2", workers=(1, 3)) @api.post("/process") async def process(input_data: dict) -> dict: from datetime import datetime - return {"status": "success", "echo": input_data, "timestamp": datetime.now().isoformat()} + return {"echo": input_data, "timestamp": datetime.now().isoformat()} @api.get("/health") async def health() -> dict: return {"status": "healthy"} ``` -LB returns the dict directly (no `JobOutput` wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`. - ### Mode 3: External Image Client -Deploy a pre-built Docker image and call it as a client. No `@decorator` -- the Endpoint provisions the image and provides QB and LB client methods. +Deploy a pre-built Docker image, call it as a client. Returns `EndpointJob` (see below). ```python from runpod_flash import Endpoint, GpuGroup vllm = Endpoint(name="vllm", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_24) -# LB-style calls -result = await vllm.post("/v1/completions", {"prompt": "hello"}) -models = await vllm.get("/v1/models") - -# QB-style calls -job = await vllm.run({"prompt": "hello"}) +result = await vllm.post("/v1/completions", {"prompt": "hello"}) # LB-style +job = await vllm.run({"prompt": "hello"}) # QB-style await job.wait() print(job.output) ``` ### Mode 4: Existing Endpoint Client -Connect to an already-deployed endpoint by ID. No provisioning. +Connect to an already-deployed endpoint by ID. No provisioning. Returns `EndpointJob`. ```python -from runpod_flash import Endpoint - ep = Endpoint(id="abc123") - -# QB-style job = await ep.runsync({"prompt": "hello"}) print(job.output) - -# LB-style -result = await ep.post("/v1/completions", {"prompt": "hello"}) ``` +**EndpointJob** (returned by `.run()` / `.runsync()` in client modes): properties `.id`, `.output`, `.error`, `.done`; methods `await job.status()`, `await job.wait(timeout=60)`, `await job.cancel()`. + ## Constructor Parameters ```python Endpoint( - name: str = None, # Endpoint name (required unless id= is set) + name: str = None, # Required unless id= is set *, id: str = None, # Connect to existing endpoint (client mode) gpu: GpuGroup | GpuType | list = None, # GPU type(s) -- mutually exclusive with cpu cpu: str | CpuInstanceType | list = None, # CPU type(s) -- mutually exclusive with gpu - workers: int | tuple[int, int] = None, # Max workers (int) or (min, max) tuple. Default: (0, 1) + workers: int | tuple[int, int] = None, # (min, max) tuple or just max. Default: (0, 1) idle_timeout: int = 60, # Seconds before scale-down dependencies: list[str] = None, # pip packages to install - system_dependencies: list[str] = None, # apt-get packages to install + system_dependencies: list[str] = None, # apt-get packages accelerate_downloads: bool = True, # CDN download acceleration - volume: NetworkVolume = None, # Persistent storage - datacenter: DataCenter = DataCenter.EU_RO_1, # Data center location + volume: NetworkVolume = None, # Persistent storage (NetworkVolume(name=..., size=100, dataCenterId=DataCenter.EU_RO_1)) + datacenter: DataCenter = DataCenter.EU_RO_1, env: dict[str, str] = None, # Environment variables gpu_count: int = 1, # GPUs per worker - execution_timeout_ms: int = 0, # Execution timeout (0 = no limit) - flashboot: bool = True, # FlashBoot for fast cold starts + execution_timeout_ms: int = 0, # 0 = no limit + flashboot: bool = True, # Fast cold starts image: str = None, # Docker image (external image mode, mutually exclusive with id) - scaler_type: ServerlessScalerType = None, # QUEUE_DELAY (QB default) or REQUEST_COUNT (LB default) - scaler_value: int = 4, # Scaler parameter - template: PodTemplate = None, # Pod template overrides + scaler_type: ServerlessScalerType = None, # QUEUE_DELAY (QB) or REQUEST_COUNT (LB) + scaler_value: int = 4, + template: PodTemplate = None, # Pod overrides (e.g. PodTemplate(containerDiskInGb=100)) ) ``` -**Mutual exclusions:** -- `gpu` and `cpu` cannot both be set -- `id` and `image` cannot both be set -- `name` or `id` is required - -**Defaults:** -- If neither `gpu` nor `cpu` is set (and not client mode), defaults to `gpu=GpuGroup.ANY` +- `gpu` and `cpu` are mutually exclusive. `id` and `image` are mutually exclusive. +- If neither `gpu` nor `cpu` is set (non-client), defaults to `gpu=GpuGroup.ANY`. - `workers=5` means `(0, 5)`. `workers=(2, 5)` means min 2, max 5. -## EndpointJob +## GPU & CPU Types -Returned by `Endpoint.run()` and `Endpoint.runsync()` in client mode (image= or id=). +### GpuGroup (by VRAM class) -```python -job = await ep.run({"prompt": "hello"}) - -# Properties -job.id # "job-abc123" -job.output # Result payload (after COMPLETED) -job.error # Error message (after FAILED) -job.done # True if terminal status (COMPLETED, FAILED, CANCELLED, TIMED_OUT) - -# Methods -await job.status() # Poll, update internal state, return status string -await job.wait(timeout=60) # Poll until terminal status (exponential backoff) -await job.cancel() # Cancel the job -``` +| Group | VRAM | GPUs | +|-------|------|------| +| `ANY` | Any | Any available (not for production) | +| `AMPERE_16` | 16GB | RTX A4000/A4500 | +| `AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 | +| `ADA_24` | 24GB | RTX 4090 | +| `ADA_32_PRO` | 32GB | RTX 5090 | +| `AMPERE_48` | 48GB | A40, RTX A6000 | +| `ADA_48_PRO` | 48GB | RTX 6000 Ada | +| `AMPERE_80` | 80GB | A100 | +| `ADA_80_PRO` | 80GB | H100 | +| `HOPPER_141` | 141GB | H200 | -## GPU & CPU Types +For exact GPU selection, use `GpuType` enum (e.g. `GpuType.NVIDIA_GEFORCE_RTX_4090`). See `src/runpod_flash/core/resources/gpu.py` for full list. -### GPU Groups (GpuGroup enum) +### CPU Instance Types -VRAM-class groups that map to one or more specific GPU models: +Format: `cpu{gen}{type}-{vcpu}-{memory}`. Use string shorthand (`cpu="cpu3c-1-2"`) or `CpuInstanceType` enum. -| Group | VRAM | GPUs | -|-------|------|------| -| `GpuGroup.ANY` | Any | Any available (not for production) | -| `GpuGroup.AMPERE_16` | 16GB | RTX A4000, RTX A4500, RTX 4000 Ada, RTX 2000 Ada | -| `GpuGroup.AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 | -| `GpuGroup.ADA_24` | 24GB | RTX 4090 | -| `GpuGroup.ADA_32_PRO` | 32GB | RTX 5090 | -| `GpuGroup.AMPERE_48` | 48GB | A40, RTX A6000 | -| `GpuGroup.ADA_48_PRO` | 48GB | RTX 6000 Ada, L40, L40S | -| `GpuGroup.AMPERE_80` | 80GB | A100 80GB PCIe, A100-SXM4-80GB | -| `GpuGroup.ADA_80_PRO` | 80GB | H100 PCIe, H100 80GB HBM3, H100 NVL | -| `GpuGroup.HOPPER_141` | 141GB | H200 | - -### GPU Types (GpuType enum) - -Specific GPU models for exact hardware selection: - -`NVIDIA_GEFORCE_RTX_4090`, `NVIDIA_GEFORCE_RTX_5090`, `NVIDIA_RTX_6000_ADA_GENERATION`, `NVIDIA_H100_80GB_HBM3`, `NVIDIA_RTX_A4000`, `NVIDIA_RTX_A4500`, `NVIDIA_RTX_4000_ADA_GENERATION`, `NVIDIA_RTX_2000_ADA_GENERATION`, `NVIDIA_RTX_A5000`, `NVIDIA_L4`, `NVIDIA_GEFORCE_RTX_3090`, `NVIDIA_A40`, `NVIDIA_RTX_A6000`, `NVIDIA_A100_80GB_PCIe`, `NVIDIA_A100_SXM4_80GB`, `NVIDIA_H200` - -Usage: `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` or `gpu=[GpuType.NVIDIA_A100_80GB_PCIe, GpuType.NVIDIA_A100_SXM4_80GB]` - -### CPU Instance Types (CpuInstanceType enum) - -Format: `CPU{generation}{type}_{vcpu}_{memory_gb}`. Can also use string shorthand: `cpu="cpu3c-1-2"`. - -| Instance Type | Gen | Type | vCPU | RAM | Max Disk | -|--------------|-----|------|------|-----|----------| -| `CPU3G_1_4` | 3rd | General | 1 | 4GB | 10GB | -| `CPU3G_2_8` | 3rd | General | 2 | 8GB | 20GB | -| `CPU3G_4_16` | 3rd | General | 4 | 16GB | 40GB | -| `CPU3G_8_32` | 3rd | General | 8 | 32GB | 80GB | -| `CPU3C_1_2` | 3rd | Compute | 1 | 2GB | 10GB | -| `CPU3C_2_4` | 3rd | Compute | 2 | 4GB | 20GB | -| `CPU3C_4_8` | 3rd | Compute | 4 | 8GB | 40GB | -| `CPU3C_8_16` | 3rd | Compute | 8 | 16GB | 80GB | -| `CPU5C_1_2` | 5th | Compute | 1 | 2GB | 15GB | -| `CPU5C_2_4` | 5th | Compute | 2 | 4GB | 30GB | -| `CPU5C_4_8` | 5th | Compute | 4 | 8GB | 60GB | -| `CPU5C_8_16` | 5th | Compute | 8 | 16GB | 120GB | +Families: `cpu3g` (general, 4GB/vCPU), `cpu3c` (compute, 2GB/vCPU), `cpu5c` (5th gen compute, 2GB/vCPU). Each from 1 to 8 vCPUs. See `src/runpod_flash/core/resources/cpu.py` for full list. ## Cloudpickle Scoping Rules Functions decorated with `@Endpoint(...)` are serialized with cloudpickle. They can ONLY access: -- Function parameters -- Local variables defined inside the function -- Imports done inside the function -- Built-in Python functions +- Function parameters, local variables, imports done **inside** the function, built-ins They CANNOT access: module-level imports, global variables, external functions/classes. ```python -# WRONG - external references +# WRONG import torch @Endpoint(name="worker", gpu=GpuGroup.ADA_24) async def bad(data): return torch.tensor(data) # torch not accessible remotely -# CORRECT - everything inside, dependencies declared +# CORRECT @Endpoint(name="worker", gpu=GpuGroup.ADA_24, dependencies=["torch"]) async def good(data): import torch return torch.tensor(data) ``` -All pip packages must be listed in `dependencies=[]`. System packages go in `system_dependencies=[]`. +All pip packages must be in `dependencies=[]`. System packages in `system_dependencies=[]`. ## CLI Commands -### flash login - -```bash -flash login [--no-open] [--timeout SECONDS] -``` - -Authenticate via browser. Opens Runpod console for authorization, saves credentials locally. - -### flash init - -```bash -flash init [project_name] -``` - -Creates a project with three template workers: -- `gpu_worker.py` -- QB GPU endpoint using `@Endpoint` decorator -- `cpu_worker.py` -- QB CPU endpoint using `@Endpoint` decorator -- `lb_worker.py` -- LB CPU endpoint with `@api.post` and `@api.get` routes - -### flash run - -```bash -flash run [--host HOST] [--port PORT] -``` - -Starts a local FastAPI dev server at `localhost:8888` with auto-generated routes for all discovered endpoints. API explorer at `/docs`. - -| Option | Default | Description | -|--------|---------|-------------| -| `--host` | `localhost` | Server host (or `FLASH_HOST` env) | -| `--port` | `8888` | Server port (or `FLASH_PORT` env) | - -### flash build - -```bash -flash build [--exclude PACKAGES] [--keep-build] [--preview] -``` - -Scans `@Endpoint` decorators, groups by resource config, creates `flash_manifest.json`, installs dependencies for Linux x86_64, packages into `.flash/artifact.tar.gz`. - -| Option | Description | -|--------|-------------| -| `--exclude pkg1,pkg2` | Skip packages already in base Docker image | -| `--keep-build` | Don't delete `.flash/.build/` after packaging | -| `--preview` | Build then run in local Docker containers | - -**500MB deployment limit** -- use `--exclude` for packages in base image: - ```bash -flash build --exclude torch,torchvision,torchaudio -``` - -**`--preview` mode**: Creates Docker containers per resource config, starts mothership on `localhost:8000`, enables end-to-end local testing. - -### flash deploy - -```bash -flash deploy new [--app-name NAME] # Create environment -flash deploy send [--app-name NAME] # Deploy archive -flash deploy list [--app-name NAME] # List environments -flash deploy info [--app-name NAME] # Show details -flash deploy delete [--app-name NAME] # Delete (double confirmation) -``` - -`flash deploy send` requires `flash build` to have been run first. - -### flash undeploy - -```bash -flash undeploy list # List all deployed resources -flash undeploy # Undeploy specific resource -``` - -### flash env / flash app - -```bash -flash env list|create|get|delete # Environment management -flash app list|get # App management -``` +flash login # Authenticate via browser +flash init [project_name] # Create project from templates +flash run [--host HOST] [--port PORT] # Dev server at localhost:8888 +flash build [--exclude pkg1,pkg2] [--preview] # Package artifact (500MB limit) +flash deploy new|send|list|info|delete # Deployment lifecycle +flash undeploy list # List deployed resources +flash undeploy # Remove specific resource +flash env list|create|get|delete # Environment management +flash app list|get # App management +``` + +Key notes: +- `flash build --exclude torch,torchvision,torchaudio` -- exclude packages already in base Docker image to stay under 500MB limit +- `flash build --preview` -- run in local Docker containers for end-to-end testing +- `flash deploy send` requires `flash build` first ## Common Patterns -### QB GPU Endpoint - -```python -from runpod_flash import Endpoint, GpuGroup - -@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, workers=(0, 3), dependencies=["torch"]) -async def inference(data: dict) -> dict: - import torch - tensor = torch.tensor(data["values"], device="cuda") - return {"result": tensor.sum().item()} -``` - -### QB CPU Endpoint - -```python -from runpod_flash import Endpoint - -@Endpoint(name="cpu_worker", cpu="cpu3c-1-2") -async def cpu_hello(input_data: dict) -> dict: - import platform - from datetime import datetime - return { - "message": input_data.get("message", "Hello from CPU worker!"), - "timestamp": datetime.now().isoformat(), - "python_version": platform.python_version(), - } -``` - -### LB HTTP API - -```python -from runpod_flash import Endpoint - -api = Endpoint(name="my-api", gpu=GpuGroup.ADA_24, workers=(1, 5)) - -@api.get("/health") -async def health(): - return {"status": "ok"} - -@api.post("/compute") -async def compute(request: dict) -> dict: - return {"result": request} -``` - -### External Image Deployment - -```python -from runpod_flash import Endpoint, GpuGroup - -vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO) -result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"}) -``` - ### Hybrid GPU/CPU Pipeline ```python @@ -460,129 +235,30 @@ async def pipeline(raw_data): return await inference(clean) ``` -### Parallel Execution +### External Image ```python -import asyncio - -results = await asyncio.gather( - process_item(item1), - process_item(item2), - process_item(item3), -) -``` - -### NetworkVolume - -```python -from runpod_flash import Endpoint, GpuGroup, NetworkVolume, DataCenter - -volume = NetworkVolume(name="model-storage", size=100, dataCenterId=DataCenter.EU_RO_1) - -@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, volume=volume) -async def worker(data: dict) -> dict: - ... -``` - -### PodTemplate - -```python -from runpod_flash import Endpoint, GpuGroup, PodTemplate - -template = PodTemplate(containerDiskInGb=100) - -@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, template=template) -async def worker(data: dict) -> dict: - ... +vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO) +result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"}) ``` ## Error Handling -### Queue-Based (QB) Resources - -```python -job_output = await my_function(data) -if job_output.error: - print(f"Failed: {job_output.error}") -else: - result = job_output.output -``` - -`JobOutput` fields: `id`, `status`, `output`, `error`, `started_at`, `ended_at` - -### Load-Balanced (LB) Resources - -```python -try: - result = await my_function(data) # Returns dict directly -except Exception as e: - print(f"Error: {e}") -``` - -### EndpointJob (Client Mode) - -```python -job = await ep.run({"prompt": "hello"}) -await job.wait(timeout=120) -if job.error: - print(f"Failed: {job.error}") -else: - print(job.output) -``` - -### Runtime Exceptions - -``` -FlashRuntimeError (base) - RemoteExecutionError # Remote function failed - SerializationError # cloudpickle serialization failed - GraphQLError # GraphQL base error - GraphQLMutationError # Mutation failed - GraphQLQueryError # Query failed - ManifestError # Invalid/missing manifest - ManifestServiceUnavailableError # State Manager unreachable -``` - -## Architecture Overview - -### Deployment Architecture - -**Mothership Pattern**: Coordinator endpoint + distributed child endpoints. - -1. `flash build` scans code, creates manifest + archive -2. `flash deploy send` uploads archive, provisions resources -3. Mothership boots, reconciles desired vs current state -4. Child endpoints query State Manager GraphQL for service discovery (peer-to-peer) -5. Functions route locally or remotely based on manifest - -### How Endpoint Resolves to Internal Classes - -The `Endpoint` class automatically selects the right internal resource class based on: -- **QB vs LB**: Inferred from usage (direct `@Endpoint` decorator = QB, `.get()`/`.post()` routes = LB) -- **GPU vs CPU**: From `gpu=` or `cpu=` parameter -- **Live vs Deploy**: From runtime environment (`flash run` = live, `flash deploy` = deploy classes) - -This means 8 internal classes are selected automatically -- users never need to pick one. - -### Cross-Endpoint Routing - -Functions on different endpoints can call each other transparently: -1. `ProductionWrapper` intercepts calls -2. `ServiceRegistry` looks up function in manifest -3. Local function? Execute directly -4. Remote function? Serialize args (cloudpickle), POST to remote endpoint +- **QB**: Returns `JobOutput` -- check `result.error` for failures, `result.output` for data +- **LB**: Returns dict directly -- use try/except +- **Client mode**: `EndpointJob` -- check `job.error` after `await job.wait()` +- **Serialization limit**: cloudpickle + base64, max 10MB. Pass URLs/paths for large data. -**Serialization**: cloudpickle + base64, max 10MB payload. Pass URLs/paths instead of large data. +Exception hierarchy: `FlashRuntimeError` > `RemoteExecutionError`, `SerializationError`, `GraphQLError` > `GraphQLMutationError`/`GraphQLQueryError`, `ManifestError`. ## Common Gotchas -1. **External scope in decorated functions** -- Most common error. All imports and logic must be inside the function body. +1. **External scope in decorated functions** -- #1 error. All imports and logic must be inside the function body. 2. **Forgetting `await`** -- All remote functions must be awaited. -3. **Undeclared dependencies** -- Must be in `dependencies=[]` parameter. +3. **Undeclared dependencies** -- Must be in `dependencies=[]`. 4. **QB vs LB return types** -- QB returns `JobOutput` wrapper, LB returns dict directly. -5. **Large serialization** -- Max 10MB. Pass URLs/paths, not large data objects. -6. **Imports at module level** -- Import inside decorated functions, not at top of file. -7. **Bundle too large (>500MB)** -- Use `--exclude` for packages in base Docker image. -8. **Endpoints accumulate** -- Clean up with `flash undeploy list` / `flash undeploy `. -9. **Mixing decorator patterns** -- Cannot use `@Endpoint(...)` as direct decorator AND register routes (`.get()`/`.post()`) on the same instance. -10. **Client mode restrictions** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators. Cannot use `@ep.post("/path")` to register routes on a client. +5. **Large payloads** -- Max 10MB serialization. Pass URLs, not data. +6. **Bundle too large (>500MB)** -- Use `flash build --exclude` for packages in base image. +7. **Mixing patterns** -- Cannot use `@Endpoint(...)` as decorator AND `.get()`/`.post()` on same instance. +8. **Client vs decorator** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators. +9. **Endpoints accumulate** -- Clean up with `flash undeploy`. From c96a25d7e86c9f48317dfb9447746cb8570995f5 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 18:19:07 +0100 Subject: [PATCH 04/24] chore: remove deprecated class mention from skill --- flash/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index cec1b732..3e119133 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -49,7 +49,7 @@ flash run # Local FastAPI dev server at localhost:8888/docs ## The Endpoint Class: Four Modes -The `Endpoint` class is the single entry point. It replaces the old 8-class hierarchy (`LiveServerless`, etc.) which still works but emits `DeprecationWarning`. +The `Endpoint` class is the single entry point for all Flash functionality. ### Mode 1: Queue-Based Decorator (QB) From 16a0a2d44eb6af2d6fcb0470f990bc5eebae9a7f Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:39:41 +0100 Subject: [PATCH 05/24] chore: rewrite flash skill for v1.7.0 endpoint API - replace v1.6.0 content with eval-tested v1.7.0 skill - remove non-existent flash login command - fix GpuType.ANY to GpuGroup (GpuType has no ANY member) - consolidate from four modes to three (QB, LB, client) - all examples use Endpoint class exclusively - scored 18/18 on eval assertions across 3 test prompts --- flash/SKILL.md | 325 ++++++++++++++++++++++--------------------------- 1 file changed, 144 insertions(+), 181 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 3e119133..35caed31 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -1,264 +1,227 @@ --- name: flash -description: Complete knowledge of runpod-flash - the Endpoint class, CLI, deployment, architecture. - Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuType", "GpuGroup". +description: Complete knowledge of the runpod-flash framework - SDK, CLI, architecture, deployment, and codebase. Use when working with runpod-flash code, writing Endpoint classes, configuring GPU/CPU endpoints, debugging deployments, or understanding the framework internals. Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuGroup", "CpuInstanceType", "EndpointJob", "remote GPU". user-invocable: true allowed-tools: Read, Grep, Glob, Bash --- -# Runpod Flash +# Runpod Flash (v1.7.0) -**runpod-flash** (v1.6.0) -- Python SDK for distributed inference and serving on Runpod serverless. +Python SDK for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything. -- **Package**: `pip install runpod-flash` -- **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...` -- **CLI**: `flash` -- **Python**: >=3.10, <3.15 +`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/` -## Getting Started +## Endpoint: Three Modes -```bash -pip install runpod-flash -flash login # Authenticate via browser (recommended) -# Or: export RUNPOD_API_KEY=... or add to .env file -``` - -Minimal example: - -```python -import asyncio -from runpod_flash import Endpoint, GpuType - -@Endpoint(name="my-first-worker", gpu=GpuType.ANY, dependencies=["torch"]) -async def gpu_task(data): - import torch - tensor = torch.tensor(data, device="cuda") - return {"sum": tensor.sum().item(), "gpu": torch.cuda.get_device_name(0)} - -asyncio.run(gpu_task([1, 2, 3, 4, 5])) -``` - -First run takes ~1 minute (endpoint provisioning). Subsequent runs take ~1 second. - -Create a project with templates: +### Mode 1: Your Code (Queue-Based Decorator) -```bash -flash init my_project && cd my_project -flash run # Local FastAPI dev server at localhost:8888/docs -``` - -## The Endpoint Class: Four Modes - -The `Endpoint` class is the single entry point for all Flash functionality. - -### Mode 1: Queue-Based Decorator (QB) - -One function = one endpoint = own workers. Best for batch, long-running tasks, automatic retries. Returns `JobOutput` with `.output`, `.error`, `.status`. +One function = one endpoint with its own workers. ```python -from runpod_flash import Endpoint, GpuType +from runpod_flash import Endpoint, GpuGroup -@Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"]) -async def gpu_hello(input_data: dict) -> dict: - import torch - gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU" - return {"message": input_data.get("message", "Hello!"), "gpu": gpu_name} +@Endpoint(name="my-worker", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"]) +async def compute(data): + import torch # MUST import inside function (cloudpickle) + return {"sum": torch.tensor(data, device="cuda").sum().item()} -result = await gpu_hello({"message": "test"}) -# result.output contains the return dict +result = await compute([1, 2, 3]) ``` -### Mode 2: Load-Balanced Decorator (LB) +### Mode 2: Your Code (Load-Balanced Routes) -Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP. Returns dict directly (no wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`. +Multiple HTTP routes share one pool of workers. ```python -from runpod_flash import Endpoint +from runpod_flash import Endpoint, GpuGroup -api = Endpoint(name="lb_worker", cpu="cpu3c-1-2", workers=(1, 3)) +api = Endpoint(name="my-api", gpu=GpuGroup.ADA_24, workers=(1, 5), dependencies=["torch"]) -@api.post("/process") -async def process(input_data: dict) -> dict: - from datetime import datetime - return {"echo": input_data, "timestamp": datetime.now().isoformat()} +@api.post("/predict") +async def predict(data: list[float]): + import torch + return {"result": torch.tensor(data, device="cuda").sum().item()} @api.get("/health") -async def health() -> dict: - return {"status": "healthy"} +async def health(): + return {"status": "ok"} ``` -### Mode 3: External Image Client +### Mode 3: External Image (Client) -Deploy a pre-built Docker image, call it as a client. Returns `EndpointJob` (see below). +Deploy a pre-built Docker image and call it via HTTP. ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuGroup, PodTemplate + +server = Endpoint( + name="my-server", + image="my-org/my-image:latest", + gpu=GpuGroup.AMPERE_80, + workers=1, + env={"HF_TOKEN": "xxx"}, + template=PodTemplate(containerDiskInGb=100), +) -vllm = Endpoint(name="vllm", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_24) +# LB-style +result = await server.post("/v1/completions", {"prompt": "hello"}) +models = await server.get("/v1/models") -result = await vllm.post("/v1/completions", {"prompt": "hello"}) # LB-style -job = await vllm.run({"prompt": "hello"}) # QB-style +# QB-style +job = await server.run({"prompt": "hello"}) await job.wait() print(job.output) ``` -### Mode 4: Existing Endpoint Client - -Connect to an already-deployed endpoint by ID. No provisioning. Returns `EndpointJob`. +Connect to an existing endpoint by ID (no provisioning): ```python ep = Endpoint(id="abc123") -job = await ep.runsync({"prompt": "hello"}) +job = await ep.runsync({"input": "hello"}) print(job.output) ``` -**EndpointJob** (returned by `.run()` / `.runsync()` in client modes): properties `.id`, `.output`, `.error`, `.done`; methods `await job.status()`, `await job.wait(timeout=60)`, `await job.cancel()`. +## How Mode Is Determined + +| Parameters | Mode | +|-----------|------| +| `name=` only | Decorator (your code) | +| `image=` set | Client (deploys image, then HTTP calls) | +| `id=` set | Client (connects to existing, no provisioning) | -## Constructor Parameters +## Endpoint Constructor ```python Endpoint( - name: str = None, # Required unless id= is set - *, - id: str = None, # Connect to existing endpoint (client mode) - gpu: GpuGroup | GpuType | list = None, # GPU type(s) -- mutually exclusive with cpu - cpu: str | CpuInstanceType | list = None, # CPU type(s) -- mutually exclusive with gpu - workers: int | tuple[int, int] = None, # (min, max) tuple or just max. Default: (0, 1) - idle_timeout: int = 60, # Seconds before scale-down - dependencies: list[str] = None, # pip packages to install - system_dependencies: list[str] = None, # apt-get packages - accelerate_downloads: bool = True, # CDN download acceleration - volume: NetworkVolume = None, # Persistent storage (NetworkVolume(name=..., size=100, dataCenterId=DataCenter.EU_RO_1)) - datacenter: DataCenter = DataCenter.EU_RO_1, - env: dict[str, str] = None, # Environment variables - gpu_count: int = 1, # GPUs per worker - execution_timeout_ms: int = 0, # 0 = no limit - flashboot: bool = True, # Fast cold starts - image: str = None, # Docker image (external image mode, mutually exclusive with id) - scaler_type: ServerlessScalerType = None, # QUEUE_DELAY (QB) or REQUEST_COUNT (LB) - scaler_value: int = 4, - template: PodTemplate = None, # Pod overrides (e.g. PodTemplate(containerDiskInGb=100)) + name="endpoint-name", # required (unless id= set) + id=None, # connect to existing endpoint + gpu=GpuGroup.AMPERE_80, # GPU type (default: ANY) + cpu=CpuInstanceType.CPU5C_4_8, # CPU type (mutually exclusive with gpu) + workers=3, # shorthand for (0, 3) + workers=(1, 5), # explicit (min, max) + idle_timeout=60, # seconds before scale-down (default: 60) + dependencies=["torch"], # pip packages for remote exec + system_dependencies=["ffmpeg"], # apt-get packages + image="org/image:tag", # pre-built Docker image (client mode) + env={"KEY": "val"}, # environment variables + volume=NetworkVolume(...), # persistent storage + gpu_count=1, # GPUs per worker + template=PodTemplate(containerDiskInGb=100), + flashboot=True, # fast cold starts ) ``` -- `gpu` and `cpu` are mutually exclusive. `id` and `image` are mutually exclusive. -- If neither `gpu` nor `cpu` is set (non-client), defaults to `gpu=GpuGroup.ANY`. -- `workers=5` means `(0, 5)`. `workers=(2, 5)` means min 2, max 5. - -## GPU & CPU Types - -### GpuGroup (by VRAM class) - -| Group | VRAM | GPUs | -|-------|------|------| -| `ANY` | Any | Any available (not for production) | -| `AMPERE_16` | 16GB | RTX A4000/A4500 | -| `AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 | -| `ADA_24` | 24GB | RTX 4090 | -| `ADA_32_PRO` | 32GB | RTX 5090 | -| `AMPERE_48` | 48GB | A40, RTX A6000 | -| `ADA_48_PRO` | 48GB | RTX 6000 Ada | -| `AMPERE_80` | 80GB | A100 | -| `ADA_80_PRO` | 80GB | H100 | -| `HOPPER_141` | 141GB | H200 | - -For exact GPU selection, use `GpuType` enum (e.g. `GpuType.NVIDIA_GEFORCE_RTX_4090`). See `src/runpod_flash/core/resources/gpu.py` for full list. +- `gpu=` and `cpu=` are mutually exclusive +- `workers=3` means `(0, 3)`. Default is `(0, 1)` +- `idle_timeout` default is **60 seconds** -### CPU Instance Types +## Cloudpickle Scoping (CRITICAL) -Format: `cpu{gen}{type}-{vcpu}-{memory}`. Use string shorthand (`cpu="cpu3c-1-2"`) or `CpuInstanceType` enum. - -Families: `cpu3g` (general, 4GB/vCPU), `cpu3c` (compute, 2GB/vCPU), `cpu5c` (5th gen compute, 2GB/vCPU). Each from 1 to 8 vCPUs. See `src/runpod_flash/core/resources/cpu.py` for full list. - -## Cloudpickle Scoping Rules - -Functions decorated with `@Endpoint(...)` are serialized with cloudpickle. They can ONLY access: -- Function parameters, local variables, imports done **inside** the function, built-ins - -They CANNOT access: module-level imports, global variables, external functions/classes. +Decorated functions are serialized. They can ONLY access: +- Parameters, local variables, imports inside the function, builtins ```python # WRONG import torch -@Endpoint(name="worker", gpu=GpuGroup.ADA_24) -async def bad(data): - return torch.tensor(data) # torch not accessible remotely +@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"]) +async def bad(x): + return torch.tensor(x) # NameError # CORRECT -@Endpoint(name="worker", gpu=GpuGroup.ADA_24, dependencies=["torch"]) -async def good(data): +@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"]) +async def good(x): import torch - return torch.tensor(data) + return torch.tensor(x) ``` -All pip packages must be in `dependencies=[]`. System packages in `system_dependencies=[]`. +## EndpointJob -## CLI Commands +Returned by `ep.run()` and `ep.runsync()` in client mode. -```bash -flash login # Authenticate via browser -flash init [project_name] # Create project from templates -flash run [--host HOST] [--port PORT] # Dev server at localhost:8888 -flash build [--exclude pkg1,pkg2] [--preview] # Package artifact (500MB limit) -flash deploy new|send|list|info|delete # Deployment lifecycle -flash undeploy list # List deployed resources -flash undeploy # Remove specific resource -flash env list|create|get|delete # Environment management -flash app list|get # App management +```python +job = await ep.run({"data": [1, 2, 3]}) +await job.wait(timeout=120) # poll until done +print(job.id, job.output, job.error, job.done) +await job.cancel() ``` -Key notes: -- `flash build --exclude torch,torchvision,torchaudio` -- exclude packages already in base Docker image to stay under 500MB limit -- `flash build --preview` -- run in local Docker containers for end-to-end testing -- `flash deploy send` requires `flash build` first +## GPU Types (GpuGroup) + +| Enum | GPU | VRAM | +|------|-----|------| +| `ANY` | any | varies | +| `AMPERE_16` | RTX A4000 | 16GB | +| `AMPERE_24` | RTX A5000/L4 | 24GB | +| `AMPERE_48` | A40/A6000 | 48GB | +| `AMPERE_80` | A100 | 80GB | +| `ADA_24` | RTX 4090 | 24GB | +| `ADA_32_PRO` | RTX 5090 | 32GB | +| `ADA_48_PRO` | RTX 6000 Ada | 48GB | +| `ADA_80_PRO` | H100 | 80GB | +| `HOPPER_141` | H200 | 141GB | + +## CPU Types (CpuInstanceType) + +Format: `CPU{gen}{type}_{vcpu}_{memory_gb}`. Example: `CPU5C_4_8` = 5th gen, compute, 4 vCPU, 8GB. + +```python +from runpod_flash import Endpoint, CpuInstanceType + +@Endpoint(name="cpu-work", cpu=CpuInstanceType.CPU5C_4_8, workers=5, dependencies=["pandas"]) +async def process(data): + import pandas as pd + return pd.DataFrame(data).describe().to_dict() +``` ## Common Patterns -### Hybrid GPU/CPU Pipeline +### CPU + GPU Pipeline ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuGroup, CpuInstanceType -@Endpoint(name="preprocessor", cpu="cpu5c-4-8", dependencies=["pandas"]) -async def preprocess(data): +@Endpoint(name="preprocess", cpu=CpuInstanceType.CPU5C_4_8, workers=5, dependencies=["pandas"]) +async def preprocess(raw): import pandas as pd - return pd.DataFrame(data).to_dict("records") + return pd.DataFrame(raw).to_dict("records") -@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, dependencies=["torch"]) -async def inference(data): +@Endpoint(name="infer", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"]) +async def infer(clean): import torch - tensor = torch.tensor(data, device="cuda") - return {"result": tensor.sum().item()} + t = torch.tensor([[v for v in r.values()] for r in clean], device="cuda") + return {"predictions": t.mean(dim=1).tolist()} -async def pipeline(raw_data): - clean = await preprocess(raw_data) - return await inference(clean) +async def pipeline(data): + return await infer(await preprocess(data)) ``` -### External Image +### Parallel Execution ```python -vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO) -result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"}) +import asyncio +results = await asyncio.gather(compute(a), compute(b), compute(c)) ``` -## Error Handling +## CLI + +| Command | Description | +|---------|-------------| +| `flash init [name]` | Create project template | +| `flash run [--auto-provision]` | Local dev server at localhost:8888 | +| `flash build [--exclude pkg1,pkg2]` | Package artifact (500MB limit) | +| `flash deploy new/send/list/info/delete ` | Deploy to production | +| `flash undeploy list/` | Remove endpoints | -- **QB**: Returns `JobOutput` -- check `result.error` for failures, `result.output` for data -- **LB**: Returns dict directly -- use try/except -- **Client mode**: `EndpointJob` -- check `job.error` after `await job.wait()` -- **Serialization limit**: cloudpickle + base64, max 10MB. Pass URLs/paths for large data. +## Gotchas -Exception hierarchy: `FlashRuntimeError` > `RemoteExecutionError`, `SerializationError`, `GraphQLError` > `GraphQLMutationError`/`GraphQLQueryError`, `ManifestError`. +1. **Imports outside function** -- most common error. Everything inside the decorated function. +2. **Forgetting await** -- all decorated functions and client methods need `await`. +3. **Missing dependencies** -- must list in `dependencies=[]`. +4. **gpu/cpu are exclusive** -- pick one per Endpoint. +5. **idle_timeout is seconds** -- default 60s, not minutes. +6. **10MB payload limit** -- pass URLs, not large objects. +7. **Client vs decorator** -- `image=`/`id=` = client. Otherwise = decorator. -## Common Gotchas +## Architecture (for codebase work) -1. **External scope in decorated functions** -- #1 error. All imports and logic must be inside the function body. -2. **Forgetting `await`** -- All remote functions must be awaited. -3. **Undeclared dependencies** -- Must be in `dependencies=[]`. -4. **QB vs LB return types** -- QB returns `JobOutput` wrapper, LB returns dict directly. -5. **Large payloads** -- Max 10MB serialization. Pass URLs, not data. -6. **Bundle too large (>500MB)** -- Use `flash build --exclude` for packages in base image. -7. **Mixing patterns** -- Cannot use `@Endpoint(...)` as decorator AND `.get()`/`.post()` on same instance. -8. **Client vs decorator** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators. -9. **Endpoints accumulate** -- Clean up with `flash undeploy`. +Source: `src/runpod_flash/`. Entry: `endpoint.py` (Endpoint class) delegates to `client.py` (@remote, internal). Build scanner: `cli/commands/build_utils/scanner.py`. Runtime: `runtime/` (handlers, service registry, serialization). Resources: `core/resources/` (internal classes auto-selected by Endpoint). Dev: `make dev`, `make test-unit`, `make lint`, `make format`, `make index`. From 76483f264ec9a688bfc620482694735aee743d3b Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:43:27 +0100 Subject: [PATCH 06/24] chore: add auth section, restore flash login, remove architecture noise --- flash/SKILL.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 35caed31..67fef0e0 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -206,12 +206,19 @@ results = await asyncio.gather(compute(a), compute(b), compute(c)) | Command | Description | |---------|-------------| +| `flash login` | Authenticate via browser | | `flash init [name]` | Create project template | | `flash run [--auto-provision]` | Local dev server at localhost:8888 | | `flash build [--exclude pkg1,pkg2]` | Package artifact (500MB limit) | | `flash deploy new/send/list/info/delete ` | Deploy to production | | `flash undeploy list/` | Remove endpoints | +## Authentication + +Two ways to authenticate: +- `flash login` -- browser-based OAuth (recommended) +- `RUNPOD_API_KEY` env var -- set in `.env` or export directly + ## Gotchas 1. **Imports outside function** -- most common error. Everything inside the decorated function. @@ -221,7 +228,3 @@ results = await asyncio.gather(compute(a), compute(b), compute(c)) 5. **idle_timeout is seconds** -- default 60s, not minutes. 6. **10MB payload limit** -- pass URLs, not large objects. 7. **Client vs decorator** -- `image=`/`id=` = client. Otherwise = decorator. - -## Architecture (for codebase work) - -Source: `src/runpod_flash/`. Entry: `endpoint.py` (Endpoint class) delegates to `client.py` (@remote, internal). Build scanner: `cli/commands/build_utils/scanner.py`. Runtime: `runtime/` (handlers, service registry, serialization). Resources: `core/resources/` (internal classes auto-selected by Endpoint). Dev: `make dev`, `make test-unit`, `make lint`, `make format`, `make index`. From 2fb095229aeb211baa580665f9ae36d28f95d727 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:44:47 +0100 Subject: [PATCH 07/24] chore: remove unnecessary allowed-tools from skill frontmatter --- flash/SKILL.md | 1 - 1 file changed, 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 67fef0e0..3e7f6f70 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -2,7 +2,6 @@ name: flash description: Complete knowledge of the runpod-flash framework - SDK, CLI, architecture, deployment, and codebase. Use when working with runpod-flash code, writing Endpoint classes, configuring GPU/CPU endpoints, debugging deployments, or understanding the framework internals. Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuGroup", "CpuInstanceType", "EndpointJob", "remote GPU". user-invocable: true -allowed-tools: Read, Grep, Glob, Bash --- # Runpod Flash (v1.7.0) From fcefce052d285a9c87d4a002c9c30ba073ca4607 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:45:44 +0100 Subject: [PATCH 08/24] chore: shorten skill description, move version out of title --- flash/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 3e7f6f70..be844118 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -1,12 +1,12 @@ --- name: flash -description: Complete knowledge of the runpod-flash framework - SDK, CLI, architecture, deployment, and codebase. Use when working with runpod-flash code, writing Endpoint classes, configuring GPU/CPU endpoints, debugging deployments, or understanding the framework internals. Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuGroup", "CpuInstanceType", "EndpointJob", "remote GPU". +description: runpod-flash SDK and CLI for deploying AI workloads on RunPod serverless GPUs/CPUs. user-invocable: true --- -# Runpod Flash (v1.7.0) +# Runpod Flash -Python SDK for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything. +Python SDK (v1.7.0) for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything. `pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/` From b82da18949eaa7b4724cce9172055ceaa4634294 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:46:04 +0100 Subject: [PATCH 09/24] fix: use correct "Runpod" casing in skill --- flash/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index be844118..f90b27fe 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -1,12 +1,12 @@ --- name: flash -description: runpod-flash SDK and CLI for deploying AI workloads on RunPod serverless GPUs/CPUs. +description: runpod-flash SDK and CLI for deploying AI workloads on Runpod serverless GPUs/CPUs. user-invocable: true --- # Runpod Flash -Python SDK (v1.7.0) for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything. +Python SDK (v1.7.0) for running AI workloads on Runpod serverless. One class -- `Endpoint` -- handles everything. `pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/` From 0a40f3ec8a2809fc72e33e89e1429bdaf03c8631 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:48:27 +0100 Subject: [PATCH 10/24] chore: remove redundant intro, lead with install + imports --- flash/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index f90b27fe..579564ee 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -4,12 +4,12 @@ description: runpod-flash SDK and CLI for deploying AI workloads on Runpod serve user-invocable: true --- -# Runpod Flash - -Python SDK (v1.7.0) for running AI workloads on Runpod serverless. One class -- `Endpoint` -- handles everything. +# Runpod Flash (v1.7.0) `pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/` +One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment. + ## Endpoint: Three Modes ### Mode 1: Your Code (Queue-Based Decorator) From f2f9cd52d0d732286f075ce720d31d6968dfba29 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:50:33 +0100 Subject: [PATCH 11/24] chore: remove repo-specific source path from skill --- flash/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 579564ee..cda31eb3 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -6,7 +6,7 @@ user-invocable: true # Runpod Flash (v1.7.0) -`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/` +`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment. From efda28f0e5f3b338d429bff37cca21c1dc350088 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:54:56 +0100 Subject: [PATCH 12/24] chore: add NetworkVolume, PodTemplate, flashboot, gpu_count details to skill --- flash/SKILL.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/flash/SKILL.md b/flash/SKILL.md index cda31eb3..37627e7d 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -113,6 +113,25 @@ Endpoint( - `gpu=` and `cpu=` are mutually exclusive - `workers=3` means `(0, 3)`. Default is `(0, 1)` - `idle_timeout` default is **60 seconds** +- `flashboot=True` (default) -- enables fast cold starts via snapshot restore +- `gpu_count` -- GPUs per worker (default 1), use >1 for multi-GPU models + +### NetworkVolume + +```python +NetworkVolume(name="my-vol", size=100) # size in GB, default 100 +``` + +### PodTemplate + +```python +PodTemplate( + containerDiskInGb=64, # container disk size (default 64) + dockerArgs="", # extra docker arguments + ports="", # exposed ports + startScript="", # script to run on start +) +``` ## Cloudpickle Scoping (CRITICAL) From 5dd2196612815394ad012611bef347c4f2fc31c6 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:56:41 +0100 Subject: [PATCH 13/24] chore: add full CpuInstanceType enum table to skill --- flash/SKILL.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 37627e7d..1c284a91 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -180,7 +180,20 @@ await job.cancel() ## CPU Types (CpuInstanceType) -Format: `CPU{gen}{type}_{vcpu}_{memory_gb}`. Example: `CPU5C_4_8` = 5th gen, compute, 4 vCPU, 8GB. +| Enum | vCPU | RAM | Max Disk | Type | +|------|------|-----|----------|------| +| `CPU3G_1_4` | 1 | 4GB | 10GB | General | +| `CPU3G_2_8` | 2 | 8GB | 20GB | General | +| `CPU3G_4_16` | 4 | 16GB | 40GB | General | +| `CPU3G_8_32` | 8 | 32GB | 80GB | General | +| `CPU3C_1_2` | 1 | 2GB | 10GB | Compute | +| `CPU3C_2_4` | 2 | 4GB | 20GB | Compute | +| `CPU3C_4_8` | 4 | 8GB | 40GB | Compute | +| `CPU3C_8_16` | 8 | 16GB | 80GB | Compute | +| `CPU5C_1_2` | 1 | 2GB | 15GB | Compute (5th gen) | +| `CPU5C_2_4` | 2 | 4GB | 30GB | Compute (5th gen) | +| `CPU5C_4_8` | 4 | 8GB | 60GB | Compute (5th gen) | +| `CPU5C_8_16` | 8 | 16GB | 120GB | Compute (5th gen) | ```python from runpod_flash import Endpoint, CpuInstanceType From ed44e42db5c6fcfb0a088e8f6396b75945bc0c06 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:57:16 +0100 Subject: [PATCH 14/24] chore: trim redundant cloudpickle wrong/correct example --- flash/SKILL.md | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 1c284a91..46b54d28 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -135,22 +135,7 @@ PodTemplate( ## Cloudpickle Scoping (CRITICAL) -Decorated functions are serialized. They can ONLY access: -- Parameters, local variables, imports inside the function, builtins - -```python -# WRONG -import torch -@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"]) -async def bad(x): - return torch.tensor(x) # NameError - -# CORRECT -@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"]) -async def good(x): - import torch - return torch.tensor(x) -``` +Decorated functions are serialized. They can ONLY access: parameters, local variables, imports inside the function, builtins. All imports must be inside the function body. ## EndpointJob From db721ed5b09f778659c897a0ebd4819d9aae85a9 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 22:59:58 +0100 Subject: [PATCH 15/24] chore: remove redundant cloudpickle section, keep in gotchas --- flash/SKILL.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 46b54d28..82b14ff0 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -133,10 +133,6 @@ PodTemplate( ) ``` -## Cloudpickle Scoping (CRITICAL) - -Decorated functions are serialized. They can ONLY access: parameters, local variables, imports inside the function, builtins. All imports must be inside the function body. - ## EndpointJob Returned by `ep.run()` and `ep.runsync()` in client mode. From a7e5d7665f1fc75559cac756ad425712dbefffef Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:05:02 +0100 Subject: [PATCH 16/24] chore: remove redundant import from intro line --- flash/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 82b14ff0..536da103 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -6,7 +6,7 @@ user-invocable: true # Runpod Flash (v1.7.0) -`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 +`pip install runpod-flash` | Python >=3.10 One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment. From 7da03d4251ba8e87c783151de18e986e6b687ce7 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:05:49 +0100 Subject: [PATCH 17/24] chore: remove version from skill title --- flash/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 536da103..2aaa751d 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -4,7 +4,7 @@ description: runpod-flash SDK and CLI for deploying AI workloads on Runpod serve user-invocable: true --- -# Runpod Flash (v1.7.0) +# Runpod Flash `pip install runpod-flash` | Python >=3.10 From b71449aa5fada3c2f390706c17043f6d58018238 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:07:49 +0100 Subject: [PATCH 18/24] chore: add local dev workflow context to skill intro --- flash/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 2aaa751d..0bae442d 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -8,7 +8,7 @@ user-invocable: true `pip install runpod-flash` | Python >=3.10 -One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment. +Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything. ## Endpoint: Three Modes From ee6de571569b3bbb66f4fb7aaf2e2ad98a1ec479 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:09:06 +0100 Subject: [PATCH 19/24] chore: move CLI to top as code block with examples, remove old CLI/auth sections --- flash/SKILL.md | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 0bae442d..2e2b4d90 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -10,6 +10,24 @@ user-invocable: true Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything. +## CLI + +```bash +flash login # authenticate via browser (or set RUNPOD_API_KEY env var) +flash init my-project # scaffold a new project in ./my-project +flash run # start dev server at localhost:8888 +flash run --auto-provision # same, but pre-provision endpoints (no cold start) +flash build # package artifact for deployment (500MB limit) +flash build --exclude pkg1,pkg2 # exclude packages from build +flash deploy new staging # deploy to "staging" environment +flash deploy send staging # send latest build to "staging" +flash deploy list staging # list deployments in "staging" +flash deploy info staging # show deployment details +flash deploy delete staging # delete "staging" deployment +flash undeploy list # list all active endpoints +flash undeploy my-endpoint # remove a specific endpoint +``` + ## Endpoint: Three Modes ### Mode 1: Your Code (Queue-Based Decorator) @@ -214,23 +232,6 @@ import asyncio results = await asyncio.gather(compute(a), compute(b), compute(c)) ``` -## CLI - -| Command | Description | -|---------|-------------| -| `flash login` | Authenticate via browser | -| `flash init [name]` | Create project template | -| `flash run [--auto-provision]` | Local dev server at localhost:8888 | -| `flash build [--exclude pkg1,pkg2]` | Package artifact (500MB limit) | -| `flash deploy new/send/list/info/delete ` | Deploy to production | -| `flash undeploy list/` | Remove endpoints | - -## Authentication - -Two ways to authenticate: -- `flash login` -- browser-based OAuth (recommended) -- `RUNPOD_API_KEY` env var -- set in `.env` or export directly - ## Gotchas 1. **Imports outside function** -- most common error. Everything inside the decorated function. From a43d1c888d8f9e7bc3c222f7eba2a1f39e009d16 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:09:44 +0100 Subject: [PATCH 20/24] chore: add setup section with install and auth before CLI --- flash/SKILL.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 2e2b4d90..8493ac4e 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -6,16 +6,21 @@ user-invocable: true # Runpod Flash -`pip install runpod-flash` | Python >=3.10 - Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything. -## CLI +## Setup ```bash -flash login # authenticate via browser (or set RUNPOD_API_KEY env var) +pip install runpod-flash # requires Python >=3.10 +flash login # authenticate via browser +# OR: export RUNPOD_API_KEY=your_key # alternative: set API key directly flash init my-project # scaffold a new project in ./my-project -flash run # start dev server at localhost:8888 +``` + +## CLI + +```bash +flash run # start local dev server at localhost:8888 flash run --auto-provision # same, but pre-provision endpoints (no cold start) flash build # package artifact for deployment (500MB limit) flash build --exclude pkg1,pkg2 # exclude packages from build From 8d27d270d6cd632e29796da953747491a90b3242 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:10:47 +0100 Subject: [PATCH 21/24] chore: separate flash login and RUNPOD_API_KEY as distinct auth options --- flash/SKILL.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 8493ac4e..5a645aed 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -12,8 +12,13 @@ Write code locally, test with `flash run` (dev server at localhost:8888), and fl ```bash pip install runpod-flash # requires Python >=3.10 -flash login # authenticate via browser -# OR: export RUNPOD_API_KEY=your_key # alternative: set API key directly + +# auth option 1: browser-based login (saves token locally) +flash login + +# auth option 2: API key via environment variable +export RUNPOD_API_KEY=your_key + flash init my-project # scaffold a new project in ./my-project ``` From f81c03135519f033b9da27b618aa02e3b99de04e Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:12:22 +0100 Subject: [PATCH 22/24] chore: simplify endpoint intro line --- flash/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 5a645aed..d8fcb7a3 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -6,7 +6,7 @@ user-invocable: true # Runpod Flash -Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything. +Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. `Endpoint` handles everything. ## Setup From ee8a866129eb9cd969af545564efd5f9f905e172 Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:20:14 +0100 Subject: [PATCH 23/24] chore: add multi-GPU list support, update examples to workers=5, add auto-switch gotcha --- flash/SKILL.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index d8fcb7a3..1851069c 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -47,7 +47,7 @@ One function = one endpoint with its own workers. ```python from runpod_flash import Endpoint, GpuGroup -@Endpoint(name="my-worker", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"]) +@Endpoint(name="my-worker", gpu=GpuGroup.AMPERE_80, workers=5, dependencies=["torch"]) async def compute(data): import torch # MUST import inside function (cloudpickle) return {"sum": torch.tensor(data, device="cuda").sum().item()} @@ -122,9 +122,10 @@ print(job.output) Endpoint( name="endpoint-name", # required (unless id= set) id=None, # connect to existing endpoint - gpu=GpuGroup.AMPERE_80, # GPU type (default: ANY) + gpu=GpuGroup.AMPERE_80, # single GPU type (default: ANY) + gpu=[GpuGroup.ADA_24, GpuGroup.AMPERE_80], # or list for auto-select by supply cpu=CpuInstanceType.CPU5C_4_8, # CPU type (mutually exclusive with gpu) - workers=3, # shorthand for (0, 3) + workers=5, # shorthand for (0, 5) workers=(1, 5), # explicit (min, max) idle_timeout=60, # seconds before scale-down (default: 60) dependencies=["torch"], # pip packages for remote exec @@ -139,7 +140,7 @@ Endpoint( ``` - `gpu=` and `cpu=` are mutually exclusive -- `workers=3` means `(0, 3)`. Default is `(0, 1)` +- `workers=5` means `(0, 5)`. Default is `(0, 1)` - `idle_timeout` default is **60 seconds** - `flashboot=True` (default) -- enables fast cold starts via snapshot restore - `gpu_count` -- GPUs per worker (default 1), use >1 for multi-GPU models @@ -225,7 +226,7 @@ async def preprocess(raw): import pandas as pd return pd.DataFrame(raw).to_dict("records") -@Endpoint(name="infer", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"]) +@Endpoint(name="infer", gpu=GpuGroup.AMPERE_80, workers=5, dependencies=["torch"]) async def infer(clean): import torch t = torch.tensor([[v for v in r.values()] for r in clean], device="cuda") @@ -251,3 +252,4 @@ results = await asyncio.gather(compute(a), compute(b), compute(c)) 5. **idle_timeout is seconds** -- default 60s, not minutes. 6. **10MB payload limit** -- pass URLs, not large objects. 7. **Client vs decorator** -- `image=`/`id=` = client. Otherwise = decorator. +8. **Auto GPU switching requires workers >= 5** -- pass a list of GPU types (e.g. `gpu=[GpuGroup.ADA_24, GpuGroup.AMPERE_80]`) and set `workers=5` or higher. The platform only auto-switches GPU types based on supply when max workers is at least 5. From 6773ea53fe3a12cdf818adc4e4b66d693c53875d Mon Sep 17 00:00:00 2001 From: Tim Pietrusky Date: Thu, 5 Mar 2026 23:42:36 +0100 Subject: [PATCH 24/24] fix: correct fabricated CLI commands and add missing constructor param - Replace made-up `flash deploy new/send/list/info/delete` with actual `flash deploy --env`, `flash env list/create/get/delete` commands - Add `flash deploy --preview` for local Docker preview - Add `execution_timeout_ms` to Endpoint constructor --- flash/SKILL.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/flash/SKILL.md b/flash/SKILL.md index 1851069c..16d21eba 100644 --- a/flash/SKILL.md +++ b/flash/SKILL.md @@ -29,11 +29,13 @@ flash run # start local dev server at localhost:8 flash run --auto-provision # same, but pre-provision endpoints (no cold start) flash build # package artifact for deployment (500MB limit) flash build --exclude pkg1,pkg2 # exclude packages from build -flash deploy new staging # deploy to "staging" environment -flash deploy send staging # send latest build to "staging" -flash deploy list staging # list deployments in "staging" -flash deploy info staging # show deployment details -flash deploy delete staging # delete "staging" deployment +flash deploy # build + deploy (auto-selects env if only one) +flash deploy --env staging # build + deploy to "staging" environment +flash deploy --preview # build + launch local preview in Docker +flash env list # list deployment environments +flash env create staging # create "staging" environment +flash env get staging # show environment details + resources +flash env delete staging # delete environment + tear down resources flash undeploy list # list all active endpoints flash undeploy my-endpoint # remove a specific endpoint ``` @@ -136,6 +138,7 @@ Endpoint( gpu_count=1, # GPUs per worker template=PodTemplate(containerDiskInGb=100), flashboot=True, # fast cold starts + execution_timeout_ms=0, # max execution time (0 = unlimited) ) ```