From fe25f732bbc35ee25d212f0997cb68f47afe0b0a Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 16:48:24 +0100
Subject: [PATCH 01/24] chore: add flash agent skill co-located with source
 code

Adds flash/SKILL.md rewritten around the unified Endpoint class API.
Replaces the old skill in runpod/skills which documented the deprecated
8-class resource hierarchy. Co-locating the skill ensures it stays in
sync with the codebase. Discoverable via `npx skills add runpod/flash`.
---
 flash/SKILL.md | 588 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 588 insertions(+)
 create mode 100644 flash/SKILL.md

diff --git a/flash/SKILL.md b/flash/SKILL.md
new file mode 100644
index 00000000..d6bf9379
--- /dev/null
+++ b/flash/SKILL.md
@@ -0,0 +1,588 @@
+---
+name: flash
+description: Complete knowledge of runpod-flash - the Endpoint class, CLI, deployment, architecture.
+  Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuType", "GpuGroup".
+user-invocable: true
+allowed-tools: Read, Grep, Glob, Bash
+---
+
+# Runpod Flash
+
+**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on RunPod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer.
+
+- **Package**: `pip install runpod-flash`
+- **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...`
+- **CLI**: `flash`
+- **Python**: >=3.10, <3.15
+
+## Getting Started
+
+### 1. Install Flash
+
+```bash
+pip install runpod-flash
+```
+
+### 2. Authenticate
+
+Either log in via browser (recommended):
+
+```bash
+flash login
+```
+
+Or set your API key manually. Get a key from [RunPod account settings](https://docs.runpod.io/get-started/api-keys):
+
+```bash
+export RUNPOD_API_KEY=your_api_key_here
+```
+
+Or save in a `.env` file (Flash auto-loads via `python-dotenv`):
+
+```bash
+echo "RUNPOD_API_KEY=your_api_key_here" > .env
+```
+
+### 3. Write and run a remote function
+
+```python
+import asyncio
+from runpod_flash import Endpoint, GpuType
+
+@Endpoint(name="my-first-worker", gpu=GpuType.ANY, dependencies=["torch"])
+async def gpu_task(data):
+    import torch
+    tensor = torch.tensor(data, device="cuda")
+    return {"sum": tensor.sum().item(), "gpu": torch.cuda.get_device_name(0)}
+
+async def main():
+    result = await gpu_task([1, 2, 3, 4, 5])
+    print(result)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+First run takes ~1 minute (endpoint provisioning). Subsequent runs take ~1 second.
+
+### 4. Or create a Flash API project
+
+```bash
+flash init my_project
+cd my_project
+pip install -r requirements.txt
+# Edit .env and add your RUNPOD_API_KEY
+flash run                    # Start local FastAPI server at localhost:8888
+```
+
+API explorer available at `http://localhost:8888/docs`.
+
+### 5. Build and deploy to production
+
+```bash
+flash build                              # Scan endpoints, package artifact
+flash build --exclude torch,torchvision  # Exclude packages in base image (500MB limit)
+flash deploy new production              # Create deployment environment
+flash deploy send production             # Upload and deploy
+flash deploy list                        # List environments
+flash deploy info production             # Show details
+flash deploy delete production           # Tear down
+```
+
+## The Endpoint Class: Four Modes
+
+The `Endpoint` class is the single entry point for all Flash functionality. It replaces the old 8-class resource hierarchy (`LiveServerless`, `CpuLiveServerless`, etc.) which still works but emits `DeprecationWarning`.
+
+### Mode 1: Queue-Based Decorator (QB)
+
+One function = one endpoint = own workers. Best for batch processing, long-running tasks, automatic retries.
+
+```python
+from runpod_flash import Endpoint, GpuType
+
+@Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"])
+async def gpu_hello(input_data: dict) -> dict:
+    import torch
+    gpu_available = torch.cuda.is_available()
+    gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU"
+    return {
+        "message": input_data.get("message", "Hello from GPU worker!"),
+        "gpu": {"available": gpu_available, "name": gpu_name},
+    }
+```
+
+QB returns a `JobOutput` with `.output`, `.error`, `.status` fields:
+
+```python
+result = await gpu_hello({"message": "test"})
+# result.output contains the return dict
+```
+
+### Mode 2: Load-Balanced Decorator (LB)
+
+Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP.
+
+```python
+from runpod_flash import Endpoint
+
+api = Endpoint(name="lb_worker", cpu="cpu3c-1-2", workers=(1, 3))
+
+@api.post("/process")
+async def process(input_data: dict) -> dict:
+    from datetime import datetime
+    return {"status": "success", "echo": input_data, "timestamp": datetime.now().isoformat()}
+
+@api.get("/health")
+async def health() -> dict:
+    return {"status": "healthy"}
+```
+
+LB returns the dict directly (no `JobOutput` wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`.
+
+### Mode 3: External Image Client
+
+Deploy a pre-built Docker image and call it as a client. No `@decorator` -- the Endpoint provisions the image and provides QB and LB client methods.
+
+```python
+from runpod_flash import Endpoint, GpuGroup
+
+vllm = Endpoint(name="vllm", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_24)
+
+# LB-style calls
+result = await vllm.post("/v1/completions", {"prompt": "hello"})
+models = await vllm.get("/v1/models")
+
+# QB-style calls
+job = await vllm.run({"prompt": "hello"})
+await job.wait()
+print(job.output)
+```
+
+### Mode 4: Existing Endpoint Client
+
+Connect to an already-deployed endpoint by ID. No provisioning.
+
+```python
+from runpod_flash import Endpoint
+
+ep = Endpoint(id="abc123")
+
+# QB-style
+job = await ep.runsync({"prompt": "hello"})
+print(job.output)
+
+# LB-style
+result = await ep.post("/v1/completions", {"prompt": "hello"})
+```
+
+## Constructor Parameters
+
+```python
+Endpoint(
+    name: str = None,                    # Endpoint name (required unless id= is set)
+    *,
+    id: str = None,                      # Connect to existing endpoint (client mode)
+    gpu: GpuGroup | GpuType | list = None,  # GPU type(s) -- mutually exclusive with cpu
+    cpu: str | CpuInstanceType | list = None, # CPU type(s) -- mutually exclusive with gpu
+    workers: int | tuple[int, int] = None,    # Max workers (int) or (min, max) tuple. Default: (0, 1)
+    idle_timeout: int = 60,              # Seconds before scale-down
+    dependencies: list[str] = None,      # pip packages to install
+    system_dependencies: list[str] = None, # apt-get packages to install
+    accelerate_downloads: bool = True,   # CDN download acceleration
+    volume: NetworkVolume = None,        # Persistent storage
+    datacenter: DataCenter = DataCenter.EU_RO_1,  # Data center location
+    env: dict[str, str] = None,          # Environment variables
+    gpu_count: int = 1,                  # GPUs per worker
+    execution_timeout_ms: int = 0,       # Execution timeout (0 = no limit)
+    flashboot: bool = True,              # FlashBoot for fast cold starts
+    image: str = None,                   # Docker image (external image mode, mutually exclusive with id)
+    scaler_type: ServerlessScalerType = None,  # QUEUE_DELAY (QB default) or REQUEST_COUNT (LB default)
+    scaler_value: int = 4,               # Scaler parameter
+    template: PodTemplate = None,        # Pod template overrides
+)
+```
+
+**Mutual exclusions:**
+- `gpu` and `cpu` cannot both be set
+- `id` and `image` cannot both be set
+- `name` or `id` is required
+
+**Defaults:**
+- If neither `gpu` nor `cpu` is set (and not client mode), defaults to `gpu=GpuGroup.ANY`
+- `workers=5` means `(0, 5)`. `workers=(2, 5)` means min 2, max 5.
+
+## EndpointJob
+
+Returned by `Endpoint.run()` and `Endpoint.runsync()` in client mode (image= or id=).
+
+```python
+job = await ep.run({"prompt": "hello"})
+
+# Properties
+job.id        # "job-abc123"
+job.output    # Result payload (after COMPLETED)
+job.error     # Error message (after FAILED)
+job.done      # True if terminal status (COMPLETED, FAILED, CANCELLED, TIMED_OUT)
+
+# Methods
+await job.status()            # Poll, update internal state, return status string
+await job.wait(timeout=60)    # Poll until terminal status (exponential backoff)
+await job.cancel()            # Cancel the job
+```
+
+## GPU & CPU Types
+
+### GPU Groups (GpuGroup enum)
+
+VRAM-class groups that map to one or more specific GPU models:
+
+| Group | VRAM | GPUs |
+|-------|------|------|
+| `GpuGroup.ANY` | Any | Any available (not for production) |
+| `GpuGroup.AMPERE_16` | 16GB | RTX A4000, RTX A4500, RTX 4000 Ada, RTX 2000 Ada |
+| `GpuGroup.AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 |
+| `GpuGroup.ADA_24` | 24GB | RTX 4090 |
+| `GpuGroup.ADA_32_PRO` | 32GB | RTX 5090 |
+| `GpuGroup.AMPERE_48` | 48GB | A40, RTX A6000 |
+| `GpuGroup.ADA_48_PRO` | 48GB | RTX 6000 Ada, L40, L40S |
+| `GpuGroup.AMPERE_80` | 80GB | A100 80GB PCIe, A100-SXM4-80GB |
+| `GpuGroup.ADA_80_PRO` | 80GB | H100 PCIe, H100 80GB HBM3, H100 NVL |
+| `GpuGroup.HOPPER_141` | 141GB | H200 |
+
+### GPU Types (GpuType enum)
+
+Specific GPU models for exact hardware selection:
+
+`NVIDIA_GEFORCE_RTX_4090`, `NVIDIA_GEFORCE_RTX_5090`, `NVIDIA_RTX_6000_ADA_GENERATION`, `NVIDIA_H100_80GB_HBM3`, `NVIDIA_RTX_A4000`, `NVIDIA_RTX_A4500`, `NVIDIA_RTX_4000_ADA_GENERATION`, `NVIDIA_RTX_2000_ADA_GENERATION`, `NVIDIA_RTX_A5000`, `NVIDIA_L4`, `NVIDIA_GEFORCE_RTX_3090`, `NVIDIA_A40`, `NVIDIA_RTX_A6000`, `NVIDIA_A100_80GB_PCIe`, `NVIDIA_A100_SXM4_80GB`, `NVIDIA_H200`
+
+Usage: `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` or `gpu=[GpuType.NVIDIA_A100_80GB_PCIe, GpuType.NVIDIA_A100_SXM4_80GB]`
+
+### CPU Instance Types (CpuInstanceType enum)
+
+Format: `CPU{generation}{type}_{vcpu}_{memory_gb}`. Can also use string shorthand: `cpu="cpu3c-1-2"`.
+
+| Instance Type | Gen | Type | vCPU | RAM | Max Disk |
+|--------------|-----|------|------|-----|----------|
+| `CPU3G_1_4` | 3rd | General | 1 | 4GB | 10GB |
+| `CPU3G_2_8` | 3rd | General | 2 | 8GB | 20GB |
+| `CPU3G_4_16` | 3rd | General | 4 | 16GB | 40GB |
+| `CPU3G_8_32` | 3rd | General | 8 | 32GB | 80GB |
+| `CPU3C_1_2` | 3rd | Compute | 1 | 2GB | 10GB |
+| `CPU3C_2_4` | 3rd | Compute | 2 | 4GB | 20GB |
+| `CPU3C_4_8` | 3rd | Compute | 4 | 8GB | 40GB |
+| `CPU3C_8_16` | 3rd | Compute | 8 | 16GB | 80GB |
+| `CPU5C_1_2` | 5th | Compute | 1 | 2GB | 15GB |
+| `CPU5C_2_4` | 5th | Compute | 2 | 4GB | 30GB |
+| `CPU5C_4_8` | 5th | Compute | 4 | 8GB | 60GB |
+| `CPU5C_8_16` | 5th | Compute | 8 | 16GB | 120GB |
+
+## Cloudpickle Scoping Rules
+
+Functions decorated with `@Endpoint(...)` are serialized with cloudpickle. They can ONLY access:
+- Function parameters
+- Local variables defined inside the function
+- Imports done inside the function
+- Built-in Python functions
+
+They CANNOT access: module-level imports, global variables, external functions/classes.
+
+```python
+# WRONG - external references
+import torch
+@Endpoint(name="worker", gpu=GpuGroup.ADA_24)
+async def bad(data):
+    return torch.tensor(data)  # torch not accessible remotely
+
+# CORRECT - everything inside, dependencies declared
+@Endpoint(name="worker", gpu=GpuGroup.ADA_24, dependencies=["torch"])
+async def good(data):
+    import torch
+    return torch.tensor(data)
+```
+
+All pip packages must be listed in `dependencies=[]`. System packages go in `system_dependencies=[]`.
+
+## CLI Commands
+
+### flash login
+
+```bash
+flash login [--no-open] [--timeout SECONDS]
+```
+
+Authenticate via browser. Opens RunPod console for authorization, saves credentials locally.
+
+### flash init
+
+```bash
+flash init [project_name]
+```
+
+Creates a project with three template workers:
+- `gpu_worker.py` -- QB GPU endpoint using `@Endpoint` decorator
+- `cpu_worker.py` -- QB CPU endpoint using `@Endpoint` decorator
+- `lb_worker.py` -- LB CPU endpoint with `@api.post` and `@api.get` routes
+
+### flash run
+
+```bash
+flash run [--host HOST] [--port PORT]
+```
+
+Starts a local FastAPI dev server at `localhost:8888` with auto-generated routes for all discovered endpoints. API explorer at `/docs`.
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--host` | `localhost` | Server host (or `FLASH_HOST` env) |
+| `--port` | `8888` | Server port (or `FLASH_PORT` env) |
+
+### flash build
+
+```bash
+flash build [--exclude PACKAGES] [--keep-build] [--preview]
+```
+
+Scans `@Endpoint` decorators, groups by resource config, creates `flash_manifest.json`, installs dependencies for Linux x86_64, packages into `.flash/artifact.tar.gz`.
+
+| Option | Description |
+|--------|-------------|
+| `--exclude pkg1,pkg2` | Skip packages already in base Docker image |
+| `--keep-build` | Don't delete `.flash/.build/` after packaging |
+| `--preview` | Build then run in local Docker containers |
+
+**500MB deployment limit** -- use `--exclude` for packages in base image:
+
+```bash
+flash build --exclude torch,torchvision,torchaudio
+```
+
+**`--preview` mode**: Creates Docker containers per resource config, starts mothership on `localhost:8000`, enables end-to-end local testing.
+
+### flash deploy
+
+```bash
+flash deploy new <env_name> [--app-name NAME]   # Create environment
+flash deploy send <env_name> [--app-name NAME]   # Deploy archive
+flash deploy list [--app-name NAME]               # List environments
+flash deploy info <env_name> [--app-name NAME]    # Show details
+flash deploy delete <env_name> [--app-name NAME]  # Delete (double confirmation)
+```
+
+`flash deploy send` requires `flash build` to have been run first.
+
+### flash undeploy
+
+```bash
+flash undeploy list          # List all deployed resources
+flash undeploy <name>        # Undeploy specific resource
+```
+
+### flash env / flash app
+
+```bash
+flash env list|create|get|delete <name>   # Environment management
+flash app list|get <name>                 # App management
+```
+
+## Common Patterns
+
+### QB GPU Endpoint
+
+```python
+from runpod_flash import Endpoint, GpuGroup
+
+@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, workers=(0, 3), dependencies=["torch"])
+async def inference(data: dict) -> dict:
+    import torch
+    tensor = torch.tensor(data["values"], device="cuda")
+    return {"result": tensor.sum().item()}
+```
+
+### QB CPU Endpoint
+
+```python
+from runpod_flash import Endpoint
+
+@Endpoint(name="cpu_worker", cpu="cpu3c-1-2")
+async def cpu_hello(input_data: dict) -> dict:
+    import platform
+    from datetime import datetime
+    return {
+        "message": input_data.get("message", "Hello from CPU worker!"),
+        "timestamp": datetime.now().isoformat(),
+        "python_version": platform.python_version(),
+    }
+```
+
+### LB HTTP API
+
+```python
+from runpod_flash import Endpoint
+
+api = Endpoint(name="my-api", gpu=GpuGroup.ADA_24, workers=(1, 5))
+
+@api.get("/health")
+async def health():
+    return {"status": "ok"}
+
+@api.post("/compute")
+async def compute(request: dict) -> dict:
+    return {"result": request}
+```
+
+### External Image Deployment
+
+```python
+from runpod_flash import Endpoint, GpuGroup
+
+vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO)
+result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"})
+```
+
+### Hybrid GPU/CPU Pipeline
+
+```python
+from runpod_flash import Endpoint, GpuGroup
+
+@Endpoint(name="preprocessor", cpu="cpu5c-4-8", dependencies=["pandas"])
+async def preprocess(data):
+    import pandas as pd
+    return pd.DataFrame(data).to_dict("records")
+
+@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, dependencies=["torch"])
+async def inference(data):
+    import torch
+    tensor = torch.tensor(data, device="cuda")
+    return {"result": tensor.sum().item()}
+
+async def pipeline(raw_data):
+    clean = await preprocess(raw_data)
+    return await inference(clean)
+```
+
+### Parallel Execution
+
+```python
+import asyncio
+
+results = await asyncio.gather(
+    process_item(item1),
+    process_item(item2),
+    process_item(item3),
+)
+```
+
+### NetworkVolume
+
+```python
+from runpod_flash import Endpoint, GpuGroup, NetworkVolume, DataCenter
+
+volume = NetworkVolume(name="model-storage", size=100, dataCenterId=DataCenter.EU_RO_1)
+
+@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, volume=volume)
+async def worker(data: dict) -> dict:
+    ...
+```
+
+### PodTemplate
+
+```python
+from runpod_flash import Endpoint, GpuGroup, PodTemplate
+
+template = PodTemplate(containerDiskInGb=100)
+
+@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, template=template)
+async def worker(data: dict) -> dict:
+    ...
+```
+
+## Error Handling
+
+### Queue-Based (QB) Resources
+
+```python
+job_output = await my_function(data)
+if job_output.error:
+    print(f"Failed: {job_output.error}")
+else:
+    result = job_output.output
+```
+
+`JobOutput` fields: `id`, `status`, `output`, `error`, `started_at`, `ended_at`
+
+### Load-Balanced (LB) Resources
+
+```python
+try:
+    result = await my_function(data)  # Returns dict directly
+except Exception as e:
+    print(f"Error: {e}")
+```
+
+### EndpointJob (Client Mode)
+
+```python
+job = await ep.run({"prompt": "hello"})
+await job.wait(timeout=120)
+if job.error:
+    print(f"Failed: {job.error}")
+else:
+    print(job.output)
+```
+
+### Runtime Exceptions
+
+```
+FlashRuntimeError (base)
+  RemoteExecutionError      # Remote function failed
+  SerializationError        # cloudpickle serialization failed
+  GraphQLError              # GraphQL base error
+    GraphQLMutationError    # Mutation failed
+    GraphQLQueryError       # Query failed
+  ManifestError             # Invalid/missing manifest
+  ManifestServiceUnavailableError  # State Manager unreachable
+```
+
+## Architecture Overview
+
+### Deployment Architecture
+
+**Mothership Pattern**: Coordinator endpoint + distributed child endpoints.
+
+1. `flash build` scans code, creates manifest + archive
+2. `flash deploy send` uploads archive, provisions resources
+3. Mothership boots, reconciles desired vs current state
+4. Child endpoints query State Manager GraphQL for service discovery (peer-to-peer)
+5. Functions route locally or remotely based on manifest
+
+### How Endpoint Resolves to Internal Classes
+
+The `Endpoint` class automatically selects the right internal resource class based on:
+- **QB vs LB**: Inferred from usage (direct `@Endpoint` decorator = QB, `.get()`/`.post()` routes = LB)
+- **GPU vs CPU**: From `gpu=` or `cpu=` parameter
+- **Live vs Deploy**: From runtime environment (`flash run` = live, `flash deploy` = deploy classes)
+
+This means 8 internal classes are selected automatically -- users never need to pick one.
+
+### Cross-Endpoint Routing
+
+Functions on different endpoints can call each other transparently:
+1. `ProductionWrapper` intercepts calls
+2. `ServiceRegistry` looks up function in manifest
+3. Local function? Execute directly
+4. Remote function? Serialize args (cloudpickle), POST to remote endpoint
+
+**Serialization**: cloudpickle + base64, max 10MB payload. Pass URLs/paths instead of large data.
+
+## Common Gotchas
+
+1. **External scope in decorated functions** -- Most common error. All imports and logic must be inside the function body.
+2. **Forgetting `await`** -- All remote functions must be awaited.
+3. **Undeclared dependencies** -- Must be in `dependencies=[]` parameter.
+4. **QB vs LB return types** -- QB returns `JobOutput` wrapper, LB returns dict directly.
+5. **Large serialization** -- Max 10MB. Pass URLs/paths, not large data objects.
+6. **Imports at module level** -- Import inside decorated functions, not at top of file.
+7. **Bundle too large (>500MB)** -- Use `--exclude` for packages in base Docker image.
+8. **Endpoints accumulate** -- Clean up with `flash undeploy list` / `flash undeploy <name>`.
+9. **Mixing decorator patterns** -- Cannot use `@Endpoint(...)` as direct decorator AND register routes (`.get()`/`.post()`) on the same instance.
+10. **Client mode restrictions** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators. Cannot use `@ep.post("/path")` to register routes on a client.

From 9f6bfb2be3a69572b5f7e372644f3ed86a50341e Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 16:59:48 +0100
Subject: [PATCH 02/24] fix: use correct "Runpod" casing in skill

---
 flash/SKILL.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index d6bf9379..9874e6c9 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -8,7 +8,7 @@ allowed-tools: Read, Grep, Glob, Bash
 
 # Runpod Flash
 
-**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on RunPod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer.
+**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on Runpod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer.
 
 - **Package**: `pip install runpod-flash`
 - **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...`
@@ -31,7 +31,7 @@ Either log in via browser (recommended):
 flash login
 ```
 
-Or set your API key manually. Get a key from [RunPod account settings](https://docs.runpod.io/get-started/api-keys):
+Or set your API key manually. Get a key from [Runpod account settings](https://docs.runpod.io/get-started/api-keys):
 
 ```bash
 export RUNPOD_API_KEY=your_api_key_here
@@ -310,7 +310,7 @@ All pip packages must be listed in `dependencies=[]`. System packages go in `sys
 flash login [--no-open] [--timeout SECONDS]
 ```
 
-Authenticate via browser. Opens RunPod console for authorization, saves credentials locally.
+Authenticate via browser. Opens Runpod console for authorization, saves credentials locally.
 
 ### flash init
 

From 4a8c55551c24f244bdda1bbea52271b383aa1ddb Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 18:17:28 +0100
Subject: [PATCH 03/24] chore: trim flash skill from 588 to 264 lines

Remove content an agent doesn't need: architecture internals, full
enum listings, verbose CLI option tables, redundant code patterns.
Keep: constructor params, four modes, cloudpickle rules, gotchas.
Point agents to source files for enum details they can read themselves.
---
 flash/SKILL.md | 492 +++++++++----------------------------------------
 1 file changed, 84 insertions(+), 408 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 9874e6c9..cec1b732 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -8,7 +8,7 @@ allowed-tools: Read, Grep, Glob, Bash
 
 # Runpod Flash
 
-**runpod-flash** (v1.6.0) is a Python SDK for distributed inference and serving on Runpod serverless. Write Python functions locally, configure with the `Endpoint` class, and Flash handles GPU/CPU provisioning, dependency management, and data transfer.
+**runpod-flash** (v1.6.0) -- Python SDK for distributed inference and serving on Runpod serverless.
 
 - **Package**: `pip install runpod-flash`
 - **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...`
@@ -17,33 +17,13 @@ allowed-tools: Read, Grep, Glob, Bash
 
 ## Getting Started
 
-### 1. Install Flash
-
 ```bash
 pip install runpod-flash
+flash login                  # Authenticate via browser (recommended)
+# Or: export RUNPOD_API_KEY=... or add to .env file
 ```
 
-### 2. Authenticate
-
-Either log in via browser (recommended):
-
-```bash
-flash login
-```
-
-Or set your API key manually. Get a key from [Runpod account settings](https://docs.runpod.io/get-started/api-keys):
-
-```bash
-export RUNPOD_API_KEY=your_api_key_here
-```
-
-Or save in a `.env` file (Flash auto-loads via `python-dotenv`):
-
-```bash
-echo "RUNPOD_API_KEY=your_api_key_here" > .env
-```
-
-### 3. Write and run a remote function
+Minimal example:
 
 ```python
 import asyncio
@@ -55,47 +35,25 @@ async def gpu_task(data):
     tensor = torch.tensor(data, device="cuda")
     return {"sum": tensor.sum().item(), "gpu": torch.cuda.get_device_name(0)}
 
-async def main():
-    result = await gpu_task([1, 2, 3, 4, 5])
-    print(result)
-
-if __name__ == "__main__":
-    asyncio.run(main())
+asyncio.run(gpu_task([1, 2, 3, 4, 5]))
 ```
 
 First run takes ~1 minute (endpoint provisioning). Subsequent runs take ~1 second.
 
-### 4. Or create a Flash API project
-
-```bash
-flash init my_project
-cd my_project
-pip install -r requirements.txt
-# Edit .env and add your RUNPOD_API_KEY
-flash run                    # Start local FastAPI server at localhost:8888
-```
-
-API explorer available at `http://localhost:8888/docs`.
-
-### 5. Build and deploy to production
+Create a project with templates:
 
 ```bash
-flash build                              # Scan endpoints, package artifact
-flash build --exclude torch,torchvision  # Exclude packages in base image (500MB limit)
-flash deploy new production              # Create deployment environment
-flash deploy send production             # Upload and deploy
-flash deploy list                        # List environments
-flash deploy info production             # Show details
-flash deploy delete production           # Tear down
+flash init my_project && cd my_project
+flash run                    # Local FastAPI dev server at localhost:8888/docs
 ```
 
 ## The Endpoint Class: Four Modes
 
-The `Endpoint` class is the single entry point for all Flash functionality. It replaces the old 8-class resource hierarchy (`LiveServerless`, `CpuLiveServerless`, etc.) which still works but emits `DeprecationWarning`.
+The `Endpoint` class is the single entry point. It replaces the old 8-class hierarchy (`LiveServerless`, etc.) which still works but emits `DeprecationWarning`.
 
 ### Mode 1: Queue-Based Decorator (QB)
 
-One function = one endpoint = own workers. Best for batch processing, long-running tasks, automatic retries.
+One function = one endpoint = own workers. Best for batch, long-running tasks, automatic retries. Returns `JobOutput` with `.output`, `.error`, `.status`.
 
 ```python
 from runpod_flash import Endpoint, GpuType
@@ -103,24 +61,16 @@ from runpod_flash import Endpoint, GpuType
 @Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"])
 async def gpu_hello(input_data: dict) -> dict:
     import torch
-    gpu_available = torch.cuda.is_available()
-    gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU"
-    return {
-        "message": input_data.get("message", "Hello from GPU worker!"),
-        "gpu": {"available": gpu_available, "name": gpu_name},
-    }
-```
-
-QB returns a `JobOutput` with `.output`, `.error`, `.status` fields:
+    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU"
+    return {"message": input_data.get("message", "Hello!"), "gpu": gpu_name}
 
-```python
 result = await gpu_hello({"message": "test"})
 # result.output contains the return dict
 ```
 
 ### Mode 2: Load-Balanced Decorator (LB)
 
-Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP.
+Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP. Returns dict directly (no wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`.
 
 ```python
 from runpod_flash import Endpoint
@@ -130,315 +80,140 @@ api = Endpoint(name="lb_worker", cpu="cpu3c-1-2", workers=(1, 3))
 @api.post("/process")
 async def process(input_data: dict) -> dict:
     from datetime import datetime
-    return {"status": "success", "echo": input_data, "timestamp": datetime.now().isoformat()}
+    return {"echo": input_data, "timestamp": datetime.now().isoformat()}
 
 @api.get("/health")
 async def health() -> dict:
     return {"status": "healthy"}
 ```
 
-LB returns the dict directly (no `JobOutput` wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`.
-
 ### Mode 3: External Image Client
 
-Deploy a pre-built Docker image and call it as a client. No `@decorator` -- the Endpoint provisions the image and provides QB and LB client methods.
+Deploy a pre-built Docker image, call it as a client. Returns `EndpointJob` (see below).
 
 ```python
 from runpod_flash import Endpoint, GpuGroup
 
 vllm = Endpoint(name="vllm", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_24)
 
-# LB-style calls
-result = await vllm.post("/v1/completions", {"prompt": "hello"})
-models = await vllm.get("/v1/models")
-
-# QB-style calls
-job = await vllm.run({"prompt": "hello"})
+result = await vllm.post("/v1/completions", {"prompt": "hello"})  # LB-style
+job = await vllm.run({"prompt": "hello"})                         # QB-style
 await job.wait()
 print(job.output)
 ```
 
 ### Mode 4: Existing Endpoint Client
 
-Connect to an already-deployed endpoint by ID. No provisioning.
+Connect to an already-deployed endpoint by ID. No provisioning. Returns `EndpointJob`.
 
 ```python
-from runpod_flash import Endpoint
-
 ep = Endpoint(id="abc123")
-
-# QB-style
 job = await ep.runsync({"prompt": "hello"})
 print(job.output)
-
-# LB-style
-result = await ep.post("/v1/completions", {"prompt": "hello"})
 ```
 
+**EndpointJob** (returned by `.run()` / `.runsync()` in client modes): properties `.id`, `.output`, `.error`, `.done`; methods `await job.status()`, `await job.wait(timeout=60)`, `await job.cancel()`.
+
 ## Constructor Parameters
 
 ```python
 Endpoint(
-    name: str = None,                    # Endpoint name (required unless id= is set)
+    name: str = None,                    # Required unless id= is set
     *,
     id: str = None,                      # Connect to existing endpoint (client mode)
     gpu: GpuGroup | GpuType | list = None,  # GPU type(s) -- mutually exclusive with cpu
     cpu: str | CpuInstanceType | list = None, # CPU type(s) -- mutually exclusive with gpu
-    workers: int | tuple[int, int] = None,    # Max workers (int) or (min, max) tuple. Default: (0, 1)
+    workers: int | tuple[int, int] = None,    # (min, max) tuple or just max. Default: (0, 1)
     idle_timeout: int = 60,              # Seconds before scale-down
     dependencies: list[str] = None,      # pip packages to install
-    system_dependencies: list[str] = None, # apt-get packages to install
+    system_dependencies: list[str] = None, # apt-get packages
     accelerate_downloads: bool = True,   # CDN download acceleration
-    volume: NetworkVolume = None,        # Persistent storage
-    datacenter: DataCenter = DataCenter.EU_RO_1,  # Data center location
+    volume: NetworkVolume = None,        # Persistent storage (NetworkVolume(name=..., size=100, dataCenterId=DataCenter.EU_RO_1))
+    datacenter: DataCenter = DataCenter.EU_RO_1,
     env: dict[str, str] = None,          # Environment variables
     gpu_count: int = 1,                  # GPUs per worker
-    execution_timeout_ms: int = 0,       # Execution timeout (0 = no limit)
-    flashboot: bool = True,              # FlashBoot for fast cold starts
+    execution_timeout_ms: int = 0,       # 0 = no limit
+    flashboot: bool = True,              # Fast cold starts
     image: str = None,                   # Docker image (external image mode, mutually exclusive with id)
-    scaler_type: ServerlessScalerType = None,  # QUEUE_DELAY (QB default) or REQUEST_COUNT (LB default)
-    scaler_value: int = 4,               # Scaler parameter
-    template: PodTemplate = None,        # Pod template overrides
+    scaler_type: ServerlessScalerType = None,  # QUEUE_DELAY (QB) or REQUEST_COUNT (LB)
+    scaler_value: int = 4,
+    template: PodTemplate = None,        # Pod overrides (e.g. PodTemplate(containerDiskInGb=100))
 )
 ```
 
-**Mutual exclusions:**
-- `gpu` and `cpu` cannot both be set
-- `id` and `image` cannot both be set
-- `name` or `id` is required
-
-**Defaults:**
-- If neither `gpu` nor `cpu` is set (and not client mode), defaults to `gpu=GpuGroup.ANY`
+- `gpu` and `cpu` are mutually exclusive. `id` and `image` are mutually exclusive.
+- If neither `gpu` nor `cpu` is set (non-client), defaults to `gpu=GpuGroup.ANY`.
 - `workers=5` means `(0, 5)`. `workers=(2, 5)` means min 2, max 5.
 
-## EndpointJob
+## GPU & CPU Types
 
-Returned by `Endpoint.run()` and `Endpoint.runsync()` in client mode (image= or id=).
+### GpuGroup (by VRAM class)
 
-```python
-job = await ep.run({"prompt": "hello"})
-
-# Properties
-job.id        # "job-abc123"
-job.output    # Result payload (after COMPLETED)
-job.error     # Error message (after FAILED)
-job.done      # True if terminal status (COMPLETED, FAILED, CANCELLED, TIMED_OUT)
-
-# Methods
-await job.status()            # Poll, update internal state, return status string
-await job.wait(timeout=60)    # Poll until terminal status (exponential backoff)
-await job.cancel()            # Cancel the job
-```
+| Group | VRAM | GPUs |
+|-------|------|------|
+| `ANY` | Any | Any available (not for production) |
+| `AMPERE_16` | 16GB | RTX A4000/A4500 |
+| `AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 |
+| `ADA_24` | 24GB | RTX 4090 |
+| `ADA_32_PRO` | 32GB | RTX 5090 |
+| `AMPERE_48` | 48GB | A40, RTX A6000 |
+| `ADA_48_PRO` | 48GB | RTX 6000 Ada |
+| `AMPERE_80` | 80GB | A100 |
+| `ADA_80_PRO` | 80GB | H100 |
+| `HOPPER_141` | 141GB | H200 |
 
-## GPU & CPU Types
+For exact GPU selection, use `GpuType` enum (e.g. `GpuType.NVIDIA_GEFORCE_RTX_4090`). See `src/runpod_flash/core/resources/gpu.py` for full list.
 
-### GPU Groups (GpuGroup enum)
+### CPU Instance Types
 
-VRAM-class groups that map to one or more specific GPU models:
+Format: `cpu{gen}{type}-{vcpu}-{memory}`. Use string shorthand (`cpu="cpu3c-1-2"`) or `CpuInstanceType` enum.
 
-| Group | VRAM | GPUs |
-|-------|------|------|
-| `GpuGroup.ANY` | Any | Any available (not for production) |
-| `GpuGroup.AMPERE_16` | 16GB | RTX A4000, RTX A4500, RTX 4000 Ada, RTX 2000 Ada |
-| `GpuGroup.AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 |
-| `GpuGroup.ADA_24` | 24GB | RTX 4090 |
-| `GpuGroup.ADA_32_PRO` | 32GB | RTX 5090 |
-| `GpuGroup.AMPERE_48` | 48GB | A40, RTX A6000 |
-| `GpuGroup.ADA_48_PRO` | 48GB | RTX 6000 Ada, L40, L40S |
-| `GpuGroup.AMPERE_80` | 80GB | A100 80GB PCIe, A100-SXM4-80GB |
-| `GpuGroup.ADA_80_PRO` | 80GB | H100 PCIe, H100 80GB HBM3, H100 NVL |
-| `GpuGroup.HOPPER_141` | 141GB | H200 |
-
-### GPU Types (GpuType enum)
-
-Specific GPU models for exact hardware selection:
-
-`NVIDIA_GEFORCE_RTX_4090`, `NVIDIA_GEFORCE_RTX_5090`, `NVIDIA_RTX_6000_ADA_GENERATION`, `NVIDIA_H100_80GB_HBM3`, `NVIDIA_RTX_A4000`, `NVIDIA_RTX_A4500`, `NVIDIA_RTX_4000_ADA_GENERATION`, `NVIDIA_RTX_2000_ADA_GENERATION`, `NVIDIA_RTX_A5000`, `NVIDIA_L4`, `NVIDIA_GEFORCE_RTX_3090`, `NVIDIA_A40`, `NVIDIA_RTX_A6000`, `NVIDIA_A100_80GB_PCIe`, `NVIDIA_A100_SXM4_80GB`, `NVIDIA_H200`
-
-Usage: `gpu=GpuType.NVIDIA_GEFORCE_RTX_4090` or `gpu=[GpuType.NVIDIA_A100_80GB_PCIe, GpuType.NVIDIA_A100_SXM4_80GB]`
-
-### CPU Instance Types (CpuInstanceType enum)
-
-Format: `CPU{generation}{type}_{vcpu}_{memory_gb}`. Can also use string shorthand: `cpu="cpu3c-1-2"`.
-
-| Instance Type | Gen | Type | vCPU | RAM | Max Disk |
-|--------------|-----|------|------|-----|----------|
-| `CPU3G_1_4` | 3rd | General | 1 | 4GB | 10GB |
-| `CPU3G_2_8` | 3rd | General | 2 | 8GB | 20GB |
-| `CPU3G_4_16` | 3rd | General | 4 | 16GB | 40GB |
-| `CPU3G_8_32` | 3rd | General | 8 | 32GB | 80GB |
-| `CPU3C_1_2` | 3rd | Compute | 1 | 2GB | 10GB |
-| `CPU3C_2_4` | 3rd | Compute | 2 | 4GB | 20GB |
-| `CPU3C_4_8` | 3rd | Compute | 4 | 8GB | 40GB |
-| `CPU3C_8_16` | 3rd | Compute | 8 | 16GB | 80GB |
-| `CPU5C_1_2` | 5th | Compute | 1 | 2GB | 15GB |
-| `CPU5C_2_4` | 5th | Compute | 2 | 4GB | 30GB |
-| `CPU5C_4_8` | 5th | Compute | 4 | 8GB | 60GB |
-| `CPU5C_8_16` | 5th | Compute | 8 | 16GB | 120GB |
+Families: `cpu3g` (general, 4GB/vCPU), `cpu3c` (compute, 2GB/vCPU), `cpu5c` (5th gen compute, 2GB/vCPU). Each from 1 to 8 vCPUs. See `src/runpod_flash/core/resources/cpu.py` for full list.
 
 ## Cloudpickle Scoping Rules
 
 Functions decorated with `@Endpoint(...)` are serialized with cloudpickle. They can ONLY access:
-- Function parameters
-- Local variables defined inside the function
-- Imports done inside the function
-- Built-in Python functions
+- Function parameters, local variables, imports done **inside** the function, built-ins
 
 They CANNOT access: module-level imports, global variables, external functions/classes.
 
 ```python
-# WRONG - external references
+# WRONG
 import torch
 @Endpoint(name="worker", gpu=GpuGroup.ADA_24)
 async def bad(data):
     return torch.tensor(data)  # torch not accessible remotely
 
-# CORRECT - everything inside, dependencies declared
+# CORRECT
 @Endpoint(name="worker", gpu=GpuGroup.ADA_24, dependencies=["torch"])
 async def good(data):
     import torch
     return torch.tensor(data)
 ```
 
-All pip packages must be listed in `dependencies=[]`. System packages go in `system_dependencies=[]`.
+All pip packages must be in `dependencies=[]`. System packages in `system_dependencies=[]`.
 
 ## CLI Commands
 
-### flash login
-
-```bash
-flash login [--no-open] [--timeout SECONDS]
-```
-
-Authenticate via browser. Opens Runpod console for authorization, saves credentials locally.
-
-### flash init
-
-```bash
-flash init [project_name]
-```
-
-Creates a project with three template workers:
-- `gpu_worker.py` -- QB GPU endpoint using `@Endpoint` decorator
-- `cpu_worker.py` -- QB CPU endpoint using `@Endpoint` decorator
-- `lb_worker.py` -- LB CPU endpoint with `@api.post` and `@api.get` routes
-
-### flash run
-
-```bash
-flash run [--host HOST] [--port PORT]
-```
-
-Starts a local FastAPI dev server at `localhost:8888` with auto-generated routes for all discovered endpoints. API explorer at `/docs`.
-
-| Option | Default | Description |
-|--------|---------|-------------|
-| `--host` | `localhost` | Server host (or `FLASH_HOST` env) |
-| `--port` | `8888` | Server port (or `FLASH_PORT` env) |
-
-### flash build
-
-```bash
-flash build [--exclude PACKAGES] [--keep-build] [--preview]
-```
-
-Scans `@Endpoint` decorators, groups by resource config, creates `flash_manifest.json`, installs dependencies for Linux x86_64, packages into `.flash/artifact.tar.gz`.
-
-| Option | Description |
-|--------|-------------|
-| `--exclude pkg1,pkg2` | Skip packages already in base Docker image |
-| `--keep-build` | Don't delete `.flash/.build/` after packaging |
-| `--preview` | Build then run in local Docker containers |
-
-**500MB deployment limit** -- use `--exclude` for packages in base image:
-
 ```bash
-flash build --exclude torch,torchvision,torchaudio
-```
-
-**`--preview` mode**: Creates Docker containers per resource config, starts mothership on `localhost:8000`, enables end-to-end local testing.
-
-### flash deploy
-
-```bash
-flash deploy new <env_name> [--app-name NAME]   # Create environment
-flash deploy send <env_name> [--app-name NAME]   # Deploy archive
-flash deploy list [--app-name NAME]               # List environments
-flash deploy info <env_name> [--app-name NAME]    # Show details
-flash deploy delete <env_name> [--app-name NAME]  # Delete (double confirmation)
-```
-
-`flash deploy send` requires `flash build` to have been run first.
-
-### flash undeploy
-
-```bash
-flash undeploy list          # List all deployed resources
-flash undeploy <name>        # Undeploy specific resource
-```
-
-### flash env / flash app
-
-```bash
-flash env list|create|get|delete <name>   # Environment management
-flash app list|get <name>                 # App management
-```
+flash login                                      # Authenticate via browser
+flash init [project_name]                        # Create project from templates
+flash run [--host HOST] [--port PORT]            # Dev server at localhost:8888
+flash build [--exclude pkg1,pkg2] [--preview]    # Package artifact (500MB limit)
+flash deploy new|send|list|info|delete <env>     # Deployment lifecycle
+flash undeploy list                              # List deployed resources
+flash undeploy <name>                            # Remove specific resource
+flash env list|create|get|delete <name>          # Environment management
+flash app list|get <name>                        # App management
+```
+
+Key notes:
+- `flash build --exclude torch,torchvision,torchaudio` -- exclude packages already in base Docker image to stay under 500MB limit
+- `flash build --preview` -- run in local Docker containers for end-to-end testing
+- `flash deploy send` requires `flash build` first
 
 ## Common Patterns
 
-### QB GPU Endpoint
-
-```python
-from runpod_flash import Endpoint, GpuGroup
-
-@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, workers=(0, 3), dependencies=["torch"])
-async def inference(data: dict) -> dict:
-    import torch
-    tensor = torch.tensor(data["values"], device="cuda")
-    return {"result": tensor.sum().item()}
-```
-
-### QB CPU Endpoint
-
-```python
-from runpod_flash import Endpoint
-
-@Endpoint(name="cpu_worker", cpu="cpu3c-1-2")
-async def cpu_hello(input_data: dict) -> dict:
-    import platform
-    from datetime import datetime
-    return {
-        "message": input_data.get("message", "Hello from CPU worker!"),
-        "timestamp": datetime.now().isoformat(),
-        "python_version": platform.python_version(),
-    }
-```
-
-### LB HTTP API
-
-```python
-from runpod_flash import Endpoint
-
-api = Endpoint(name="my-api", gpu=GpuGroup.ADA_24, workers=(1, 5))
-
-@api.get("/health")
-async def health():
-    return {"status": "ok"}
-
-@api.post("/compute")
-async def compute(request: dict) -> dict:
-    return {"result": request}
-```
-
-### External Image Deployment
-
-```python
-from runpod_flash import Endpoint, GpuGroup
-
-vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO)
-result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"})
-```
-
 ### Hybrid GPU/CPU Pipeline
 
 ```python
@@ -460,129 +235,30 @@ async def pipeline(raw_data):
     return await inference(clean)
 ```
 
-### Parallel Execution
+### External Image
 
 ```python
-import asyncio
-
-results = await asyncio.gather(
-    process_item(item1),
-    process_item(item2),
-    process_item(item3),
-)
-```
-
-### NetworkVolume
-
-```python
-from runpod_flash import Endpoint, GpuGroup, NetworkVolume, DataCenter
-
-volume = NetworkVolume(name="model-storage", size=100, dataCenterId=DataCenter.EU_RO_1)
-
-@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, volume=volume)
-async def worker(data: dict) -> dict:
-    ...
-```
-
-### PodTemplate
-
-```python
-from runpod_flash import Endpoint, GpuGroup, PodTemplate
-
-template = PodTemplate(containerDiskInGb=100)
-
-@Endpoint(name="worker", gpu=GpuGroup.AMPERE_80, template=template)
-async def worker(data: dict) -> dict:
-    ...
+vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO)
+result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"})
 ```
 
 ## Error Handling
 
-### Queue-Based (QB) Resources
-
-```python
-job_output = await my_function(data)
-if job_output.error:
-    print(f"Failed: {job_output.error}")
-else:
-    result = job_output.output
-```
-
-`JobOutput` fields: `id`, `status`, `output`, `error`, `started_at`, `ended_at`
-
-### Load-Balanced (LB) Resources
-
-```python
-try:
-    result = await my_function(data)  # Returns dict directly
-except Exception as e:
-    print(f"Error: {e}")
-```
-
-### EndpointJob (Client Mode)
-
-```python
-job = await ep.run({"prompt": "hello"})
-await job.wait(timeout=120)
-if job.error:
-    print(f"Failed: {job.error}")
-else:
-    print(job.output)
-```
-
-### Runtime Exceptions
-
-```
-FlashRuntimeError (base)
-  RemoteExecutionError      # Remote function failed
-  SerializationError        # cloudpickle serialization failed
-  GraphQLError              # GraphQL base error
-    GraphQLMutationError    # Mutation failed
-    GraphQLQueryError       # Query failed
-  ManifestError             # Invalid/missing manifest
-  ManifestServiceUnavailableError  # State Manager unreachable
-```
-
-## Architecture Overview
-
-### Deployment Architecture
-
-**Mothership Pattern**: Coordinator endpoint + distributed child endpoints.
-
-1. `flash build` scans code, creates manifest + archive
-2. `flash deploy send` uploads archive, provisions resources
-3. Mothership boots, reconciles desired vs current state
-4. Child endpoints query State Manager GraphQL for service discovery (peer-to-peer)
-5. Functions route locally or remotely based on manifest
-
-### How Endpoint Resolves to Internal Classes
-
-The `Endpoint` class automatically selects the right internal resource class based on:
-- **QB vs LB**: Inferred from usage (direct `@Endpoint` decorator = QB, `.get()`/`.post()` routes = LB)
-- **GPU vs CPU**: From `gpu=` or `cpu=` parameter
-- **Live vs Deploy**: From runtime environment (`flash run` = live, `flash deploy` = deploy classes)
-
-This means 8 internal classes are selected automatically -- users never need to pick one.
-
-### Cross-Endpoint Routing
-
-Functions on different endpoints can call each other transparently:
-1. `ProductionWrapper` intercepts calls
-2. `ServiceRegistry` looks up function in manifest
-3. Local function? Execute directly
-4. Remote function? Serialize args (cloudpickle), POST to remote endpoint
+- **QB**: Returns `JobOutput` -- check `result.error` for failures, `result.output` for data
+- **LB**: Returns dict directly -- use try/except
+- **Client mode**: `EndpointJob` -- check `job.error` after `await job.wait()`
+- **Serialization limit**: cloudpickle + base64, max 10MB. Pass URLs/paths for large data.
 
-**Serialization**: cloudpickle + base64, max 10MB payload. Pass URLs/paths instead of large data.
+Exception hierarchy: `FlashRuntimeError` > `RemoteExecutionError`, `SerializationError`, `GraphQLError` > `GraphQLMutationError`/`GraphQLQueryError`, `ManifestError`.
 
 ## Common Gotchas
 
-1. **External scope in decorated functions** -- Most common error. All imports and logic must be inside the function body.
+1. **External scope in decorated functions** -- #1 error. All imports and logic must be inside the function body.
 2. **Forgetting `await`** -- All remote functions must be awaited.
-3. **Undeclared dependencies** -- Must be in `dependencies=[]` parameter.
+3. **Undeclared dependencies** -- Must be in `dependencies=[]`.
 4. **QB vs LB return types** -- QB returns `JobOutput` wrapper, LB returns dict directly.
-5. **Large serialization** -- Max 10MB. Pass URLs/paths, not large data objects.
-6. **Imports at module level** -- Import inside decorated functions, not at top of file.
-7. **Bundle too large (>500MB)** -- Use `--exclude` for packages in base Docker image.
-8. **Endpoints accumulate** -- Clean up with `flash undeploy list` / `flash undeploy <name>`.
-9. **Mixing decorator patterns** -- Cannot use `@Endpoint(...)` as direct decorator AND register routes (`.get()`/`.post()`) on the same instance.
-10. **Client mode restrictions** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators. Cannot use `@ep.post("/path")` to register routes on a client.
+5. **Large payloads** -- Max 10MB serialization. Pass URLs, not data.
+6. **Bundle too large (>500MB)** -- Use `flash build --exclude` for packages in base image.
+7. **Mixing patterns** -- Cannot use `@Endpoint(...)` as decorator AND `.get()`/`.post()` on same instance.
+8. **Client vs decorator** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators.
+9. **Endpoints accumulate** -- Clean up with `flash undeploy`.

From c96a25d7e86c9f48317dfb9447746cb8570995f5 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 18:19:07 +0100
Subject: [PATCH 04/24] chore: remove deprecated class mention from skill

---
 flash/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index cec1b732..3e119133 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -49,7 +49,7 @@ flash run                    # Local FastAPI dev server at localhost:8888/docs
 
 ## The Endpoint Class: Four Modes
 
-The `Endpoint` class is the single entry point. It replaces the old 8-class hierarchy (`LiveServerless`, etc.) which still works but emits `DeprecationWarning`.
+The `Endpoint` class is the single entry point for all Flash functionality.
 
 ### Mode 1: Queue-Based Decorator (QB)
 

From 16a0a2d44eb6af2d6fcb0470f990bc5eebae9a7f Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:39:41 +0100
Subject: [PATCH 05/24] chore: rewrite flash skill for v1.7.0 endpoint API

- replace v1.6.0 content with eval-tested v1.7.0 skill
- remove non-existent flash login command
- fix GpuType.ANY to GpuGroup (GpuType has no ANY member)
- consolidate from four modes to three (QB, LB, client)
- all examples use Endpoint class exclusively
- scored 18/18 on eval assertions across 3 test prompts
---
 flash/SKILL.md | 325 ++++++++++++++++++++++---------------------------
 1 file changed, 144 insertions(+), 181 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 3e119133..35caed31 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -1,264 +1,227 @@
 ---
 name: flash
-description: Complete knowledge of runpod-flash - the Endpoint class, CLI, deployment, architecture.
-  Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuType", "GpuGroup".
+description: Complete knowledge of the runpod-flash framework - SDK, CLI, architecture, deployment, and codebase. Use when working with runpod-flash code, writing Endpoint classes, configuring GPU/CPU endpoints, debugging deployments, or understanding the framework internals. Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuGroup", "CpuInstanceType", "EndpointJob", "remote GPU".
 user-invocable: true
 allowed-tools: Read, Grep, Glob, Bash
 ---
 
-# Runpod Flash
+# Runpod Flash (v1.7.0)
 
-**runpod-flash** (v1.6.0) -- Python SDK for distributed inference and serving on Runpod serverless.
+Python SDK for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything.
 
-- **Package**: `pip install runpod-flash`
-- **Import**: `from runpod_flash import Endpoint, GpuGroup, GpuType, ...`
-- **CLI**: `flash`
-- **Python**: >=3.10, <3.15
+`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/`
 
-## Getting Started
+## Endpoint: Three Modes
 
-```bash
-pip install runpod-flash
-flash login                  # Authenticate via browser (recommended)
-# Or: export RUNPOD_API_KEY=... or add to .env file
-```
-
-Minimal example:
-
-```python
-import asyncio
-from runpod_flash import Endpoint, GpuType
-
-@Endpoint(name="my-first-worker", gpu=GpuType.ANY, dependencies=["torch"])
-async def gpu_task(data):
-    import torch
-    tensor = torch.tensor(data, device="cuda")
-    return {"sum": tensor.sum().item(), "gpu": torch.cuda.get_device_name(0)}
-
-asyncio.run(gpu_task([1, 2, 3, 4, 5]))
-```
-
-First run takes ~1 minute (endpoint provisioning). Subsequent runs take ~1 second.
-
-Create a project with templates:
+### Mode 1: Your Code (Queue-Based Decorator)
 
-```bash
-flash init my_project && cd my_project
-flash run                    # Local FastAPI dev server at localhost:8888/docs
-```
-
-## The Endpoint Class: Four Modes
-
-The `Endpoint` class is the single entry point for all Flash functionality.
-
-### Mode 1: Queue-Based Decorator (QB)
-
-One function = one endpoint = own workers. Best for batch, long-running tasks, automatic retries. Returns `JobOutput` with `.output`, `.error`, `.status`.
+One function = one endpoint with its own workers.
 
 ```python
-from runpod_flash import Endpoint, GpuType
+from runpod_flash import Endpoint, GpuGroup
 
-@Endpoint(name="gpu_worker", gpu=GpuType.ANY, dependencies=["torch"])
-async def gpu_hello(input_data: dict) -> dict:
-    import torch
-    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU"
-    return {"message": input_data.get("message", "Hello!"), "gpu": gpu_name}
+@Endpoint(name="my-worker", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"])
+async def compute(data):
+    import torch  # MUST import inside function (cloudpickle)
+    return {"sum": torch.tensor(data, device="cuda").sum().item()}
 
-result = await gpu_hello({"message": "test"})
-# result.output contains the return dict
+result = await compute([1, 2, 3])
 ```
 
-### Mode 2: Load-Balanced Decorator (LB)
+### Mode 2: Your Code (Load-Balanced Routes)
 
-Multiple routes, shared workers. Best for real-time APIs, low-latency HTTP. Returns dict directly (no wrapper). Supported methods: `GET`, `POST`, `PUT`, `DELETE`, `PATCH`. Reserved paths: `/execute`, `/ping`.
+Multiple HTTP routes share one pool of workers.
 
 ```python
-from runpod_flash import Endpoint
+from runpod_flash import Endpoint, GpuGroup
 
-api = Endpoint(name="lb_worker", cpu="cpu3c-1-2", workers=(1, 3))
+api = Endpoint(name="my-api", gpu=GpuGroup.ADA_24, workers=(1, 5), dependencies=["torch"])
 
-@api.post("/process")
-async def process(input_data: dict) -> dict:
-    from datetime import datetime
-    return {"echo": input_data, "timestamp": datetime.now().isoformat()}
+@api.post("/predict")
+async def predict(data: list[float]):
+    import torch
+    return {"result": torch.tensor(data, device="cuda").sum().item()}
 
 @api.get("/health")
-async def health() -> dict:
-    return {"status": "healthy"}
+async def health():
+    return {"status": "ok"}
 ```
 
-### Mode 3: External Image Client
+### Mode 3: External Image (Client)
 
-Deploy a pre-built Docker image, call it as a client. Returns `EndpointJob` (see below).
+Deploy a pre-built Docker image and call it via HTTP.
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuGroup, PodTemplate
+
+server = Endpoint(
+    name="my-server",
+    image="my-org/my-image:latest",
+    gpu=GpuGroup.AMPERE_80,
+    workers=1,
+    env={"HF_TOKEN": "xxx"},
+    template=PodTemplate(containerDiskInGb=100),
+)
 
-vllm = Endpoint(name="vllm", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_24)
+# LB-style
+result = await server.post("/v1/completions", {"prompt": "hello"})
+models = await server.get("/v1/models")
 
-result = await vllm.post("/v1/completions", {"prompt": "hello"})  # LB-style
-job = await vllm.run({"prompt": "hello"})                         # QB-style
+# QB-style
+job = await server.run({"prompt": "hello"})
 await job.wait()
 print(job.output)
 ```
 
-### Mode 4: Existing Endpoint Client
-
-Connect to an already-deployed endpoint by ID. No provisioning. Returns `EndpointJob`.
+Connect to an existing endpoint by ID (no provisioning):
 
 ```python
 ep = Endpoint(id="abc123")
-job = await ep.runsync({"prompt": "hello"})
+job = await ep.runsync({"input": "hello"})
 print(job.output)
 ```
 
-**EndpointJob** (returned by `.run()` / `.runsync()` in client modes): properties `.id`, `.output`, `.error`, `.done`; methods `await job.status()`, `await job.wait(timeout=60)`, `await job.cancel()`.
+## How Mode Is Determined
+
+| Parameters | Mode |
+|-----------|------|
+| `name=` only | Decorator (your code) |
+| `image=` set | Client (deploys image, then HTTP calls) |
+| `id=` set | Client (connects to existing, no provisioning) |
 
-## Constructor Parameters
+## Endpoint Constructor
 
 ```python
 Endpoint(
-    name: str = None,                    # Required unless id= is set
-    *,
-    id: str = None,                      # Connect to existing endpoint (client mode)
-    gpu: GpuGroup | GpuType | list = None,  # GPU type(s) -- mutually exclusive with cpu
-    cpu: str | CpuInstanceType | list = None, # CPU type(s) -- mutually exclusive with gpu
-    workers: int | tuple[int, int] = None,    # (min, max) tuple or just max. Default: (0, 1)
-    idle_timeout: int = 60,              # Seconds before scale-down
-    dependencies: list[str] = None,      # pip packages to install
-    system_dependencies: list[str] = None, # apt-get packages
-    accelerate_downloads: bool = True,   # CDN download acceleration
-    volume: NetworkVolume = None,        # Persistent storage (NetworkVolume(name=..., size=100, dataCenterId=DataCenter.EU_RO_1))
-    datacenter: DataCenter = DataCenter.EU_RO_1,
-    env: dict[str, str] = None,          # Environment variables
-    gpu_count: int = 1,                  # GPUs per worker
-    execution_timeout_ms: int = 0,       # 0 = no limit
-    flashboot: bool = True,              # Fast cold starts
-    image: str = None,                   # Docker image (external image mode, mutually exclusive with id)
-    scaler_type: ServerlessScalerType = None,  # QUEUE_DELAY (QB) or REQUEST_COUNT (LB)
-    scaler_value: int = 4,
-    template: PodTemplate = None,        # Pod overrides (e.g. PodTemplate(containerDiskInGb=100))
+    name="endpoint-name",                  # required (unless id= set)
+    id=None,                               # connect to existing endpoint
+    gpu=GpuGroup.AMPERE_80,               # GPU type (default: ANY)
+    cpu=CpuInstanceType.CPU5C_4_8,        # CPU type (mutually exclusive with gpu)
+    workers=3,                             # shorthand for (0, 3)
+    workers=(1, 5),                        # explicit (min, max)
+    idle_timeout=60,                       # seconds before scale-down (default: 60)
+    dependencies=["torch"],                # pip packages for remote exec
+    system_dependencies=["ffmpeg"],        # apt-get packages
+    image="org/image:tag",                 # pre-built Docker image (client mode)
+    env={"KEY": "val"},                    # environment variables
+    volume=NetworkVolume(...),             # persistent storage
+    gpu_count=1,                           # GPUs per worker
+    template=PodTemplate(containerDiskInGb=100),
+    flashboot=True,                        # fast cold starts
 )
 ```
 
-- `gpu` and `cpu` are mutually exclusive. `id` and `image` are mutually exclusive.
-- If neither `gpu` nor `cpu` is set (non-client), defaults to `gpu=GpuGroup.ANY`.
-- `workers=5` means `(0, 5)`. `workers=(2, 5)` means min 2, max 5.
-
-## GPU & CPU Types
-
-### GpuGroup (by VRAM class)
-
-| Group | VRAM | GPUs |
-|-------|------|------|
-| `ANY` | Any | Any available (not for production) |
-| `AMPERE_16` | 16GB | RTX A4000/A4500 |
-| `AMPERE_24` | 24GB | RTX A5000, L4, RTX 3090 |
-| `ADA_24` | 24GB | RTX 4090 |
-| `ADA_32_PRO` | 32GB | RTX 5090 |
-| `AMPERE_48` | 48GB | A40, RTX A6000 |
-| `ADA_48_PRO` | 48GB | RTX 6000 Ada |
-| `AMPERE_80` | 80GB | A100 |
-| `ADA_80_PRO` | 80GB | H100 |
-| `HOPPER_141` | 141GB | H200 |
-
-For exact GPU selection, use `GpuType` enum (e.g. `GpuType.NVIDIA_GEFORCE_RTX_4090`). See `src/runpod_flash/core/resources/gpu.py` for full list.
+- `gpu=` and `cpu=` are mutually exclusive
+- `workers=3` means `(0, 3)`. Default is `(0, 1)`
+- `idle_timeout` default is **60 seconds**
 
-### CPU Instance Types
+## Cloudpickle Scoping (CRITICAL)
 
-Format: `cpu{gen}{type}-{vcpu}-{memory}`. Use string shorthand (`cpu="cpu3c-1-2"`) or `CpuInstanceType` enum.
-
-Families: `cpu3g` (general, 4GB/vCPU), `cpu3c` (compute, 2GB/vCPU), `cpu5c` (5th gen compute, 2GB/vCPU). Each from 1 to 8 vCPUs. See `src/runpod_flash/core/resources/cpu.py` for full list.
-
-## Cloudpickle Scoping Rules
-
-Functions decorated with `@Endpoint(...)` are serialized with cloudpickle. They can ONLY access:
-- Function parameters, local variables, imports done **inside** the function, built-ins
-
-They CANNOT access: module-level imports, global variables, external functions/classes.
+Decorated functions are serialized. They can ONLY access:
+- Parameters, local variables, imports inside the function, builtins
 
 ```python
 # WRONG
 import torch
-@Endpoint(name="worker", gpu=GpuGroup.ADA_24)
-async def bad(data):
-    return torch.tensor(data)  # torch not accessible remotely
+@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"])
+async def bad(x):
+    return torch.tensor(x)  # NameError
 
 # CORRECT
-@Endpoint(name="worker", gpu=GpuGroup.ADA_24, dependencies=["torch"])
-async def good(data):
+@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"])
+async def good(x):
     import torch
-    return torch.tensor(data)
+    return torch.tensor(x)
 ```
 
-All pip packages must be in `dependencies=[]`. System packages in `system_dependencies=[]`.
+## EndpointJob
 
-## CLI Commands
+Returned by `ep.run()` and `ep.runsync()` in client mode.
 
-```bash
-flash login                                      # Authenticate via browser
-flash init [project_name]                        # Create project from templates
-flash run [--host HOST] [--port PORT]            # Dev server at localhost:8888
-flash build [--exclude pkg1,pkg2] [--preview]    # Package artifact (500MB limit)
-flash deploy new|send|list|info|delete <env>     # Deployment lifecycle
-flash undeploy list                              # List deployed resources
-flash undeploy <name>                            # Remove specific resource
-flash env list|create|get|delete <name>          # Environment management
-flash app list|get <name>                        # App management
+```python
+job = await ep.run({"data": [1, 2, 3]})
+await job.wait(timeout=120)        # poll until done
+print(job.id, job.output, job.error, job.done)
+await job.cancel()
 ```
 
-Key notes:
-- `flash build --exclude torch,torchvision,torchaudio` -- exclude packages already in base Docker image to stay under 500MB limit
-- `flash build --preview` -- run in local Docker containers for end-to-end testing
-- `flash deploy send` requires `flash build` first
+## GPU Types (GpuGroup)
+
+| Enum | GPU | VRAM |
+|------|-----|------|
+| `ANY` | any | varies |
+| `AMPERE_16` | RTX A4000 | 16GB |
+| `AMPERE_24` | RTX A5000/L4 | 24GB |
+| `AMPERE_48` | A40/A6000 | 48GB |
+| `AMPERE_80` | A100 | 80GB |
+| `ADA_24` | RTX 4090 | 24GB |
+| `ADA_32_PRO` | RTX 5090 | 32GB |
+| `ADA_48_PRO` | RTX 6000 Ada | 48GB |
+| `ADA_80_PRO` | H100 | 80GB |
+| `HOPPER_141` | H200 | 141GB |
+
+## CPU Types (CpuInstanceType)
+
+Format: `CPU{gen}{type}_{vcpu}_{memory_gb}`. Example: `CPU5C_4_8` = 5th gen, compute, 4 vCPU, 8GB.
+
+```python
+from runpod_flash import Endpoint, CpuInstanceType
+
+@Endpoint(name="cpu-work", cpu=CpuInstanceType.CPU5C_4_8, workers=5, dependencies=["pandas"])
+async def process(data):
+    import pandas as pd
+    return pd.DataFrame(data).describe().to_dict()
+```
 
 ## Common Patterns
 
-### Hybrid GPU/CPU Pipeline
+### CPU + GPU Pipeline
 
 ```python
-from runpod_flash import Endpoint, GpuGroup
+from runpod_flash import Endpoint, GpuGroup, CpuInstanceType
 
-@Endpoint(name="preprocessor", cpu="cpu5c-4-8", dependencies=["pandas"])
-async def preprocess(data):
+@Endpoint(name="preprocess", cpu=CpuInstanceType.CPU5C_4_8, workers=5, dependencies=["pandas"])
+async def preprocess(raw):
     import pandas as pd
-    return pd.DataFrame(data).to_dict("records")
+    return pd.DataFrame(raw).to_dict("records")
 
-@Endpoint(name="inference", gpu=GpuGroup.AMPERE_80, dependencies=["torch"])
-async def inference(data):
+@Endpoint(name="infer", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"])
+async def infer(clean):
     import torch
-    tensor = torch.tensor(data, device="cuda")
-    return {"result": tensor.sum().item()}
+    t = torch.tensor([[v for v in r.values()] for r in clean], device="cuda")
+    return {"predictions": t.mean(dim=1).tolist()}
 
-async def pipeline(raw_data):
-    clean = await preprocess(raw_data)
-    return await inference(clean)
+async def pipeline(data):
+    return await infer(await preprocess(data))
 ```
 
-### External Image
+### Parallel Execution
 
 ```python
-vllm = Endpoint(name="vllm-server", image="vllm/vllm-openai:latest", gpu=GpuGroup.ADA_80_PRO)
-result = await vllm.post("/v1/completions", {"prompt": "hello", "model": "meta-llama/Llama-3-8B"})
+import asyncio
+results = await asyncio.gather(compute(a), compute(b), compute(c))
 ```
 
-## Error Handling
+## CLI
+
+| Command | Description |
+|---------|-------------|
+| `flash init [name]` | Create project template |
+| `flash run [--auto-provision]` | Local dev server at localhost:8888 |
+| `flash build [--exclude pkg1,pkg2]` | Package artifact (500MB limit) |
+| `flash deploy new/send/list/info/delete <env>` | Deploy to production |
+| `flash undeploy list/<name>` | Remove endpoints |
 
-- **QB**: Returns `JobOutput` -- check `result.error` for failures, `result.output` for data
-- **LB**: Returns dict directly -- use try/except
-- **Client mode**: `EndpointJob` -- check `job.error` after `await job.wait()`
-- **Serialization limit**: cloudpickle + base64, max 10MB. Pass URLs/paths for large data.
+## Gotchas
 
-Exception hierarchy: `FlashRuntimeError` > `RemoteExecutionError`, `SerializationError`, `GraphQLError` > `GraphQLMutationError`/`GraphQLQueryError`, `ManifestError`.
+1. **Imports outside function** -- most common error. Everything inside the decorated function.
+2. **Forgetting await** -- all decorated functions and client methods need `await`.
+3. **Missing dependencies** -- must list in `dependencies=[]`.
+4. **gpu/cpu are exclusive** -- pick one per Endpoint.
+5. **idle_timeout is seconds** -- default 60s, not minutes.
+6. **10MB payload limit** -- pass URLs, not large objects.
+7. **Client vs decorator** -- `image=`/`id=` = client. Otherwise = decorator.
 
-## Common Gotchas
+## Architecture (for codebase work)
 
-1. **External scope in decorated functions** -- #1 error. All imports and logic must be inside the function body.
-2. **Forgetting `await`** -- All remote functions must be awaited.
-3. **Undeclared dependencies** -- Must be in `dependencies=[]`.
-4. **QB vs LB return types** -- QB returns `JobOutput` wrapper, LB returns dict directly.
-5. **Large payloads** -- Max 10MB serialization. Pass URLs, not data.
-6. **Bundle too large (>500MB)** -- Use `flash build --exclude` for packages in base image.
-7. **Mixing patterns** -- Cannot use `@Endpoint(...)` as decorator AND `.get()`/`.post()` on same instance.
-8. **Client vs decorator** -- `Endpoint(id=...)` and `Endpoint(image=...)` are clients, not decorators.
-9. **Endpoints accumulate** -- Clean up with `flash undeploy`.
+Source: `src/runpod_flash/`. Entry: `endpoint.py` (Endpoint class) delegates to `client.py` (@remote, internal). Build scanner: `cli/commands/build_utils/scanner.py`. Runtime: `runtime/` (handlers, service registry, serialization). Resources: `core/resources/` (internal classes auto-selected by Endpoint). Dev: `make dev`, `make test-unit`, `make lint`, `make format`, `make index`.

From 76483f264ec9a688bfc620482694735aee743d3b Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:43:27 +0100
Subject: [PATCH 06/24] chore: add auth section, restore flash login, remove
 architecture noise

---
 flash/SKILL.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 35caed31..67fef0e0 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -206,12 +206,19 @@ results = await asyncio.gather(compute(a), compute(b), compute(c))
 
 | Command | Description |
 |---------|-------------|
+| `flash login` | Authenticate via browser |
 | `flash init [name]` | Create project template |
 | `flash run [--auto-provision]` | Local dev server at localhost:8888 |
 | `flash build [--exclude pkg1,pkg2]` | Package artifact (500MB limit) |
 | `flash deploy new/send/list/info/delete <env>` | Deploy to production |
 | `flash undeploy list/<name>` | Remove endpoints |
 
+## Authentication
+
+Two ways to authenticate:
+- `flash login` -- browser-based OAuth (recommended)
+- `RUNPOD_API_KEY` env var -- set in `.env` or export directly
+
 ## Gotchas
 
 1. **Imports outside function** -- most common error. Everything inside the decorated function.
@@ -221,7 +228,3 @@ results = await asyncio.gather(compute(a), compute(b), compute(c))
 5. **idle_timeout is seconds** -- default 60s, not minutes.
 6. **10MB payload limit** -- pass URLs, not large objects.
 7. **Client vs decorator** -- `image=`/`id=` = client. Otherwise = decorator.
-
-## Architecture (for codebase work)
-
-Source: `src/runpod_flash/`. Entry: `endpoint.py` (Endpoint class) delegates to `client.py` (@remote, internal). Build scanner: `cli/commands/build_utils/scanner.py`. Runtime: `runtime/` (handlers, service registry, serialization). Resources: `core/resources/` (internal classes auto-selected by Endpoint). Dev: `make dev`, `make test-unit`, `make lint`, `make format`, `make index`.

From 2fb095229aeb211baa580665f9ae36d28f95d727 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:44:47 +0100
Subject: [PATCH 07/24] chore: remove unnecessary allowed-tools from skill
 frontmatter

---
 flash/SKILL.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 67fef0e0..3e7f6f70 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -2,7 +2,6 @@
 name: flash
 description: Complete knowledge of the runpod-flash framework - SDK, CLI, architecture, deployment, and codebase. Use when working with runpod-flash code, writing Endpoint classes, configuring GPU/CPU endpoints, debugging deployments, or understanding the framework internals. Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuGroup", "CpuInstanceType", "EndpointJob", "remote GPU".
 user-invocable: true
-allowed-tools: Read, Grep, Glob, Bash
 ---
 
 # Runpod Flash (v1.7.0)

From fcefce052d285a9c87d4a002c9c30ba073ca4607 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:45:44 +0100
Subject: [PATCH 08/24] chore: shorten skill description, move version out of
 title

---
 flash/SKILL.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 3e7f6f70..be844118 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -1,12 +1,12 @@
 ---
 name: flash
-description: Complete knowledge of the runpod-flash framework - SDK, CLI, architecture, deployment, and codebase. Use when working with runpod-flash code, writing Endpoint classes, configuring GPU/CPU endpoints, debugging deployments, or understanding the framework internals. Triggers on "flash", "runpod-flash", "Endpoint", "serverless", "deploy", "GpuGroup", "CpuInstanceType", "EndpointJob", "remote GPU".
+description: runpod-flash SDK and CLI for deploying AI workloads on RunPod serverless GPUs/CPUs.
 user-invocable: true
 ---
 
-# Runpod Flash (v1.7.0)
+# Runpod Flash
 
-Python SDK for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything.
+Python SDK (v1.7.0) for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything.
 
 `pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/`
 

From b82da18949eaa7b4724cce9172055ceaa4634294 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:46:04 +0100
Subject: [PATCH 09/24] fix: use correct "Runpod" casing in skill

---
 flash/SKILL.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index be844118..f90b27fe 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -1,12 +1,12 @@
 ---
 name: flash
-description: runpod-flash SDK and CLI for deploying AI workloads on RunPod serverless GPUs/CPUs.
+description: runpod-flash SDK and CLI for deploying AI workloads on Runpod serverless GPUs/CPUs.
 user-invocable: true
 ---
 
 # Runpod Flash
 
-Python SDK (v1.7.0) for running AI workloads on RunPod serverless. One class -- `Endpoint` -- handles everything.
+Python SDK (v1.7.0) for running AI workloads on Runpod serverless. One class -- `Endpoint` -- handles everything.
 
 `pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/`
 

From 0a40f3ec8a2809fc72e33e89e1429bdaf03c8631 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:48:27 +0100
Subject: [PATCH 10/24] chore: remove redundant intro, lead with install +
 imports

---
 flash/SKILL.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index f90b27fe..579564ee 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -4,12 +4,12 @@ description: runpod-flash SDK and CLI for deploying AI workloads on Runpod serve
 user-invocable: true
 ---
 
-# Runpod Flash
-
-Python SDK (v1.7.0) for running AI workloads on Runpod serverless. One class -- `Endpoint` -- handles everything.
+# Runpod Flash (v1.7.0)
 
 `pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/`
 
+One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment.
+
 ## Endpoint: Three Modes
 
 ### Mode 1: Your Code (Queue-Based Decorator)

From f2f9cd52d0d732286f075ce720d31d6968dfba29 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:50:33 +0100
Subject: [PATCH 11/24] chore: remove repo-specific source path from skill

---
 flash/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 579564ee..cda31eb3 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -6,7 +6,7 @@ user-invocable: true
 
 # Runpod Flash (v1.7.0)
 
-`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10 | Source: `src/runpod_flash/`
+`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10
 
 One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment.
 

From efda28f0e5f3b338d429bff37cca21c1dc350088 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:54:56 +0100
Subject: [PATCH 12/24] chore: add NetworkVolume, PodTemplate, flashboot,
 gpu_count details to skill

---
 flash/SKILL.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index cda31eb3..37627e7d 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -113,6 +113,25 @@ Endpoint(
 - `gpu=` and `cpu=` are mutually exclusive
 - `workers=3` means `(0, 3)`. Default is `(0, 1)`
 - `idle_timeout` default is **60 seconds**
+- `flashboot=True` (default) -- enables fast cold starts via snapshot restore
+- `gpu_count` -- GPUs per worker (default 1), use >1 for multi-GPU models
+
+### NetworkVolume
+
+```python
+NetworkVolume(name="my-vol", size=100)  # size in GB, default 100
+```
+
+### PodTemplate
+
+```python
+PodTemplate(
+    containerDiskInGb=64,    # container disk size (default 64)
+    dockerArgs="",           # extra docker arguments
+    ports="",                # exposed ports
+    startScript="",          # script to run on start
+)
+```
 
 ## Cloudpickle Scoping (CRITICAL)
 

From 5dd2196612815394ad012611bef347c4f2fc31c6 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:56:41 +0100
Subject: [PATCH 13/24] chore: add full CpuInstanceType enum table to skill

---
 flash/SKILL.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 37627e7d..1c284a91 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -180,7 +180,20 @@ await job.cancel()
 
 ## CPU Types (CpuInstanceType)
 
-Format: `CPU{gen}{type}_{vcpu}_{memory_gb}`. Example: `CPU5C_4_8` = 5th gen, compute, 4 vCPU, 8GB.
+| Enum | vCPU | RAM | Max Disk | Type |
+|------|------|-----|----------|------|
+| `CPU3G_1_4` | 1 | 4GB | 10GB | General |
+| `CPU3G_2_8` | 2 | 8GB | 20GB | General |
+| `CPU3G_4_16` | 4 | 16GB | 40GB | General |
+| `CPU3G_8_32` | 8 | 32GB | 80GB | General |
+| `CPU3C_1_2` | 1 | 2GB | 10GB | Compute |
+| `CPU3C_2_4` | 2 | 4GB | 20GB | Compute |
+| `CPU3C_4_8` | 4 | 8GB | 40GB | Compute |
+| `CPU3C_8_16` | 8 | 16GB | 80GB | Compute |
+| `CPU5C_1_2` | 1 | 2GB | 15GB | Compute (5th gen) |
+| `CPU5C_2_4` | 2 | 4GB | 30GB | Compute (5th gen) |
+| `CPU5C_4_8` | 4 | 8GB | 60GB | Compute (5th gen) |
+| `CPU5C_8_16` | 8 | 16GB | 120GB | Compute (5th gen) |
 
 ```python
 from runpod_flash import Endpoint, CpuInstanceType

From ed44e42db5c6fcfb0a088e8f6396b75945bc0c06 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:57:16 +0100
Subject: [PATCH 14/24] chore: trim redundant cloudpickle wrong/correct example

---
 flash/SKILL.md | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 1c284a91..46b54d28 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -135,22 +135,7 @@ PodTemplate(
 
 ## Cloudpickle Scoping (CRITICAL)
 
-Decorated functions are serialized. They can ONLY access:
-- Parameters, local variables, imports inside the function, builtins
-
-```python
-# WRONG
-import torch
-@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"])
-async def bad(x):
-    return torch.tensor(x)  # NameError
-
-# CORRECT
-@Endpoint(name="w", gpu=GpuGroup.ADA_24, dependencies=["torch"])
-async def good(x):
-    import torch
-    return torch.tensor(x)
-```
+Decorated functions are serialized. They can ONLY access: parameters, local variables, imports inside the function, builtins. All imports must be inside the function body.
 
 ## EndpointJob
 

From db721ed5b09f778659c897a0ebd4819d9aae85a9 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 22:59:58 +0100
Subject: [PATCH 15/24] chore: remove redundant cloudpickle section, keep in
 gotchas

---
 flash/SKILL.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 46b54d28..82b14ff0 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -133,10 +133,6 @@ PodTemplate(
 )
 ```
 
-## Cloudpickle Scoping (CRITICAL)
-
-Decorated functions are serialized. They can ONLY access: parameters, local variables, imports inside the function, builtins. All imports must be inside the function body.
-
 ## EndpointJob
 
 Returned by `ep.run()` and `ep.runsync()` in client mode.

From a7e5d7665f1fc75559cac756ad425712dbefffef Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:05:02 +0100
Subject: [PATCH 16/24] chore: remove redundant import from intro line

---
 flash/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 82b14ff0..536da103 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -6,7 +6,7 @@ user-invocable: true
 
 # Runpod Flash (v1.7.0)
 
-`pip install runpod-flash` | `from runpod_flash import Endpoint, GpuGroup` | Python >=3.10
+`pip install runpod-flash` | Python >=3.10
 
 One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment.
 

From 7da03d4251ba8e87c783151de18e986e6b687ce7 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:05:49 +0100
Subject: [PATCH 17/24] chore: remove version from skill title

---
 flash/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 536da103..2aaa751d 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -4,7 +4,7 @@ description: runpod-flash SDK and CLI for deploying AI workloads on Runpod serve
 user-invocable: true
 ---
 
-# Runpod Flash (v1.7.0)
+# Runpod Flash
 
 `pip install runpod-flash` | Python >=3.10
 

From b71449aa5fada3c2f390706c17043f6d58018238 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:07:49 +0100
Subject: [PATCH 18/24] chore: add local dev workflow context to skill intro

---
 flash/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 2aaa751d..0bae442d 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -8,7 +8,7 @@ user-invocable: true
 
 `pip install runpod-flash` | Python >=3.10
 
-One class -- `Endpoint` -- handles everything: decorator mode, load-balanced routes, and external image deployment.
+Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything.
 
 ## Endpoint: Three Modes
 

From ee6de571569b3bbb66f4fb7aaf2e2ad98a1ec479 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:09:06 +0100
Subject: [PATCH 19/24] chore: move CLI to top as code block with examples,
 remove old CLI/auth sections

---
 flash/SKILL.md | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 0bae442d..2e2b4d90 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -10,6 +10,24 @@ user-invocable: true
 
 Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything.
 
+## CLI
+
+```bash
+flash login                              # authenticate via browser (or set RUNPOD_API_KEY env var)
+flash init my-project                    # scaffold a new project in ./my-project
+flash run                                # start dev server at localhost:8888
+flash run --auto-provision               # same, but pre-provision endpoints (no cold start)
+flash build                              # package artifact for deployment (500MB limit)
+flash build --exclude pkg1,pkg2          # exclude packages from build
+flash deploy new staging                 # deploy to "staging" environment
+flash deploy send staging                # send latest build to "staging"
+flash deploy list staging                # list deployments in "staging"
+flash deploy info staging                # show deployment details
+flash deploy delete staging              # delete "staging" deployment
+flash undeploy list                      # list all active endpoints
+flash undeploy my-endpoint               # remove a specific endpoint
+```
+
 ## Endpoint: Three Modes
 
 ### Mode 1: Your Code (Queue-Based Decorator)
@@ -214,23 +232,6 @@ import asyncio
 results = await asyncio.gather(compute(a), compute(b), compute(c))
 ```
 
-## CLI
-
-| Command | Description |
-|---------|-------------|
-| `flash login` | Authenticate via browser |
-| `flash init [name]` | Create project template |
-| `flash run [--auto-provision]` | Local dev server at localhost:8888 |
-| `flash build [--exclude pkg1,pkg2]` | Package artifact (500MB limit) |
-| `flash deploy new/send/list/info/delete <env>` | Deploy to production |
-| `flash undeploy list/<name>` | Remove endpoints |
-
-## Authentication
-
-Two ways to authenticate:
-- `flash login` -- browser-based OAuth (recommended)
-- `RUNPOD_API_KEY` env var -- set in `.env` or export directly
-
 ## Gotchas
 
 1. **Imports outside function** -- most common error. Everything inside the decorated function.

From a43d1c888d8f9e7bc3c222f7eba2a1f39e009d16 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:09:44 +0100
Subject: [PATCH 20/24] chore: add setup section with install and auth before
 CLI

---
 flash/SKILL.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 2e2b4d90..8493ac4e 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -6,16 +6,21 @@ user-invocable: true
 
 # Runpod Flash
 
-`pip install runpod-flash` | Python >=3.10
-
 Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything.
 
-## CLI
+## Setup
 
 ```bash
-flash login                              # authenticate via browser (or set RUNPOD_API_KEY env var)
+pip install runpod-flash                 # requires Python >=3.10
+flash login                              # authenticate via browser
+# OR: export RUNPOD_API_KEY=your_key     # alternative: set API key directly
 flash init my-project                    # scaffold a new project in ./my-project
-flash run                                # start dev server at localhost:8888
+```
+
+## CLI
+
+```bash
+flash run                                # start local dev server at localhost:8888
 flash run --auto-provision               # same, but pre-provision endpoints (no cold start)
 flash build                              # package artifact for deployment (500MB limit)
 flash build --exclude pkg1,pkg2          # exclude packages from build

From 8d27d270d6cd632e29796da953747491a90b3242 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:10:47 +0100
Subject: [PATCH 21/24] chore: separate flash login and RUNPOD_API_KEY as
 distinct auth options

---
 flash/SKILL.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 8493ac4e..5a645aed 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -12,8 +12,13 @@ Write code locally, test with `flash run` (dev server at localhost:8888), and fl
 
 ```bash
 pip install runpod-flash                 # requires Python >=3.10
-flash login                              # authenticate via browser
-# OR: export RUNPOD_API_KEY=your_key     # alternative: set API key directly
+
+# auth option 1: browser-based login (saves token locally)
+flash login
+
+# auth option 2: API key via environment variable
+export RUNPOD_API_KEY=your_key
+
 flash init my-project                    # scaffold a new project in ./my-project
 ```
 

From f81c03135519f033b9da27b618aa02e3b99de04e Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:12:22 +0100
Subject: [PATCH 22/24] chore: simplify endpoint intro line

---
 flash/SKILL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 5a645aed..d8fcb7a3 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -6,7 +6,7 @@ user-invocable: true
 
 # Runpod Flash
 
-Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. One class -- `Endpoint` -- handles everything.
+Write code locally, test with `flash run` (dev server at localhost:8888), and flash automatically provisions and deploys to remote GPUs/CPUs in the cloud. `Endpoint` handles everything.
 
 ## Setup
 

From ee8a866129eb9cd969af545564efd5f9f905e172 Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:20:14 +0100
Subject: [PATCH 23/24] chore: add multi-GPU list support, update examples to
 workers=5, add auto-switch gotcha

---
 flash/SKILL.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index d8fcb7a3..1851069c 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -47,7 +47,7 @@ One function = one endpoint with its own workers.
 ```python
 from runpod_flash import Endpoint, GpuGroup
 
-@Endpoint(name="my-worker", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"])
+@Endpoint(name="my-worker", gpu=GpuGroup.AMPERE_80, workers=5, dependencies=["torch"])
 async def compute(data):
     import torch  # MUST import inside function (cloudpickle)
     return {"sum": torch.tensor(data, device="cuda").sum().item()}
@@ -122,9 +122,10 @@ print(job.output)
 Endpoint(
     name="endpoint-name",                  # required (unless id= set)
     id=None,                               # connect to existing endpoint
-    gpu=GpuGroup.AMPERE_80,               # GPU type (default: ANY)
+    gpu=GpuGroup.AMPERE_80,               # single GPU type (default: ANY)
+    gpu=[GpuGroup.ADA_24, GpuGroup.AMPERE_80],  # or list for auto-select by supply
     cpu=CpuInstanceType.CPU5C_4_8,        # CPU type (mutually exclusive with gpu)
-    workers=3,                             # shorthand for (0, 3)
+    workers=5,                             # shorthand for (0, 5)
     workers=(1, 5),                        # explicit (min, max)
     idle_timeout=60,                       # seconds before scale-down (default: 60)
     dependencies=["torch"],                # pip packages for remote exec
@@ -139,7 +140,7 @@ Endpoint(
 ```
 
 - `gpu=` and `cpu=` are mutually exclusive
-- `workers=3` means `(0, 3)`. Default is `(0, 1)`
+- `workers=5` means `(0, 5)`. Default is `(0, 1)`
 - `idle_timeout` default is **60 seconds**
 - `flashboot=True` (default) -- enables fast cold starts via snapshot restore
 - `gpu_count` -- GPUs per worker (default 1), use >1 for multi-GPU models
@@ -225,7 +226,7 @@ async def preprocess(raw):
     import pandas as pd
     return pd.DataFrame(raw).to_dict("records")
 
-@Endpoint(name="infer", gpu=GpuGroup.AMPERE_80, workers=3, dependencies=["torch"])
+@Endpoint(name="infer", gpu=GpuGroup.AMPERE_80, workers=5, dependencies=["torch"])
 async def infer(clean):
     import torch
     t = torch.tensor([[v for v in r.values()] for r in clean], device="cuda")
@@ -251,3 +252,4 @@ results = await asyncio.gather(compute(a), compute(b), compute(c))
 5. **idle_timeout is seconds** -- default 60s, not minutes.
 6. **10MB payload limit** -- pass URLs, not large objects.
 7. **Client vs decorator** -- `image=`/`id=` = client. Otherwise = decorator.
+8. **Auto GPU switching requires workers >= 5** -- pass a list of GPU types (e.g. `gpu=[GpuGroup.ADA_24, GpuGroup.AMPERE_80]`) and set `workers=5` or higher. The platform only auto-switches GPU types based on supply when max workers is at least 5.

From 6773ea53fe3a12cdf818adc4e4b66d693c53875d Mon Sep 17 00:00:00 2001
From: Tim Pietrusky <tim.pietrusky@runpod.io>
Date: Thu, 5 Mar 2026 23:42:36 +0100
Subject: [PATCH 24/24] fix: correct fabricated CLI commands and add missing
 constructor param

- Replace made-up `flash deploy new/send/list/info/delete` with actual
  `flash deploy --env`, `flash env list/create/get/delete` commands
- Add `flash deploy --preview` for local Docker preview
- Add `execution_timeout_ms` to Endpoint constructor
---
 flash/SKILL.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/flash/SKILL.md b/flash/SKILL.md
index 1851069c..16d21eba 100644
--- a/flash/SKILL.md
+++ b/flash/SKILL.md
@@ -29,11 +29,13 @@ flash run                                # start local dev server at localhost:8
 flash run --auto-provision               # same, but pre-provision endpoints (no cold start)
 flash build                              # package artifact for deployment (500MB limit)
 flash build --exclude pkg1,pkg2          # exclude packages from build
-flash deploy new staging                 # deploy to "staging" environment
-flash deploy send staging                # send latest build to "staging"
-flash deploy list staging                # list deployments in "staging"
-flash deploy info staging                # show deployment details
-flash deploy delete staging              # delete "staging" deployment
+flash deploy                             # build + deploy (auto-selects env if only one)
+flash deploy --env staging               # build + deploy to "staging" environment
+flash deploy --preview                   # build + launch local preview in Docker
+flash env list                           # list deployment environments
+flash env create staging                 # create "staging" environment
+flash env get staging                    # show environment details + resources
+flash env delete staging                 # delete environment + tear down resources
 flash undeploy list                      # list all active endpoints
 flash undeploy my-endpoint               # remove a specific endpoint
 ```
@@ -136,6 +138,7 @@ Endpoint(
     gpu_count=1,                           # GPUs per worker
     template=PodTemplate(containerDiskInGb=100),
     flashboot=True,                        # fast cold starts
+    execution_timeout_ms=0,                # max execution time (0 = unlimited)
 )
 ```