feat(supervisor): verify warm-start delivery, cold-start silently lost dispatches (#3918)

myftija · web-flow · commit 002b8458d583 · 2026-06-16T14:14:53.000+01:00
### Problem

Firestarter's `didWarmStart: true` means the response was written to a
long-poll socket — not that the runner received it. A silently dead
poller (no FIN, e.g. a VM torn down mid-poll) leaves the dispatched run
stuck in `PENDING_EXECUTING` until the run engine's heartbeat redrive,
and each redrive burns a queue redelivery toward
`TASK_RUN_DEQUEUED_MAX_RETRIES`.

### Change

After a warm-start hit, the supervisor retains the `DequeuedMessage`
(TimerWheel, default 10s), then probes the existing `getLatestSnapshot`
API. If the run is still on the exact dequeued snapshot, no runner ever
acted — it falls through to the regular cold-create path. Recovery: ~10s
+ cold start, no new APIs, no CLI changes.

- **Double-start safe**: `startRunAttempt` runs under a per-run lock and
409s stale snapshot ids, so a reviving runner and the fallback workload
can't both execute; the loser exits before running anything.
- **Probe errors → do nothing**: healthy runners legitimately act late
during platform brownouts (nested attempt-start retries), so falling
back on uncertainty would stampede duplicates. The heartbeat redrive
stays as the backstop (also covers supervisor restarts dropping timers).
- **Off by default**: `TRIGGER_WARM_START_VERIFY_ENABLED` (+
`TRIGGER_WARM_START_VERIFY_DELAY_MS`, 1–60s, default 10s). Disabled =
complete no-op. Works for all workload managers (compute/k8s/docker)
since it hooks the shared dequeue path.
- Emits `warmstart.verify` wide events (`outcome: delivered | fallback |
probe_error`), making the silent-loss rate directly measurable.
diff --git a/.server-changes/warm-start-delivery-verification.md b/.server-changes/warm-start-delivery-verification.md
@@ -0,0 +1,6 @@
+---
+area: supervisor
+type: feature
+---
+
+Verify warm-start dispatches were acted on and cold-start the run within seconds when a dispatch is silently lost (opt-in via TRIGGER_WARM_START_VERIFY_ENABLED).
diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts
@@ -79,6 +79,16 @@ const Env = z
     TRIGGER_CHECKPOINT_URL: z.string().optional(),
     TRIGGER_METADATA_URL: z.string().optional(),
 
+    // Warm-start delivery verification: after a warm-start hit, probe the
+    // platform and cold-start the run if no runner acted on the dispatch
+    TRIGGER_WARM_START_VERIFY_ENABLED: BoolEnv.default(false),
+    TRIGGER_WARM_START_VERIFY_DELAY_MS: z.coerce
+      .number()
+      .int()
+      .min(1_000)
+      .max(60_000)
+      .default(10_000),
+
     // Used by the resource monitor
     RESOURCE_MONITOR_ENABLED: BoolEnv.default(false),
     RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(),
diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts
@@ -27,6 +27,10 @@ import { PodCleaner } from "./services/podCleaner.js";
 import { FailedPodHandler } from "./services/failedPodHandler.js";
 import { getWorkerToken } from "./workerToken.js";
 import { OtlpTraceService } from "./services/otlpTraceService.js";
+import {
+  WarmStartVerificationService,
+  type WarmStartTimings,
+} from "./services/warmStartVerificationService.js";
 import { extractTraceparent, getRestoreRunnerId } from "./util.js";
 import { Redis } from "ioredis";
 import { BackpressureMonitor } from "./backpressure/backpressureMonitor.js";
@@ -63,6 +67,7 @@ class ManagedSupervisor {
   private readonly logger = new SimpleStructuredLogger("managed-supervisor");
   private readonly resourceMonitor: ResourceMonitor;
   private readonly checkpointClient?: CheckpointClient;
+  private readonly warmStartVerifier?: WarmStartVerificationService;
 
   private readonly podCleaner?: PodCleaner;
   private readonly failedPodHandler?: FailedPodHandler;
@@ -311,6 +316,19 @@ class ManagedSupervisor {
       });
     }
 
+    if (env.TRIGGER_WARM_START_VERIFY_ENABLED && this.warmStartUrl) {
+      this.logger.log("Warm-start delivery verification enabled", {
+        delayMs: env.TRIGGER_WARM_START_VERIFY_DELAY_MS,
+      });
+
+      this.warmStartVerifier = new WarmStartVerificationService({
+        workerClient: this.workerSession.httpClient,
+        delayMs: env.TRIGGER_WARM_START_VERIFY_DELAY_MS,
+        createWorkload: (message, timings) => this.createWorkload(message, timings),
+        wideEventOpts: this.wideEventOpts,
+      });
+    }
+
     this.workerSession.on("runNotification", async ({ time, run }) => {
       this.logger.verbose("runNotification", { time, run });
 
@@ -467,66 +485,24 @@ class ManagedSupervisor {
             if (didWarmStart) {
               setExtra(fromContext(), "path_taken", "warm_start");
               this.logger.debug("Warm start successful", { runId: message.run.id });
-              return;
-            }
-
-            setExtra(fromContext(), "path_taken", "cold_create");
-
-            const createStart = performance.now();
-            try {
-              if (!message.deployment.friendlyId) {
-                // mostly a type guard, deployments always exists for deployed environments
-                // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments.
-                throw new Error("Deployment is missing");
-              }
-
-              await this.workloadManager.create({
-                dequeuedAt: message.dequeuedAt,
+              // A hit only means the response was written to the long-poll
+              // socket, not that the runner received it. Schedule a delivery
+              // verification that cold-starts the run if nobody acts on it.
+              this.warmStartVerifier?.schedule(message, {
                 dequeueResponseMs,
                 pollingIntervalMs,
                 warmStartCheckMs,
-                envId: message.environment.id,
-                envType: message.environment.type,
-                image: message.image,
-                machine: message.run.machine,
-                orgId: message.organization.id,
-                projectId: message.project.id,
-                deploymentFriendlyId: message.deployment.friendlyId,
-                deploymentVersion: message.backgroundWorker.version,
-                runId: message.run.id,
-                runFriendlyId: message.run.friendlyId,
-                version: message.version,
-                nextAttemptNumber: message.run.attemptNumber,
-                snapshotId: message.snapshot.id,
-                snapshotFriendlyId: message.snapshot.friendlyId,
-                placementTags: message.placementTags,
-                traceContext: message.run.traceContext,
-                annotations: message.run.annotations,
-                hasPrivateLink: message.organization.hasPrivateLink,
               });
-              recordPhaseSince("workload_create", createStart, undefined);
-              workloadCreateDuration.observe(
-                { backend: this.workloadManagerBackend, outcome: "success" },
-                (performance.now() - createStart) / 1000
-              );
-
-              // Disabled for now
-              // this.resourceMonitor.blockResources({
-              //   cpu: message.run.machine.cpu,
-              //   memory: message.run.machine.memory,
-              // });
-            } catch (error) {
-              recordPhaseSince(
-                "workload_create",
-                createStart,
-                error instanceof Error ? error : new Error(String(error))
-              );
-              workloadCreateDuration.observe(
-                { backend: this.workloadManagerBackend, outcome: "error" },
-                (performance.now() - createStart) / 1000
-              );
-              this.logger.error("Failed to create workload", { error });
+              return;
             }
+
+            setExtra(fromContext(), "path_taken", "cold_create");
+
+            await this.createWorkload(message, {
+              dequeueResponseMs,
+              pollingIntervalMs,
+              warmStartCheckMs,
+            });
           }
         );
       }
@@ -561,6 +537,8 @@ class ManagedSupervisor {
 
   async onRunConnected({ run }: { run: { friendlyId: string } }) {
     this.logger.debug("Run connected", { run });
+    // The dispatched run reached a runner on this node - no fallback needed.
+    this.warmStartVerifier?.cancel(run.friendlyId);
     this.workerSession.subscribeToRunNotifications([run.friendlyId]);
   }
 
@@ -569,6 +547,72 @@ class ManagedSupervisor {
     this.workerSession.unsubscribeFromRunNotifications([run.friendlyId]);
   }
 
+  private async createWorkload(message: DequeuedMessage, timings: WarmStartTimings) {
+    const createStart = performance.now();
+    try {
+      if (!message.deployment.friendlyId) {
+        // mostly a type guard, deployments always exists for deployed environments
+        // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments.
+        throw new Error("Deployment is missing");
+      }
+
+      if (!message.image) {
+        // same type-guard situation as deployment above
+        throw new Error("Image is missing");
+      }
+
+      await this.workloadManager.create({
+        dequeuedAt: message.dequeuedAt,
+        dequeueResponseMs: timings.dequeueResponseMs,
+        pollingIntervalMs: timings.pollingIntervalMs,
+        warmStartCheckMs: timings.warmStartCheckMs,
+        envId: message.environment.id,
+        envType: message.environment.type,
+        image: message.image,
+        machine: message.run.machine,
+        orgId: message.organization.id,
+        projectId: message.project.id,
+        deploymentFriendlyId: message.deployment.friendlyId,
+        deploymentVersion: message.backgroundWorker.version,
+        runId: message.run.id,
+        runFriendlyId: message.run.friendlyId,
+        version: message.version,
+        nextAttemptNumber: message.run.attemptNumber,
+        snapshotId: message.snapshot.id,
+        snapshotFriendlyId: message.snapshot.friendlyId,
+        placementTags: message.placementTags,
+        traceContext: message.run.traceContext,
+        annotations: message.run.annotations,
+        hasPrivateLink: message.organization.hasPrivateLink,
+      });
+      recordPhaseSince("workload_create", createStart, undefined);
+      workloadCreateDuration.observe(
+        { backend: this.workloadManagerBackend, outcome: "success" },
+        (performance.now() - createStart) / 1000
+      );
+
+      // Disabled for now
+      // this.resourceMonitor.blockResources({
+      //   cpu: message.run.machine.cpu,
+      //   memory: message.run.machine.memory,
+      // });
+    } catch (error) {
+      recordPhaseSince(
+        "workload_create",
+        createStart,
+        error instanceof Error ? error : new Error(String(error))
+      );
+      workloadCreateDuration.observe(
+        { backend: this.workloadManagerBackend, outcome: "error" },
+        (performance.now() - createStart) / 1000
+      );
+      this.logger.error("Failed to create workload", {
+        runId: message.run.friendlyId,
+        error,
+      });
+    }
+  }
+
   private async tryWarmStart(
     dequeuedMessage: DequeuedMessage,
     traceparent: string | undefined
@@ -650,6 +694,9 @@ class ManagedSupervisor {
 
   async stop() {
     this.logger.log("Shutting down");
+    // Stop the verifier first: its timer can otherwise fire mid-shutdown and
+    // cold-create a workload on a node that is going down.
+    this.warmStartVerifier?.stop();
     await this.workloadServer.stop();
     await this.workerSession.stop();
 
diff --git a/apps/supervisor/src/services/warmStartVerificationService.test.ts b/apps/supervisor/src/services/warmStartVerificationService.test.ts
@@ -0,0 +1,132 @@
+import { describe, expect, it, vi } from "vitest";
+import { setTimeout as sleep } from "node:timers/promises";
+import { WarmStartVerificationService } from "./warmStartVerificationService.js";
+import type { DequeuedMessage } from "@trigger.dev/core/v3";
+import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers";
+
+// The TimerWheel ticks every 100ms, so a 1000ms delay (the env minimum)
+// fires within ~1.1s.
+const DELAY_MS = 1_000;
+// Long enough that a pending verification would certainly have fired.
+const SETTLE_MS = 1_600;
+
+const DEQUEUED_SNAPSHOT_ID = "snapshot_dequeued";
+
+function makeMessage(runFriendlyId = "run_1"): DequeuedMessage {
+  return {
+    run: { friendlyId: runFriendlyId },
+    snapshot: { friendlyId: DEQUEUED_SNAPSHOT_ID },
+  } as unknown as DequeuedMessage;
+}
+
+function createService(opts: {
+  latestSnapshotId?: string;
+  probeError?: boolean;
+}) {
+  const getLatestSnapshot = vi.fn(async (_runId: string) =>
+    opts.probeError
+      ? { success: false as const, error: "connection refused" }
+      : {
+          success: true as const,
+          data: { execution: { snapshot: { friendlyId: opts.latestSnapshotId } } },
+        }
+  );
+
+  const createWorkload = vi.fn(async () => {});
+
+  const service = new WarmStartVerificationService({
+    workerClient: { getLatestSnapshot } as unknown as SupervisorHttpClient,
+    delayMs: DELAY_MS,
+    createWorkload,
+    wideEventOpts: { service: "supervisor-test", env: {}, enabled: false },
+  });
+
+  return { service, getLatestSnapshot, createWorkload };
+}
+
+describe("WarmStartVerificationService", () => {
+  it("falls back to a cold create when the snapshot is unchanged", async () => {
+    const { service, createWorkload } = createService({
+      latestSnapshotId: DEQUEUED_SNAPSHOT_ID,
+    });
+    try {
+      const message = makeMessage();
+      const timings = { warmStartCheckMs: 12 };
+      service.schedule(message, timings);
+
+      await vi.waitFor(() => expect(createWorkload).toHaveBeenCalledTimes(1), {
+        timeout: 3_000,
+      });
+      expect(createWorkload).toHaveBeenCalledWith(message, timings);
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("does nothing when the snapshot has moved on (delivered)", async () => {
+    const { service, getLatestSnapshot, createWorkload } = createService({
+      latestSnapshotId: "snapshot_executing",
+    });
+    try {
+      service.schedule(makeMessage(), { warmStartCheckMs: 12 });
+
+      await vi.waitFor(() => expect(getLatestSnapshot).toHaveBeenCalledTimes(1), {
+        timeout: 3_000,
+      });
+      await sleep(100);
+      expect(createWorkload).not.toHaveBeenCalled();
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("never falls back when the probe errors", async () => {
+    const { service, getLatestSnapshot, createWorkload } = createService({ probeError: true });
+    try {
+      service.schedule(makeMessage(), { warmStartCheckMs: 12 });
+
+      await vi.waitFor(() => expect(getLatestSnapshot).toHaveBeenCalledTimes(1), {
+        timeout: 3_000,
+      });
+      await sleep(100);
+      expect(createWorkload).not.toHaveBeenCalled();
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("cancel before the delay prevents the probe entirely", async () => {
+    const { service, getLatestSnapshot, createWorkload } = createService({
+      latestSnapshotId: DEQUEUED_SNAPSHOT_ID,
+    });
+    try {
+      service.schedule(makeMessage(), { warmStartCheckMs: 12 });
+
+      expect(service.cancel("run_1")).toBe(true);
+
+      await sleep(SETTLE_MS);
+      expect(getLatestSnapshot).not.toHaveBeenCalled();
+      expect(createWorkload).not.toHaveBeenCalled();
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("re-scheduling the same run replaces the pending verification", async () => {
+    const { service, getLatestSnapshot } = createService({
+      latestSnapshotId: "snapshot_executing",
+    });
+    try {
+      service.schedule(makeMessage(), { warmStartCheckMs: 1 });
+      service.schedule(makeMessage(), { warmStartCheckMs: 2 });
+
+      await vi.waitFor(() => expect(getLatestSnapshot).toHaveBeenCalledTimes(1), {
+        timeout: 3_000,
+      });
+      await sleep(SETTLE_MS);
+      expect(getLatestSnapshot).toHaveBeenCalledTimes(1);
+    } finally {
+      service.stop();
+    }
+  });
+});
diff --git a/apps/supervisor/src/services/warmStartVerificationService.ts b/apps/supervisor/src/services/warmStartVerificationService.ts