Skip to content

Commit cc2d7e2

Browse files
joe4devclaude
andcommitted
fix(init): derive folded init-error reporting from invoke outcome and rapid's init duration
Reporting fixes for the on-demand folded-into-invoke (suppressed init) path: - The invoke handler no longer unconditionally forces error status from the recorded init failure: the REPORT Status/Error Type lines and the /error routing now apply only when the invocation actually failed, and the recorded failure is cleared once an invocation succeeds. Previously one init failure permanently tainted a recovered environment - every later (successful) invocation was posted to /error with an error REPORT line. Persistent failures keep re-emitting the full failure envelope per invoke. - RecordInitError captures the failure type detected by rapidcore for runtimes that crashed WITHOUT calling /init/error (sys.exit, segfault, invalid entrypoint), so those failures render the same INIT_REPORT(phase=invoke) and REPORT Status/Error Type lines as the /init/error-reported flavor instead of a healthy-looking envelope. - REPORT's Init Duration now uses rapid's authoritative Init-phase measurement captured from the INIT_REPORT(phase=init) lifecycle event, instead of wall-clock time.Since(initStart) at invoke arrival, which wrongly included the idle gap between init completion and the first invoke (minutes for provisioned concurrency). It is also omitted for non-on-demand invokes, matching AWS, which reports provisioned concurrency init in separate provisioning-time log streams. - Document that SnapStart environments intentionally use the on-demand error model in LocalStack (they initialize lazily at the first invoke, not at version publish). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent c8fc5fb commit cc2d7e2

3 files changed

Lines changed: 84 additions & 13 deletions

File tree

cmd/localstack/custom_interop.go

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ type CustomInteropServer struct {
3434
// logCollector accumulates the runtime's stdout/stderr plus the synthetic START/REPORT/
3535
// INIT_REPORT lines that are flushed to LocalStack with each invocation's logs.
3636
logCollector *LogCollector
37+
// eventsAPI provides rapid's authoritative Init-phase duration (see events.go), used for
38+
// the REPORT/INIT_REPORT log lines instead of wall-clock measurements at invoke arrival.
39+
eventsAPI *LocalStackEventsAPI
3740
// initStart is set once in Init() and warmStart is flipped on the first invoke.
3841
// Both are accessed only from the single sequential init -> invoke flow (the RIE
3942
// processes one invocation at a time), so they need no additional synchronization.
@@ -51,12 +54,20 @@ type CustomInteropServer struct {
5154
// initErrorType holds rapidcore's scrubbed fatal error type (e.g. Runtime.Unknown) when init
5255
// failed, used to render the INIT_REPORT(phase=invoke) and REPORT Status/Error Type lines for
5356
// the on-demand folded-into-invoke path. Stores a string; empty/unset means init did not fail.
57+
// It persists while invocations keep failing (each one re-runs the init as a suppressed init
58+
// and AWS re-emits the failure envelope), and is cleared by the invoke handler once an
59+
// invocation succeeds so a recovered environment is not tainted by the original failure.
5460
initErrorType atomic.Value
5561
// onDemand is true for on-demand functions, where AWS folds a failed cold-start init into
5662
// the first invocation (suppressed init). For these we do NOT report init failures via
5763
// /status/error; instead we signal ready and let the first invoke surface the error with
58-
// the full INIT_REPORT/START/END/REPORT envelope. Provisioned concurrency, SnapStart, and
59-
// Managed Instances keep the provisioning-time /status/error model.
64+
// the full INIT_REPORT/START/END/REPORT envelope. Provisioned concurrency and Managed
65+
// Instances keep the provisioning-time /status/error model. SnapStart environments are
66+
// also classified on-demand here (LocalStack sets AWS_LAMBDA_INITIALIZATION_TYPE=on-demand
67+
// for them and initializes them lazily at the first invoke, not at version publish), so the
68+
// fold-into-invoke model applies to them too.
69+
// TODO: set AWS_LAMBDA_INITIALIZATION_TYPE=snap-start on the LocalStack side for env-var
70+
// parity with AWS once SnapStart environments get their own initialization type.
6071
onDemand bool
6172
}
6273

@@ -121,13 +132,14 @@ func (l *LocalStackAdapter) SendResult(invokeId string, body []byte, isError boo
121132
return nil
122133
}
123134

124-
func NewCustomInteropServer(lsOpts *LsOpts, adapter *LocalStackAdapter, delegate interop.Server, logCollector *LogCollector) (server *CustomInteropServer) {
135+
func NewCustomInteropServer(lsOpts *LsOpts, adapter *LocalStackAdapter, delegate interop.Server, logCollector *LogCollector, eventsAPI *LocalStackEventsAPI) (server *CustomInteropServer) {
125136
server = &CustomInteropServer{
126137
delegate: delegate.(*rapidcore.Server),
127138
port: lsOpts.InteropPort,
128139
upstreamEndpoint: lsOpts.RuntimeEndpoint,
129140
localStackAdapter: adapter,
130141
logCollector: logCollector,
142+
eventsAPI: eventsAPI,
131143
onDemand: GetenvWithDefault("AWS_LAMBDA_INITIALIZATION_TYPE", "on-demand") == "on-demand",
132144
}
133145

@@ -153,17 +165,27 @@ func NewCustomInteropServer(lsOpts *LsOpts, adapter *LocalStackAdapter, delegate
153165

154166
initErrType, _ := server.initErrorType.Load().(string)
155167

168+
// First invocation into a successfully initialized on-demand environment: REPORT
169+
// carries the Init phase duration as measured by rapid (init start -> init end).
170+
// Provisioned concurrency / Managed Instances initialize at provisioning time and
171+
// AWS omits Init Duration from their invokes' REPORT lines.
156172
initDuration := ""
157-
if !server.warmStart && !server.initTimedOut.Load() && initErrType == "" {
158-
initTimeMS := float64(time.Since(server.initStart).Nanoseconds()) / float64(time.Millisecond)
159-
initDuration = fmt.Sprintf("Init Duration: %.2f ms\t", initTimeMS)
173+
if server.onDemand && !server.warmStart && !server.initTimedOut.Load() && initErrType == "" {
174+
if initTimeMS, ok := server.eventsAPI.InitDurationMS(); ok {
175+
initDuration = fmt.Sprintf("Init Duration: %.2f ms\t", initTimeMS)
176+
}
160177
}
161178
server.warmStart = true
162179

163180
// On-demand init failure folded into this invocation (AWS suppressed init): emit
164-
// the INIT_REPORT(phase=invoke) line before START (emitted during Invoke below).
181+
// the INIT_REPORT(phase=invoke) line before START (emitted during Invoke below),
182+
// reporting the failed init's duration (rapid's measurement when available; the
183+
// wall-clock fallback covers inits that died before emitting INIT_REPORT).
165184
if initErrType != "" {
166-
initTimeMS := float64(time.Since(server.initStart).Nanoseconds()) / float64(time.Millisecond)
185+
initTimeMS, ok := server.eventsAPI.InitDurationMS()
186+
if !ok {
187+
initTimeMS = float64(time.Since(server.initStart).Nanoseconds()) / float64(time.Millisecond)
188+
}
167189
_, _ = fmt.Fprintf(logCollector,
168190
"INIT_REPORT Init Duration: %.2f ms\tPhase: invoke\tStatus: error\tError Type: %s\n",
169191
initTimeMS, initErrType)
@@ -219,11 +241,19 @@ func NewCustomInteropServer(lsOpts *LsOpts, adapter *LocalStackAdapter, delegate
219241
log.Fatalln(err)
220242
}
221243
}
222-
// On-demand init failure folded into this invocation: the REPORT carries the
223-
// failure status and rapidcore's scrubbed fatal error type (e.g. Runtime.Unknown).
244+
// On-demand init failure folded into this invocation: when the suppressed init
245+
// re-run (and thus the invoke) failed again, the REPORT carries the failure status
246+
// and rapidcore's scrubbed fatal error type (e.g. Runtime.Unknown). When the
247+
// invocation succeeded (the suppressed re-init recovered from a transient init
248+
// failure), the result stands on its own — AWS reports it as successful — and the
249+
// cached init failure is cleared so later invocations are not tainted by it.
224250
if initErrType != "" {
225-
isErr = true
226-
status = "Status: error\tError Type: " + initErrType
251+
if err != nil {
252+
isErr = true
253+
status = "Status: error\tError Type: " + initErrType
254+
} else {
255+
server.initErrorType.Store("")
256+
}
227257
}
228258
// optional sleep. can be used for debugging purposes
229259
if lsOpts.PostInvokeWaitMS != "" {
@@ -365,6 +395,21 @@ func (c *CustomInteropServer) SendInitError(errType fatalerror.ErrorType, errMsg
365395
}
366396
}
367397

398+
// RecordInitError records the structured init failure detected by rapidcore for runtimes that
399+
// failed WITHOUT calling /init/error (crash, sys.exit, invalid entrypoint), so the on-demand
400+
// folded-into-invoke path renders the same INIT_REPORT(phase=invoke) and REPORT Status/Error
401+
// Type lines as the /init/error-reported flavor. It must not overwrite a type already recorded
402+
// by SendInitErrorResponse: the runtime-reported error is the authoritative one.
403+
func (c *CustomInteropServer) RecordInitError(errType fatalerror.ErrorType) {
404+
if recorded, _ := c.initErrorType.Load().(string); recorded != "" {
405+
return
406+
}
407+
if errType == "" {
408+
errType = fatalerror.RuntimeExit
409+
}
410+
c.initErrorType.Store(string(errType))
411+
}
412+
368413
func (c *CustomInteropServer) GetCurrentInvokeID() string {
369414
log.Traceln("GetCurrentInvokeID called")
370415
return c.delegate.GetCurrentInvokeID()

cmd/localstack/events.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ package main
22

33
import (
44
"fmt"
5+
"sync/atomic"
56

67
"github.com/aws/aws-lambda-runtime-interface-emulator/internal/lambda/interop"
78
"github.com/aws/aws-lambda-runtime-interface-emulator/internal/lambda/rapidcore/standalone/telemetry"
9+
lambdatelemetry "github.com/aws/aws-lambda-runtime-interface-emulator/internal/lambda/telemetry"
810
)
911

1012
// LocalStackEventsAPI rides rapidcore's invoke lifecycle events to emit the synthetic START
@@ -15,6 +17,12 @@ import (
1517
type LocalStackEventsAPI struct {
1618
*telemetry.StandaloneEventsAPI
1719
logCollector *LogCollector
20+
// initDurationMS holds rapid's authoritative measurement of the Init phase (init start ->
21+
// init end, monotonic), captured from the INIT_REPORT(phase=init) lifecycle event emitted
22+
// by doRuntimeDomainInit. The /invoke handler renders it as the first invocation's REPORT
23+
// "Init Duration" instead of re-measuring at invoke arrival, which would wrongly include
24+
// the idle gap between init completion and the first invoke dispatch.
25+
initDurationMS atomic.Value // float64
1826
}
1927

2028
func NewLocalStackEventsAPI(logCollector *LogCollector) *LocalStackEventsAPI {
@@ -28,3 +36,17 @@ func (e *LocalStackEventsAPI) SendInvokeStart(data interop.InvokeStartData) erro
2836
_, _ = fmt.Fprintf(e.logCollector, "START RequestId: %s Version: %s\n", data.RequestID, data.Version)
2937
return e.StandaloneEventsAPI.SendInvokeStart(data)
3038
}
39+
40+
func (e *LocalStackEventsAPI) SendInitReport(data interop.InitReportData) error {
41+
if data.Phase == lambdatelemetry.InitInsideInitPhase {
42+
e.initDurationMS.Store(data.Metrics.DurationMs)
43+
}
44+
return e.StandaloneEventsAPI.SendInitReport(data)
45+
}
46+
47+
// InitDurationMS returns rapid's measured duration of the startup Init phase and whether one
48+
// was recorded (false e.g. when the init phase timed out and never completed).
49+
func (e *LocalStackEventsAPI) InitDurationMS() (float64, bool) {
50+
durationMS, ok := e.initDurationMS.Load().(float64)
51+
return durationMS, ok
52+
}

cmd/localstack/main.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ func main() {
238238
runDaemon(d) // async
239239

240240
defaultInterop := sandbox.DefaultInteropServer()
241-
interopServer := NewCustomInteropServer(lsOpts, lsAdapter, defaultInterop, logCollector)
241+
interopServer := NewCustomInteropServer(lsOpts, lsAdapter, defaultInterop, logCollector, lsEventsAPI)
242242
sandbox.SetInteropServer(interopServer)
243243
if len(handler) > 0 {
244244
sandbox.SetHandler(handler)
@@ -323,7 +323,11 @@ func main() {
323323
// invoke, which surfaces the cached init error (or a runtime-exit error) together with
324324
// the full INIT_REPORT/START/END/REPORT log envelope. SendInitErrorResponse has already
325325
// cached the structured error (without reporting via /status/error for on-demand).
326+
// Record the failure type detected by rapidcore so runtimes that crashed WITHOUT
327+
// calling /init/error still get the error envelope (no-op when /init/error already
328+
// recorded the runtime-reported type).
326329
log.Debugln("Init failed; deferring to first invocation (on-demand suppressed init).")
330+
interopServer.RecordInitError(initResp.InitErrorType)
327331
case err != nil:
328332
// PC/SnapStart/MI, or an init-phase reset: report the failure now and exit. When the
329333
// runtime reported its own error via /init/error, SendInitErrorResponse already

0 commit comments

Comments
 (0)