Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion common/metrics/metric_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -841,7 +841,12 @@ var (
WithDescription("The amount of time it took to successfully send a task to the DLQ. This only records the"+
" latency of the final attempt to send the task to the DLQ, not the cumulative latency of all attempts."),
)
TaskDiscarded = NewCounterDef("task_errors_discarded")
TaskDiscarded = NewCounterDef("task_errors_discarded")

WorkerCommandsDispatchSuccess = NewCounterDef("worker_commands_dispatch_success")
WorkerCommandsDispatchFailure = NewCounterDef("worker_commands_dispatch_failure")
WorkerCommandsDispatchNoPoller = NewCounterDef("worker_commands_dispatch_no_poller")
WorkerCommandsOperationFailure = NewCounterDef("worker_commands_operation_failure")
TaskSkipped = NewCounterDef("task_skipped")
TaskVersionMisMatch = NewCounterDef("task_errors_version_mismatch")
TasksDependencyTaskNotCompleted = NewCounterDef("task_dependency_task_not_completed")
Expand Down
123 changes: 123 additions & 0 deletions common/nexus/matching_response.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package nexus

import (
"github.com/nexus-rpc/sdk-go/nexus"
enumspb "go.temporal.io/api/enums/v1"
failurepb "go.temporal.io/api/failure/v1"
nexuspb "go.temporal.io/api/nexus/v1"
matchingservice "go.temporal.io/server/api/matchingservice/v1"
"go.temporal.io/server/common/nexus/nexusrpc"
)

// StartOperationResponseToError converts a StartOperationResponse proto into a Nexus SDK error.
// Returns nil if the response indicates success (SyncSuccess or AsyncSuccess).
//
// Error types returned:
// - *nexus.HandlerError: internal errors (e.g., unexpected response variant)
// - *nexus.OperationError: operation-level failures from the handler
func StartOperationResponseToError(resp *nexuspb.StartOperationResponse) error {
switch t := resp.GetVariant().(type) {
case *nexuspb.StartOperationResponse_SyncSuccess:
return nil
case *nexuspb.StartOperationResponse_AsyncSuccess:
return nil
case *nexuspb.StartOperationResponse_OperationError:
//nolint:staticcheck // Deprecated variant still in use for backward compatibility.
opErr := &nexus.OperationError{
Message: "operation error",
//nolint:staticcheck // Deprecated function still in use for backward compatibility.
State: nexus.OperationState(t.OperationError.GetOperationState()),
Cause: &nexus.FailureError{
//nolint:staticcheck // Deprecated function still in use for backward compatibility.
Failure: ProtoFailureToNexusFailure(t.OperationError.GetFailure()),
},
}
if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil {
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
return opErr
case *nexuspb.StartOperationResponse_Failure:
return operationErrorFromTemporalFailure(t.Failure)
default:
return nil
}
}

// DispatchNexusTaskResponseToError converts a DispatchNexusTaskResponse proto into a Nexus SDK
// error. Returns nil if the response indicates success.
//
// This handles the outer dispatch envelope (timeout, handler error, failure) and delegates to
// StartOperationResponseToError for the inner StartOperationResponse.
//
// Error types returned:
// - *nexus.HandlerError: transport/handler failures, timeouts
// - *nexus.OperationError: operation-level failures from the worker
func DispatchNexusTaskResponseToError(resp *matchingservice.DispatchNexusTaskResponse) error {
switch t := resp.GetOutcome().(type) {
case *matchingservice.DispatchNexusTaskResponse_Failure:
return handlerErrorFromTemporalFailure(t.Failure)
case *matchingservice.DispatchNexusTaskResponse_HandlerError:
return handlerErrorFromDeprecatedProto(t.HandlerError)
case *matchingservice.DispatchNexusTaskResponse_RequestTimeout:
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeUpstreamTimeout, "upstream timeout")
case *matchingservice.DispatchNexusTaskResponse_Response:
return StartOperationResponseToError(t.Response.GetStartOperation())
default:
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "empty outcome")
}
}

func handlerErrorFromTemporalFailure(failure *failurepb.Failure) error {
nf, err := TemporalFailureToNexusFailure(failure)
if err != nil {
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
he, err := nexusrpc.DefaultFailureConverter().FailureToError(nf)
if err != nil {
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
return he
}

func handlerErrorFromDeprecatedProto(he *nexuspb.HandlerError) *nexus.HandlerError {
var retryBehavior nexus.HandlerErrorRetryBehavior
//nolint:exhaustive // unspecified is the default
switch he.GetRetryBehavior() {
case enumspb.NEXUS_HANDLER_ERROR_RETRY_BEHAVIOR_RETRYABLE:
retryBehavior = nexus.HandlerErrorRetryBehaviorRetryable
case enumspb.NEXUS_HANDLER_ERROR_RETRY_BEHAVIOR_NON_RETRYABLE:
retryBehavior = nexus.HandlerErrorRetryBehaviorNonRetryable
}
//nolint:staticcheck // Deprecated function still in use for backward compatibility.
cause := ProtoFailureToNexusFailure(he.GetFailure())
return &nexus.HandlerError{
//nolint:staticcheck // Deprecated function still in use for backward compatibility.
Type: nexus.HandlerErrorType(he.GetErrorType()),
RetryBehavior: retryBehavior,
Cause: &nexus.FailureError{Failure: cause},
}
}

func operationErrorFromTemporalFailure(failure *failurepb.Failure) error {
state := nexus.OperationStateFailed
if failure.GetCanceledFailureInfo() != nil {
state = nexus.OperationStateCanceled
}
nf, err := TemporalFailureToNexusFailure(failure)
if err != nil {
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
cause, err := nexusrpc.DefaultFailureConverter().FailureToError(nf)
if err != nil {
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
opErr := &nexus.OperationError{
State: state,
Message: "operation error",
Cause: cause,
}
if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil {
return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
return opErr
}
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ require (
github.com/lib/pq v1.10.9
github.com/maruel/panicparse/v2 v2.4.0
github.com/mitchellh/mapstructure v1.5.0
github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5
github.com/nexus-rpc/sdk-go v0.6.0
github.com/olekukonko/tablewriter v0.0.5
github.com/olivere/elastic/v7 v7.0.32
github.com/pkg/errors v0.9.1
Expand Down Expand Up @@ -174,4 +174,4 @@ require (
modernc.org/memory v1.11.0 // indirect
)

replace go.temporal.io/api => github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759
replace go.temporal.io/api => github.com/temporalio/api-go v1.62.2-0.20260314000959-bbb2a94130c3
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5 h1:Van9KGGs8lcDgxzSNFbDhEMNeJ80TbBxwZ45f9iBk9U=
github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk=
github.com/nexus-rpc/sdk-go v0.6.0 h1:QRgnP2zTbxEbiyWG/aXH8uSC5LV/Mg1fqb19jb4DBlo=
github.com/nexus-rpc/sdk-go v0.6.0/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
Expand Down Expand Up @@ -310,8 +310,8 @@ github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXl
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759 h1:CSlBGjKIgi770YWTYB1dt2AJuLKU6yArSZL636UStdo=
github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM=
github.com/temporalio/api-go v1.62.2-0.20260314000959-bbb2a94130c3 h1:8T6S/2+0jCL//uKnmwQsH7W+O1VnkonSMuzpfR+AZe8=
github.com/temporalio/api-go v1.62.2-0.20260314000959-bbb2a94130c3/go.mod h1:ucB3ZO5X2AFLJcUBzOrio08zxiQjuzdM/7aRKOEQPEc=
github.com/temporalio/ringpop-go v0.0.0-20250130211428-b97329e994f7 h1:lEebX/hZss+TSH3EBwhztnBavJVj7pWGJOH8UgKHS0w=
github.com/temporalio/ringpop-go v0.0.0-20250130211428-b97329e994f7/go.mod h1:RE+CHmY+kOZQk47AQaVzwrGmxpflnLgTd6EOK0853j4=
github.com/temporalio/sqlparser v0.0.0-20231115171017-f4060bcfa6cb h1:YzHH/U/dN7vMP+glybzcXRTczTrgfdRisNTzAj7La04=
Expand Down
153 changes: 48 additions & 105 deletions service/frontend/nexus_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,115 +447,42 @@ func (h *nexusHandler) StartOperation(
oc.logger.Error("received error from matching service for Nexus StartOperation request", tag.Error(err))
return nil, commonnexus.ConvertGRPCError(err, false)
}
// Convert to standard Nexus SDK response.
switch t := response.GetOutcome().(type) {
case *matchingservice.DispatchNexusTaskResponse_Failure:
// Set the failure source to "worker" if we've reached this case.
// Failure conversions errors below are the user's fault, as it implies that malformed completions were sent from
// the worker.
// Convert to standard Nexus SDK response and check for errors.
nexusErr := commonnexus.DispatchNexusTaskResponseToError(response)
if nexusErr != nil {
oc.setFailureSource(commonnexus.FailureSourceWorker)
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:" + t.Failure.GetNexusHandlerFailureInfo().GetType()))
nf, err := commonnexus.TemporalFailureToNexusFailure(t.Failure)
if err != nil {
oc.logger.Error("error converting Temporal failure to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName))
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
he, err := nexusrpc.DefaultFailureConverter().FailureToError(nf)
if err != nil {
oc.logger.Error("error converting Nexus failure to Nexus HandlerError", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName))
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
oc.metricsHandler = oc.metricsHandler.WithTags(outcomeTagForNexusError(nexusErr))
return nil, nexusErr
}

// Success path: extract the result from the StartOperation response.
startOp := response.GetResponse().GetStartOperation()
switch t := startOp.GetVariant().(type) {
case *nexuspb.StartOperationResponse_SyncSuccess:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("sync_success"))
links := parseLinks(t.SyncSuccess.GetLinks(), oc.logger)
nexus.AddHandlerLinks(ctx, links...)
return &nexus.HandlerStartOperationResultSync[any]{
Value: t.SyncSuccess.GetPayload(),
}, nil

case *nexuspb.StartOperationResponse_AsyncSuccess:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("async_success"))
token := t.AsyncSuccess.GetOperationToken()
if token == "" {
token = t.AsyncSuccess.GetOperationId()
}
return nil, he

case *matchingservice.DispatchNexusTaskResponse_HandlerError:
// Deprecated case. Replaced with DispatchNexusTaskResponse_Failure
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:" + t.HandlerError.GetErrorType()))
oc.setFailureSource(commonnexus.FailureSourceWorker)
err := convertOutcomeToNexusHandlerError(t)
return nil, err

case *matchingservice.DispatchNexusTaskResponse_RequestTimeout:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_timeout"))
links := parseLinks(t.AsyncSuccess.GetLinks(), oc.logger)
nexus.AddHandlerLinks(ctx, links...)
return &nexus.HandlerStartOperationResultAsync{
OperationToken: token,
}, nil

default:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:EMPTY_OUTCOME"))
oc.setFailureSource(commonnexus.FailureSourceWorker)
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeUpstreamTimeout, "upstream timeout")

case *matchingservice.DispatchNexusTaskResponse_Response:
switch t := t.Response.GetStartOperation().GetVariant().(type) {
case *nexuspb.StartOperationResponse_SyncSuccess:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("sync_success"))
links := parseLinks(t.SyncSuccess.GetLinks(), oc.logger)
nexus.AddHandlerLinks(ctx, links...)
return &nexus.HandlerStartOperationResultSync[any]{
Value: t.SyncSuccess.GetPayload(),
}, nil

case *nexuspb.StartOperationResponse_AsyncSuccess:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("async_success"))
token := t.AsyncSuccess.GetOperationToken()
if token == "" {
token = t.AsyncSuccess.GetOperationId()
}
links := parseLinks(t.AsyncSuccess.GetLinks(), oc.logger)
nexus.AddHandlerLinks(ctx, links...)
return &nexus.HandlerStartOperationResultAsync{
OperationToken: token,
}, nil

case *nexuspb.StartOperationResponse_OperationError:
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("operation_error"))
oc.setFailureSource(commonnexus.FailureSourceWorker)
opErr := &nexus.OperationError{
Message: "operation error",
// nolint:staticcheck // Deprecated function still in use for backward compatibility.
State: nexus.OperationState(t.OperationError.GetOperationState()),
Cause: &nexus.FailureError{
// nolint:staticcheck // Deprecated function still in use for backward compatibility.
Failure: commonnexus.ProtoFailureToNexusFailure(t.OperationError.GetFailure()),
},
}
if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil {
oc.logger.Error("error converting OperationError to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName))
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
return nil, opErr

case *nexuspb.StartOperationResponse_Failure:
// Set the failure source to "worker" if we've reached this case.
// Failure conversions errors below are the user's fault, as it implies that malformed completions were sent from
// the worker.
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("failure"))
oc.setFailureSource(commonnexus.FailureSourceWorker)
nf, err := commonnexus.TemporalFailureToNexusFailure(t.Failure)
if err != nil {
oc.logger.Error("error converting Temporal failure to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName))
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
cause, err := nexusrpc.DefaultFailureConverter().FailureToError(nf)
if err != nil {
oc.logger.Error("error converting Nexus failure to Nexus OperationError", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName))
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
state := nexus.OperationStateFailed
if t.Failure.GetCanceledFailureInfo() != nil {
state = nexus.OperationStateCanceled
}
opErr := &nexus.OperationError{
State: state,
Message: "operation error",
Cause: cause,
}
if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil {
oc.logger.Error("error converting OperationError to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName))
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error")
}
return nil, opErr
}
return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "empty outcome")
}
// This is the worker's fault.
oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:EMPTY_OUTCOME"))
oc.setFailureSource(commonnexus.FailureSourceWorker)

return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "empty outcome")
}

func parseLinks(links []*nexuspb.Link, logger log.Logger) []nexus.Link {
Expand Down Expand Up @@ -819,6 +746,22 @@ func convertOutcomeToNexusHandlerError(resp *matchingservice.DispatchNexusTaskRe
}
}

// outcomeTagForNexusError returns a metrics OutcomeTag based on the Nexus SDK error type.
func outcomeTagForNexusError(nexusErr error) metrics.Tag {
var handlerErr *nexus.HandlerError
if errors.As(nexusErr, &handlerErr) {
if handlerErr.Type == nexus.HandlerErrorTypeUpstreamTimeout {
return metrics.OutcomeTag("handler_timeout")
}
return metrics.OutcomeTag("handler_error:" + string(handlerErr.Type))
}
var opErr *nexus.OperationError
if errors.As(nexusErr, &opErr) {
return metrics.OutcomeTag("operation_error")
}
return metrics.OutcomeTag("handler_error:UNKNOWN")
}

func (nc *nexusContext) setFailureSource(source string) {
nc.responseHeadersMutex.Lock()
defer nc.responseHeadersMutex.Unlock()
Expand Down
Loading
Loading