From fbf13b0c8373bf522137fca69e94353ae9a438ef Mon Sep 17 00:00:00 2001 From: Kannan Rajah Date: Tue, 17 Feb 2026 18:32:05 -0800 Subject: [PATCH 1/5] Add ActivityCommandTask dispatch via Nexus - Add activityCommandTaskDispatcher to dispatch tasks to workers - Use typed Nexus service definition for worker communication - Add proper error handling for DispatchNexusTask response - Release workflow lock before making RPC call - Add functional test for dispatch flow Co-authored-by: Cursor --- .../activity_command_task_dispatcher.go | 150 ++++++++++++++++++ .../outbound_queue_active_task_executor.go | 28 +++- ...utbound_queue_active_task_executor_test.go | 2 + service/history/outbound_queue_factory.go | 4 + tests/activity_command_task_test.go | 149 +++++++++++++++++ 5 files changed, 326 insertions(+), 7 deletions(-) create mode 100644 service/history/activity_command_task_dispatcher.go create mode 100644 tests/activity_command_task_test.go diff --git a/service/history/activity_command_task_dispatcher.go b/service/history/activity_command_task_dispatcher.go new file mode 100644 index 0000000000..266de84455 --- /dev/null +++ b/service/history/activity_command_task_dispatcher.go @@ -0,0 +1,150 @@ +package history + +import ( + "context" + "errors" + "fmt" + "time" + + enumspb "go.temporal.io/api/enums/v1" + nexuspb "go.temporal.io/api/nexus/v1" + taskqueuepb "go.temporal.io/api/taskqueue/v1" + workerpb "go.temporal.io/api/worker/v1" + "go.temporal.io/server/api/matchingservice/v1" + "go.temporal.io/server/common/debug" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/log/tag" + "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/payload" + "go.temporal.io/server/common/resource" + "go.temporal.io/server/service/history/configs" + "go.temporal.io/server/service/history/tasks" +) + +const ( + activityCommandTaskTimeout = time.Second * 10 * debug.TimeoutMultiplier +) + +// activityCommandTaskDispatcher handles dispatching activity command tasks to workers. +type activityCommandTaskDispatcher struct { + matchingRawClient resource.MatchingRawClient + config *configs.Config + metricsHandler metrics.Handler + logger log.Logger +} + +func newActivityCommandTaskDispatcher( + matchingRawClient resource.MatchingRawClient, + config *configs.Config, + metricsHandler metrics.Handler, + logger log.Logger, +) *activityCommandTaskDispatcher { + return &activityCommandTaskDispatcher{ + matchingRawClient: matchingRawClient, + config: config, + metricsHandler: metricsHandler, + logger: logger, + } +} + +func (d *activityCommandTaskDispatcher) execute( + ctx context.Context, + task *tasks.ActivityCommandTask, +) error { + if !d.config.EnableActivityCancellationNexusTask() { + return nil + } + + if len(task.TaskTokens) == 0 { + return nil + } + + ctx, cancel := context.WithTimeout(ctx, activityCommandTaskTimeout) + defer cancel() + + return d.dispatchToWorker(ctx, task) +} + +func (d *activityCommandTaskDispatcher) dispatchToWorker( + ctx context.Context, + task *tasks.ActivityCommandTask, +) error { + notificationRequest := &workerpb.ActivityNotificationRequest{ + NotificationType: workerpb.ActivityNotificationType(task.CommandType), + TaskTokens: task.TaskTokens, + } + requestPayload, err := payload.Encode(notificationRequest) + if err != nil { + return fmt.Errorf("failed to encode activity command request: %w", err) + } + + nexusRequest := &nexuspb.Request{ + Header: map[string]string{}, + Variant: &nexuspb.Request_StartOperation{ + StartOperation: &nexuspb.StartOperationRequest{ + Service: workerpb.WorkerService.ServiceName, + Operation: workerpb.WorkerService.NotifyActivity.Name(), + Payload: requestPayload, + }, + }, + } + + resp, err := d.matchingRawClient.DispatchNexusTask(ctx, &matchingservice.DispatchNexusTaskRequest{ + NamespaceId: task.NamespaceID, + TaskQueue: &taskqueuepb.TaskQueue{ + Name: task.Destination, + Kind: enumspb.TASK_QUEUE_KIND_NORMAL, + }, + Request: nexusRequest, + }) + if err != nil { + d.logger.Warn("Failed to dispatch activity command to worker", + tag.NewStringTag("control_queue", task.Destination), + tag.Error(err)) + return err + } + + return d.handleDispatchResponse(resp, task.Destination) +} + +func (d *activityCommandTaskDispatcher) handleDispatchResponse( + resp *matchingservice.DispatchNexusTaskResponse, + controlQueue string, +) error { + // Check for timeout (no worker polling) + if resp.GetRequestTimeout() != nil { + d.logger.Warn("No worker polling control queue for activity command", + tag.NewStringTag("control_queue", controlQueue)) + return errors.New("no worker polling control queue") + } + + // Check for worker handler failure + if failure := resp.GetFailure(); failure != nil { + d.logger.Warn("Worker handler failed for activity command", + tag.NewStringTag("control_queue", controlQueue), + tag.NewStringTag("failure_message", failure.GetMessage())) + return fmt.Errorf("worker handler failed: %s", failure.GetMessage()) + } + + // Check operation-level response + nexusResp := resp.GetResponse() + if nexusResp == nil { + return nil + } + + startOpResp := nexusResp.GetStartOperation() + if startOpResp == nil { + return nil + } + + // Check for operation failure (terminal - don't retry) + if opFailure := startOpResp.GetFailure(); opFailure != nil { + d.logger.Warn("Activity command operation failure", + tag.NewStringTag("control_queue", controlQueue), + tag.NewStringTag("failure_message", opFailure.GetMessage())) + return nil + } + + return nil +} + diff --git a/service/history/outbound_queue_active_task_executor.go b/service/history/outbound_queue_active_task_executor.go index bc3af18822..1b897ebae3 100644 --- a/service/history/outbound_queue_active_task_executor.go +++ b/service/history/outbound_queue_active_task_executor.go @@ -10,6 +10,8 @@ import ( "go.temporal.io/server/common/debug" "go.temporal.io/server/common/log" "go.temporal.io/server/common/metrics" + "go.temporal.io/server/common/resource" + "go.temporal.io/server/service/history/configs" "go.temporal.io/server/service/history/consts" historyi "go.temporal.io/server/service/history/interfaces" "go.temporal.io/server/service/history/queues" @@ -24,7 +26,8 @@ const ( type outboundQueueActiveTaskExecutor struct { stateMachineEnvironment - chasmEngine chasm.Engine + chasmEngine chasm.Engine + activityCommandTaskDispatcher *activityCommandTaskDispatcher } var _ queues.Executor = &outboundQueueActiveTaskExecutor{} @@ -35,17 +38,26 @@ func newOutboundQueueActiveTaskExecutor( logger log.Logger, metricsHandler metrics.Handler, chasmEngine chasm.Engine, + matchingRawClient resource.MatchingRawClient, + config *configs.Config, ) *outboundQueueActiveTaskExecutor { + scopedMetricsHandler := metricsHandler.WithTags( + metrics.OperationTag(metrics.OperationOutboundQueueProcessorScope), + ) return &outboundQueueActiveTaskExecutor{ stateMachineEnvironment: stateMachineEnvironment{ - shardContext: shardCtx, - cache: workflowCache, - logger: logger, - metricsHandler: metricsHandler.WithTags( - metrics.OperationTag(metrics.OperationOutboundQueueProcessorScope), - ), + shardContext: shardCtx, + cache: workflowCache, + logger: logger, + metricsHandler: scopedMetricsHandler, }, chasmEngine: chasmEngine, + activityCommandTaskDispatcher: newActivityCommandTaskDispatcher( + matchingRawClient, + config, + scopedMetricsHandler, + logger, + ), } } @@ -92,6 +104,8 @@ func (e *outboundQueueActiveTaskExecutor) Execute( return respond(e.executeStateMachineTask(ctx, task)) case *tasks.ChasmTask: return respond(e.executeChasmSideEffectTask(ctx, task)) + case *tasks.ActivityCommandTask: + return respond(e.activityCommandTaskDispatcher.execute(ctx, task)) } return respond(queueserrors.NewUnprocessableTaskError(fmt.Sprintf("unknown task type '%T'", task))) diff --git a/service/history/outbound_queue_active_task_executor_test.go b/service/history/outbound_queue_active_task_executor_test.go index 511b4e2810..dfcc860973 100644 --- a/service/history/outbound_queue_active_task_executor_test.go +++ b/service/history/outbound_queue_active_task_executor_test.go @@ -115,6 +115,8 @@ func (s *outboundQueueActiveTaskExecutorSuite) SetupTest() { s.logger, s.metricsHandler, s.mockChasmEngine, + nil, // matchingRawClient - not used in these tests + nil, // config - not used in these tests ) } diff --git a/service/history/outbound_queue_factory.go b/service/history/outbound_queue_factory.go index 77dffccef6..9dcd094405 100644 --- a/service/history/outbound_queue_factory.go +++ b/service/history/outbound_queue_factory.go @@ -10,6 +10,7 @@ import ( "go.temporal.io/server/common/metrics" "go.temporal.io/server/common/namespace" "go.temporal.io/server/common/quotas" + "go.temporal.io/server/common/resource" ctasks "go.temporal.io/server/common/tasks" "go.temporal.io/server/common/telemetry" "go.temporal.io/server/service/history/circuitbreakerpool" @@ -31,6 +32,7 @@ type outboundQueueFactoryParams struct { QueueFactoryBaseParams CircuitBreakerPool *circuitbreakerpool.OutboundQueueCircuitBreakerPool + MatchingRawClient resource.MatchingRawClient } type groupLimiter struct { @@ -227,6 +229,8 @@ func (f *outboundQueueFactory) CreateQueue( logger, metricsHandler, f.ChasmEngine, + f.MatchingRawClient, + shardContext.GetConfig(), ) standbyExecutor := newOutboundQueueStandbyTaskExecutor( diff --git a/tests/activity_command_task_test.go b/tests/activity_command_task_test.go new file mode 100644 index 0000000000..0019813e57 --- /dev/null +++ b/tests/activity_command_task_test.go @@ -0,0 +1,149 @@ +package tests + +import ( + "context" + "testing" + "time" + + commandpb "go.temporal.io/api/command/v1" + commonpb "go.temporal.io/api/common/v1" + enumspb "go.temporal.io/api/enums/v1" + taskqueuepb "go.temporal.io/api/taskqueue/v1" + workerpb "go.temporal.io/api/worker/v1" + "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/server/common/dynamicconfig" + "go.temporal.io/server/tests/testcore" + "google.golang.org/protobuf/types/known/durationpb" +) + +// TestDispatchCancelToWorker tests that when an activity cancellation is requested, +// the server dispatches an ActivityCommandTask to the worker's control queue via Nexus. +func TestDispatchCancelToWorker(t *testing.T) { + env := testcore.NewEnv(t, testcore.WithDynamicConfig(dynamicconfig.EnableActivityCancellationNexusTask, true)) + + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) + defer cancel() + + tv := env.Tv() + poller := env.TaskPoller() + + // Get the control queue name from test vars + controlQueueName := tv.ControlQueueName(env.Namespace().String()) + t.Logf("WorkerInstanceKey: %s", tv.WorkerInstanceKey()) + t.Logf("ControlQueueName: %s", controlQueueName) + + // Start the workflow + startResp, err := env.FrontendClient().StartWorkflowExecution(ctx, &workflowservice.StartWorkflowExecutionRequest{ + RequestId: tv.Any().String(), + Namespace: env.Namespace().String(), + WorkflowId: tv.WorkflowID(), + WorkflowType: tv.WorkflowType(), + TaskQueue: tv.TaskQueue(), + WorkflowExecutionTimeout: durationpb.New(60 * time.Second), + WorkflowTaskTimeout: durationpb.New(10 * time.Second), + }) + env.NoError(err) + t.Logf("Started workflow: %s/%s", tv.WorkflowID(), startResp.RunId) + + // Poll and complete first workflow task - schedule the activity + _, err = poller.PollAndHandleWorkflowTask(tv, + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK, + Attributes: &commandpb.Command_ScheduleActivityTaskCommandAttributes{ + ScheduleActivityTaskCommandAttributes: &commandpb.ScheduleActivityTaskCommandAttributes{ + ActivityId: tv.ActivityID(), + ActivityType: tv.ActivityType(), + TaskQueue: tv.TaskQueue(), + ScheduleToCloseTimeout: durationpb.New(60 * time.Second), + StartToCloseTimeout: durationpb.New(60 * time.Second), + }, + }, + }, + }, + }, nil + }) + env.NoError(err) + t.Log("Scheduled activity") + + // Poll for activity task and start running the activity. + activityPollResp, err := env.FrontendClient().PollActivityTaskQueue(ctx, &workflowservice.PollActivityTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: tv.TaskQueue(), + Identity: tv.WorkerIdentity(), + WorkerInstanceKey: tv.WorkerInstanceKey(), + WorkerControlTaskQueue: controlQueueName, + }) + env.NoError(err) + env.NotNil(activityPollResp) + env.NotEmpty(activityPollResp.TaskToken) + t.Log("Activity started with WorkerInstanceKey") + + // Request workflow cancellation + t.Log("Requesting workflow cancellation...") + _, err = env.FrontendClient().RequestCancelWorkflowExecution(ctx, &workflowservice.RequestCancelWorkflowExecutionRequest{ + Namespace: env.Namespace().String(), + WorkflowExecution: &commonpb.WorkflowExecution{ + WorkflowId: tv.WorkflowID(), + RunId: startResp.RunId, + }, + }) + env.NoError(err) + + // Simulate what the SDK does when a workflow is cancelled. + // Poll and complete the workflow task with RequestCancelActivityTask command. + // This sets CancelRequested=true and triggers the dispatch of ActivityCommandTask. + _, err = poller.PollAndHandleWorkflowTask(tv, + func(task *workflowservice.PollWorkflowTaskQueueResponse) (*workflowservice.RespondWorkflowTaskCompletedRequest, error) { + // Find the scheduled event ID + var scheduledEventID int64 + for _, event := range task.History.Events { + if event.EventType == enumspb.EVENT_TYPE_ACTIVITY_TASK_SCHEDULED { + scheduledEventID = event.EventId + break + } + } + return &workflowservice.RespondWorkflowTaskCompletedRequest{ + Commands: []*commandpb.Command{ + { + CommandType: enumspb.COMMAND_TYPE_REQUEST_CANCEL_ACTIVITY_TASK, + Attributes: &commandpb.Command_RequestCancelActivityTaskCommandAttributes{ + RequestCancelActivityTaskCommandAttributes: &commandpb.RequestCancelActivityTaskCommandAttributes{ + ScheduledEventId: scheduledEventID, + }, + }, + }, + }, + }, nil + }) + env.NoError(err) + t.Log("Workflow task completed with RequestCancelActivityTask command") + + // Poll Nexus control queue until we receive the notification request + var nexusPollResp *workflowservice.PollNexusTaskQueueResponse + env.Eventually(func() bool { + pollCtx, pollCancel := context.WithTimeout(ctx, 5*time.Second) + defer pollCancel() + resp, err := env.FrontendClient().PollNexusTaskQueue(pollCtx, &workflowservice.PollNexusTaskQueueRequest{ + Namespace: env.Namespace().String(), + TaskQueue: &taskqueuepb.TaskQueue{Name: controlQueueName, Kind: enumspb.TASK_QUEUE_KIND_NORMAL}, + Identity: tv.WorkerIdentity(), + }) + if err == nil && resp != nil && resp.Request != nil { + nexusPollResp = resp + return true + } + return false + }, 120*time.Second, 100*time.Millisecond, "Timed out waiting for Nexus task") + + // Verify we received the notification request on the control queue + env.NotNil(nexusPollResp.Request, "Expected to receive Nexus request on control queue") + + startOp := nexusPollResp.Request.GetStartOperation() + env.NotNil(startOp, "Expected StartOperation in Nexus request") + env.Equal(workerpb.WorkerService.ServiceName, startOp.Service, "Expected WorkerService") + env.Equal(workerpb.WorkerService.NotifyActivity.Name(), startOp.Operation, "Expected notify-activity operation") + t.Log("SUCCESS: Received notify-activity Nexus request on control queue") +} From d182f845836550e0d8d7ea2115354c4e294f60d0 Mon Sep 17 00:00:00 2001 From: Kannan Rajah Date: Fri, 13 Mar 2026 13:07:45 -0700 Subject: [PATCH 2/5] Update dispatcher to use WorkerCommandsRequest from nexusservices package Made-with: Cursor --- .../activity_command_task_dispatcher.go | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/service/history/activity_command_task_dispatcher.go b/service/history/activity_command_task_dispatcher.go index 266de84455..e400d36578 100644 --- a/service/history/activity_command_task_dispatcher.go +++ b/service/history/activity_command_task_dispatcher.go @@ -8,8 +8,8 @@ import ( enumspb "go.temporal.io/api/enums/v1" nexuspb "go.temporal.io/api/nexus/v1" + workerservicepb "go.temporal.io/api/nexusservices/workerservice/v1" taskqueuepb "go.temporal.io/api/taskqueue/v1" - workerpb "go.temporal.io/api/worker/v1" "go.temporal.io/server/api/matchingservice/v1" "go.temporal.io/server/common/debug" "go.temporal.io/server/common/log" @@ -65,25 +65,40 @@ func (d *activityCommandTaskDispatcher) execute( return d.dispatchToWorker(ctx, task) } +const ( + workerServiceName = "temporal.api.nexusservices.workerservice.v1.WorkerService" + executeCommandsOperation = "executeCommands" +) + func (d *activityCommandTaskDispatcher) dispatchToWorker( ctx context.Context, task *tasks.ActivityCommandTask, ) error { - notificationRequest := &workerpb.ActivityNotificationRequest{ - NotificationType: workerpb.ActivityNotificationType(task.CommandType), - TaskTokens: task.TaskTokens, + commands := make([]*workerservicepb.WorkerCommandsRequest_WorkerCommand, 0, len(task.TaskTokens)) + for _, token := range task.TaskTokens { + commands = append(commands, &workerservicepb.WorkerCommandsRequest_WorkerCommand{ + Type: &workerservicepb.WorkerCommandsRequest_WorkerCommand_CancelActivity{ + CancelActivity: &workerservicepb.WorkerCommandsRequest_CancelActivity{ + TaskToken: token, + }, + }, + }) + } + + request := &workerservicepb.WorkerCommandsRequest{ + Commands: commands, } - requestPayload, err := payload.Encode(notificationRequest) + requestPayload, err := payload.Encode(request) if err != nil { - return fmt.Errorf("failed to encode activity command request: %w", err) + return fmt.Errorf("failed to encode worker commands request: %w", err) } nexusRequest := &nexuspb.Request{ Header: map[string]string{}, Variant: &nexuspb.Request_StartOperation{ StartOperation: &nexuspb.StartOperationRequest{ - Service: workerpb.WorkerService.ServiceName, - Operation: workerpb.WorkerService.NotifyActivity.Name(), + Service: workerServiceName, + Operation: executeCommandsOperation, Payload: requestPayload, }, }, From 0f6651c793fab6a68059948333f7b2ea02e3b606 Mon Sep 17 00:00:00 2001 From: Kannan Rajah Date: Fri, 13 Mar 2026 14:29:50 -0700 Subject: [PATCH 3/5] Update dispatcher for ExecuteCommands rename and extracted types - Update api-go dependency to pick up ExecuteCommands rename - Use workerpb.WorkerCommand, workerpb.CancelActivityCommand from worker/v1 - Use workerservicepb.ExecuteCommandsRequest from nexusservices package - Update operation name to PascalCase ExecuteCommands Made-with: Cursor --- go.mod | 2 +- go.sum | 4 ++-- service/history/activity_command_task_dispatcher.go | 13 +++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 9cd443c040..b217171341 100644 --- a/go.mod +++ b/go.mod @@ -174,4 +174,4 @@ require ( modernc.org/memory v1.11.0 // indirect ) -replace go.temporal.io/api => github.com/temporalio/api-go v1.62.2-0.20260313200323-1c4f982100b9 +replace go.temporal.io/api => github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759 diff --git a/go.sum b/go.sum index 35a5f56097..e9d2b357fa 100644 --- a/go.sum +++ b/go.sum @@ -310,8 +310,8 @@ github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXl github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/temporalio/api-go v1.62.2-0.20260313200323-1c4f982100b9 h1:YssGqIj8Lkg7fRrbkMWylvs1cMKoLyAZIGYSWb/60QA= -github.com/temporalio/api-go v1.62.2-0.20260313200323-1c4f982100b9/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759 h1:CSlBGjKIgi770YWTYB1dt2AJuLKU6yArSZL636UStdo= +github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= github.com/temporalio/ringpop-go v0.0.0-20250130211428-b97329e994f7 h1:lEebX/hZss+TSH3EBwhztnBavJVj7pWGJOH8UgKHS0w= github.com/temporalio/ringpop-go v0.0.0-20250130211428-b97329e994f7/go.mod h1:RE+CHmY+kOZQk47AQaVzwrGmxpflnLgTd6EOK0853j4= github.com/temporalio/sqlparser v0.0.0-20231115171017-f4060bcfa6cb h1:YzHH/U/dN7vMP+glybzcXRTczTrgfdRisNTzAj7La04= diff --git a/service/history/activity_command_task_dispatcher.go b/service/history/activity_command_task_dispatcher.go index e400d36578..4a3492c82e 100644 --- a/service/history/activity_command_task_dispatcher.go +++ b/service/history/activity_command_task_dispatcher.go @@ -10,6 +10,7 @@ import ( nexuspb "go.temporal.io/api/nexus/v1" workerservicepb "go.temporal.io/api/nexusservices/workerservice/v1" taskqueuepb "go.temporal.io/api/taskqueue/v1" + workerpb "go.temporal.io/api/worker/v1" "go.temporal.io/server/api/matchingservice/v1" "go.temporal.io/server/common/debug" "go.temporal.io/server/common/log" @@ -67,25 +68,25 @@ func (d *activityCommandTaskDispatcher) execute( const ( workerServiceName = "temporal.api.nexusservices.workerservice.v1.WorkerService" - executeCommandsOperation = "executeCommands" + executeCommandsOperation = "ExecuteCommands" ) func (d *activityCommandTaskDispatcher) dispatchToWorker( ctx context.Context, task *tasks.ActivityCommandTask, ) error { - commands := make([]*workerservicepb.WorkerCommandsRequest_WorkerCommand, 0, len(task.TaskTokens)) + commands := make([]*workerpb.WorkerCommand, 0, len(task.TaskTokens)) for _, token := range task.TaskTokens { - commands = append(commands, &workerservicepb.WorkerCommandsRequest_WorkerCommand{ - Type: &workerservicepb.WorkerCommandsRequest_WorkerCommand_CancelActivity{ - CancelActivity: &workerservicepb.WorkerCommandsRequest_CancelActivity{ + commands = append(commands, &workerpb.WorkerCommand{ + Type: &workerpb.WorkerCommand_CancelActivity{ + CancelActivity: &workerpb.CancelActivityCommand{ TaskToken: token, }, }, }) } - request := &workerservicepb.WorkerCommandsRequest{ + request := &workerservicepb.ExecuteCommandsRequest{ Commands: commands, } requestPayload, err := payload.Encode(request) From 1b85ccee1a8aae58764db3116ed6c1be9c79ed74 Mon Sep 17 00:00:00 2001 From: Kannan Rajah Date: Fri, 13 Mar 2026 15:52:27 -0700 Subject: [PATCH 4/5] Update dispatcher and integration test for WorkerCommandsTask Rename activityCommandTaskDispatcher to workerCommandsTaskDispatcher, use task.Commands directly instead of rebuilding from TaskTokens, and update dynamic config reference to EnableCancelActivityWorkerCommand. Made-with: Cursor --- .../activity_command_task_dispatcher.go | 55 +++++++------------ .../outbound_queue_active_task_executor.go | 8 +-- tests/activity_command_task_test.go | 9 ++- 3 files changed, 27 insertions(+), 45 deletions(-) diff --git a/service/history/activity_command_task_dispatcher.go b/service/history/activity_command_task_dispatcher.go index 4a3492c82e..60c61fff87 100644 --- a/service/history/activity_command_task_dispatcher.go +++ b/service/history/activity_command_task_dispatcher.go @@ -10,7 +10,6 @@ import ( nexuspb "go.temporal.io/api/nexus/v1" workerservicepb "go.temporal.io/api/nexusservices/workerservice/v1" taskqueuepb "go.temporal.io/api/taskqueue/v1" - workerpb "go.temporal.io/api/worker/v1" "go.temporal.io/server/api/matchingservice/v1" "go.temporal.io/server/common/debug" "go.temporal.io/server/common/log" @@ -23,24 +22,24 @@ import ( ) const ( - activityCommandTaskTimeout = time.Second * 10 * debug.TimeoutMultiplier + workerCommandsTaskTimeout = time.Second * 10 * debug.TimeoutMultiplier ) -// activityCommandTaskDispatcher handles dispatching activity command tasks to workers. -type activityCommandTaskDispatcher struct { +// workerCommandsTaskDispatcher handles dispatching worker commands to workers via Nexus. +type workerCommandsTaskDispatcher struct { matchingRawClient resource.MatchingRawClient config *configs.Config metricsHandler metrics.Handler logger log.Logger } -func newActivityCommandTaskDispatcher( +func newWorkerCommandsTaskDispatcher( matchingRawClient resource.MatchingRawClient, config *configs.Config, metricsHandler metrics.Handler, logger log.Logger, -) *activityCommandTaskDispatcher { - return &activityCommandTaskDispatcher{ +) *workerCommandsTaskDispatcher { + return &workerCommandsTaskDispatcher{ matchingRawClient: matchingRawClient, config: config, metricsHandler: metricsHandler, @@ -48,19 +47,19 @@ func newActivityCommandTaskDispatcher( } } -func (d *activityCommandTaskDispatcher) execute( +func (d *workerCommandsTaskDispatcher) execute( ctx context.Context, - task *tasks.ActivityCommandTask, + task *tasks.WorkerCommandsTask, ) error { - if !d.config.EnableActivityCancellationNexusTask() { + if !d.config.EnableCancelActivityWorkerCommand() { return nil } - if len(task.TaskTokens) == 0 { + if len(task.Commands) == 0 { return nil } - ctx, cancel := context.WithTimeout(ctx, activityCommandTaskTimeout) + ctx, cancel := context.WithTimeout(ctx, workerCommandsTaskTimeout) defer cancel() return d.dispatchToWorker(ctx, task) @@ -71,23 +70,12 @@ const ( executeCommandsOperation = "ExecuteCommands" ) -func (d *activityCommandTaskDispatcher) dispatchToWorker( +func (d *workerCommandsTaskDispatcher) dispatchToWorker( ctx context.Context, - task *tasks.ActivityCommandTask, + task *tasks.WorkerCommandsTask, ) error { - commands := make([]*workerpb.WorkerCommand, 0, len(task.TaskTokens)) - for _, token := range task.TaskTokens { - commands = append(commands, &workerpb.WorkerCommand{ - Type: &workerpb.WorkerCommand_CancelActivity{ - CancelActivity: &workerpb.CancelActivityCommand{ - TaskToken: token, - }, - }, - }) - } - request := &workerservicepb.ExecuteCommandsRequest{ - Commands: commands, + Commands: task.Commands, } requestPayload, err := payload.Encode(request) if err != nil { @@ -114,7 +102,7 @@ func (d *activityCommandTaskDispatcher) dispatchToWorker( Request: nexusRequest, }) if err != nil { - d.logger.Warn("Failed to dispatch activity command to worker", + d.logger.Warn("Failed to dispatch worker commands", tag.NewStringTag("control_queue", task.Destination), tag.Error(err)) return err @@ -123,26 +111,23 @@ func (d *activityCommandTaskDispatcher) dispatchToWorker( return d.handleDispatchResponse(resp, task.Destination) } -func (d *activityCommandTaskDispatcher) handleDispatchResponse( +func (d *workerCommandsTaskDispatcher) handleDispatchResponse( resp *matchingservice.DispatchNexusTaskResponse, controlQueue string, ) error { - // Check for timeout (no worker polling) if resp.GetRequestTimeout() != nil { - d.logger.Warn("No worker polling control queue for activity command", + d.logger.Warn("No worker polling control queue", tag.NewStringTag("control_queue", controlQueue)) return errors.New("no worker polling control queue") } - // Check for worker handler failure if failure := resp.GetFailure(); failure != nil { - d.logger.Warn("Worker handler failed for activity command", + d.logger.Warn("Worker handler failed", tag.NewStringTag("control_queue", controlQueue), tag.NewStringTag("failure_message", failure.GetMessage())) return fmt.Errorf("worker handler failed: %s", failure.GetMessage()) } - // Check operation-level response nexusResp := resp.GetResponse() if nexusResp == nil { return nil @@ -153,9 +138,8 @@ func (d *activityCommandTaskDispatcher) handleDispatchResponse( return nil } - // Check for operation failure (terminal - don't retry) if opFailure := startOpResp.GetFailure(); opFailure != nil { - d.logger.Warn("Activity command operation failure", + d.logger.Warn("Worker command operation failure", tag.NewStringTag("control_queue", controlQueue), tag.NewStringTag("failure_message", opFailure.GetMessage())) return nil @@ -163,4 +147,3 @@ func (d *activityCommandTaskDispatcher) handleDispatchResponse( return nil } - diff --git a/service/history/outbound_queue_active_task_executor.go b/service/history/outbound_queue_active_task_executor.go index 1b897ebae3..241a127110 100644 --- a/service/history/outbound_queue_active_task_executor.go +++ b/service/history/outbound_queue_active_task_executor.go @@ -27,7 +27,7 @@ const ( type outboundQueueActiveTaskExecutor struct { stateMachineEnvironment chasmEngine chasm.Engine - activityCommandTaskDispatcher *activityCommandTaskDispatcher + workerCommandsTaskDispatcher *workerCommandsTaskDispatcher } var _ queues.Executor = &outboundQueueActiveTaskExecutor{} @@ -52,7 +52,7 @@ func newOutboundQueueActiveTaskExecutor( metricsHandler: scopedMetricsHandler, }, chasmEngine: chasmEngine, - activityCommandTaskDispatcher: newActivityCommandTaskDispatcher( + workerCommandsTaskDispatcher: newWorkerCommandsTaskDispatcher( matchingRawClient, config, scopedMetricsHandler, @@ -104,8 +104,8 @@ func (e *outboundQueueActiveTaskExecutor) Execute( return respond(e.executeStateMachineTask(ctx, task)) case *tasks.ChasmTask: return respond(e.executeChasmSideEffectTask(ctx, task)) - case *tasks.ActivityCommandTask: - return respond(e.activityCommandTaskDispatcher.execute(ctx, task)) + case *tasks.WorkerCommandsTask: + return respond(e.workerCommandsTaskDispatcher.execute(ctx, task)) } return respond(queueserrors.NewUnprocessableTaskError(fmt.Sprintf("unknown task type '%T'", task))) diff --git a/tests/activity_command_task_test.go b/tests/activity_command_task_test.go index 0019813e57..e943a45683 100644 --- a/tests/activity_command_task_test.go +++ b/tests/activity_command_task_test.go @@ -9,7 +9,6 @@ import ( commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" taskqueuepb "go.temporal.io/api/taskqueue/v1" - workerpb "go.temporal.io/api/worker/v1" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/server/common/dynamicconfig" "go.temporal.io/server/tests/testcore" @@ -19,7 +18,7 @@ import ( // TestDispatchCancelToWorker tests that when an activity cancellation is requested, // the server dispatches an ActivityCommandTask to the worker's control queue via Nexus. func TestDispatchCancelToWorker(t *testing.T) { - env := testcore.NewEnv(t, testcore.WithDynamicConfig(dynamicconfig.EnableActivityCancellationNexusTask, true)) + env := testcore.NewEnv(t, testcore.WithDynamicConfig(dynamicconfig.EnableCancelActivityWorkerCommand, true)) ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) defer cancel() @@ -143,7 +142,7 @@ func TestDispatchCancelToWorker(t *testing.T) { startOp := nexusPollResp.Request.GetStartOperation() env.NotNil(startOp, "Expected StartOperation in Nexus request") - env.Equal(workerpb.WorkerService.ServiceName, startOp.Service, "Expected WorkerService") - env.Equal(workerpb.WorkerService.NotifyActivity.Name(), startOp.Operation, "Expected notify-activity operation") - t.Log("SUCCESS: Received notify-activity Nexus request on control queue") + env.Equal("temporal.api.nexusservices.workerservice.v1.WorkerService", startOp.Service, "Expected WorkerService") + env.Equal("ExecuteCommands", startOp.Operation, "Expected ExecuteCommands operation") + t.Log("SUCCESS: Received ExecuteCommands Nexus request on control queue") } From d15684c16091543e58abd47afd10d105ca733350 Mon Sep 17 00:00:00 2001 From: Kannan Rajah Date: Sat, 14 Mar 2026 15:32:45 -0700 Subject: [PATCH 5/5] Add worker commands dispatcher with shared Nexus response adapters Rename activity_command_task_dispatcher to worker_commands_task_dispatcher and add dispatch logic with proper error handling, metrics, and failure scenario documentation. Key changes: - Use generated WorkerService constants from nexus-rpc-gen for service and operation names. - Classify errors using Nexus SDK types (*nexus.HandlerError for transport failures, *nexus.OperationError for permanent failures). - Add dispatch metrics: success, failure, no-poller, operation failure. - Extract shared Nexus response-to-SDK-error adapters in common/nexus/matching_response.go: - StartOperationResponseToError: universal StartOperationResponse conversion. - DispatchNexusTaskResponseToError: matching API envelope conversion. - Refactor frontend nexus_handler.go StartOperation to use the shared adapter, eliminating duplicated inline conversion logic. Made-with: Cursor --- common/metrics/metric_defs.go | 7 +- common/nexus/matching_response.go | 123 +++++++++++++ go.mod | 4 +- go.sum | 8 +- service/frontend/nexus_handler.go | 153 +++++----------- .../activity_command_task_dispatcher.go | 149 ---------------- .../worker_commands_task_dispatcher.go | 168 ++++++++++++++++++ ...k_test.go => worker_commands_task_test.go} | 5 +- 8 files changed, 354 insertions(+), 263 deletions(-) create mode 100644 common/nexus/matching_response.go delete mode 100644 service/history/activity_command_task_dispatcher.go create mode 100644 service/history/worker_commands_task_dispatcher.go rename tests/{activity_command_task_test.go => worker_commands_task_test.go} (95%) diff --git a/common/metrics/metric_defs.go b/common/metrics/metric_defs.go index 08c99e689d..fd59db8f91 100644 --- a/common/metrics/metric_defs.go +++ b/common/metrics/metric_defs.go @@ -841,7 +841,12 @@ var ( WithDescription("The amount of time it took to successfully send a task to the DLQ. This only records the"+ " latency of the final attempt to send the task to the DLQ, not the cumulative latency of all attempts."), ) - TaskDiscarded = NewCounterDef("task_errors_discarded") + TaskDiscarded = NewCounterDef("task_errors_discarded") + + WorkerCommandsDispatchSuccess = NewCounterDef("worker_commands_dispatch_success") + WorkerCommandsDispatchFailure = NewCounterDef("worker_commands_dispatch_failure") + WorkerCommandsDispatchNoPoller = NewCounterDef("worker_commands_dispatch_no_poller") + WorkerCommandsOperationFailure = NewCounterDef("worker_commands_operation_failure") TaskSkipped = NewCounterDef("task_skipped") TaskVersionMisMatch = NewCounterDef("task_errors_version_mismatch") TasksDependencyTaskNotCompleted = NewCounterDef("task_dependency_task_not_completed") diff --git a/common/nexus/matching_response.go b/common/nexus/matching_response.go new file mode 100644 index 0000000000..30db7b1a44 --- /dev/null +++ b/common/nexus/matching_response.go @@ -0,0 +1,123 @@ +package nexus + +import ( + "github.com/nexus-rpc/sdk-go/nexus" + enumspb "go.temporal.io/api/enums/v1" + failurepb "go.temporal.io/api/failure/v1" + nexuspb "go.temporal.io/api/nexus/v1" + matchingservice "go.temporal.io/server/api/matchingservice/v1" + "go.temporal.io/server/common/nexus/nexusrpc" +) + +// StartOperationResponseToError converts a StartOperationResponse proto into a Nexus SDK error. +// Returns nil if the response indicates success (SyncSuccess or AsyncSuccess). +// +// Error types returned: +// - *nexus.HandlerError: internal errors (e.g., unexpected response variant) +// - *nexus.OperationError: operation-level failures from the handler +func StartOperationResponseToError(resp *nexuspb.StartOperationResponse) error { + switch t := resp.GetVariant().(type) { + case *nexuspb.StartOperationResponse_SyncSuccess: + return nil + case *nexuspb.StartOperationResponse_AsyncSuccess: + return nil + case *nexuspb.StartOperationResponse_OperationError: + //nolint:staticcheck // Deprecated variant still in use for backward compatibility. + opErr := &nexus.OperationError{ + Message: "operation error", + //nolint:staticcheck // Deprecated function still in use for backward compatibility. + State: nexus.OperationState(t.OperationError.GetOperationState()), + Cause: &nexus.FailureError{ + //nolint:staticcheck // Deprecated function still in use for backward compatibility. + Failure: ProtoFailureToNexusFailure(t.OperationError.GetFailure()), + }, + } + if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil { + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + } + return opErr + case *nexuspb.StartOperationResponse_Failure: + return operationErrorFromTemporalFailure(t.Failure) + default: + return nil + } +} + +// DispatchNexusTaskResponseToError converts a DispatchNexusTaskResponse proto into a Nexus SDK +// error. Returns nil if the response indicates success. +// +// This handles the outer dispatch envelope (timeout, handler error, failure) and delegates to +// StartOperationResponseToError for the inner StartOperationResponse. +// +// Error types returned: +// - *nexus.HandlerError: transport/handler failures, timeouts +// - *nexus.OperationError: operation-level failures from the worker +func DispatchNexusTaskResponseToError(resp *matchingservice.DispatchNexusTaskResponse) error { + switch t := resp.GetOutcome().(type) { + case *matchingservice.DispatchNexusTaskResponse_Failure: + return handlerErrorFromTemporalFailure(t.Failure) + case *matchingservice.DispatchNexusTaskResponse_HandlerError: + return handlerErrorFromDeprecatedProto(t.HandlerError) + case *matchingservice.DispatchNexusTaskResponse_RequestTimeout: + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeUpstreamTimeout, "upstream timeout") + case *matchingservice.DispatchNexusTaskResponse_Response: + return StartOperationResponseToError(t.Response.GetStartOperation()) + default: + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "empty outcome") + } +} + +func handlerErrorFromTemporalFailure(failure *failurepb.Failure) error { + nf, err := TemporalFailureToNexusFailure(failure) + if err != nil { + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + } + he, err := nexusrpc.DefaultFailureConverter().FailureToError(nf) + if err != nil { + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + } + return he +} + +func handlerErrorFromDeprecatedProto(he *nexuspb.HandlerError) *nexus.HandlerError { + var retryBehavior nexus.HandlerErrorRetryBehavior + //nolint:exhaustive // unspecified is the default + switch he.GetRetryBehavior() { + case enumspb.NEXUS_HANDLER_ERROR_RETRY_BEHAVIOR_RETRYABLE: + retryBehavior = nexus.HandlerErrorRetryBehaviorRetryable + case enumspb.NEXUS_HANDLER_ERROR_RETRY_BEHAVIOR_NON_RETRYABLE: + retryBehavior = nexus.HandlerErrorRetryBehaviorNonRetryable + } + //nolint:staticcheck // Deprecated function still in use for backward compatibility. + cause := ProtoFailureToNexusFailure(he.GetFailure()) + return &nexus.HandlerError{ + //nolint:staticcheck // Deprecated function still in use for backward compatibility. + Type: nexus.HandlerErrorType(he.GetErrorType()), + RetryBehavior: retryBehavior, + Cause: &nexus.FailureError{Failure: cause}, + } +} + +func operationErrorFromTemporalFailure(failure *failurepb.Failure) error { + state := nexus.OperationStateFailed + if failure.GetCanceledFailureInfo() != nil { + state = nexus.OperationStateCanceled + } + nf, err := TemporalFailureToNexusFailure(failure) + if err != nil { + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + } + cause, err := nexusrpc.DefaultFailureConverter().FailureToError(nf) + if err != nil { + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + } + opErr := &nexus.OperationError{ + State: state, + Message: "operation error", + Cause: cause, + } + if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil { + return nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + } + return opErr +} diff --git a/go.mod b/go.mod index b217171341..428006efc4 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,7 @@ require ( github.com/lib/pq v1.10.9 github.com/maruel/panicparse/v2 v2.4.0 github.com/mitchellh/mapstructure v1.5.0 - github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5 + github.com/nexus-rpc/sdk-go v0.6.0 github.com/olekukonko/tablewriter v0.0.5 github.com/olivere/elastic/v7 v7.0.32 github.com/pkg/errors v0.9.1 @@ -174,4 +174,4 @@ require ( modernc.org/memory v1.11.0 // indirect ) -replace go.temporal.io/api => github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759 +replace go.temporal.io/api => github.com/temporalio/api-go v1.62.2-0.20260314000959-bbb2a94130c3 diff --git a/go.sum b/go.sum index e9d2b357fa..bc007fa0c3 100644 --- a/go.sum +++ b/go.sum @@ -236,8 +236,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= -github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5 h1:Van9KGGs8lcDgxzSNFbDhEMNeJ80TbBxwZ45f9iBk9U= -github.com/nexus-rpc/sdk-go v0.5.2-0.20260211051645-26b0b4c584e5/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= +github.com/nexus-rpc/sdk-go v0.6.0 h1:QRgnP2zTbxEbiyWG/aXH8uSC5LV/Mg1fqb19jb4DBlo= +github.com/nexus-rpc/sdk-go v0.6.0/go.mod h1:FHdPfVQwRuJFZFTF0Y2GOAxCrbIBNrcPna9slkGKPYk= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= @@ -310,8 +310,8 @@ github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXl github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759 h1:CSlBGjKIgi770YWTYB1dt2AJuLKU6yArSZL636UStdo= -github.com/temporalio/api-go v1.62.2-0.20260313212811-d44912090759/go.mod h1:iaxoP/9OXMJcQkETTECfwYq4cw/bj4nwov8b3ZLVnXM= +github.com/temporalio/api-go v1.62.2-0.20260314000959-bbb2a94130c3 h1:8T6S/2+0jCL//uKnmwQsH7W+O1VnkonSMuzpfR+AZe8= +github.com/temporalio/api-go v1.62.2-0.20260314000959-bbb2a94130c3/go.mod h1:ucB3ZO5X2AFLJcUBzOrio08zxiQjuzdM/7aRKOEQPEc= github.com/temporalio/ringpop-go v0.0.0-20250130211428-b97329e994f7 h1:lEebX/hZss+TSH3EBwhztnBavJVj7pWGJOH8UgKHS0w= github.com/temporalio/ringpop-go v0.0.0-20250130211428-b97329e994f7/go.mod h1:RE+CHmY+kOZQk47AQaVzwrGmxpflnLgTd6EOK0853j4= github.com/temporalio/sqlparser v0.0.0-20231115171017-f4060bcfa6cb h1:YzHH/U/dN7vMP+glybzcXRTczTrgfdRisNTzAj7La04= diff --git a/service/frontend/nexus_handler.go b/service/frontend/nexus_handler.go index 48432e45fb..c1b3591818 100644 --- a/service/frontend/nexus_handler.go +++ b/service/frontend/nexus_handler.go @@ -447,115 +447,42 @@ func (h *nexusHandler) StartOperation( oc.logger.Error("received error from matching service for Nexus StartOperation request", tag.Error(err)) return nil, commonnexus.ConvertGRPCError(err, false) } - // Convert to standard Nexus SDK response. - switch t := response.GetOutcome().(type) { - case *matchingservice.DispatchNexusTaskResponse_Failure: - // Set the failure source to "worker" if we've reached this case. - // Failure conversions errors below are the user's fault, as it implies that malformed completions were sent from - // the worker. + // Convert to standard Nexus SDK response and check for errors. + nexusErr := commonnexus.DispatchNexusTaskResponseToError(response) + if nexusErr != nil { oc.setFailureSource(commonnexus.FailureSourceWorker) - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:" + t.Failure.GetNexusHandlerFailureInfo().GetType())) - nf, err := commonnexus.TemporalFailureToNexusFailure(t.Failure) - if err != nil { - oc.logger.Error("error converting Temporal failure to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName)) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") - } - he, err := nexusrpc.DefaultFailureConverter().FailureToError(nf) - if err != nil { - oc.logger.Error("error converting Nexus failure to Nexus HandlerError", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName)) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") + oc.metricsHandler = oc.metricsHandler.WithTags(outcomeTagForNexusError(nexusErr)) + return nil, nexusErr + } + + // Success path: extract the result from the StartOperation response. + startOp := response.GetResponse().GetStartOperation() + switch t := startOp.GetVariant().(type) { + case *nexuspb.StartOperationResponse_SyncSuccess: + oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("sync_success")) + links := parseLinks(t.SyncSuccess.GetLinks(), oc.logger) + nexus.AddHandlerLinks(ctx, links...) + return &nexus.HandlerStartOperationResultSync[any]{ + Value: t.SyncSuccess.GetPayload(), + }, nil + + case *nexuspb.StartOperationResponse_AsyncSuccess: + oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("async_success")) + token := t.AsyncSuccess.GetOperationToken() + if token == "" { + token = t.AsyncSuccess.GetOperationId() } - return nil, he - - case *matchingservice.DispatchNexusTaskResponse_HandlerError: - // Deprecated case. Replaced with DispatchNexusTaskResponse_Failure - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:" + t.HandlerError.GetErrorType())) - oc.setFailureSource(commonnexus.FailureSourceWorker) - err := convertOutcomeToNexusHandlerError(t) - return nil, err - - case *matchingservice.DispatchNexusTaskResponse_RequestTimeout: - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_timeout")) + links := parseLinks(t.AsyncSuccess.GetLinks(), oc.logger) + nexus.AddHandlerLinks(ctx, links...) + return &nexus.HandlerStartOperationResultAsync{ + OperationToken: token, + }, nil + + default: + oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:EMPTY_OUTCOME")) oc.setFailureSource(commonnexus.FailureSourceWorker) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeUpstreamTimeout, "upstream timeout") - - case *matchingservice.DispatchNexusTaskResponse_Response: - switch t := t.Response.GetStartOperation().GetVariant().(type) { - case *nexuspb.StartOperationResponse_SyncSuccess: - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("sync_success")) - links := parseLinks(t.SyncSuccess.GetLinks(), oc.logger) - nexus.AddHandlerLinks(ctx, links...) - return &nexus.HandlerStartOperationResultSync[any]{ - Value: t.SyncSuccess.GetPayload(), - }, nil - - case *nexuspb.StartOperationResponse_AsyncSuccess: - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("async_success")) - token := t.AsyncSuccess.GetOperationToken() - if token == "" { - token = t.AsyncSuccess.GetOperationId() - } - links := parseLinks(t.AsyncSuccess.GetLinks(), oc.logger) - nexus.AddHandlerLinks(ctx, links...) - return &nexus.HandlerStartOperationResultAsync{ - OperationToken: token, - }, nil - - case *nexuspb.StartOperationResponse_OperationError: - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("operation_error")) - oc.setFailureSource(commonnexus.FailureSourceWorker) - opErr := &nexus.OperationError{ - Message: "operation error", - // nolint:staticcheck // Deprecated function still in use for backward compatibility. - State: nexus.OperationState(t.OperationError.GetOperationState()), - Cause: &nexus.FailureError{ - // nolint:staticcheck // Deprecated function still in use for backward compatibility. - Failure: commonnexus.ProtoFailureToNexusFailure(t.OperationError.GetFailure()), - }, - } - if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil { - oc.logger.Error("error converting OperationError to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName)) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") - } - return nil, opErr - - case *nexuspb.StartOperationResponse_Failure: - // Set the failure source to "worker" if we've reached this case. - // Failure conversions errors below are the user's fault, as it implies that malformed completions were sent from - // the worker. - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("failure")) - oc.setFailureSource(commonnexus.FailureSourceWorker) - nf, err := commonnexus.TemporalFailureToNexusFailure(t.Failure) - if err != nil { - oc.logger.Error("error converting Temporal failure to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName)) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") - } - cause, err := nexusrpc.DefaultFailureConverter().FailureToError(nf) - if err != nil { - oc.logger.Error("error converting Nexus failure to Nexus OperationError", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName)) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") - } - state := nexus.OperationStateFailed - if t.Failure.GetCanceledFailureInfo() != nil { - state = nexus.OperationStateCanceled - } - opErr := &nexus.OperationError{ - State: state, - Message: "operation error", - Cause: cause, - } - if err := nexusrpc.MarkAsWrapperError(nexusrpc.DefaultFailureConverter(), opErr); err != nil { - oc.logger.Error("error converting OperationError to Nexus failure", tag.Error(err), tag.Operation(operation), tag.WorkflowNamespace(oc.namespaceName)) - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "internal error") - } - return nil, opErr - } + return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "empty outcome") } - // This is the worker's fault. - oc.metricsHandler = oc.metricsHandler.WithTags(metrics.OutcomeTag("handler_error:EMPTY_OUTCOME")) - oc.setFailureSource(commonnexus.FailureSourceWorker) - - return nil, nexus.NewHandlerErrorf(nexus.HandlerErrorTypeInternal, "empty outcome") } func parseLinks(links []*nexuspb.Link, logger log.Logger) []nexus.Link { @@ -819,6 +746,22 @@ func convertOutcomeToNexusHandlerError(resp *matchingservice.DispatchNexusTaskRe } } +// outcomeTagForNexusError returns a metrics OutcomeTag based on the Nexus SDK error type. +func outcomeTagForNexusError(nexusErr error) metrics.Tag { + var handlerErr *nexus.HandlerError + if errors.As(nexusErr, &handlerErr) { + if handlerErr.Type == nexus.HandlerErrorTypeUpstreamTimeout { + return metrics.OutcomeTag("handler_timeout") + } + return metrics.OutcomeTag("handler_error:" + string(handlerErr.Type)) + } + var opErr *nexus.OperationError + if errors.As(nexusErr, &opErr) { + return metrics.OutcomeTag("operation_error") + } + return metrics.OutcomeTag("handler_error:UNKNOWN") +} + func (nc *nexusContext) setFailureSource(source string) { nc.responseHeadersMutex.Lock() defer nc.responseHeadersMutex.Unlock() diff --git a/service/history/activity_command_task_dispatcher.go b/service/history/activity_command_task_dispatcher.go deleted file mode 100644 index 60c61fff87..0000000000 --- a/service/history/activity_command_task_dispatcher.go +++ /dev/null @@ -1,149 +0,0 @@ -package history - -import ( - "context" - "errors" - "fmt" - "time" - - enumspb "go.temporal.io/api/enums/v1" - nexuspb "go.temporal.io/api/nexus/v1" - workerservicepb "go.temporal.io/api/nexusservices/workerservice/v1" - taskqueuepb "go.temporal.io/api/taskqueue/v1" - "go.temporal.io/server/api/matchingservice/v1" - "go.temporal.io/server/common/debug" - "go.temporal.io/server/common/log" - "go.temporal.io/server/common/log/tag" - "go.temporal.io/server/common/metrics" - "go.temporal.io/server/common/payload" - "go.temporal.io/server/common/resource" - "go.temporal.io/server/service/history/configs" - "go.temporal.io/server/service/history/tasks" -) - -const ( - workerCommandsTaskTimeout = time.Second * 10 * debug.TimeoutMultiplier -) - -// workerCommandsTaskDispatcher handles dispatching worker commands to workers via Nexus. -type workerCommandsTaskDispatcher struct { - matchingRawClient resource.MatchingRawClient - config *configs.Config - metricsHandler metrics.Handler - logger log.Logger -} - -func newWorkerCommandsTaskDispatcher( - matchingRawClient resource.MatchingRawClient, - config *configs.Config, - metricsHandler metrics.Handler, - logger log.Logger, -) *workerCommandsTaskDispatcher { - return &workerCommandsTaskDispatcher{ - matchingRawClient: matchingRawClient, - config: config, - metricsHandler: metricsHandler, - logger: logger, - } -} - -func (d *workerCommandsTaskDispatcher) execute( - ctx context.Context, - task *tasks.WorkerCommandsTask, -) error { - if !d.config.EnableCancelActivityWorkerCommand() { - return nil - } - - if len(task.Commands) == 0 { - return nil - } - - ctx, cancel := context.WithTimeout(ctx, workerCommandsTaskTimeout) - defer cancel() - - return d.dispatchToWorker(ctx, task) -} - -const ( - workerServiceName = "temporal.api.nexusservices.workerservice.v1.WorkerService" - executeCommandsOperation = "ExecuteCommands" -) - -func (d *workerCommandsTaskDispatcher) dispatchToWorker( - ctx context.Context, - task *tasks.WorkerCommandsTask, -) error { - request := &workerservicepb.ExecuteCommandsRequest{ - Commands: task.Commands, - } - requestPayload, err := payload.Encode(request) - if err != nil { - return fmt.Errorf("failed to encode worker commands request: %w", err) - } - - nexusRequest := &nexuspb.Request{ - Header: map[string]string{}, - Variant: &nexuspb.Request_StartOperation{ - StartOperation: &nexuspb.StartOperationRequest{ - Service: workerServiceName, - Operation: executeCommandsOperation, - Payload: requestPayload, - }, - }, - } - - resp, err := d.matchingRawClient.DispatchNexusTask(ctx, &matchingservice.DispatchNexusTaskRequest{ - NamespaceId: task.NamespaceID, - TaskQueue: &taskqueuepb.TaskQueue{ - Name: task.Destination, - Kind: enumspb.TASK_QUEUE_KIND_NORMAL, - }, - Request: nexusRequest, - }) - if err != nil { - d.logger.Warn("Failed to dispatch worker commands", - tag.NewStringTag("control_queue", task.Destination), - tag.Error(err)) - return err - } - - return d.handleDispatchResponse(resp, task.Destination) -} - -func (d *workerCommandsTaskDispatcher) handleDispatchResponse( - resp *matchingservice.DispatchNexusTaskResponse, - controlQueue string, -) error { - if resp.GetRequestTimeout() != nil { - d.logger.Warn("No worker polling control queue", - tag.NewStringTag("control_queue", controlQueue)) - return errors.New("no worker polling control queue") - } - - if failure := resp.GetFailure(); failure != nil { - d.logger.Warn("Worker handler failed", - tag.NewStringTag("control_queue", controlQueue), - tag.NewStringTag("failure_message", failure.GetMessage())) - return fmt.Errorf("worker handler failed: %s", failure.GetMessage()) - } - - nexusResp := resp.GetResponse() - if nexusResp == nil { - return nil - } - - startOpResp := nexusResp.GetStartOperation() - if startOpResp == nil { - return nil - } - - if opFailure := startOpResp.GetFailure(); opFailure != nil { - d.logger.Warn("Worker command operation failure", - tag.NewStringTag("control_queue", controlQueue), - tag.NewStringTag("failure_message", opFailure.GetMessage())) - return nil - } - - return nil -} diff --git a/service/history/worker_commands_task_dispatcher.go b/service/history/worker_commands_task_dispatcher.go new file mode 100644 index 0000000000..c0c3a39ac6 --- /dev/null +++ b/service/history/worker_commands_task_dispatcher.go @@ -0,0 +1,168 @@ +package history + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/nexus-rpc/sdk-go/nexus" + enumspb "go.temporal.io/api/enums/v1" + nexuspb "go.temporal.io/api/nexus/v1" + workerservicepb "go.temporal.io/api/nexusservices/workerservice/v1" + taskqueuepb "go.temporal.io/api/taskqueue/v1" + "go.temporal.io/server/api/matchingservice/v1" + "go.temporal.io/server/common/debug" + "go.temporal.io/server/common/log" + "go.temporal.io/server/common/log/tag" + "go.temporal.io/server/common/metrics" + commonnexus "go.temporal.io/server/common/nexus" + "go.temporal.io/server/common/payload" + "go.temporal.io/server/common/resource" + "go.temporal.io/server/service/history/configs" + "go.temporal.io/server/service/history/tasks" +) + +const ( + workerCommandsTaskTimeout = time.Second * 10 * debug.TimeoutMultiplier +) + +// workerCommandsTaskDispatcher dispatches worker commands to workers via Nexus. +// +// Failure scenarios: +// - No worker polling: matching returns RequestTimeout → *nexus.HandlerError{Type: UpstreamTimeout}. +// Retryable — worker may come up later. +// - Worker crashes after receiving the task: matching blocks waiting for a response until +// context deadline, then returns RequestTimeout. Indistinguishable from "no worker polling". +// Safe to retry because commands are idempotent (e.g., cancelling a missing activity is a +// no-op success per the worker contract). +// - Transport/RPC failure: *nexus.HandlerError. Retryable. +// - Operation failure (worker explicitly returns error): *nexus.OperationError. Permanent — +// the worker contract requires success for all defined commands, so this indicates a bug +// or version incompatibility. +// +// TODO: Add a worker-commands-specific retry cap (e.g., 5 attempts) instead of relying on the +// shared outbound queue HistoryTaskDLQUnexpectedErrorAttempts (default 70). These commands are +// best-effort with heartbeat timeout as fallback, so excessive retries waste resources. +type workerCommandsTaskDispatcher struct { + matchingRawClient resource.MatchingRawClient + config *configs.Config + metricsHandler metrics.Handler + logger log.Logger +} + +func newWorkerCommandsTaskDispatcher( + matchingRawClient resource.MatchingRawClient, + config *configs.Config, + metricsHandler metrics.Handler, + logger log.Logger, +) *workerCommandsTaskDispatcher { + return &workerCommandsTaskDispatcher{ + matchingRawClient: matchingRawClient, + config: config, + metricsHandler: metricsHandler, + logger: logger, + } +} + +func (d *workerCommandsTaskDispatcher) execute( + ctx context.Context, + task *tasks.WorkerCommandsTask, +) error { + if !d.config.EnableCancelActivityWorkerCommand() { + return nil + } + + if len(task.Commands) == 0 { + return nil + } + + ctx, cancel := context.WithTimeout(ctx, workerCommandsTaskTimeout) + defer cancel() + + return d.dispatchToWorker(ctx, task) +} + +func (d *workerCommandsTaskDispatcher) dispatchToWorker( + ctx context.Context, + task *tasks.WorkerCommandsTask, +) error { + request := &workerservicepb.ExecuteCommandsRequest{ + Commands: task.Commands, + } + requestPayload, err := payload.Encode(request) + if err != nil { + return fmt.Errorf("failed to encode worker commands request: %w", err) + } + + nexusRequest := &nexuspb.Request{ + Header: map[string]string{}, + Variant: &nexuspb.Request_StartOperation{ + StartOperation: &nexuspb.StartOperationRequest{ + Service: workerservicepb.WorkerService.ServiceName, + Operation: workerservicepb.WorkerService.ExecuteCommands.Name(), + Payload: requestPayload, + }, + }, + } + + resp, err := d.matchingRawClient.DispatchNexusTask(ctx, &matchingservice.DispatchNexusTaskRequest{ + NamespaceId: task.NamespaceID, + TaskQueue: &taskqueuepb.TaskQueue{ + Name: task.Destination, + Kind: enumspb.TASK_QUEUE_KIND_NORMAL, + }, + Request: nexusRequest, + }) + if err != nil { + d.logger.Warn("Failed to dispatch worker commands", + tag.NewStringTag("control_queue", task.Destination), + tag.Error(err)) + metrics.WorkerCommandsDispatchFailure.With(d.metricsHandler).Record(1) + return err + } + + nexusErr := commonnexus.DispatchNexusTaskResponseToError(resp) + if nexusErr == nil { + metrics.WorkerCommandsDispatchSuccess.With(d.metricsHandler).Record(1) + return nil + } + + return d.handleError(nexusErr, task.Destination) +} + +func (d *workerCommandsTaskDispatcher) handleError(nexusErr error, controlQueue string) error { + var opErr *nexus.OperationError + if errors.As(nexusErr, &opErr) { + // Operation-level failure: the worker received and processed the request but returned + // an error. Permanent — the worker contract requires success for all defined commands, + // so this indicates a bug or version incompatibility. Retrying won't help. + d.logger.Error("Worker returned operation failure for worker commands", + tag.NewStringTag("control_queue", controlQueue), + tag.Error(nexusErr)) + metrics.WorkerCommandsOperationFailure.With(d.metricsHandler).Record(1) + return nil + } + + var handlerErr *nexus.HandlerError + if errors.As(nexusErr, &handlerErr) { + if handlerErr.Type == nexus.HandlerErrorTypeUpstreamTimeout { + d.logger.Warn("No worker polling control queue", + tag.NewStringTag("control_queue", controlQueue)) + metrics.WorkerCommandsDispatchNoPoller.With(d.metricsHandler).Record(1) + return nexusErr + } + + d.logger.Warn("Worker commands transport failure", + tag.NewStringTag("control_queue", controlQueue), + tag.Error(nexusErr)) + metrics.WorkerCommandsDispatchFailure.With(d.metricsHandler).Record(1) + return nexusErr + } + + d.logger.Warn("Worker commands unexpected error", + tag.NewStringTag("control_queue", controlQueue), + tag.Error(nexusErr)) + metrics.WorkerCommandsDispatchFailure.With(d.metricsHandler).Record(1) + return nexusErr +} diff --git a/tests/activity_command_task_test.go b/tests/worker_commands_task_test.go similarity index 95% rename from tests/activity_command_task_test.go rename to tests/worker_commands_task_test.go index e943a45683..835537c742 100644 --- a/tests/activity_command_task_test.go +++ b/tests/worker_commands_task_test.go @@ -8,6 +8,7 @@ import ( commandpb "go.temporal.io/api/command/v1" commonpb "go.temporal.io/api/common/v1" enumspb "go.temporal.io/api/enums/v1" + workerservicepb "go.temporal.io/api/nexusservices/workerservice/v1" taskqueuepb "go.temporal.io/api/taskqueue/v1" "go.temporal.io/api/workflowservice/v1" "go.temporal.io/server/common/dynamicconfig" @@ -142,7 +143,7 @@ func TestDispatchCancelToWorker(t *testing.T) { startOp := nexusPollResp.Request.GetStartOperation() env.NotNil(startOp, "Expected StartOperation in Nexus request") - env.Equal("temporal.api.nexusservices.workerservice.v1.WorkerService", startOp.Service, "Expected WorkerService") - env.Equal("ExecuteCommands", startOp.Operation, "Expected ExecuteCommands operation") + env.Equal(workerservicepb.WorkerService.ServiceName, startOp.Service, "Expected WorkerService") + env.Equal(workerservicepb.WorkerService.ExecuteCommands.Name(), startOp.Operation, "Expected ExecuteCommands operation") t.Log("SUCCESS: Received ExecuteCommands Nexus request on control queue") }