From ce950c27eaa4f17972d5025ff5c17b5d4c345741 Mon Sep 17 00:00:00 2001
From: Jakub Hrozek <jakub@stacklok.com>
Date: Wed, 19 Nov 2025 22:48:53 +0000
Subject: [PATCH 1/4] Add composer workflow integration tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tests cover core DAG execution patterns:
- Sequential two-step workflow
- Parallel fan-out with aggregation
- Diamond DAG (diverge/converge)
- Parameter template expansion

Uses table-driven approach for maintainability. Additional tests
for conditionals and nested data blocked by issue #2669.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test/integration/vmcp/composer_basic_test.go | 252 +++++++++++++++++++
 test/integration/vmcp/helpers/vmcp_server.go |  27 +-
 2 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 test/integration/vmcp/composer_basic_test.go

diff --git a/test/integration/vmcp/composer_basic_test.go b/test/integration/vmcp/composer_basic_test.go
new file mode 100644
index 000000000..507b856d3
--- /dev/null
+++ b/test/integration/vmcp/composer_basic_test.go
@@ -0,0 +1,252 @@
+package vmcp_test
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/stacklok/toolhive/pkg/vmcp/composer"
+	"github.com/stacklok/toolhive/test/integration/vmcp/helpers"
+)
+
+// composerTestCase defines a declarative test case for composer workflows.
+type composerTestCase struct {
+	name        string
+	tools       []helpers.BackendTool // All tools (will be grouped by backend automatically)
+	workflow    *composer.WorkflowDefinition
+	params      map[string]any
+	wantStrings []string // Strings that must appear in output
+}
+
+// runComposerTest executes a single composer test case.
+func runComposerTest(t *testing.T, tc composerTestCase) {
+	t.Helper()
+	t.Parallel()
+
+	ctx := context.Background()
+
+	// Create a single backend with all tools
+	backend := helpers.CreateBackendServer(t, tc.tools, helpers.WithBackendName("test-backend"))
+	defer backend.Close()
+
+	// Create vMCP server
+	vmcpServer := helpers.NewVMCPServer(ctx, t, helpers.BackendsFromURL("test", backend.URL+"/mcp"),
+		helpers.WithPrefixConflictResolution("{workload}_"),
+		helpers.WithWorkflows([]*composer.WorkflowDefinition{tc.workflow}),
+	)
+
+	// Create client and execute workflow
+	client := helpers.NewMCPClient(ctx, t, "http://"+vmcpServer.Address()+"/mcp")
+	defer client.Close()
+
+	result := client.CallTool(ctx, tc.workflow.Name, tc.params)
+	text := helpers.AssertToolCallSuccess(t, result)
+
+	// Verify expected strings
+	helpers.AssertTextContains(t, text, tc.wantStrings...)
+
+	t.Logf("Test %q completed: %s", tc.name, text)
+}
+
+// TestVMCPComposer runs table-driven tests for composer workflow patterns.
+// Note: Tests use plain text output instead of structured JSON fields due to
+// a known bug where MCP tool results aren't parsed into structured data.
+// See: https://github.com/stacklok/toolhive/issues/2669
+func TestVMCPComposer(t *testing.T) {
+	t.Parallel()
+
+	tests := []composerTestCase{
+		{
+			name: "SequentialTwoSteps",
+			tools: []helpers.BackendTool{
+				helpers.NewBackendTool("generate", "Generate message",
+					func(_ context.Context, args map[string]any) string {
+						prefix, _ := args["prefix"].(string)
+						return prefix + " - generated"
+					}),
+				helpers.NewBackendTool("process", "Process message",
+					func(_ context.Context, args map[string]any) string {
+						msg, _ := args["message"].(string)
+						suffix, _ := args["suffix"].(string)
+						return strings.ToUpper(msg) + " " + suffix
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "sequential",
+				Description: "Two-step sequential workflow",
+				Parameters: map[string]any{
+					"prefix": map[string]any{"type": "string"},
+					"suffix": map[string]any{"type": "string"},
+				},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "gen", Type: composer.StepTypeTool, Tool: "test_generate",
+						Arguments: map[string]any{"prefix": "{{.params.prefix}}"}},
+					{ID: "proc", Type: composer.StepTypeTool, Tool: "test_process",
+						Arguments: map[string]any{
+							"message": "{{.steps.gen.output.text}}",
+							"suffix":  "{{.params.suffix}}",
+						},
+						DependsOn: []string{"gen"}},
+				},
+			},
+			params:      map[string]any{"prefix": "Hello", "suffix": "[DONE]"},
+			wantStrings: []string{"HELLO", "GENERATED", "[DONE]"},
+		},
+		{
+			name: "ParallelFanOut",
+			tools: []helpers.BackendTool{
+				helpers.NewBackendTool("task_a", "Task A", func(_ context.Context, _ map[string]any) string { return "A" }),
+				helpers.NewBackendTool("task_b", "Task B", func(_ context.Context, _ map[string]any) string { return "B" }),
+				helpers.NewBackendTool("task_c", "Task C", func(_ context.Context, _ map[string]any) string { return "C" }),
+				helpers.NewBackendTool("aggregate", "Aggregate",
+					func(_ context.Context, args map[string]any) string {
+						a, _ := args["a"].(string)
+						b, _ := args["b"].(string)
+						c, _ := args["c"].(string)
+						return "COMBINED:" + a + "," + b + "," + c
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "fanout",
+				Description: "Parallel fan-out workflow",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "a", Type: composer.StepTypeTool, Tool: "test_task_a", Arguments: map[string]any{}},
+					{ID: "b", Type: composer.StepTypeTool, Tool: "test_task_b", Arguments: map[string]any{}},
+					{ID: "c", Type: composer.StepTypeTool, Tool: "test_task_c", Arguments: map[string]any{}},
+					{ID: "agg", Type: composer.StepTypeTool, Tool: "test_aggregate",
+						Arguments: map[string]any{
+							"a": "{{.steps.a.output.text}}",
+							"b": "{{.steps.b.output.text}}",
+							"c": "{{.steps.c.output.text}}",
+						},
+						DependsOn: []string{"a", "b", "c"}},
+				},
+			},
+			params:      map[string]any{},
+			wantStrings: []string{"COMBINED", "A", "B", "C"},
+		},
+		{
+			name: "DiamondDAG",
+			tools: []helpers.BackendTool{
+				helpers.NewBackendTool("start", "Start",
+					func(_ context.Context, args map[string]any) string {
+						input, _ := args["input"].(string)
+						return "started:" + input
+					}),
+				helpers.NewBackendTool("left", "Left branch",
+					func(_ context.Context, args map[string]any) string {
+						data, _ := args["data"].(string)
+						return "L(" + data + ")"
+					}),
+				helpers.NewBackendTool("right", "Right branch",
+					func(_ context.Context, args map[string]any) string {
+						data, _ := args["data"].(string)
+						return "R(" + data + ")"
+					}),
+				helpers.NewBackendTool("merge", "Merge",
+					func(_ context.Context, args map[string]any) string {
+						l, _ := args["left"].(string)
+						r, _ := args["right"].(string)
+						return "MERGED[" + l + "+" + r + "]"
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "diamond",
+				Description: "Diamond-shaped DAG",
+				Parameters:  map[string]any{"input": map[string]any{"type": "string"}},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "start", Type: composer.StepTypeTool, Tool: "test_start",
+						Arguments: map[string]any{"input": "{{.params.input}}"}},
+					{ID: "left", Type: composer.StepTypeTool, Tool: "test_left",
+						Arguments: map[string]any{"data": "{{.steps.start.output.text}}"},
+						DependsOn: []string{"start"}},
+					{ID: "right", Type: composer.StepTypeTool, Tool: "test_right",
+						Arguments: map[string]any{"data": "{{.steps.start.output.text}}"},
+						DependsOn: []string{"start"}},
+					{ID: "merge", Type: composer.StepTypeTool, Tool: "test_merge",
+						Arguments: map[string]any{
+							"left":  "{{.steps.left.output.text}}",
+							"right": "{{.steps.right.output.text}}",
+						},
+						DependsOn: []string{"left", "right"}},
+				},
+			},
+			params:      map[string]any{"input": "test"},
+			wantStrings: []string{"MERGED", "L(", "R(", "started:test"},
+		},
+		{
+			name: "ParameterPassing",
+			tools: []helpers.BackendTool{
+				helpers.NewBackendTool("echo", "Echo params",
+					func(_ context.Context, args map[string]any) string {
+						name, _ := args["name"].(string)
+						count, _ := args["count"].(string)
+						return "name=" + name + ",count=" + count
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "params",
+				Description: "Parameter passing test",
+				Parameters: map[string]any{
+					"name":  map[string]any{"type": "string"},
+					"count": map[string]any{"type": "string"},
+				},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "echo", Type: composer.StepTypeTool, Tool: "test_echo",
+						Arguments: map[string]any{
+							"name":  "{{.params.name}}",
+							"count": "{{.params.count}}",
+						}},
+				},
+			},
+			params:      map[string]any{"name": "testuser", "count": "42"},
+			wantStrings: []string{"name=testuser", "count=42"},
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			runComposerTest(t, tc)
+		})
+	}
+}
+
+// TestVMCPComposer_WorkflowListed verifies workflow appears in tool list.
+func TestVMCPComposer_WorkflowListed(t *testing.T) {
+	t.Parallel()
+	ctx := context.Background()
+
+	backend := helpers.CreateBackendServer(t, []helpers.BackendTool{
+		helpers.NewBackendTool("noop", "No-op", func(_ context.Context, _ map[string]any) string { return "ok" }),
+	}, helpers.WithBackendName("test"))
+	defer backend.Close()
+
+	workflow := &composer.WorkflowDefinition{
+		Name:        "my_workflow",
+		Description: "Test workflow",
+		Parameters:  map[string]any{},
+		FailureMode: "abort",
+		Steps: []composer.WorkflowStep{
+			{ID: "step1", Type: composer.StepTypeTool, Tool: "test_noop", Arguments: map[string]any{}},
+		},
+	}
+
+	vmcpServer := helpers.NewVMCPServer(ctx, t, helpers.BackendsFromURL("test", backend.URL+"/mcp"),
+		helpers.WithPrefixConflictResolution("{workload}_"),
+		helpers.WithWorkflows([]*composer.WorkflowDefinition{workflow}),
+	)
+
+	client := helpers.NewMCPClient(ctx, t, "http://"+vmcpServer.Address()+"/mcp")
+	defer client.Close()
+
+	toolsResp := client.ListTools(ctx)
+	toolNames := helpers.GetToolNames(toolsResp)
+	assert.Contains(t, toolNames, "my_workflow", "Workflow should be exposed as a tool")
+}
diff --git a/test/integration/vmcp/helpers/vmcp_server.go b/test/integration/vmcp/helpers/vmcp_server.go
index f001a575c..12b6bc570 100644
--- a/test/integration/vmcp/helpers/vmcp_server.go
+++ b/test/integration/vmcp/helpers/vmcp_server.go
@@ -15,6 +15,7 @@ import (
 	"github.com/stacklok/toolhive/pkg/vmcp/auth/factory"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
 	vmcpclient "github.com/stacklok/toolhive/pkg/vmcp/client"
+	"github.com/stacklok/toolhive/pkg/vmcp/composer"
 	"github.com/stacklok/toolhive/pkg/vmcp/discovery"
 	"github.com/stacklok/toolhive/pkg/vmcp/router"
 	vmcpserver "github.com/stacklok/toolhive/pkg/vmcp/server"
@@ -58,6 +59,11 @@ func WithMetadata(key, value string) func(*vmcptypes.Backend) {
 	}
 }
 
+// BackendsFromURL creates a single-element backend slice for simple test cases.
+func BackendsFromURL(id, url string) []vmcptypes.Backend {
+	return []vmcptypes.Backend{NewBackend(id, WithURL(url))}
+}
+
 // VMCPServerOption is a functional option for configuring a vMCP test server.
 type VMCPServerOption func(*vmcpServerConfig)
 
@@ -65,6 +71,7 @@ type VMCPServerOption func(*vmcpServerConfig)
 type vmcpServerConfig struct {
 	conflictStrategy string
 	prefixFormat     string
+	workflows        []*composer.WorkflowDefinition
 }
 
 // WithPrefixConflictResolution configures prefix-based conflict resolution.
@@ -75,6 +82,13 @@ func WithPrefixConflictResolution(format string) VMCPServerOption {
 	}
 }
 
+// WithWorkflows configures workflow definitions for the vMCP server.
+func WithWorkflows(workflows []*composer.WorkflowDefinition) VMCPServerOption {
+	return func(c *vmcpServerConfig) {
+		c.workflows = workflows
+	}
+}
+
 // getFreePort returns an available TCP port on localhost.
 // This is used for parallel test execution to avoid port conflicts.
 func getFreePort(tb testing.TB) int {
@@ -146,6 +160,17 @@ func NewVMCPServer(
 	// Create router
 	rtr := router.NewDefaultRouter()
 
+	// Convert workflows slice to map keyed by workflow name
+	// Note: This is a simple conversion for test-created WorkflowDefinitions.
+	// For config-based workflows, use server.ConvertConfigToWorkflowDefinitions() instead.
+	var workflowsMap map[string]*composer.WorkflowDefinition
+	if len(config.workflows) > 0 {
+		workflowsMap = make(map[string]*composer.WorkflowDefinition, len(config.workflows))
+		for _, wf := range config.workflows {
+			workflowsMap[wf.Name] = wf
+		}
+	}
+
 	// Create vMCP server with test-specific defaults
 	vmcpServer, err := vmcpserver.New(&vmcpserver.Config{
 		Name:           "test-vmcp",
@@ -153,7 +178,7 @@ func NewVMCPServer(
 		Host:           "127.0.0.1",
 		Port:           getFreePort(tb), // Get a random available port for parallel test execution
 		AuthMiddleware: auth.AnonymousMiddleware,
-	}, rtr, backendClient, discoveryMgr, backends, nil) // nil for workflowDefs in tests
+	}, rtr, backendClient, discoveryMgr, backends, workflowsMap)
 	require.NoError(tb, err, "failed to create vMCP server")
 
 	// Start server automatically

From e7831a4d7f0c10f62a83c8376abd53dd2056d844 Mon Sep 17 00:00:00 2001
From: Jakub Hrozek <jakub@stacklok.com>
Date: Mon, 24 Nov 2025 15:10:50 +0000
Subject: [PATCH 2/4] Add error handling tests for composer workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends the vMCP composer integration tests with error handling scenarios:
- SingleStepToolError: basic error propagation
- AbortMode_StopsOnFirstError: abort mode stops on first failure
- ContinueMode_PartialSuccess: continue mode returns partial results
- MultipleParallelErrors: handles multiple concurrent failures

Also adds test infrastructure for error scenarios:
- ErrorHandler field on BackendTool for returning errors
- Helper functions: NewErrorBackendTool, NewSlowBackendTool, NewTransientBackendTool
- AssertToolCallError helper for asserting error responses

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test/integration/vmcp/composer_basic_test.go | 96 ++++++++++++++++++++
 test/integration/vmcp/helpers/backend.go     | 76 +++++++++++++++-
 test/integration/vmcp/helpers/mcp_client.go  | 30 ++++++
 3 files changed, 199 insertions(+), 3 deletions(-)

diff --git a/test/integration/vmcp/composer_basic_test.go b/test/integration/vmcp/composer_basic_test.go
index 507b856d3..914c076a7 100644
--- a/test/integration/vmcp/composer_basic_test.go
+++ b/test/integration/vmcp/composer_basic_test.go
@@ -18,6 +18,7 @@ type composerTestCase struct {
 	workflow    *composer.WorkflowDefinition
 	params      map[string]any
 	wantStrings []string // Strings that must appear in output
+	wantError   bool     // Expect workflow to return an error
 }
 
 // runComposerTest executes a single composer test case.
@@ -42,6 +43,17 @@ func runComposerTest(t *testing.T, tc composerTestCase) {
 	defer client.Close()
 
 	result := client.CallTool(ctx, tc.workflow.Name, tc.params)
+
+	// Handle error vs success expectations
+	if tc.wantError {
+		text := helpers.AssertToolCallError(t, result)
+		if len(tc.wantStrings) > 0 {
+			helpers.AssertTextContains(t, text, tc.wantStrings...)
+		}
+		t.Logf("Test %q completed with expected error: %s", tc.name, text)
+		return
+	}
+
 	text := helpers.AssertToolCallSuccess(t, result)
 
 	// Verify expected strings
@@ -209,6 +221,90 @@ func TestVMCPComposer(t *testing.T) {
 			params:      map[string]any{"name": "testuser", "count": "42"},
 			wantStrings: []string{"name=testuser", "count=42"},
 		},
+		// Error handling tests
+		{
+			name: "SingleStepToolError",
+			tools: []helpers.BackendTool{
+				helpers.NewErrorBackendTool("failing_tool", "Always fails", "TOOL_ERROR: operation failed"),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "single_error",
+				Description: "Single step that errors",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "fail", Type: composer.StepTypeTool, Tool: "test_failing_tool", Arguments: map[string]any{}},
+				},
+			},
+			params:      map[string]any{},
+			wantError:   true,
+			wantStrings: []string{"TOOL_ERROR"},
+		},
+		{
+			name: "AbortMode_StopsOnFirstError",
+			tools: []helpers.BackendTool{
+				helpers.NewErrorBackendTool("error_step", "Fails", "STEP_FAILED"),
+				helpers.NewBackendTool("success_step", "Succeeds", func(_ context.Context, _ map[string]any) string { return "SUCCESS" }),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "abort_test",
+				Description: "Abort on first error",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "error", Type: composer.StepTypeTool, Tool: "test_error_step", Arguments: map[string]any{}},
+					{ID: "success", Type: composer.StepTypeTool, Tool: "test_success_step", Arguments: map[string]any{}, DependsOn: []string{"error"}},
+				},
+			},
+			params:      map[string]any{},
+			wantError:   true,
+			wantStrings: []string{"STEP_FAILED"},
+		},
+		{
+			name: "ContinueMode_PartialSuccess",
+			tools: []helpers.BackendTool{
+				helpers.NewErrorBackendTool("fail_a", "Fails A", "ERROR_A"),
+				helpers.NewBackendTool("ok_b", "OK B", func(_ context.Context, _ map[string]any) string { return "SUCCESS_B" }),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "continue_test",
+				Description: "Continue despite errors",
+				Parameters:  map[string]any{},
+				FailureMode: "continue",
+				Steps: []composer.WorkflowStep{
+					// Independent parallel steps
+					{ID: "a", Type: composer.StepTypeTool, Tool: "test_fail_a", Arguments: map[string]any{}},
+					{ID: "b", Type: composer.StepTypeTool, Tool: "test_ok_b", Arguments: map[string]any{}},
+				},
+			},
+			params: map[string]any{},
+			// Continue mode returns success with partial results
+			wantError:   false,
+			wantStrings: []string{"SUCCESS_B"},
+		},
+		{
+			name: "MultipleParallelErrors",
+			tools: []helpers.BackendTool{
+				helpers.NewErrorBackendTool("fail_1", "Fails 1", "ERROR_1"),
+				helpers.NewErrorBackendTool("fail_2", "Fails 2", "ERROR_2"),
+				helpers.NewErrorBackendTool("fail_3", "Fails 3", "ERROR_3"),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "multi_error",
+				Description: "Multiple parallel failures",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{ID: "f1", Type: composer.StepTypeTool, Tool: "test_fail_1", Arguments: map[string]any{}},
+					{ID: "f2", Type: composer.StepTypeTool, Tool: "test_fail_2", Arguments: map[string]any{}},
+					{ID: "f3", Type: composer.StepTypeTool, Tool: "test_fail_3", Arguments: map[string]any{}},
+				},
+			},
+			params:    map[string]any{},
+			wantError: true,
+			// At least one error message should appear
+			wantStrings: []string{"ERROR"},
+		},
 	}
 
 	for _, tc := range tests {
diff --git a/test/integration/vmcp/helpers/backend.go b/test/integration/vmcp/helpers/backend.go
index 3bcfa2a4a..b4d81526f 100644
--- a/test/integration/vmcp/helpers/backend.go
+++ b/test/integration/vmcp/helpers/backend.go
@@ -5,7 +5,9 @@ import (
 	"context"
 	"net/http"
 	"net/http/httptest"
+	"sync/atomic"
 	"testing"
+	"time"
 
 	"github.com/mark3labs/mcp-go/mcp"
 	"github.com/mark3labs/mcp-go/server"
@@ -31,6 +33,11 @@ type BackendTool struct {
 	// The handler receives the tool arguments as a map and should return
 	// a string representation of the result (typically JSON).
 	Handler func(ctx context.Context, args map[string]any) string
+
+	// ErrorHandler is an alternative handler that can return errors.
+	// If set, this takes precedence over Handler.
+	// Returns (result, isError).
+	ErrorHandler func(ctx context.Context, args map[string]any) (string, bool)
 }
 
 // NewBackendTool creates a new BackendTool with sensible defaults.
@@ -58,6 +65,60 @@ func NewBackendTool(name, description string, handler func(ctx context.Context,
 	}
 }
 
+// NewErrorBackendTool creates a BackendTool that always returns an error.
+// This is useful for testing error handling and failure modes.
+func NewErrorBackendTool(name, description, errorMsg string) BackendTool {
+	return BackendTool{
+		Name:        name,
+		Description: description,
+		InputSchema: mcp.ToolInputSchema{
+			Type:       "object",
+			Properties: map[string]any{},
+		},
+		ErrorHandler: func(_ context.Context, _ map[string]any) (string, bool) {
+			return errorMsg, true
+		},
+	}
+}
+
+// NewSlowBackendTool creates a BackendTool that delays before returning.
+// This is useful for testing timeout behavior.
+func NewSlowBackendTool(name, description string, delay time.Duration, result string) BackendTool {
+	return BackendTool{
+		Name:        name,
+		Description: description,
+		InputSchema: mcp.ToolInputSchema{
+			Type:       "object",
+			Properties: map[string]any{},
+		},
+		Handler: func(_ context.Context, _ map[string]any) string {
+			time.Sleep(delay)
+			return result
+		},
+	}
+}
+
+// NewTransientBackendTool creates a BackendTool that fails a specified number of times before succeeding.
+// This is useful for testing retry behavior.
+func NewTransientBackendTool(name, description string, failCount int32, errorMsg, successMsg string) BackendTool {
+	var attempts int32
+	return BackendTool{
+		Name:        name,
+		Description: description,
+		InputSchema: mcp.ToolInputSchema{
+			Type:       "object",
+			Properties: map[string]any{},
+		},
+		ErrorHandler: func(_ context.Context, _ map[string]any) (string, bool) {
+			current := atomic.AddInt32(&attempts, 1)
+			if current <= failCount {
+				return errorMsg, true
+			}
+			return successMsg, false
+		},
+	}
+}
+
 // contextKey is a private type for context keys to avoid collisions.
 type contextKey string
 
@@ -198,14 +259,23 @@ func CreateBackendServer(tb testing.TB, tools []BackendTool, opts ...BackendServ
 					args = make(map[string]any)
 				}
 
-				// Invoke the tool handler
-				result := tool.Handler(ctx, args)
+				// Invoke the appropriate handler
+				var result string
+				var isError bool
+
+				if tool.ErrorHandler != nil {
+					result, isError = tool.ErrorHandler(ctx, args)
+				} else {
+					result = tool.Handler(ctx, args)
+					isError = false
+				}
 
-				// Return successful result with text content
+				// Return result with text content
 				return &mcp.CallToolResult{
 					Content: []mcp.Content{
 						mcp.NewTextContent(result),
 					},
+					IsError: isError,
 				}, nil
 			},
 		)
diff --git a/test/integration/vmcp/helpers/mcp_client.go b/test/integration/vmcp/helpers/mcp_client.go
index 10a5a54b4..82fd531a7 100644
--- a/test/integration/vmcp/helpers/mcp_client.go
+++ b/test/integration/vmcp/helpers/mcp_client.go
@@ -196,6 +196,36 @@ func AssertToolCallSuccess(tb testing.TB, result *mcp.CallToolResult) string {
 	return text
 }
 
+// AssertToolCallError asserts that a tool call failed (IsError=true)
+// and returns the concatenated text content from all content items.
+//
+// The function uses require assertions, so it will fail the test immediately
+// if the tool call was successful.
+//
+// Example:
+//
+//	result := client.CallTool(ctx, "failing_workflow", map[string]any{})
+//	text := helpers.AssertToolCallError(t, result)
+//	assert.Contains(t, text, "error message")
+func AssertToolCallError(tb testing.TB, result *mcp.CallToolResult) string {
+	tb.Helper()
+
+	require.NotNil(tb, result, "tool call result should not be nil")
+	require.True(tb, result.IsError, "tool call should return an error, got success: %v", result.Content)
+
+	var textParts []string
+	for _, content := range result.Content {
+		if textContent, ok := mcp.AsTextContent(content); ok {
+			textParts = append(textParts, textContent.Text)
+		}
+	}
+
+	text := strings.Join(textParts, "\n")
+	tb.Logf("Tool call failed as expected with %d content items, total text length: %d", len(result.Content), len(text))
+
+	return text
+}
+
 // AssertTextContains asserts that text contains all expected substrings.
 // This is a variadic helper for checking multiple content expectations in tool results.
 //

From a92662ab00c1d9b3d65f30969a5ea729d8cf2b80 Mon Sep 17 00:00:00 2001
From: Jakub Hrozek <jakub@stacklok.com>
Date: Wed, 26 Nov 2025 15:22:16 +0000
Subject: [PATCH 3/4] Use OutputConfig for structured outputs in composer tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates integration tests to use the proper OutputConfig feature instead
of relying on the plain text workaround. This aligns with the new
structured output support added in commit 51ac7d80.

- Add Output config to all successful workflow test cases
- Remove outdated note about issue #2669 workaround
- Error test cases intentionally left without Output config

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test/integration/vmcp/composer_basic_test.go | 58 +++++++++++++++++++-
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/test/integration/vmcp/composer_basic_test.go b/test/integration/vmcp/composer_basic_test.go
index 914c076a7..4c84ec15e 100644
--- a/test/integration/vmcp/composer_basic_test.go
+++ b/test/integration/vmcp/composer_basic_test.go
@@ -8,6 +8,7 @@ import (
 	"github.com/stretchr/testify/assert"
 
 	"github.com/stacklok/toolhive/pkg/vmcp/composer"
+	"github.com/stacklok/toolhive/pkg/vmcp/config"
 	"github.com/stacklok/toolhive/test/integration/vmcp/helpers"
 )
 
@@ -63,9 +64,6 @@ func runComposerTest(t *testing.T, tc composerTestCase) {
 }
 
 // TestVMCPComposer runs table-driven tests for composer workflow patterns.
-// Note: Tests use plain text output instead of structured JSON fields due to
-// a known bug where MCP tool results aren't parsed into structured data.
-// See: https://github.com/stacklok/toolhive/issues/2669
 func TestVMCPComposer(t *testing.T) {
 	t.Parallel()
 
@@ -103,6 +101,15 @@ func TestVMCPComposer(t *testing.T) {
 						},
 						DependsOn: []string{"gen"}},
 				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"result": {
+							Type:        "string",
+							Description: "Processed result",
+							Value:       "{{.steps.proc.output.text}}",
+						},
+					},
+				},
 			},
 			params:      map[string]any{"prefix": "Hello", "suffix": "[DONE]"},
 			wantStrings: []string{"HELLO", "GENERATED", "[DONE]"},
@@ -138,6 +145,15 @@ func TestVMCPComposer(t *testing.T) {
 						},
 						DependsOn: []string{"a", "b", "c"}},
 				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"combined": {
+							Type:        "string",
+							Description: "Combined result from all tasks",
+							Value:       "{{.steps.agg.output.text}}",
+						},
+					},
+				},
 			},
 			params:      map[string]any{},
 			wantStrings: []string{"COMBINED", "A", "B", "C"},
@@ -188,6 +204,15 @@ func TestVMCPComposer(t *testing.T) {
 						},
 						DependsOn: []string{"left", "right"}},
 				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"merged": {
+							Type:        "string",
+							Description: "Merged result from diamond DAG",
+							Value:       "{{.steps.merge.output.text}}",
+						},
+					},
+				},
 			},
 			params:      map[string]any{"input": "test"},
 			wantStrings: []string{"MERGED", "L(", "R(", "started:test"},
@@ -217,6 +242,15 @@ func TestVMCPComposer(t *testing.T) {
 							"count": "{{.params.count}}",
 						}},
 				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"echo_result": {
+							Type:        "string",
+							Description: "Echo result with parameters",
+							Value:       "{{.steps.echo.output.text}}",
+						},
+					},
+				},
 			},
 			params:      map[string]any{"name": "testuser", "count": "42"},
 			wantStrings: []string{"name=testuser", "count=42"},
@@ -276,6 +310,15 @@ func TestVMCPComposer(t *testing.T) {
 					{ID: "a", Type: composer.StepTypeTool, Tool: "test_fail_a", Arguments: map[string]any{}},
 					{ID: "b", Type: composer.StepTypeTool, Tool: "test_ok_b", Arguments: map[string]any{}},
 				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"result_b": {
+							Type:        "string",
+							Description: "Result from successful step B",
+							Value:       "{{.steps.b.output.text}}",
+						},
+					},
+				},
 			},
 			params: map[string]any{},
 			// Continue mode returns success with partial results
@@ -332,6 +375,15 @@ func TestVMCPComposer_WorkflowListed(t *testing.T) {
 		Steps: []composer.WorkflowStep{
 			{ID: "step1", Type: composer.StepTypeTool, Tool: "test_noop", Arguments: map[string]any{}},
 		},
+		Output: &config.OutputConfig{
+			Properties: map[string]config.OutputProperty{
+				"result": {
+					Type:        "string",
+					Description: "Noop result",
+					Value:       "{{.steps.step1.output.text}}",
+				},
+			},
+		},
 	}
 
 	vmcpServer := helpers.NewVMCPServer(ctx, t, helpers.BackendsFromURL("test", backend.URL+"/mcp"),

From ecd6480d771e3c5593bba09555417cad1783e657 Mon Sep 17 00:00:00 2001
From: Jakub Hrozek <jakub@stacklok.com>
Date: Wed, 26 Nov 2025 22:48:42 +0000
Subject: [PATCH 4/4] Add edge case tests for composer workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds 4 new integration tests covering edge cases identified during QE analysis:

- StepTimeoutDuringRetry: Verifies timeout is respected during retry attempts
- ConditionalStepReferencingSkippedStep: Documents behavior when template
  references output from a skipped step (returns <no value>)
- OutputMissingRequiredField: Documents that Required field doesn't validate
  templated values are non-empty (returns <no value>)
- ContinueModeWithFailedDependency: Documents that dependent steps still run
  when their dependency fails in continue mode (receives <no value>)

Tests include DISCUSSION comments noting behaviors that may warrant review.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test/integration/vmcp/composer_basic_test.go | 255 +++++++++++++++++++
 1 file changed, 255 insertions(+)

diff --git a/test/integration/vmcp/composer_basic_test.go b/test/integration/vmcp/composer_basic_test.go
index 4c84ec15e..3e5ba36be 100644
--- a/test/integration/vmcp/composer_basic_test.go
+++ b/test/integration/vmcp/composer_basic_test.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 
@@ -348,6 +349,260 @@ func TestVMCPComposer(t *testing.T) {
 			// At least one error message should appear
 			wantStrings: []string{"ERROR"},
 		},
+		// Advanced error handling and edge case tests
+		//
+		// Test: StepTimeoutDuringRetry
+		// Verifies that step timeout is respected even when retry is configured.
+		// The slow tool takes longer than the step timeout, so it should fail due
+		// to timeout rather than exhausting all retry attempts.
+		{
+			name: "StepTimeoutDuringRetry",
+			tools: []helpers.BackendTool{
+				// Tool that delays 500ms before returning - longer than the 50ms step timeout
+				helpers.NewSlowBackendTool("slow_tool", "Slow tool that times out", 500*time.Millisecond, "SUCCESS"),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "timeout_retry",
+				Description: "Tests that timeout is respected during retry attempts",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{
+						ID:        "slow_step",
+						Type:      composer.StepTypeTool,
+						Tool:      "test_slow_tool",
+						Arguments: map[string]any{},
+						// Step timeout is shorter than the tool delay
+						Timeout: 50 * time.Millisecond,
+						OnError: &composer.ErrorHandler{
+							Action:     "retry",
+							RetryCount: 3,
+							RetryDelay: 10 * time.Millisecond,
+						},
+					},
+				},
+			},
+			params:    map[string]any{},
+			wantError: true,
+			// Should fail due to timeout - the error message should indicate timeout or context deadline
+			wantStrings: []string{},
+		},
+		// Test: ConditionalStepReferencingSkippedStep
+		// Tests template expansion when a step references output from a skipped step.
+		// Step 1 has a condition that evaluates to false (gets skipped).
+		// Step 2 depends on Step 1 and tries to use its output in arguments.
+		//
+		// DISCUSSION: When a step is skipped, referencing its output via template
+		// (e.g., {{.steps.step1.output.text}}) returns "<no value>" (Go template default
+		// for missing map keys). The workflow succeeds and the dependent step runs with
+		// this placeholder value.
+		//
+		// This may or may not be the desired behavior. Alternatives to consider:
+		// - Fail the workflow when referencing skipped step output
+		// - Skip dependent steps automatically when their dependency is skipped
+		// - Provide an explicit "default" value mechanism for skipped step references
+		{
+			name: "ConditionalStepReferencingSkippedStep",
+			tools: []helpers.BackendTool{
+				helpers.NewBackendTool("producer", "Produces data",
+					func(_ context.Context, _ map[string]any) string {
+						return "PRODUCED_DATA"
+					}),
+				helpers.NewBackendTool("consumer", "Consumes data",
+					func(_ context.Context, args map[string]any) string {
+						data, _ := args["data"].(string)
+						// The data will be "<no value>" from the skipped step's template expansion
+						return "RECEIVED:" + data
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "skipped_ref",
+				Description: "Tests referencing output from a skipped step",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{
+						ID:        "step1",
+						Type:      composer.StepTypeTool,
+						Tool:      "test_producer",
+						Arguments: map[string]any{},
+						// Condition that always evaluates to false - step will be skipped
+						Condition: "false",
+					},
+					{
+						ID:   "step2",
+						Type: composer.StepTypeTool,
+						Tool: "test_consumer",
+						Arguments: map[string]any{
+							// Reference output from skipped step - will expand to "<no value>"
+							"data": "{{.steps.step1.output.text}}",
+						},
+						DependsOn: []string{"step1"},
+					},
+				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"result": {
+							Type:        "string",
+							Description: "Consumer result",
+							Value:       "{{.steps.step2.output.text}}",
+						},
+					},
+				},
+			},
+			params: map[string]any{},
+			// The workflow succeeds. Step2 receives "<no value>" from the skipped step's
+			// template expansion (Go template default for missing keys).
+			// Note: "<no value>" is JSON-escaped as "\u003cno value\u003e" in the response.
+			wantError:   false,
+			wantStrings: []string{"RECEIVED:", "no value"},
+		},
+		// Test: OutputMissingRequiredField
+		// Tests output construction when a required output field template references
+		// a non-existent field.
+		//
+		// NOTE: The `Required` field in OutputConfig does NOT validate that the
+		// templated value is non-empty. When a template references a non-existent
+		// field (e.g., {{.steps.step1.output.nonexistent_field}}), it expands to
+		// "<no value>" (Go template default) WITHOUT failing.
+		//
+		// DISCUSSION: This is potentially surprising behavior that may warrant a fix.
+		// Users may expect `Required` to ensure meaningful values are present.
+		// Consider:
+		// - Adding validation that required fields don't contain "<no value>"
+		// - Using Go template's "missingkey=error" option for strict mode
+		// - Documenting this behavior clearly if it's intentional
+		{
+			name: "OutputMissingRequiredField",
+			tools: []helpers.BackendTool{
+				helpers.NewBackendTool("simple_tool", "Returns simple output",
+					func(_ context.Context, _ map[string]any) string {
+						// Returns text output but not a "special_field" that the output requires
+						return "simple_result"
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "missing_required",
+				Description: "Tests output with template referencing non-existent field",
+				Parameters:  map[string]any{},
+				FailureMode: "abort",
+				Steps: []composer.WorkflowStep{
+					{
+						ID:        "step1",
+						Type:      composer.StepTypeTool,
+						Tool:      "test_simple_tool",
+						Arguments: map[string]any{},
+					},
+				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"result": {
+							Type:        "string",
+							Description: "Result field referencing non-existent step output field",
+							// Reference a field that doesn't exist in the step output
+							// This will expand to "<no value>" instead of failing
+							Value: "{{.steps.step1.output.nonexistent_field}}",
+						},
+					},
+					// Required does not validate that the value is non-empty
+					Required: []string{"result"},
+				},
+			},
+			params: map[string]any{},
+			// The workflow succeeds - Required does not enforce non-empty values.
+			// The result field will contain "<no value>" from the template expansion.
+			// Note: "<no value>" is JSON-escaped as "\u003cno value\u003e" in the response.
+			wantError:   false,
+			wantStrings: []string{"no value"},
+		},
+		// Test: ContinueModeWithFailedDependency
+		// Tests what happens in "continue" failure mode when a dependent step
+		// tries to use output from a failed step.
+		// - Step A fails with an error
+		// - Step B depends on Step A and tries to use its output
+		// - Step C is independent (no dependencies)
+		//
+		// NOTE: In "continue" mode, dependent steps still run even when their
+		// dependency failed. They receive "<no value>" placeholder for the failed
+		// step's output. Step B is NOT skipped - it executes with the placeholder.
+		//
+		// DISCUSSION: This may or may not be desired behavior. Alternative approaches:
+		// - Skip dependent steps automatically when their dependency fails
+		// - Allow steps to declare "skip_on_dependency_failure: true"
+		// - Provide a way to detect failed dependencies in conditions
+		//
+		// Current behavior means dependent steps must handle "<no value>" gracefully
+		// or the workflow may produce unexpected results.
+		{
+			name: "ContinueModeWithFailedDependency",
+			tools: []helpers.BackendTool{
+				helpers.NewErrorBackendTool("fail_first", "Always fails", "DEPENDENCY_FAILED"),
+				helpers.NewBackendTool("use_output", "Uses output from dependency",
+					func(_ context.Context, args map[string]any) string {
+						data, _ := args["data"].(string)
+						// data will be "<no value>" from the failed dependency
+						return "PROCESSED:" + data
+					}),
+				helpers.NewBackendTool("independent", "Independent step",
+					func(_ context.Context, _ map[string]any) string {
+						return "INDEPENDENT_SUCCESS"
+					}),
+			},
+			workflow: &composer.WorkflowDefinition{
+				Name:        "continue_failed_dep",
+				Description: "Tests continue mode when a step depends on a failed step",
+				Parameters:  map[string]any{},
+				FailureMode: "continue",
+				Steps: []composer.WorkflowStep{
+					{
+						ID:        "step_a",
+						Type:      composer.StepTypeTool,
+						Tool:      "test_fail_first",
+						Arguments: map[string]any{},
+					},
+					{
+						ID:   "step_b",
+						Type: composer.StepTypeTool,
+						Tool: "test_use_output",
+						Arguments: map[string]any{
+							// Reference output from failed step - will expand to "<no value>"
+							"data": "{{.steps.step_a.output.text}}",
+						},
+						DependsOn: []string{"step_a"},
+					},
+					{
+						// Independent step that succeeds regardless
+						ID:        "step_c",
+						Type:      composer.StepTypeTool,
+						Tool:      "test_independent",
+						Arguments: map[string]any{},
+					},
+				},
+				Output: &config.OutputConfig{
+					Properties: map[string]config.OutputProperty{
+						"independent_result": {
+							Type:        "string",
+							Description: "Result from independent step",
+							Value:       "{{.steps.step_c.output.text}}",
+						},
+						"dependent_result": {
+							Type:        "string",
+							Description: "Result from dependent step (runs with placeholder from failed dep)",
+							Value:       "{{.steps.step_b.output.text}}",
+							Default:     "SKIPPED_DUE_TO_DEPENDENCY",
+						},
+					},
+				},
+			},
+			params: map[string]any{},
+			// In continue mode, the workflow succeeds with partial results.
+			// Step B runs (not skipped!) with "<no value>" from failed step_a.
+			// Step C (independent) also succeeds.
+			// Both outputs are present in the result.
+			// Note: "<no value>" is JSON-escaped as "\u003cno value\u003e" in the response.
+			wantError:   false,
+			wantStrings: []string{"INDEPENDENT_SUCCESS", "PROCESSED:", "no value"},
+		},
 	}
 
 	for _, tc := range tests {