From 93e7aba12cdd02825291de425b7119582d50b269 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:39:58 +0300
Subject: [PATCH 01/17] feat(otel): add genai and mcp telemetry primitives

- `pkg/telemetry/genai/` provides the GenAI semantic-conventions surface: span helpers (`ChatSpan`, `EmbeddingSpan`, `FallbackSpan`, `SandboxSpan`, runtime helpers), attribute / operation-name / provider-name constants per the OTel GenAI semconv, conversation-id baggage round-trippers, error classification, content-capture gating (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`), stability gating (`OTEL_SEMCONV_STABILITY_OPT_IN`), `gen_ai.client.token.usage` and operation-duration histograms, the `gen_ai.evaluation.result` log emitter, and process-boundary helpers (`InjectSandboxEnv`, `InjectTraceContextEnv`)
- `pkg/telemetry/mcp/` provides MCP-specific telemetry: `ConversationIDFromBaggage`, span starters for client / server, `params._meta` propagation carrier, attribute constants, and metrics
- Test files cover content gating, stability defaults, conversation propagation, and span lifecycle invariants
---
 pkg/telemetry/genai/attrs.go          | 105 +++++++
 pkg/telemetry/genai/content.go        | 203 +++++++++++++
 pkg/telemetry/genai/conversation.go   |  52 ++++
 pkg/telemetry/genai/doc.go            |  15 +
 pkg/telemetry/genai/embedding.go      | 176 +++++++++++
 pkg/telemetry/genai/errors.go         |  85 ++++++
 pkg/telemetry/genai/evaluation.go     |  64 ++++
 pkg/telemetry/genai/genai_test.go     | 156 ++++++++++
 pkg/telemetry/genai/metrics.go        |  80 +++++
 pkg/telemetry/genai/provider_names.go |  28 ++
 pkg/telemetry/genai/runtime.go        | 367 ++++++++++++++++++++++
 pkg/telemetry/genai/sandbox.go        | 231 ++++++++++++++
 pkg/telemetry/genai/span.go           | 418 ++++++++++++++++++++++++++
 pkg/telemetry/genai/stability.go      | 130 ++++++++
 pkg/telemetry/genai/stability_test.go |  55 ++++
 pkg/telemetry/genai/stream.go         | 255 ++++++++++++++++
 pkg/telemetry/mcp/attrs.go            |  58 ++++
 pkg/telemetry/mcp/conversation.go     |  19 ++
 pkg/telemetry/mcp/doc.go              |  13 +
 pkg/telemetry/mcp/mcp_test.go         |  97 ++++++
 pkg/telemetry/mcp/metrics.go          |  56 ++++
 pkg/telemetry/mcp/propagation.go      |  92 ++++++
 pkg/telemetry/mcp/span.go             | 247 +++++++++++++++
 23 files changed, 3002 insertions(+)
 create mode 100644 pkg/telemetry/genai/attrs.go
 create mode 100644 pkg/telemetry/genai/content.go
 create mode 100644 pkg/telemetry/genai/conversation.go
 create mode 100644 pkg/telemetry/genai/doc.go
 create mode 100644 pkg/telemetry/genai/embedding.go
 create mode 100644 pkg/telemetry/genai/errors.go
 create mode 100644 pkg/telemetry/genai/evaluation.go
 create mode 100644 pkg/telemetry/genai/genai_test.go
 create mode 100644 pkg/telemetry/genai/metrics.go
 create mode 100644 pkg/telemetry/genai/provider_names.go
 create mode 100644 pkg/telemetry/genai/runtime.go
 create mode 100644 pkg/telemetry/genai/sandbox.go
 create mode 100644 pkg/telemetry/genai/span.go
 create mode 100644 pkg/telemetry/genai/stability.go
 create mode 100644 pkg/telemetry/genai/stability_test.go
 create mode 100644 pkg/telemetry/genai/stream.go
 create mode 100644 pkg/telemetry/mcp/attrs.go
 create mode 100644 pkg/telemetry/mcp/conversation.go
 create mode 100644 pkg/telemetry/mcp/doc.go
 create mode 100644 pkg/telemetry/mcp/mcp_test.go
 create mode 100644 pkg/telemetry/mcp/metrics.go
 create mode 100644 pkg/telemetry/mcp/propagation.go
 create mode 100644 pkg/telemetry/mcp/span.go

diff --git a/pkg/telemetry/genai/attrs.go b/pkg/telemetry/genai/attrs.go
new file mode 100644
index 000000000..48e0b82f4
--- /dev/null
+++ b/pkg/telemetry/genai/attrs.go
@@ -0,0 +1,105 @@
+package genai
+
+// Attribute keys defined by the OTel GenAI semantic conventions. All are
+// Development stability — declared as constants here so call sites depend
+// on a stable local symbol rather than a moving upstream import path.
+const (
+	AttrOperationName  = "gen_ai.operation.name"
+	AttrProviderName   = "gen_ai.provider.name"
+	AttrConversationID = "gen_ai.conversation.id"
+	AttrOutputType     = "gen_ai.output.type"
+
+	AttrAgentName        = "gen_ai.agent.name"
+	AttrAgentID          = "gen_ai.agent.id"
+	AttrAgentDescription = "gen_ai.agent.description"
+	AttrAgentVersion     = "gen_ai.agent.version"
+
+	AttrWorkflowName = "gen_ai.workflow.name"
+
+	AttrRequestModel            = "gen_ai.request.model"
+	AttrRequestStream           = "gen_ai.request.stream"
+	AttrRequestMaxTokens        = "gen_ai.request.max_tokens"
+	AttrRequestTemperature      = "gen_ai.request.temperature"
+	AttrRequestTopP             = "gen_ai.request.top_p"
+	AttrRequestTopK             = "gen_ai.request.top_k"
+	AttrRequestFrequencyPenalty = "gen_ai.request.frequency_penalty"
+	AttrRequestPresencePenalty  = "gen_ai.request.presence_penalty"
+	AttrRequestStopSequences    = "gen_ai.request.stop_sequences"
+	AttrRequestChoiceCount      = "gen_ai.request.choice.count"
+	AttrRequestSeed             = "gen_ai.request.seed"
+	AttrRequestEncodingFormats  = "gen_ai.request.encoding_formats"
+
+	AttrResponseModel            = "gen_ai.response.model"
+	AttrResponseID               = "gen_ai.response.id"
+	AttrResponseFinishReasons    = "gen_ai.response.finish_reasons"
+	AttrResponseTimeToFirstChunk = "gen_ai.response.time_to_first_chunk"
+
+	AttrUsageInputTokens              = "gen_ai.usage.input_tokens"
+	AttrUsageOutputTokens             = "gen_ai.usage.output_tokens"
+	AttrUsageCacheReadInputTokens     = "gen_ai.usage.cache_read.input_tokens"
+	AttrUsageCacheCreationInputTokens = "gen_ai.usage.cache_creation.input_tokens"
+	AttrUsageReasoningOutputTokens    = "gen_ai.usage.reasoning.output_tokens"
+
+	AttrTokenType = "gen_ai.token.type"
+
+	AttrToolName          = "gen_ai.tool.name"
+	AttrToolCallID        = "gen_ai.tool.call.id"
+	AttrToolType          = "gen_ai.tool.type"
+	AttrToolDescription   = "gen_ai.tool.description"
+	AttrToolDefinitions   = "gen_ai.tool.definitions"
+	AttrToolCallArguments = "gen_ai.tool.call.arguments"
+	AttrToolCallResult    = "gen_ai.tool.call.result"
+
+	AttrInputMessages      = "gen_ai.input.messages"
+	AttrOutputMessages     = "gen_ai.output.messages"
+	AttrSystemInstructions = "gen_ai.system_instructions"
+
+	AttrPromptName = "gen_ai.prompt.name"
+
+	AttrDataSourceID             = "gen_ai.data_source.id"
+	AttrEmbeddingsDimensionCount = "gen_ai.embeddings.dimension.count"
+	AttrRetrievalDocuments       = "gen_ai.retrieval.documents"
+	AttrRetrievalQueryText       = "gen_ai.retrieval.query.text"
+
+	AttrEvaluationName        = "gen_ai.evaluation.name"
+	AttrEvaluationScoreLabel  = "gen_ai.evaluation.score.label"
+	AttrEvaluationScoreValue  = "gen_ai.evaluation.score.value"
+	AttrEvaluationExplanation = "gen_ai.evaluation.explanation"
+)
+
+// Operation names — values for AttrOperationName.
+const (
+	OperationChat            = "chat"
+	OperationTextCompletion  = "text_completion"
+	OperationGenerateContent = "generate_content"
+	OperationEmbeddings      = "embeddings"
+	OperationCreateAgent     = "create_agent"
+	OperationInvokeAgent     = "invoke_agent"
+	OperationInvokeWorkflow  = "invoke_workflow"
+	OperationExecuteTool     = "execute_tool"
+	OperationRetrieval       = "retrieval"
+)
+
+// Token types — values for AttrTokenType when recording the token usage
+// histogram. Spec defines `input` and `output`; we use the cache_read /
+// cache_creation / reasoning variants to mirror the per-token-type
+// usage attributes for richer breakdowns.
+const (
+	TokenTypeInput         = "input"
+	TokenTypeOutput        = "output"
+	TokenTypeCacheRead     = "cache_read.input"
+	TokenTypeCacheCreation = "cache_creation.input"
+	TokenTypeReasoning     = "reasoning.output"
+)
+
+// Provider names — values for AttrProviderName. Names follow the values
+// defined in the provider-specific GenAI semconv pages.
+const (
+	ProviderAnthropic   = "anthropic"
+	ProviderOpenAI      = "openai"
+	ProviderAWSBedrock  = "aws.bedrock"
+	ProviderGCPVertexAI = "gcp.vertex_ai"
+	ProviderGCPGenAI    = "gcp.gen_ai"
+	ProviderAzureAI     = "azure.ai.inference"
+	ProviderDMR         = "docker.dmr"
+)
diff --git a/pkg/telemetry/genai/content.go b/pkg/telemetry/genai/content.go
new file mode 100644
index 000000000..108adfbac
--- /dev/null
+++ b/pkg/telemetry/genai/content.go
@@ -0,0 +1,203 @@
+package genai
+
+import (
+	"encoding/json"
+	"os"
+	"strings"
+
+	"go.opentelemetry.io/otel/attribute"
+
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// EnvCaptureMessageContent is the OTel-recommended environment variable
+// that toggles capture of GenAI request/response content as span
+// attributes. Default is off because chat history routinely contains
+// PII, secrets, internal documents, and other content that should not
+// be exported by default.
+//
+// Recognised truthy values: "true", "1", "yes", "on" (case-insensitive).
+const EnvCaptureMessageContent = "OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"
+
+// IsContentCaptureEnabled reports whether the OTel content-capture
+// opt-in is set. Read on every call so tests and feature flags can
+// flip the value at runtime.
+func IsContentCaptureEnabled() bool {
+	switch strings.ToLower(strings.TrimSpace(os.Getenv(EnvCaptureMessageContent))) {
+	case "true", "1", "yes", "on":
+		return true
+	default:
+		return false
+	}
+}
+
+// messagePart matches the OTel GenAI semconv message part schema
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/non-normative/examples-llm-calls/).
+//
+// Field choice per spec:
+//   - "text" parts use Content
+//   - "uri" parts use URI (and may set MimeType / Modality)
+//   - "tool_call" / "tool_call_response" parts use ID, Name, Arguments,
+//     Result
+type messagePart struct {
+	Type      string `json:"type"`
+	Content   string `json:"content,omitempty"`
+	URI       string `json:"uri,omitempty"`
+	MimeType  string `json:"mime_type,omitempty"`
+	Modality  string `json:"modality,omitempty"`
+	ID        string `json:"id,omitempty"`
+	Name      string `json:"name,omitempty"`
+	Arguments any    `json:"arguments,omitempty"`
+	Result    any    `json:"result,omitempty"`
+}
+
+type structuredMessage struct {
+	Role  string        `json:"role"`
+	Parts []messagePart `json:"parts"`
+}
+
+// SetInputMessages serialises chat history into `gen_ai.input.messages`
+// per the OTel GenAI examples schema (role + parts) and attaches it to
+// the span. System messages are removed from the array and emitted
+// separately as `gen_ai.system_instructions` per the spec.
+//
+// No-op when content capture is disabled or the span is nil.
+func SetInputMessages(span *ChatSpan, messages []chat.Message) {
+	if span == nil || !IsContentCaptureEnabled() {
+		return
+	}
+
+	var systemInstructions []structuredMessage
+	var input []structuredMessage
+	for i := range messages {
+		msg := messageToStructured(&messages[i])
+		if messages[i].Role == chat.MessageRoleSystem {
+			systemInstructions = append(systemInstructions, msg)
+			continue
+		}
+		input = append(input, msg)
+	}
+
+	if len(systemInstructions) > 0 {
+		if encoded, err := json.Marshal(systemInstructions); err == nil {
+			span.SetAttributes(attribute.String(AttrSystemInstructions, string(encoded)))
+		}
+	}
+	if len(input) > 0 {
+		if encoded, err := json.Marshal(input); err == nil {
+			span.SetAttributes(attribute.String(AttrInputMessages, string(encoded)))
+		}
+	}
+}
+
+// SetOutputMessages serialises the assembled response into
+// `gen_ai.output.messages`. Use after streaming has completed and the
+// final assistant message is known.
+func SetOutputMessages(span *ChatSpan, content, reasoning string, toolCalls []tools.ToolCall) {
+	if span == nil || !IsContentCaptureEnabled() {
+		return
+	}
+	parts := []messagePart{}
+	if reasoning != "" {
+		parts = append(parts, messagePart{Type: "reasoning", Content: reasoning})
+	}
+	if content != "" {
+		parts = append(parts, messagePart{Type: "text", Content: content})
+	}
+	for _, tc := range toolCalls {
+		parts = append(parts, messagePart{
+			Type:      "tool_call",
+			ID:        tc.ID,
+			Name:      tc.Function.Name,
+			Arguments: tc.Function.Arguments,
+		})
+	}
+	if len(parts) == 0 {
+		return
+	}
+	out := []structuredMessage{{Role: "assistant", Parts: parts}}
+	if encoded, err := json.Marshal(out); err == nil {
+		span.SetAttributes(attribute.String(AttrOutputMessages, string(encoded)))
+	}
+}
+
+// SetToolDefinitions serialises the tool definitions presented to the
+// model into `gen_ai.tool.definitions`.
+func SetToolDefinitions(span *ChatSpan, toolDefs []tools.Tool) {
+	if span == nil || !IsContentCaptureEnabled() || len(toolDefs) == 0 {
+		return
+	}
+	encoded, err := json.Marshal(toolDefs)
+	if err != nil {
+		return
+	}
+	span.SetAttributes(attribute.String(AttrToolDefinitions, string(encoded)))
+}
+
+// messageToStructured converts a chat.Message to the spec-shaped
+// structured message representation. Multi-content messages produce one
+// part per content block; tool calls and tool results map to their
+// respective part types.
+func messageToStructured(m *chat.Message) structuredMessage {
+	role := string(m.Role)
+	parts := []messagePart{}
+
+	switch {
+	case len(m.MultiContent) > 0:
+		for _, mc := range m.MultiContent {
+			switch mc.Type {
+			case chat.MessagePartTypeText:
+				if mc.Text != "" {
+					parts = append(parts, messagePart{Type: "text", Content: mc.Text})
+				}
+			case chat.MessagePartTypeImageURL:
+				if mc.ImageURL != nil && mc.ImageURL.URL != "" {
+					parts = append(parts, messagePart{
+						Type:     "uri",
+						URI:      mc.ImageURL.URL,
+						Modality: "image",
+					})
+				}
+			case chat.MessagePartTypeFile:
+				if mc.File != nil {
+					p := messagePart{Type: "file", ID: mc.File.FileID}
+					if mc.File.MimeType != "" {
+						p.MimeType = mc.File.MimeType
+					}
+					parts = append(parts, p)
+				}
+			}
+		}
+	case m.ToolCallID != "":
+		// Tool result messages: the entire content is the tool's
+		// response payload, encoded as a single tool_call_response
+		// part. Skip the default text/reasoning branch so we don't
+		// also emit a duplicate `text` part with the same payload.
+	default:
+		if m.ReasoningContent != "" {
+			parts = append(parts, messagePart{Type: "reasoning", Content: m.ReasoningContent})
+		}
+		if m.Content != "" {
+			parts = append(parts, messagePart{Type: "text", Content: m.Content})
+		}
+	}
+
+	for _, tc := range m.ToolCalls {
+		parts = append(parts, messagePart{
+			Type:      "tool_call",
+			ID:        tc.ID,
+			Name:      tc.Function.Name,
+			Arguments: tc.Function.Arguments,
+		})
+	}
+	if m.ToolCallID != "" {
+		parts = append(parts, messagePart{
+			Type:    "tool_call_response",
+			ID:      m.ToolCallID,
+			Content: m.Content,
+		})
+	}
+
+	return structuredMessage{Role: role, Parts: parts}
+}
diff --git a/pkg/telemetry/genai/conversation.go b/pkg/telemetry/genai/conversation.go
new file mode 100644
index 000000000..06b0edf4d
--- /dev/null
+++ b/pkg/telemetry/genai/conversation.go
@@ -0,0 +1,52 @@
+package genai
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/baggage"
+)
+
+// baggageKeyConversationID matches the GenAI semconv attribute key for
+// the conversation identifier so the value flows transparently through
+// the W3C `baggage` header alongside `traceparent`. Any downstream
+// service or subprocess running OTel auto-instrumentation will pick it
+// up without per-helper plumbing.
+const baggageKeyConversationID = "gen_ai.conversation.id"
+
+// WithConversationID returns a context that carries the conversation id
+// in OTel baggage. Spans created later in the chain — including ones in
+// helper packages that have no direct access to the session — read it
+// via ConversationIDFromContext and attach `gen_ai.conversation.id`
+// automatically. Empty id is a no-op.
+func WithConversationID(ctx context.Context, id string) context.Context {
+	if id == "" {
+		return ctx
+	}
+	member, err := baggage.NewMember(baggageKeyConversationID, id)
+	if err != nil {
+		return ctx
+	}
+	bag, err := baggage.FromContext(ctx).SetMember(member)
+	if err != nil {
+		return ctx
+	}
+	return baggage.ContextWithBaggage(ctx, bag)
+}
+
+// ConversationIDFromContext returns the conversation id stored in the
+// context's baggage, or "" when none has been seeded.
+func ConversationIDFromContext(ctx context.Context) string {
+	return baggage.FromContext(ctx).Member(baggageKeyConversationID).Value()
+}
+
+// conversationAttribute returns the gen_ai.conversation.id attribute
+// from baggage when present, or zero-value KeyValue when absent. Helper
+// for span starters so they can append it in one line.
+func conversationAttribute(ctx context.Context) (attribute.KeyValue, bool) {
+	id := ConversationIDFromContext(ctx)
+	if id == "" {
+		return attribute.KeyValue{}, false
+	}
+	return attribute.String(AttrConversationID, id), true
+}
diff --git a/pkg/telemetry/genai/doc.go b/pkg/telemetry/genai/doc.go
new file mode 100644
index 000000000..61bf90dd0
--- /dev/null
+++ b/pkg/telemetry/genai/doc.go
@@ -0,0 +1,15 @@
+// Package genai provides OpenTelemetry instrumentation helpers that follow
+// the GenAI semantic conventions
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/).
+//
+// The package is structured so that callers — provider clients, the agent
+// runtime, MCP clients — describe what they are doing in domain terms and
+// the helpers produce the spec-conformant spans, metrics, and log records.
+// Centralising the OTel surface here lets us upgrade the semantic
+// conventions in one place and keeps the call sites compact.
+//
+// All gen_ai.* attributes are Development stability per the spec. Attribute
+// keys are declared as constants in this package rather than imported from
+// go.opentelemetry.io/otel/semconv to insulate callers from the upstream
+// reorganisations the GenAI conventions are still going through.
+package genai
diff --git a/pkg/telemetry/genai/embedding.go b/pkg/telemetry/genai/embedding.go
new file mode 100644
index 000000000..a83ad752e
--- /dev/null
+++ b/pkg/telemetry/genai/embedding.go
@@ -0,0 +1,176 @@
+package genai
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// EmbeddingRequest carries the inputs needed to start an
+// `embeddings {model}` span per the OTel GenAI semantic conventions.
+type EmbeddingRequest struct {
+	Provider string
+	Model    string
+	// BatchSize is the number of input texts in the embedding call,
+	// recorded as `cagent.embeddings.batch_size`. Zero means
+	// single-input.
+	BatchSize int
+	// EncodingFormats is the optional list of requested output
+	// encodings (e.g. "float", "base64") per the GenAI semconv.
+	// Recorded as `gen_ai.request.encoding_formats` when non-empty.
+	EncodingFormats []string
+}
+
+// EmbeddingSpan handles the lifecycle of an embedding span and the
+// matching `gen_ai.client.operation.duration` / `gen_ai.client.token.usage`
+// metric records.
+type EmbeddingSpan struct {
+	span      trace.Span
+	provider  string
+	model     string
+	startedAt time.Time
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+
+	mu          sync.Mutex
+	ended       bool
+	inputTokens int64
+	dimensions  int
+	errType     string
+}
+
+// StartEmbedding begins a CLIENT-kind `embeddings {model}` span and
+// records the spec-required `gen_ai.operation.name=embeddings`,
+// `gen_ai.provider.name`, and `gen_ai.request.model` attributes.
+func StartEmbedding(ctx context.Context, req EmbeddingRequest) (context.Context, *EmbeddingSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	name := OperationEmbeddings
+	if req.Model != "" {
+		name = OperationEmbeddings + " " + req.Model
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationEmbeddings),
+		attribute.String(AttrProviderName, req.Provider),
+	}
+	if req.Model != "" {
+		attrs = append(attrs, attribute.String(AttrRequestModel, req.Model))
+	}
+	if req.BatchSize > 1 {
+		attrs = append(attrs, attribute.Int("cagent.embeddings.batch_size", req.BatchSize))
+	}
+	if len(req.EncodingFormats) > 0 {
+		attrs = append(attrs, attribute.StringSlice(AttrRequestEncodingFormats, req.EncodingFormats))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &EmbeddingSpan{
+		span:      span,
+		provider:  req.Provider,
+		model:     req.Model,
+		startedAt: time.Now(),
+		metricCtx: ctx,
+	}
+}
+
+// SetInputTokens records the number of input tokens consumed by the
+// embedding call. Emitted as `gen_ai.usage.input_tokens` on the span
+// and as the `gen_ai.client.token.usage` metric at End time.
+func (s *EmbeddingSpan) SetInputTokens(n int64) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.inputTokens = n
+	s.mu.Unlock()
+}
+
+// SetDimensions records the dimensionality of the resulting embedding
+// vector(s). Emitted as `gen_ai.embeddings.dimension.count`.
+func (s *EmbeddingSpan) SetDimensions(d int) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.dimensions = d
+	s.mu.Unlock()
+}
+
+// RecordError marks the span as failed and stores `error.type` for the
+// duration metric.
+func (s *EmbeddingSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span and records the duration + token-usage metrics.
+func (s *EmbeddingSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	inputTokens := s.inputTokens
+	dimensions := s.dimensions
+	errType := s.errType
+	s.mu.Unlock()
+
+	if inputTokens > 0 {
+		s.span.SetAttributes(attribute.Int64(AttrUsageInputTokens, inputTokens))
+	}
+	if dimensions > 0 {
+		s.span.SetAttributes(attribute.Int(AttrEmbeddingsDimensionCount, dimensions))
+	}
+	s.span.End()
+
+	insts := getInstruments()
+	if insts == nil {
+		return
+	}
+	commonAttrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationEmbeddings),
+		attribute.String(AttrProviderName, s.provider),
+	}
+	if s.model != "" {
+		commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model))
+	}
+	durationAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+	if errType != "" {
+		durationAttrs = append(durationAttrs, attribute.String("error.type", errType))
+	}
+	if insts.clientOperationDuration != nil {
+		insts.clientOperationDuration.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+			metric.WithAttributes(durationAttrs...),
+		)
+	}
+	if inputTokens > 0 && insts.clientTokenUsage != nil {
+		tokenAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+		tokenAttrs = append(tokenAttrs, attribute.String(AttrTokenType, TokenTypeInput))
+		insts.clientTokenUsage.Record(s.metricCtx, inputTokens,
+			metric.WithAttributes(tokenAttrs...),
+		)
+	}
+}
diff --git a/pkg/telemetry/genai/errors.go b/pkg/telemetry/genai/errors.go
new file mode 100644
index 000000000..8d1f7db18
--- /dev/null
+++ b/pkg/telemetry/genai/errors.go
@@ -0,0 +1,85 @@
+package genai
+
+import (
+	"context"
+	"errors"
+	"net"
+	"strings"
+
+	"go.opentelemetry.io/otel/attribute"
+)
+
+// ErrorTypeOther is the OTel-mandated fallback for `error.type` when no
+// classifier matches. The spec requires `_OTHER` rather than a Go type
+// name so backends can rely on a bounded cardinality.
+const ErrorTypeOther = "_OTHER"
+
+// ClassifyError maps a provider error to a low-cardinality `error.type`
+// value suitable for span and metric attributes. Falls back to
+// `_OTHER` (the spec-defined sentinel) when the error does not match any
+// known pattern.
+//
+// Spec leaves the value space open for callers — these strings are picked
+// for cross-provider comparability on dashboards.
+func ClassifyError(err error) string {
+	if err == nil {
+		return ""
+	}
+	switch {
+	case errors.Is(err, context.Canceled):
+		return "context_canceled"
+	case errors.Is(err, context.DeadlineExceeded):
+		return "deadline_exceeded"
+	}
+
+	msg := strings.ToLower(err.Error())
+	switch {
+	case strings.Contains(msg, "context length") || strings.Contains(msg, "context_length"):
+		// Bare "max_tokens" matches too eagerly: validation errors
+		// like `max_tokens must be > 0` and "model X does not
+		// support max_tokens" both contain the token but are not
+		// context overflows. Stick to the unambiguous phrases.
+		return "context_length_exceeded"
+	case strings.Contains(msg, "rate limit") || strings.Contains(msg, "429"):
+		return "rate_limit"
+	case strings.Contains(msg, "401") || strings.Contains(msg, "unauthorized") || strings.Contains(msg, "authentication"):
+		return "auth"
+	case strings.Contains(msg, "403") || strings.Contains(msg, "forbidden") || strings.Contains(msg, "permission"):
+		return "forbidden"
+	case strings.Contains(msg, "content policy") || strings.Contains(msg, "content filter") || strings.Contains(msg, "safety"):
+		return "content_policy"
+	}
+
+	var netErr net.Error
+	if errors.As(err, &netErr) {
+		if netErr.Timeout() {
+			return "network_timeout"
+		}
+		return "network"
+	}
+
+	return ErrorTypeOther
+}
+
+// applyExtraAttribute converts a StreamAttributer KeyValue into an OTel
+// attribute and applies it to the span. Unsupported value types are
+// dropped silently — telemetry must never crash request paths.
+func applyExtraAttribute(span *ChatSpan, kv KeyValue) {
+	if span == nil || kv.Key == "" {
+		return
+	}
+	switch v := kv.Value.(type) {
+	case string:
+		span.SetAttributes(attribute.String(kv.Key, v))
+	case bool:
+		span.SetAttributes(attribute.Bool(kv.Key, v))
+	case int:
+		span.SetAttributes(attribute.Int(kv.Key, v))
+	case int64:
+		span.SetAttributes(attribute.Int64(kv.Key, v))
+	case float64:
+		span.SetAttributes(attribute.Float64(kv.Key, v))
+	case []string:
+		span.SetAttributes(attribute.StringSlice(kv.Key, v))
+	}
+}
diff --git a/pkg/telemetry/genai/evaluation.go b/pkg/telemetry/genai/evaluation.go
new file mode 100644
index 000000000..4d1673efa
--- /dev/null
+++ b/pkg/telemetry/genai/evaluation.go
@@ -0,0 +1,64 @@
+package genai
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/log"
+	"go.opentelemetry.io/otel/log/global"
+)
+
+// EvaluationResult describes one evaluation outcome that should be emitted
+// as a `gen_ai.evaluation.result` log record per the OTel GenAI semconv
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/).
+type EvaluationResult struct {
+	// Name is the evaluation metric — e.g. "relevance", "factuality",
+	// "tool_calls_f1". Required.
+	Name string
+
+	// ScoreLabel is the human-readable verdict — e.g. "passed",
+	// "failed", "satisfactory". Optional but commonly set.
+	ScoreLabel string
+
+	// ScoreValue is the numeric score (commonly 0.0–1.0). Optional.
+	ScoreValue    float64
+	HasScoreValue bool
+
+	// Explanation is a free-form reason for the score. Optional.
+	Explanation string
+
+	// ErrorType is set when the evaluation itself failed (e.g. the
+	// judge model errored out). Mirrors the spec's `error.type` field.
+	ErrorType string
+}
+
+// EmitEvaluationResult emits a `gen_ai.evaluation.result` log record. The
+// record links to the active span via the supplied context so dashboards
+// can join evaluation outcomes back onto the operation that produced
+// them. No-op when no logger provider is configured.
+func EmitEvaluationResult(ctx context.Context, result EvaluationResult) {
+	logger := global.GetLoggerProvider().Logger(instrumentationName)
+
+	var rec log.Record
+	rec.SetEventName("gen_ai.evaluation.result")
+	rec.SetSeverity(log.SeverityInfo)
+	rec.SetSeverityText("INFO")
+
+	rec.AddAttributes(log.String(AttrEvaluationName, result.Name))
+	if result.ScoreLabel != "" {
+		rec.AddAttributes(log.String(AttrEvaluationScoreLabel, result.ScoreLabel))
+	}
+	if result.HasScoreValue {
+		rec.AddAttributes(log.Float64(AttrEvaluationScoreValue, result.ScoreValue))
+	}
+	if result.Explanation != "" {
+		rec.AddAttributes(log.String(AttrEvaluationExplanation, result.Explanation))
+	}
+	if result.ErrorType != "" {
+		rec.AddAttributes(log.String("error.type", result.ErrorType))
+	}
+	if convID := ConversationIDFromContext(ctx); convID != "" {
+		rec.AddAttributes(log.String(AttrConversationID, convID))
+	}
+
+	logger.Emit(ctx, rec)
+}
diff --git a/pkg/telemetry/genai/genai_test.go b/pkg/telemetry/genai/genai_test.go
new file mode 100644
index 000000000..692d41212
--- /dev/null
+++ b/pkg/telemetry/genai/genai_test.go
@@ -0,0 +1,156 @@
+package genai
+
+import (
+	"context"
+	"errors"
+	"io"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/docker/docker-agent/pkg/chat"
+)
+
+func TestProviderNameForConfig(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		in   string
+		want string
+	}{
+		{"openai", ProviderOpenAI},
+		{"openai_chatcompletions", ProviderOpenAI},
+		{"openai_responses", ProviderOpenAI},
+		{"anthropic", ProviderAnthropic},
+		{"amazon-bedrock", ProviderAWSBedrock},
+		{"google", ProviderGCPGenAI},
+		{"vertexai", ProviderGCPVertexAI},
+		{"azure", ProviderAzureAI},
+		{"dmr", ProviderDMR},
+		{"custom-provider", "custom-provider"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.in, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, ProviderNameForConfig(tt.in))
+		})
+	}
+}
+
+func TestClassifyError(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		name string
+		err  error
+		want string
+	}{
+		{"nil", nil, ""},
+		{"context canceled", context.Canceled, "context_canceled"},
+		{"context deadline", context.DeadlineExceeded, "deadline_exceeded"},
+		{"rate limit", errors.New("HTTP 429 Too Many Requests"), "rate_limit"},
+		{"context length", errors.New("context_length_exceeded: prompt too large"), "context_length_exceeded"},
+		{"unauthorized", errors.New("HTTP 401 Unauthorized"), "auth"},
+		{"forbidden", errors.New("HTTP 403 Forbidden"), "forbidden"},
+		{"content policy", errors.New("response blocked by content filter"), "content_policy"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, ClassifyError(tt.err))
+		})
+	}
+}
+
+// fakeStream produces a fixed sequence of chunks then EOF.
+type fakeStream struct {
+	chunks []chat.MessageStreamResponse
+	idx    int
+	closed bool
+}
+
+func (f *fakeStream) Recv() (chat.MessageStreamResponse, error) {
+	if f.idx >= len(f.chunks) {
+		return chat.MessageStreamResponse{}, io.EOF
+	}
+	r := f.chunks[f.idx]
+	f.idx++
+	return r, nil
+}
+
+func (f *fakeStream) Close() { f.closed = true }
+
+func TestStartChatAndWrapStream(t *testing.T) {
+	t.Parallel()
+
+	stream := &fakeStream{
+		chunks: []chat.MessageStreamResponse{
+			{
+				ID:    "resp-1",
+				Model: "claude-sonnet-4",
+				Choices: []chat.MessageStreamChoice{
+					{Delta: chat.MessageDelta{Content: "hello"}},
+				},
+			},
+			{
+				Choices: []chat.MessageStreamChoice{
+					{FinishReason: chat.FinishReasonStop},
+				},
+				Usage: &chat.Usage{
+					InputTokens:       100,
+					OutputTokens:      50,
+					CachedInputTokens: 20,
+					CacheWriteTokens:  10,
+				},
+			},
+		},
+	}
+
+	ctx, span := StartChat(t.Context(), ChatRequest{
+		Provider:  ProviderAnthropic,
+		Model:     "claude-sonnet-4",
+		Stream:    true,
+		MaxTokens: 4096,
+	})
+	require.NotNil(t, span)
+	require.NotNil(t, ctx)
+
+	wrapped := WrapStream(span, stream)
+
+	// Drain the stream.
+	for {
+		resp, err := wrapped.Recv()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		require.NoError(t, err)
+		_ = resp
+	}
+	wrapped.Close()
+	assert.True(t, stream.closed)
+
+	// Re-closing should be a no-op (the wrapper guards against
+	// double-Close, which would otherwise emit two End() calls).
+	wrapped.Close()
+}
+
+func TestWrapStreamNilSpanReturnsOriginal(t *testing.T) {
+	t.Parallel()
+	s := &fakeStream{}
+	got := WrapStream(nil, s)
+	assert.Same(t, s, got)
+}
+
+func TestServerAddressFromURL(t *testing.T) {
+	t.Parallel()
+	host, port := ServerAddressFromURL("https://api.anthropic.com:443/v1/messages")
+	assert.Equal(t, "api.anthropic.com", host)
+	assert.Equal(t, 443, port)
+
+	host, port = ServerAddressFromURL("https://api.openai.com/v1/chat/completions")
+	assert.Equal(t, "api.openai.com", host)
+	assert.Equal(t, 0, port)
+
+	host, port = ServerAddressFromURL("")
+	assert.Empty(t, host)
+	assert.Equal(t, 0, port)
+}
diff --git a/pkg/telemetry/genai/metrics.go b/pkg/telemetry/genai/metrics.go
new file mode 100644
index 000000000..01f8d90f8
--- /dev/null
+++ b/pkg/telemetry/genai/metrics.go
@@ -0,0 +1,80 @@
+package genai
+
+import (
+	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// instrumentationName identifies this package as the OTel instrumentation
+// scope for spans, metrics, and log records it produces.
+const instrumentationName = "github.com/docker/docker-agent/pkg/telemetry/genai"
+
+// metricBucketsDuration matches the spec for `gen_ai.client.operation.duration`
+// (and related per-chunk timing histograms).
+var metricBucketsDuration = []float64{
+	0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
+}
+
+// metricBucketsTokenUsage matches the spec for `gen_ai.client.token.usage`.
+var metricBucketsTokenUsage = []float64{
+	1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864,
+}
+
+// instruments holds the lazily-initialised metric instruments. Resolved on
+// first use because the global MeterProvider is set at SDK init time, which
+// may run after package-level var initialisation in some contexts.
+type instruments struct {
+	clientOperationDuration     metric.Float64Histogram
+	clientOperationTTFC         metric.Float64Histogram
+	clientOperationTimePerChunk metric.Float64Histogram
+	clientTokenUsage            metric.Int64Histogram
+}
+
+var (
+	instOnce sync.Once
+	inst     *instruments
+)
+
+// getInstruments resolves and caches the package-level meter instruments.
+// Histogram creation rarely fails in practice; when one does we keep the
+// instruments that did succeed and leave the failed one nil. Call sites
+// already nil-check each instrument, so a partial set is functional —
+// previously a single early return left every metric permanently
+// disabled, which surprised production debugging when one bucket
+// configuration tripped a registration error.
+func getInstruments() *instruments {
+	instOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		i := &instruments{}
+
+		i.clientOperationDuration, _ = meter.Float64Histogram(
+			"gen_ai.client.operation.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("GenAI operation duration."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		i.clientOperationTTFC, _ = meter.Float64Histogram(
+			"gen_ai.client.operation.time_to_first_chunk",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time to receive the first chunk of a streaming GenAI response."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		i.clientOperationTimePerChunk, _ = meter.Float64Histogram(
+			"gen_ai.client.operation.time_per_output_chunk",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time between consecutive output chunks of a streaming GenAI response."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		i.clientTokenUsage, _ = meter.Int64Histogram(
+			"gen_ai.client.token.usage",
+			metric.WithUnit("{token}"),
+			metric.WithDescription("Number of tokens used in a GenAI client request, broken down by token type."),
+			metric.WithExplicitBucketBoundaries(metricBucketsTokenUsage...),
+		)
+
+		inst = i
+	})
+	return inst
+}
diff --git a/pkg/telemetry/genai/provider_names.go b/pkg/telemetry/genai/provider_names.go
new file mode 100644
index 000000000..8076583f4
--- /dev/null
+++ b/pkg/telemetry/genai/provider_names.go
@@ -0,0 +1,28 @@
+package genai
+
+// ProviderNameForConfig maps the project's internal provider type strings
+// (the values used in agent YAML and resolved by
+// pkg/model/provider.resolveProviderType) to the GenAI semconv provider
+// names defined in the per-provider semantic conventions. Unknown
+// providers fall through unchanged so dashboards still receive a value
+// rather than empty string.
+func ProviderNameForConfig(internalName string) string {
+	switch internalName {
+	case "openai", "openai_chatcompletions", "openai_responses":
+		return ProviderOpenAI
+	case "anthropic":
+		return ProviderAnthropic
+	case "amazon-bedrock":
+		return ProviderAWSBedrock
+	case "google":
+		return ProviderGCPGenAI
+	case "vertexai", "google-vertex":
+		return ProviderGCPVertexAI
+	case "azure", "azure-openai":
+		return ProviderAzureAI
+	case "dmr":
+		return ProviderDMR
+	default:
+		return internalName
+	}
+}
diff --git a/pkg/telemetry/genai/runtime.go b/pkg/telemetry/genai/runtime.go
new file mode 100644
index 000000000..628b21c1c
--- /dev/null
+++ b/pkg/telemetry/genai/runtime.go
@@ -0,0 +1,367 @@
+package genai
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// Custom (non-spec) attribute keys for runtime-side observability that has
+// no GenAI semconv equivalent yet (fallback chain, response cache,
+// approval pipeline). Kept under the `cagent.` namespace so they are
+// clearly distinguishable from the spec-defined `gen_ai.*` and `mcp.*`
+// attributes when scrolling through a span.
+const (
+	AttrFallbackPrimaryModel = "cagent.fallback.primary_model"
+	AttrFallbackFinalModel   = "cagent.fallback.final_model"
+	AttrFallbackAttempts     = "cagent.fallback.attempts"
+	AttrFallbackOutcome      = "cagent.fallback.outcome"
+	AttrFallbackInCooldown   = "cagent.fallback.in_cooldown"
+
+	AttrCacheHit     = "cagent.cache.hit"
+	AttrCacheBacking = "cagent.cache.backing"
+
+	AttrAgentNameRuntime = "cagent.agent.name"
+
+	AttrRetrievalResultCount = "cagent.retrieval.result_count"
+
+	AttrSandboxRuntime   = "cagent.sandbox.runtime"
+	AttrSandboxImage     = "cagent.sandbox.image"
+	AttrSandboxContainer = "cagent.sandbox.container"
+	AttrSandboxExitCode  = "cagent.sandbox.exit_code"
+)
+
+// FallbackOutcome values for AttrFallbackOutcome.
+const (
+	FallbackOutcomeSuccess         = "success"
+	FallbackOutcomeFailed          = "failed"
+	FallbackOutcomeContextCanceled = "context_canceled"
+)
+
+// FallbackSpan is the handle for an in-flight runtime.fallback span.
+type FallbackSpan struct {
+	span      trace.Span
+	startedAt time.Time
+
+	mu       sync.Mutex
+	attempts int
+	final    string
+	outcome  string
+	errType  string
+	ended    bool
+}
+
+// StartFallback begins a runtime.fallback span covering the whole fallback
+// chain for one agent turn. Each per-model attempt produces its own
+// `chat {model}` CLIENT child span (created by the provider decorator).
+// Attributes set up front: primary model name, agent name, in-cooldown
+// flag. The caller updates final model / attempts / outcome through the
+// returned handle and calls End to flush.
+func StartFallback(ctx context.Context, agentName, primaryModel string, inCooldown bool) (context.Context, *FallbackSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrAgentNameRuntime, agentName),
+		attribute.Bool(AttrFallbackInCooldown, inCooldown),
+	}
+	if primaryModel != "" {
+		attrs = append(attrs, attribute.String(AttrFallbackPrimaryModel, primaryModel))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, "runtime.fallback",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &FallbackSpan{
+		span:      span,
+		startedAt: time.Now(),
+	}
+}
+
+// IncrementAttempt counts one attempt against the chain. Called once per
+// (model × retry) iteration so the final span carries the total count.
+func (s *FallbackSpan) IncrementAttempt() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.attempts++
+	s.mu.Unlock()
+}
+
+// SetFinalModel records the model that ultimately served the response.
+// Called on the success path; not called on full-failure paths so the
+// attribute remains absent and dashboards can distinguish the cases.
+func (s *FallbackSpan) SetFinalModel(model string) {
+	if s == nil || model == "" {
+		return
+	}
+	s.mu.Lock()
+	s.final = model
+	s.mu.Unlock()
+}
+
+// RecordError stores an error and an error.type label for the metric.
+func (s *FallbackSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// SetOutcome records the terminal outcome of the chain. Use one of the
+// FallbackOutcome* constants.
+func (s *FallbackSpan) SetOutcome(outcome string) {
+	if s == nil || outcome == "" {
+		return
+	}
+	s.mu.Lock()
+	s.outcome = outcome
+	s.mu.Unlock()
+}
+
+// End closes the span and flushes accumulated attributes.
+func (s *FallbackSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	final := s.final
+	outcome := s.outcome
+	attempts := s.attempts
+	s.mu.Unlock()
+
+	if final != "" {
+		s.span.SetAttributes(attribute.String(AttrFallbackFinalModel, final))
+	}
+	if outcome != "" {
+		s.span.SetAttributes(attribute.String(AttrFallbackOutcome, outcome))
+	}
+	s.span.SetAttributes(attribute.Int(AttrFallbackAttempts, attempts))
+	s.span.End()
+}
+
+// RetrievalSpan handles a retrieval-operation span lifecycle.
+type RetrievalSpan struct {
+	span      trace.Span
+	startedAt time.Time
+
+	mu          sync.Mutex
+	resultCount int
+	errType     string
+	ended       bool
+}
+
+// StartRetrieval begins a `retrieval {data_source.id}` span per the OTel
+// GenAI semconv. providerName identifies the retrieval backend
+// ("sqlite", "rag", an embedding-provider name) and is Required by the
+// spec for retrieval operations. dataSourceID identifies the corpus /
+// index / collection being queried; queryText is captured only when
+// the caller has confirmed the content-capture opt-in.
+func StartRetrieval(ctx context.Context, providerName, dataSourceID string, captureQuery bool, queryText string) (context.Context, *RetrievalSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	name := OperationRetrieval
+	if dataSourceID != "" {
+		name = OperationRetrieval + " " + dataSourceID
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationRetrieval),
+	}
+	if providerName != "" {
+		attrs = append(attrs, attribute.String(AttrProviderName, providerName))
+	}
+	if dataSourceID != "" {
+		attrs = append(attrs, attribute.String(AttrDataSourceID, dataSourceID))
+	}
+	if captureQuery && queryText != "" {
+		attrs = append(attrs, attribute.String(AttrRetrievalQueryText, queryText))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &RetrievalSpan{span: span, startedAt: time.Now()}
+}
+
+// SetAttributes adds extra attributes to the retrieval span. Use for
+// retrieval-specific extensions (corpus filter, category, fusion mode,
+// etc.) that don't have a dedicated setter.
+func (s *RetrievalSpan) SetAttributes(attrs ...attribute.KeyValue) {
+	if s == nil {
+		return
+	}
+	s.span.SetAttributes(attrs...)
+}
+
+// SetResultCount records how many documents the retrieval returned.
+func (s *RetrievalSpan) SetResultCount(n int) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.resultCount = n
+	s.mu.Unlock()
+}
+
+// RecordError marks the retrieval span as failed.
+func (s *RetrievalSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the retrieval span and flushes the result count.
+func (s *RetrievalSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	count := s.resultCount
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.Int(AttrRetrievalResultCount, count))
+	s.span.End()
+}
+
+// CacheRequest counter — records every cache lookup with `result=hit|miss`
+// and a `backing` attribute for memory-only vs file-backed caches.
+var (
+	cacheCounterOnce sync.Once
+	cacheCounter     metric.Int64Counter
+)
+
+func getCacheCounter() metric.Int64Counter {
+	cacheCounterOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		c, err := meter.Int64Counter(
+			"cagent.cache.requests",
+			metric.WithUnit("{request}"),
+			metric.WithDescription("Number of response-cache lookups, broken down by hit/miss."),
+		)
+		if err != nil {
+			return
+		}
+		cacheCounter = c
+	})
+	return cacheCounter
+}
+
+// RecordCacheLookup increments the cache counter and returns a small span
+// describing the lookup. Callers `defer span.End()` and the helper sets
+// `cagent.cache.hit` from the value returned by SetHit.
+func RecordCacheLookup(ctx context.Context, backing string) (context.Context, *CacheSpan) {
+	return startCacheSpan(ctx, "cache.lookup", "lookup", backing)
+}
+
+// RecordCacheStore is the Store-side counterpart of RecordCacheLookup.
+func RecordCacheStore(ctx context.Context, backing string) (context.Context, *CacheSpan) {
+	return startCacheSpan(ctx, "cache.store", "store", backing)
+}
+
+func startCacheSpan(ctx context.Context, spanName, op, backing string) (context.Context, *CacheSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	attrs := []attribute.KeyValue{}
+	if backing != "" {
+		attrs = append(attrs, attribute.String(AttrCacheBacking, backing))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, spanName,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &CacheSpan{span: span, metricCtx: ctx, backing: backing, op: op}
+}
+
+// CacheSpan handles cache-operation span lifecycle.
+type CacheSpan struct {
+	span trace.Span
+	// metricCtx carries the active span context so counter Add calls
+	// produce span-context exemplars (drill Mimir bucket → Tempo
+	// trace). Without this the counter measurement gets only the
+	// resource attributes.
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+	backing   string
+	op        string
+
+	mu  sync.Mutex
+	hit bool
+	set bool
+}
+
+// SetHit records whether the lookup found an entry. Increments the
+// cache counter immediately so the metric reflects the result even if End
+// is called late.
+func (s *CacheSpan) SetHit(hit bool) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.hit = hit
+	s.set = true
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.Bool(AttrCacheHit, hit))
+
+	if c := getCacheCounter(); c != nil {
+		result := "miss"
+		if hit {
+			result = "hit"
+		}
+		attrs := []attribute.KeyValue{
+			attribute.String("result", result),
+			attribute.String("operation", s.op),
+		}
+		if s.backing != "" {
+			attrs = append(attrs, attribute.String(AttrCacheBacking, s.backing))
+		}
+		// Use the active context so the counter measurement carries
+		// the span exemplar — drill from Mimir bucket → Tempo trace
+		// works for cache operations the same way it does for chat.
+		c.Add(s.metricCtx, 1, metric.WithAttributes(attrs...))
+	}
+}
+
+// End closes the cache span.
+func (s *CacheSpan) End() {
+	if s == nil {
+		return
+	}
+	s.span.End()
+}
diff --git a/pkg/telemetry/genai/sandbox.go b/pkg/telemetry/genai/sandbox.go
new file mode 100644
index 000000000..4b97d7fc0
--- /dev/null
+++ b/pkg/telemetry/genai/sandbox.go
@@ -0,0 +1,231 @@
+package genai
+
+import (
+	"context"
+	"strings"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// envCarrier adapts an env-var key/value map to OTel's TextMapCarrier so
+// the configured propagator can write traceparent / tracestate / baggage
+// into a subprocess's environment. Keys are uppercased on Set to match
+// the convention subprocess-propagation tools (otel-cli, OTel SDKs)
+// expect.
+type envCarrier map[string]string
+
+func (c envCarrier) Get(key string) string { return c[strings.ToUpper(key)] }
+func (c envCarrier) Set(key, value string) { c[strings.ToUpper(key)] = value }
+func (c envCarrier) Keys() []string {
+	keys := make([]string, 0, len(c))
+	for k := range c {
+		keys = append(keys, k)
+	}
+	return keys
+}
+
+var _ propagation.TextMapCarrier = envCarrier{}
+
+// InjectSandboxEnv returns docker-style `-e KEY=VALUE` flags carrying the
+// W3C trace context for the current span so the agent process spawned
+// inside a sandbox container inherits the parent trace. Anything OTel-
+// aware running in the container — another agent, an HTTP client with
+// otelhttp transport, otel-cli — auto-parents its spans onto the active
+// CLIENT span on the host side.
+//
+// Returns nil when no propagator is configured or when the active context
+// has no span context to inject.
+func InjectSandboxEnv(ctx context.Context) []string {
+	carrier := envCarrier{}
+	otel.GetTextMapPropagator().Inject(ctx, carrier)
+	if len(carrier) == 0 {
+		return nil
+	}
+	flags := make([]string, 0, 2*len(carrier))
+	for k, v := range carrier {
+		flags = append(flags, "-e", k+"="+v)
+	}
+	return flags
+}
+
+// InjectTraceContextEnv returns `KEY=VALUE` env-var strings carrying the
+// W3C trace context for the current span. Use to extend `exec.Cmd.Env`
+// for direct subprocess spawns (hook scripts, LSP servers) so OTel-aware
+// children chain onto the active span. Companion to `InjectSandboxEnv`,
+// which formats for `docker -e`.
+//
+// Returns nil when no propagator is configured or when the active context
+// has no span context to inject.
+func InjectTraceContextEnv(ctx context.Context) []string {
+	carrier := envCarrier{}
+	otel.GetTextMapPropagator().Inject(ctx, carrier)
+	if len(carrier) == 0 {
+		return nil
+	}
+	out := make([]string, 0, len(carrier))
+	for k, v := range carrier {
+		out = append(out, k+"="+v)
+	}
+	return out
+}
+
+// SandboxSpan handles the lifecycle of a sandbox.exec span and the
+// matching sandbox.exec.duration histogram. Use to wrap the actual
+// `docker sandbox exec` (or equivalent) subprocess invocation so the
+// host side has timing, exit code, runtime kind, and image information
+// alongside the inherited child trace from inside the sandbox.
+type SandboxSpan struct {
+	span trace.Span
+	// metricCtx carries the active span context so histogram Record
+	// calls produce span-context exemplars (drill Mimir → Tempo).
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+	startedAt time.Time
+	runtime   string
+
+	mu       sync.Mutex
+	exitCode int
+	hasExit  bool
+	errType  string
+	ended    bool
+}
+
+// SandboxOptions configures the attributes set on a sandbox.exec span at
+// creation time. All fields are optional except Runtime.
+type SandboxOptions struct {
+	// Runtime is a short label identifying the sandbox backend (e.g.
+	// `"docker"`). Recorded as `cagent.sandbox.runtime` and used as a
+	// histogram label, so callers should keep the set of values small
+	// and stable.
+	Runtime string
+
+	// Image is the container/pod image when known.
+	Image string
+
+	// Container is the container/pod identifier when known.
+	Container string
+
+	// AgentName is the agent being executed in the sandbox.
+	AgentName string
+}
+
+// StartSandboxExec opens a `sandbox.exec` INTERNAL span. Runtime kind is
+// set up front; exit code and error info attach via the returned handle.
+func StartSandboxExec(ctx context.Context, opts SandboxOptions) (context.Context, *SandboxSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	attrs := []attribute.KeyValue{}
+	if opts.Runtime != "" {
+		attrs = append(attrs, attribute.String(AttrSandboxRuntime, opts.Runtime))
+	}
+	if opts.Image != "" {
+		attrs = append(attrs, attribute.String(AttrSandboxImage, opts.Image))
+	}
+	if opts.Container != "" {
+		attrs = append(attrs, attribute.String(AttrSandboxContainer, opts.Container))
+	}
+	if opts.AgentName != "" {
+		attrs = append(attrs, attribute.String(AttrAgentNameRuntime, opts.AgentName))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, "sandbox.exec",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &SandboxSpan{span: span, metricCtx: ctx, startedAt: time.Now(), runtime: opts.Runtime}
+}
+
+// SetExitCode records the subprocess exit code as
+// `cagent.sandbox.exit_code`. Set zero on success.
+func (s *SandboxSpan) SetExitCode(code int) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.exitCode = code
+	s.hasExit = true
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.Int(AttrSandboxExitCode, code))
+}
+
+// RecordError marks the span as failed.
+func (s *SandboxSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span and records the sandbox.exec.duration histogram.
+func (s *SandboxSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	errType := s.errType
+	s.mu.Unlock()
+
+	s.span.End()
+
+	hist := getSandboxDurationHistogram()
+	if hist == nil {
+		return
+	}
+	attrs := []attribute.KeyValue{}
+	if s.runtime != "" {
+		// Partitions the histogram by sandbox backend so dashboards
+		// can compare exec latency across runtimes when more than
+		// one is wired up.
+		attrs = append(attrs, attribute.String(AttrSandboxRuntime, s.runtime))
+	}
+	if errType != "" {
+		attrs = append(attrs, attribute.String("error.type", errType))
+	}
+	// Use the active context so the histogram measurement carries the
+	// span exemplar — drill from Mimir bucket → Tempo trace.
+	hist.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+		metric.WithAttributes(attrs...),
+	)
+}
+
+var (
+	sandboxDurationOnce sync.Once
+	sandboxDurationHist metric.Float64Histogram
+)
+
+func getSandboxDurationHistogram() metric.Float64Histogram {
+	sandboxDurationOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		h, err := meter.Float64Histogram(
+			"cagent.sandbox.exec.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time the host side spent waiting for a sandbox exec invocation to complete."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		if err != nil {
+			return
+		}
+		sandboxDurationHist = h
+	})
+	return sandboxDurationHist
+}
diff --git a/pkg/telemetry/genai/span.go b/pkg/telemetry/genai/span.go
new file mode 100644
index 000000000..9b0542973
--- /dev/null
+++ b/pkg/telemetry/genai/span.go
@@ -0,0 +1,418 @@
+package genai
+
+import (
+	"context"
+	"net/url"
+	"slices"
+	"strconv"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+	tracenoop "go.opentelemetry.io/otel/trace/noop"
+)
+
+// ChatRequest carries the inputs needed to start a `chat {model}` span and
+// to record the matching client metrics. Provider-specific extensions
+// (openai service tier, aws.bedrock guardrail, etc.) attach via
+// ChatSpan.SetAttributes after the span has started.
+type ChatRequest struct {
+	// Provider is the GenAI provider name. Use one of the Provider*
+	// constants. Set on the span at creation time per the per-provider
+	// semconv MUST clauses.
+	Provider string
+
+	// Model is the requested model identifier. Empty model is allowed
+	// (some routers do not commit until inside the call) but produces a
+	// span name of just "chat".
+	Model string
+
+	// Stream is true if the request is streaming. Recorded as
+	// gen_ai.request.stream.
+	Stream bool
+
+	// ServerAddress / ServerPort identify the GenAI endpoint when known
+	// (helpful for routing-aware dashboards). Optional.
+	ServerAddress string
+	ServerPort    int
+
+	// Sampling parameters. Zero values are treated as unset and not
+	// recorded on the span.
+	MaxTokens        int
+	Temperature      float64
+	TopP             float64
+	TopK             float64
+	FrequencyPenalty float64
+	PresencePenalty  float64
+	Seed             int
+	StopSequences    []string
+	ChoiceCount      int
+
+	// HasTemperature / HasTopP / HasTopK / HasFreqPenalty / HasPresPenalty
+	// disambiguate "explicitly zero" from "unset" for the float params.
+	// Callers that use the zero value as meaningful must set these.
+	HasTemperature bool
+	HasTopP        bool
+	HasTopK        bool
+	HasFreqPenalty bool
+	HasPresPenalty bool
+}
+
+// ServerAddressFromURL extracts host and port for the ServerAddress /
+// ServerPort fields when callers have a full URL handy.
+func ServerAddressFromURL(raw string) (string, int) {
+	if raw == "" {
+		return "", 0
+	}
+	u, err := url.Parse(raw)
+	if err != nil || u.Host == "" {
+		return "", 0
+	}
+	port, _ := strconv.Atoi(u.Port())
+	return u.Hostname(), port
+}
+
+// ChatSpan is the handle returned by StartChat. It wraps an OTel span and
+// captures enough state to emit per-operation metrics on End.
+type ChatSpan struct {
+	span      trace.Span
+	provider  string
+	model     string
+	startedAt time.Time
+	// metricCtx carries the request context captured at StartChat
+	// time so metric Record / Add calls in End preserve the
+	// trace-to-metric exemplar link. Using context.Background() here
+	// would silently strip the active span context and break
+	// drill-from-metric-bucket-to-trace navigation in Tempo/Mimir.
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+
+	mu            sync.Mutex
+	ended         bool
+	responseModel string
+	finishReasons []string
+	usageRecorded bool
+	usage         chatUsage
+	errType       string
+
+	// Streaming metrics: the first non-empty chunk timestamp and the
+	// previous chunk timestamp drive the time_to_first_chunk and
+	// time_per_output_chunk histograms.
+	firstChunkAt   time.Time
+	prevChunkAt    time.Time
+	chunkDurations []float64
+}
+
+type chatUsage struct {
+	inputTokens        int64
+	outputTokens       int64
+	cacheReadInput     int64
+	cacheCreationInput int64
+	reasoningOutput    int64
+}
+
+// StartChat begins a CLIENT-kind `chat {model}` span and records the
+// required gen_ai.* request attributes. The returned context carries the
+// new span; callers MUST call ChatSpan.End to flush the span and metrics.
+func StartChat(ctx context.Context, req ChatRequest) (context.Context, *ChatSpan) {
+	tracer := otel.Tracer(instrumentationName)
+
+	name := OperationChat
+	if req.Model != "" {
+		name = OperationChat + " " + req.Model
+	}
+
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationChat),
+		attribute.String(AttrProviderName, req.Provider),
+		attribute.Bool(AttrRequestStream, req.Stream),
+	}
+	if req.Model != "" {
+		attrs = append(attrs, attribute.String(AttrRequestModel, req.Model))
+	}
+	if req.ServerAddress != "" {
+		attrs = append(attrs, attribute.String("server.address", req.ServerAddress))
+		if req.ServerPort > 0 {
+			attrs = append(attrs, attribute.Int("server.port", req.ServerPort))
+		}
+	}
+	if req.MaxTokens > 0 {
+		attrs = append(attrs, attribute.Int(AttrRequestMaxTokens, req.MaxTokens))
+	}
+	if req.HasTemperature {
+		attrs = append(attrs, attribute.Float64(AttrRequestTemperature, req.Temperature))
+	}
+	if req.HasTopP {
+		attrs = append(attrs, attribute.Float64(AttrRequestTopP, req.TopP))
+	}
+	if req.HasTopK {
+		attrs = append(attrs, attribute.Float64(AttrRequestTopK, req.TopK))
+	}
+	if req.HasFreqPenalty {
+		attrs = append(attrs, attribute.Float64(AttrRequestFrequencyPenalty, req.FrequencyPenalty))
+	}
+	if req.HasPresPenalty {
+		attrs = append(attrs, attribute.Float64(AttrRequestPresencePenalty, req.PresencePenalty))
+	}
+	if req.Seed != 0 {
+		attrs = append(attrs, attribute.Int(AttrRequestSeed, req.Seed))
+	}
+	if len(req.StopSequences) > 0 {
+		attrs = append(attrs, attribute.StringSlice(AttrRequestStopSequences, req.StopSequences))
+	}
+	if req.ChoiceCount > 0 && req.ChoiceCount != 1 {
+		attrs = append(attrs, attribute.Int(AttrRequestChoiceCount, req.ChoiceCount))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attrs...),
+	)
+
+	return ctx, &ChatSpan{
+		span:      span,
+		provider:  req.Provider,
+		model:     req.Model,
+		startedAt: time.Now(),
+		metricCtx: ctx,
+	}
+}
+
+// SetAttributes adds extra attributes to the span. Use for provider-specific
+// fields (openai.*, aws.bedrock.*) and for response-side attributes the
+// caller learns later.
+func (s *ChatSpan) SetAttributes(attrs ...attribute.KeyValue) {
+	if s == nil {
+		return
+	}
+	s.span.SetAttributes(attrs...)
+}
+
+// SetResponseModel records gen_ai.response.model. Some providers return a
+// resolved model name that differs from the requested one (alias expansion,
+// version pinning); both values are useful.
+func (s *ChatSpan) SetResponseModel(model string) {
+	if s == nil || model == "" {
+		return
+	}
+	s.mu.Lock()
+	s.responseModel = model
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.String(AttrResponseModel, model))
+}
+
+// SetResponseID records gen_ai.response.id.
+func (s *ChatSpan) SetResponseID(id string) {
+	if s == nil || id == "" {
+		return
+	}
+	s.span.SetAttributes(attribute.String(AttrResponseID, id))
+}
+
+// AddFinishReason accumulates a finish reason. The spec defines the
+// attribute as a string array — multiple values are recorded once on End.
+func (s *ChatSpan) AddFinishReason(reason string) {
+	if s == nil || reason == "" {
+		return
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if slices.Contains(s.finishReasons, reason) {
+		return
+	}
+	s.finishReasons = append(s.finishReasons, reason)
+}
+
+// RecordUsage stores the token usage for emission as both span attributes
+// and the gen_ai.client.token.usage histogram. Callers pass raw provider
+// values; this package applies the spec-mandated Anthropic input-token sum
+// (`input_tokens` reported by Anthropic excludes cached tokens, so the
+// spec requires summing input + cache_read + cache_creation).
+func (s *ChatSpan) RecordUsage(inputTokens, outputTokens, cacheReadInput, cacheCreationInput, reasoningOutput int64) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.usage.inputTokens = inputTokens
+	s.usage.outputTokens = outputTokens
+	s.usage.cacheReadInput = cacheReadInput
+	s.usage.cacheCreationInput = cacheCreationInput
+	s.usage.reasoningOutput = reasoningOutput
+	s.usageRecorded = true
+}
+
+// MarkChunk records the timing of a streamed output chunk. The first call
+// drives gen_ai.response.time_to_first_chunk (and the corresponding
+// metric); subsequent calls accumulate per-chunk durations.
+func (s *ChatSpan) MarkChunk() {
+	if s == nil {
+		return
+	}
+	now := time.Now()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.firstChunkAt.IsZero() {
+		s.firstChunkAt = now
+	} else {
+		s.chunkDurations = append(s.chunkDurations, now.Sub(s.prevChunkAt).Seconds())
+	}
+	s.prevChunkAt = now
+}
+
+// RecordError marks the span as failed and stores error.type for the
+// duration metric. errType should be a short, low-cardinality string —
+// "rate_limit", "context_length_exceeded", "auth", "network",
+// "context_canceled", or "_OTHER" as the spec-defined fallback. When
+// errType is empty, ClassifyError(err) is called to derive a value, so
+// callers that don't already have a classification can pass "" without
+// losing it to the "_OTHER" bucket.
+func (s *ChatSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span, flushes accumulated finish reasons / usage / timing
+// to the span, and records the duration and token-usage histograms. Safe
+// to call multiple times; subsequent calls are no-ops.
+func (s *ChatSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	finishReasons := append([]string(nil), s.finishReasons...)
+	usage := s.usage
+	usageRecorded := s.usageRecorded
+	errType := s.errType
+	firstChunkAt := s.firstChunkAt
+	chunkDurations := append([]float64(nil), s.chunkDurations...)
+	s.mu.Unlock()
+
+	if len(finishReasons) > 0 {
+		s.span.SetAttributes(attribute.StringSlice(AttrResponseFinishReasons, finishReasons))
+	}
+	if !firstChunkAt.IsZero() {
+		ttfc := firstChunkAt.Sub(s.startedAt).Seconds()
+		s.span.SetAttributes(attribute.Float64(AttrResponseTimeToFirstChunk, ttfc))
+	}
+	if usageRecorded {
+		// Apply the spec-mandated Anthropic input-token math: Anthropic's
+		// API reports input_tokens excluding cache, but spec wants the
+		// inclusive total on gen_ai.usage.input_tokens.
+		spanInputTokens := usage.inputTokens
+		if s.provider == ProviderAnthropic {
+			spanInputTokens += usage.cacheReadInput + usage.cacheCreationInput
+		}
+		spanAttrs := []attribute.KeyValue{
+			attribute.Int64(AttrUsageInputTokens, spanInputTokens),
+			attribute.Int64(AttrUsageOutputTokens, usage.outputTokens),
+		}
+		if usage.cacheReadInput > 0 {
+			spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageCacheReadInputTokens, usage.cacheReadInput))
+		}
+		if usage.cacheCreationInput > 0 {
+			spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageCacheCreationInputTokens, usage.cacheCreationInput))
+		}
+		if usage.reasoningOutput > 0 {
+			spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageReasoningOutputTokens, usage.reasoningOutput))
+		}
+		s.span.SetAttributes(spanAttrs...)
+	}
+
+	s.span.End()
+
+	// Emit metrics. Failure to resolve instruments must not block span
+	// completion, so we silently skip when getInstruments returns nil.
+	insts := getInstruments()
+	if insts == nil {
+		return
+	}
+
+	commonAttrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationChat),
+		attribute.String(AttrProviderName, s.provider),
+	}
+	if s.model != "" {
+		commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model))
+	}
+
+	durationAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+	if errType != "" {
+		durationAttrs = append(durationAttrs, attribute.String("error.type", errType))
+	}
+	if insts.clientOperationDuration != nil {
+		insts.clientOperationDuration.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+			metric.WithAttributes(durationAttrs...),
+		)
+	}
+
+	if !firstChunkAt.IsZero() && insts.clientOperationTTFC != nil {
+		insts.clientOperationTTFC.Record(s.metricCtx, firstChunkAt.Sub(s.startedAt).Seconds(),
+			metric.WithAttributes(commonAttrs...),
+		)
+	}
+	if insts.clientOperationTimePerChunk != nil {
+		for _, d := range chunkDurations {
+			insts.clientOperationTimePerChunk.Record(s.metricCtx, d,
+				metric.WithAttributes(commonAttrs...),
+			)
+		}
+	}
+
+	if usageRecorded && insts.clientTokenUsage != nil {
+		recordTokenMetric := func(tokenType string, value int64) {
+			if value <= 0 {
+				return
+			}
+			tokenAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+			tokenAttrs = append(tokenAttrs, attribute.String(AttrTokenType, tokenType))
+			insts.clientTokenUsage.Record(s.metricCtx, value,
+				metric.WithAttributes(tokenAttrs...),
+			)
+		}
+		// Per-token-type metric data points use raw provider values so a
+		// backend summing across types reconstructs the true total
+		// without double-counting cached tokens. The Anthropic spec sum
+		// (input + cache_read + cache_creation) is only applied to the
+		// span attribute `gen_ai.usage.input_tokens` per the per-provider
+		// semconv MUST clause — see span attribute emission above.
+		recordTokenMetric(TokenTypeInput, usage.inputTokens)
+		recordTokenMetric(TokenTypeOutput, usage.outputTokens)
+		recordTokenMetric(TokenTypeCacheRead, usage.cacheReadInput)
+		recordTokenMetric(TokenTypeCacheCreation, usage.cacheCreationInput)
+		recordTokenMetric(TokenTypeReasoning, usage.reasoningOutput)
+	}
+}
+
+// Span returns the underlying OTel span so callers can attach span events
+// or links when they need finer control than the helpers expose. Returns
+// a real no-op span (not a struct embedding a nil trace.Span) when the
+// receiver is nil so callers don't have to nil-check before invoking
+// Span methods like AddEvent / SetAttributes.
+func (s *ChatSpan) Span() trace.Span {
+	if s == nil {
+		return tracenoop.Span{}
+	}
+	return s.span
+}
diff --git a/pkg/telemetry/genai/stability.go b/pkg/telemetry/genai/stability.go
new file mode 100644
index 000000000..021ce0450
--- /dev/null
+++ b/pkg/telemetry/genai/stability.go
@@ -0,0 +1,130 @@
+package genai
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	"go.opentelemetry.io/otel/attribute"
+)
+
+// EnvSemconvStability is the OTel-defined environment variable that lets
+// callers opt into experimental versions of the GenAI semantic
+// conventions
+// (https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/README.md).
+//
+// It is a comma-separated list of opt-in tokens. The only token defined
+// for GenAI today is `gen_ai_latest_experimental` — when present, the
+// instrumentation emits only the spec-defined `gen_ai.*` attributes and
+// drops the legacy attribute names (e.g. `tool.name`, `agent`,
+// `session.id`).
+//
+// Default behaviour (env var unset) is dual-emit: spans carry both the
+// legacy keys and the `gen_ai.*` keys so existing dashboards keep
+// working alongside spec-aware tooling. This matches the spec's
+// recommendation that instrumentations not change the version of
+// conventions they emit by default and instead require the opt-in for
+// the new version.
+const EnvSemconvStability = "OTEL_SEMCONV_STABILITY_OPT_IN"
+
+// stabilityToken is the spec-defined opt-in for the latest experimental
+// GenAI conventions.
+const stabilityToken = "gen_ai_latest_experimental"
+
+// Stability identifies which version of attribute names a span should
+// emit.
+type Stability int
+
+const (
+	// StabilityDualEmit is the default: emit both legacy attribute
+	// names (`tool.name`, `agent`, `session.id`, ...) and the
+	// `gen_ai.*` keys, so existing dashboards continue working while
+	// spec-aware tooling sees the new values.
+	StabilityDualEmit Stability = iota
+	// StabilityGenAILatest is selected by
+	// `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Only
+	// the `gen_ai.*` attributes are emitted; the legacy keys are
+	// dropped.
+	StabilityGenAILatest
+)
+
+var (
+	stabilityMu     sync.Mutex
+	stabilityOnce   sync.Once
+	cachedStability Stability
+)
+
+// CurrentStability returns the active stability mode. The result is
+// computed once per process from the env var; tests that need to flip
+// the mode at runtime should call ResetStabilityForTest first.
+func CurrentStability() Stability {
+	stabilityMu.Lock()
+	once := &stabilityOnce
+	stabilityMu.Unlock()
+
+	once.Do(func() {
+		raw := os.Getenv(EnvSemconvStability)
+		for tok := range strings.SplitSeq(raw, ",") {
+			// Spec: tokens are case-insensitive.
+			if strings.EqualFold(strings.TrimSpace(tok), stabilityToken) {
+				stabilityMu.Lock()
+				cachedStability = StabilityGenAILatest
+				stabilityMu.Unlock()
+				return
+			}
+		}
+		stabilityMu.Lock()
+		cachedStability = StabilityDualEmit
+		stabilityMu.Unlock()
+	})
+
+	stabilityMu.Lock()
+	defer stabilityMu.Unlock()
+	return cachedStability
+}
+
+// ResetStabilityForTest clears the cached stability value so a
+// subsequent CurrentStability call re-reads the env var. Test-only —
+// callers must ensure no other goroutine is in CurrentStability when
+// this runs. The mutex protects the sync.Once and cache fields against
+// other Reset calls and against the lock-protected segments of
+// CurrentStability, but CurrentStability releases the mutex before
+// invoking once.Do, so a concurrent reset there races on the
+// sync.Once memory itself (flagged under -race). All in-tree usage is
+// sequential (t.Setenv + t.Cleanup, no t.Parallel), so this is safe in
+// practice; do not introduce parallel callers.
+func ResetStabilityForTest() {
+	stabilityMu.Lock()
+	defer stabilityMu.Unlock()
+	stabilityOnce = sync.Once{}
+	cachedStability = StabilityDualEmit
+}
+
+// EmitLegacyAttributes reports whether legacy (pre-semconv) attribute
+// keys should be emitted. True when stability is StabilityDualEmit;
+// false when the user has opted into `gen_ai_latest_experimental`.
+func EmitLegacyAttributes() bool {
+	return CurrentStability() == StabilityDualEmit
+}
+
+// LegacyToolAttributes returns the historic tool dispatcher attribute
+// set (`tool.name`, `agent`, `session.id`, `tool.call_id`,
+// `tool.type`) — but only when legacy emission is enabled. Returns nil
+// otherwise so call sites can append unconditionally.
+func LegacyToolAttributes(toolName, toolType, agentName, sessionID, callID string) []attribute.KeyValue {
+	if !EmitLegacyAttributes() {
+		return nil
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String("tool.name", toolName),
+		attribute.String("agent", agentName),
+		attribute.String("session.id", sessionID),
+	}
+	if toolType != "" {
+		attrs = append(attrs, attribute.String("tool.type", toolType))
+	}
+	if callID != "" {
+		attrs = append(attrs, attribute.String("tool.call_id", callID))
+	}
+	return attrs
+}
diff --git a/pkg/telemetry/genai/stability_test.go b/pkg/telemetry/genai/stability_test.go
new file mode 100644
index 000000000..f89ee7991
--- /dev/null
+++ b/pkg/telemetry/genai/stability_test.go
@@ -0,0 +1,55 @@
+package genai
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestCurrentStabilityDefault(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "")
+	ResetStabilityForTest()
+	assert.Equal(t, StabilityDualEmit, CurrentStability())
+	assert.True(t, EmitLegacyAttributes())
+}
+
+func TestCurrentStabilityGenAILatest(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "gen_ai_latest_experimental")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityGenAILatest, CurrentStability())
+	assert.False(t, EmitLegacyAttributes())
+}
+
+func TestCurrentStabilityIgnoresUnrelatedTokens(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "http,database")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityDualEmit, CurrentStability())
+}
+
+func TestCurrentStabilityCompositeList(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "http, gen_ai_latest_experimental ,database")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityGenAILatest, CurrentStability())
+}
+
+func TestCurrentStabilityCaseInsensitive(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "GEN_AI_LATEST_EXPERIMENTAL")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityGenAILatest, CurrentStability())
+}
+
+func TestLegacyToolAttributesGated(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "gen_ai_latest_experimental")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Empty(t, LegacyToolAttributes("shell", "function", "main", "sess1", "call1"))
+
+	t.Setenv(EnvSemconvStability, "")
+	ResetStabilityForTest()
+	got := LegacyToolAttributes("shell", "function", "main", "sess1", "call1")
+	assert.NotEmpty(t, got)
+}
diff --git a/pkg/telemetry/genai/stream.go b/pkg/telemetry/genai/stream.go
new file mode 100644
index 000000000..382597512
--- /dev/null
+++ b/pkg/telemetry/genai/stream.go
@@ -0,0 +1,255 @@
+package genai
+
+import (
+	"errors"
+	"io"
+	"strings"
+	"sync"
+
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// StreamAttributer is an optional interface that provider stream adapters
+// may implement to surface provider-specific attributes to the chat span
+// once the response is complete. The wrapper queries the underlying stream
+// on Close (in addition to the per-chunk Recv path) and applies whatever
+// attributes the provider chose to expose. Implementations are expected to
+// be safe to call after Close.
+type StreamAttributer interface {
+	GenAIStreamAttributes() []KeyValue
+}
+
+// KeyValue is a re-exported attribute key/value pair used by the optional
+// StreamAttributer interface so providers can implement it without
+// importing go.opentelemetry.io/otel/attribute directly. The decorator
+// converts these back into OTel attributes before applying them to the
+// span.
+type KeyValue struct {
+	Key   string
+	Value any
+}
+
+// WrapStream wraps a chat.MessageStream so that consuming the stream
+// drives the lifecycle of a ChatSpan: per-chunk timing, response-level
+// attributes (id / response.model / finish reasons), usage capture, and
+// final span End on stream close or terminal error.
+//
+// The returned stream forwards all Recv/Close calls to the underlying
+// stream verbatim and adds no other behaviour, so swapping it in is
+// invisible to callers.
+func WrapStream(span *ChatSpan, stream chat.MessageStream) chat.MessageStream {
+	if span == nil || stream == nil {
+		return stream
+	}
+	return &instrumentedStream{
+		span:    span,
+		inner:   stream,
+		capture: IsContentCaptureEnabled(),
+	}
+}
+
+type instrumentedStream struct {
+	span  *ChatSpan
+	inner chat.MessageStream
+
+	// mu guards the lifecycle flags and the streaming-state buffers
+	// so a Recv that errors concurrently with the consumer's Close
+	// does not race on the check-then-set in endOnce or
+	// double-apply attributes through SetOutputMessages.
+	mu sync.Mutex
+
+	// ended is set when the span has been finalised (output flushed
+	// and `End` called). innerClosed is set when the inner stream's
+	// `Close` has been called. They are tracked separately so an
+	// error in `Recv` can end the span without preempting the
+	// caller's `Close` that releases the inner stream's resources.
+	ended       bool
+	innerClosed bool
+
+	// capture buffers the streamed deltas for emission as
+	// `gen_ai.output.messages` on Close. Filled only when content
+	// capture is opted in (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true`)
+	// so the buffer cost stays out of the default request path.
+	capture       bool
+	contentBuf    strings.Builder
+	reasoningBuf  strings.Builder
+	pendingTools  map[string]*tools.ToolCall
+	toolCallOrder []string
+}
+
+func (s *instrumentedStream) Recv() (chat.MessageStreamResponse, error) {
+	resp, err := s.inner.Recv()
+	if err != nil {
+		// io.EOF is the normal stream terminator and is not an error
+		// for the span's purposes — End handles closing.
+		// For non-EOF errors we end the span here too: callers that
+		// abandon the stream after an error (a common pattern for
+		// network failures) would otherwise leak the span and skip the
+		// duration metric. Close remains idempotent so the canonical
+		// `defer Close()` path still works.
+		if !errors.Is(err, io.EOF) {
+			s.span.RecordError(err, ClassifyError(err))
+			s.endOnce()
+		}
+		return resp, err
+	}
+
+	// First chunk arrival is meaningful for the time_to_first_chunk
+	// metric. Mark on every Recv that produced any content so we cover
+	// cases where the provider opens with an empty preamble.
+	if hasChunkPayload(&resp) {
+		s.span.MarkChunk()
+	}
+
+	if resp.ID != "" {
+		s.span.SetResponseID(resp.ID)
+	}
+	if resp.Model != "" {
+		s.span.SetResponseModel(resp.Model)
+	}
+	for i := range resp.Choices {
+		if resp.Choices[i].FinishReason != "" {
+			s.span.AddFinishReason(string(resp.Choices[i].FinishReason))
+		}
+	}
+	if resp.Usage != nil {
+		s.span.RecordUsage(
+			resp.Usage.InputTokens,
+			resp.Usage.OutputTokens,
+			resp.Usage.CachedInputTokens,
+			resp.Usage.CacheWriteTokens,
+			resp.Usage.ReasoningTokens,
+		)
+	}
+
+	if s.capture {
+		s.mu.Lock()
+		s.bufferDeltas(&resp)
+		s.mu.Unlock()
+	}
+	return resp, nil
+}
+
+// bufferDeltas accumulates content and tool-call deltas for the
+// gen_ai.output.messages attribute. Tool calls arrive across multiple
+// chunks (id once, name once, arguments in pieces), so we keep a map
+// keyed by id and concatenate arguments as they stream in.
+func (s *instrumentedStream) bufferDeltas(resp *chat.MessageStreamResponse) {
+	for i := range resp.Choices {
+		d := &resp.Choices[i].Delta
+		if d.Content != "" {
+			s.contentBuf.WriteString(d.Content)
+		}
+		if d.ReasoningContent != "" {
+			s.reasoningBuf.WriteString(d.ReasoningContent)
+		}
+		for j := range d.ToolCalls {
+			tc := &d.ToolCalls[j]
+			id := tc.ID
+			if id == "" {
+				// Provider didn't include the id on this delta — fall
+				// back to the most recent in-progress tool call.
+				if len(s.toolCallOrder) == 0 {
+					continue
+				}
+				id = s.toolCallOrder[len(s.toolCallOrder)-1]
+			}
+			if s.pendingTools == nil {
+				s.pendingTools = map[string]*tools.ToolCall{}
+			}
+			existing, ok := s.pendingTools[id]
+			if !ok {
+				existing = &tools.ToolCall{ID: id, Type: tc.Type}
+				s.pendingTools[id] = existing
+				s.toolCallOrder = append(s.toolCallOrder, id)
+			}
+			if tc.Function.Name != "" {
+				existing.Function.Name = tc.Function.Name
+			}
+			if tc.Function.Arguments != "" {
+				existing.Function.Arguments += tc.Function.Arguments
+			}
+		}
+	}
+}
+
+func (s *instrumentedStream) Close() {
+	s.mu.Lock()
+	closeInner := !s.innerClosed
+	s.innerClosed = true
+	s.mu.Unlock()
+	if closeInner {
+		s.inner.Close()
+	}
+	s.endOnce()
+}
+
+// endOnce flushes captured content, applies provider-supplied attributes,
+// and ends the span — at most once per stream. Both the error path in
+// `Recv` and the explicit `Close` path go through here so a stream that
+// errors mid-flight still ends its span without waiting for the caller.
+// `inner.Close` is intentionally NOT called here: leaving it to the
+// explicit `Close` path keeps the contract that the wrapper releases
+// the underlying stream exactly when the caller asks.
+func (s *instrumentedStream) endOnce() {
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	// Snapshot the buffers under the lock so we don't race against a
+	// concurrent Recv writing more deltas. Release before calling out
+	// to the OTel SDK and the StreamAttributer hook to avoid holding
+	// the mutex across third-party code.
+	var (
+		extras       []KeyValue
+		captured     bool
+		content      string
+		reasoning    string
+		collected    []tools.ToolCall
+		streamAttrer StreamAttributer
+	)
+	if attrer, ok := s.inner.(StreamAttributer); ok {
+		streamAttrer = attrer
+	}
+	if s.capture {
+		captured = true
+		content = s.contentBuf.String()
+		reasoning = s.reasoningBuf.String()
+		for _, id := range s.toolCallOrder {
+			if tc, ok := s.pendingTools[id]; ok {
+				collected = append(collected, *tc)
+			}
+		}
+	}
+	s.mu.Unlock()
+
+	if streamAttrer != nil {
+		extras = streamAttrer.GenAIStreamAttributes()
+	}
+	for _, kv := range extras {
+		applyExtraAttribute(s.span, kv)
+	}
+	if captured {
+		SetOutputMessages(s.span, content, reasoning, collected)
+	}
+	s.span.End()
+}
+
+// hasChunkPayload reports whether the response carries content that should
+// count as an output chunk (text, reasoning, tool call, etc.). Empty
+// keep-alive frames do not advance the per-chunk timing metrics.
+func hasChunkPayload(resp *chat.MessageStreamResponse) bool {
+	for i := range resp.Choices {
+		d := &resp.Choices[i].Delta
+		if d.Content != "" || d.ReasoningContent != "" || d.ThinkingSignature != "" {
+			return true
+		}
+		if len(d.ToolCalls) > 0 || d.FunctionCall != nil {
+			return true
+		}
+	}
+	return false
+}
diff --git a/pkg/telemetry/mcp/attrs.go b/pkg/telemetry/mcp/attrs.go
new file mode 100644
index 000000000..64a1d4138
--- /dev/null
+++ b/pkg/telemetry/mcp/attrs.go
@@ -0,0 +1,58 @@
+package mcp
+
+// MCP attribute keys defined by the OTel semantic conventions
+// (https://opentelemetry.io/docs/specs/semconv/registry/attributes/mcp/).
+// All are Development stability.
+const (
+	AttrMethodName      = "mcp.method.name"
+	AttrProtocolVersion = "mcp.protocol.version"
+	AttrResourceURI     = "mcp.resource.uri"
+	AttrSessionID       = "mcp.session.id"
+)
+
+// JSON-RPC attribute keys used alongside MCP spans for request id and
+// response status when applicable.
+const (
+	AttrJSONRPCRequestID       = "jsonrpc.request.id"
+	AttrJSONRPCProtocolVersion = "jsonrpc.protocol.version"
+	AttrRPCResponseStatusCode  = "rpc.response.status_code"
+)
+
+// gen_ai.* attribute keys that the MCP semconv overlays on MCP spans when
+// applicable. These are duplicated here as constants so the MCP package
+// doesn't depend on the genai package — keeping the two telemetry helpers
+// compositional.
+const (
+	AttrGenAIOperationName = "gen_ai.operation.name"
+	AttrGenAIToolName      = "gen_ai.tool.name"
+	AttrGenAIPromptName    = "gen_ai.prompt.name"
+)
+
+// Well-known MCP method names (https://modelcontextprotocol.io/specification).
+// These match the values listed in the OTel semconv registry.
+const (
+	MethodInitialize         = "initialize"
+	MethodPing               = "ping"
+	MethodCompletionComplete = "completion/complete"
+	MethodPromptsList        = "prompts/list"
+	MethodPromptsGet         = "prompts/get"
+	MethodResourcesList      = "resources/list"
+	MethodResourcesRead      = "resources/read"
+	MethodResourcesSubscribe = "resources/subscribe"
+	MethodResourcesUnsub     = "resources/unsubscribe"
+	MethodResourcesTemplates = "resources/templates/list"
+	MethodRootsList          = "roots/list"
+	MethodSamplingCreate     = "sampling/createMessage"
+	MethodToolsList          = "tools/list"
+	MethodToolsCall          = "tools/call"
+	MethodLoggingSetLevel    = "logging/setLevel"
+	MethodElicitationCreate  = "elicitation/create"
+)
+
+// OperationExecuteTool is the gen_ai.operation.name value used on MCP
+// tools/call spans per the spec.
+const OperationExecuteTool = "execute_tool"
+
+// instrumentationName identifies this package as the OTel instrumentation
+// scope for spans, metrics, and log records it produces.
+const instrumentationName = "github.com/docker/docker-agent/pkg/telemetry/mcp"
diff --git a/pkg/telemetry/mcp/conversation.go b/pkg/telemetry/mcp/conversation.go
new file mode 100644
index 000000000..efeaad57f
--- /dev/null
+++ b/pkg/telemetry/mcp/conversation.go
@@ -0,0 +1,19 @@
+package mcp
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/baggage"
+)
+
+// ConversationIDFromBaggage reads `gen_ai.conversation.id` from the
+// context's W3C baggage. The MCP package mirrors the genai package's
+// convention so MCP spans automatically carry the session id when the
+// runtime has seeded it; the value also propagates across MCP server
+// boundaries via the standard `baggage` header alongside `traceparent`.
+//
+// Exported so adjacent code (e.g. the MCP OAuth transport) can attach
+// the same attribute to spans it creates directly via `otel.Tracer`.
+func ConversationIDFromBaggage(ctx context.Context) string {
+	return baggage.FromContext(ctx).Member("gen_ai.conversation.id").Value()
+}
diff --git a/pkg/telemetry/mcp/doc.go b/pkg/telemetry/mcp/doc.go
new file mode 100644
index 000000000..401f7472d
--- /dev/null
+++ b/pkg/telemetry/mcp/doc.go
@@ -0,0 +1,13 @@
+// Package mcp provides OpenTelemetry instrumentation helpers that follow
+// the OTel GenAI semantic conventions for the Model Context Protocol
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/mcp/).
+//
+// MCP attributes use the `mcp.*` namespace (separate from `gen_ai.*`).
+// Trace context propagates through the MCP `params._meta` field so that
+// requests crossing client/server boundaries chain into a single trace.
+//
+// The package is structured so that callers describe what they are doing
+// in MCP terms (method name, tool name, session id) and the helpers
+// produce the spec-conformant spans, metrics, and propagation. All helpers
+// are no-op-safe when telemetry is disabled.
+package mcp
diff --git a/pkg/telemetry/mcp/mcp_test.go b/pkg/telemetry/mcp/mcp_test.go
new file mode 100644
index 000000000..0f1533803
--- /dev/null
+++ b/pkg/telemetry/mcp/mcp_test.go
@@ -0,0 +1,97 @@
+package mcp
+
+import (
+	"context"
+	"errors"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/sdk/trace"
+	traceapi "go.opentelemetry.io/otel/trace"
+)
+
+func TestEnsureMeta(t *testing.T) {
+	t.Parallel()
+	got := EnsureMeta(nil)
+	assert.NotNil(t, got)
+	assert.Empty(t, got)
+
+	existing := map[string]any{"foo": "bar"}
+	got = EnsureMeta(existing)
+	assert.Equal(t, existing, got)
+}
+
+func TestInjectExtractRoundTrip(t *testing.T) {
+	t.Parallel()
+
+	// A propagator must be configured for inject/extract to do anything;
+	// install one for the duration of the test and put it back after.
+	prev := otel.GetTextMapPropagator()
+	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
+		propagation.TraceContext{},
+		propagation.Baggage{},
+	))
+	t.Cleanup(func() { otel.SetTextMapPropagator(prev) })
+
+	// Start a sampled span so traceparent has a non-trivial trace id.
+	tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample()))
+	t.Cleanup(func() { _ = tp.Shutdown(t.Context()) })
+
+	parentCtx, parentSpan := tp.Tracer("test").Start(t.Context(), "parent")
+	defer parentSpan.End()
+	parentSC := traceapi.SpanContextFromContext(parentCtx)
+
+	meta := map[string]any{}
+	InjectMeta(parentCtx, meta)
+	assert.Contains(t, meta, "traceparent",
+		"propagator should have written W3C traceparent into _meta")
+
+	// Extract from a fresh context and verify the span context lines up
+	// with the parent we started with.
+	childCtx := ExtractMeta(t.Context(), meta)
+	extracted := traceapi.SpanContextFromContext(childCtx)
+	assert.Equal(t, parentSC.TraceID(), extracted.TraceID())
+	assert.Equal(t, parentSC.SpanID(), extracted.SpanID())
+}
+
+func TestInjectMetaNilNoOp(t *testing.T) {
+	t.Parallel()
+	// Should not panic on a nil map.
+	InjectMeta(t.Context(), nil)
+}
+
+func TestExtractMetaNilReturnsParent(t *testing.T) {
+	t.Parallel()
+	got := ExtractMeta(t.Context(), nil)
+	// Without trace context to extract we get back the same context.
+	assert.Equal(t, t.Context(), got)
+}
+
+func TestStartClientReturnsActiveSpan(t *testing.T) {
+	t.Parallel()
+
+	tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample()))
+	t.Cleanup(func() { _ = tp.Shutdown(t.Context()) })
+	prev := otel.GetTracerProvider()
+	otel.SetTracerProvider(tp)
+	t.Cleanup(func() { otel.SetTracerProvider(prev) })
+
+	ctx, span := StartClient(t.Context(), CallOptions{
+		Method:   MethodToolsCall,
+		ToolName: "search-web",
+	})
+	defer span.End()
+
+	sc := traceapi.SpanContextFromContext(ctx)
+	assert.True(t, sc.IsValid(), "context should carry an active span")
+}
+
+func TestClassifyError(t *testing.T) {
+	t.Parallel()
+	assert.Empty(t, ClassifyError(nil))
+	assert.Equal(t, "context_canceled", ClassifyError(context.Canceled))
+	assert.Equal(t, "deadline_exceeded", ClassifyError(context.DeadlineExceeded))
+	assert.Equal(t, "rpc_error", ClassifyError(errors.New("some other error")))
+}
diff --git a/pkg/telemetry/mcp/metrics.go b/pkg/telemetry/mcp/metrics.go
new file mode 100644
index 000000000..fab407f9d
--- /dev/null
+++ b/pkg/telemetry/mcp/metrics.go
@@ -0,0 +1,56 @@
+package mcp
+
+import (
+	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// metricBuckets matches the spec's bucket boundaries for all four MCP
+// duration histograms (mcp.client/server.operation.duration and
+// mcp.client/server.session.duration).
+var metricBuckets = []float64{
+	0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 30, 60, 120, 300,
+}
+
+type instruments struct {
+	clientOperationDuration metric.Float64Histogram
+	serverOperationDuration metric.Float64Histogram
+	// mcp.{client,server}.session.duration histograms are defined by
+	// the spec but require a SessionSpan that tracks open/close at
+	// the transport layer. Wire those up alongside the transport
+	// instrumentation; until then registering them here would create
+	// always-empty time series in Mimir.
+}
+
+var (
+	instOnce sync.Once
+	inst     *instruments
+)
+
+func getInstruments() *instruments {
+	instOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		i := &instruments{}
+
+		// Histogram registration rarely fails; on the rare miss we
+		// keep the successfully created instruments rather than
+		// abandoning the whole package — record sites nil-check.
+		i.clientOperationDuration, _ = meter.Float64Histogram(
+			"mcp.client.operation.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time taken by an MCP client to send a request and receive its response."),
+			metric.WithExplicitBucketBoundaries(metricBuckets...),
+		)
+		i.serverOperationDuration, _ = meter.Float64Histogram(
+			"mcp.server.operation.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time taken by an MCP server to handle a request and send its response."),
+			metric.WithExplicitBucketBoundaries(metricBuckets...),
+		)
+
+		inst = i
+	})
+	return inst
+}
diff --git a/pkg/telemetry/mcp/propagation.go b/pkg/telemetry/mcp/propagation.go
new file mode 100644
index 000000000..b0e62040b
--- /dev/null
+++ b/pkg/telemetry/mcp/propagation.go
@@ -0,0 +1,92 @@
+package mcp
+
+import (
+	"context"
+	"maps"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/propagation"
+)
+
+// metaCarrier adapts an MCP `params._meta` map (which the MCP SDK exposes
+// as `map[string]any`) to OTel's TextMapCarrier interface so the package's
+// configured propagator can read and write trace context (`traceparent`,
+// `tracestate`, `baggage`) the way it does for any HTTP carrier.
+type metaCarrier struct {
+	meta map[string]any
+}
+
+func (c metaCarrier) Get(key string) string {
+	if c.meta == nil {
+		return ""
+	}
+	v, ok := c.meta[key]
+	if !ok {
+		return ""
+	}
+	if s, ok := v.(string); ok {
+		return s
+	}
+	return ""
+}
+
+func (c metaCarrier) Set(key, value string) {
+	if c.meta == nil {
+		return
+	}
+	c.meta[key] = value
+}
+
+func (c metaCarrier) Keys() []string {
+	if c.meta == nil {
+		return nil
+	}
+	keys := make([]string, 0, len(c.meta))
+	for k, v := range c.meta {
+		if _, ok := v.(string); ok {
+			keys = append(keys, k)
+		}
+	}
+	return keys
+}
+
+// InjectMeta writes the active trace context into the given MCP `_meta`
+// map so the receiving server can extract it and parent its SERVER span
+// onto our CLIENT span. Per the MCP semconv, the keys written are
+// `traceparent`, `tracestate`, and `baggage` (W3C TraceContext + Baggage).
+//
+// If meta is nil, InjectMeta is a no-op — callers should ensure the map
+// is non-nil before calling so the keys actually persist on the request.
+func InjectMeta(ctx context.Context, meta map[string]any) {
+	if meta == nil {
+		return
+	}
+	otel.GetTextMapPropagator().Inject(ctx, metaCarrier{meta: meta})
+}
+
+// ExtractMeta reads trace context from the given MCP `_meta` map and
+// returns a context with the parent span attached. Use on the server side
+// to chain incoming spans onto the client's caller.
+func ExtractMeta(ctx context.Context, meta map[string]any) context.Context {
+	if meta == nil {
+		return ctx
+	}
+	return otel.GetTextMapPropagator().Extract(ctx, metaCarrier{meta: meta})
+}
+
+// EnsureMeta returns a metadata map suitable for InjectMeta to write
+// trace context into. When m is non-nil it is shallow-copied so an
+// upstream caller that reuses the same request struct (e.g. on retry)
+// does not see stale `traceparent` keys from a previous span injected
+// into the map they own. When m is nil a fresh map is allocated.
+func EnsureMeta(m map[string]any) map[string]any {
+	if m == nil {
+		return map[string]any{}
+	}
+	out := make(map[string]any, len(m)+3)
+	maps.Copy(out, m)
+	return out
+}
+
+// Verify metaCarrier satisfies the propagator interface at compile time.
+var _ propagation.TextMapCarrier = metaCarrier{}
diff --git a/pkg/telemetry/mcp/span.go b/pkg/telemetry/mcp/span.go
new file mode 100644
index 000000000..0ab3a806b
--- /dev/null
+++ b/pkg/telemetry/mcp/span.go
@@ -0,0 +1,247 @@
+package mcp
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// CallOptions describes an MCP request being made or handled. Used by
+// both client- and server-side helpers so call sites depend on a single
+// vocabulary.
+type CallOptions struct {
+	// Method is the MCP method name (e.g. "tools/call"). Required.
+	Method string
+
+	// Target is the low-cardinality target of the operation: tool name
+	// for tools/call, prompt name for prompts/get, etc. When set the
+	// span name becomes "{method} {target}"; otherwise just "{method}".
+	Target string
+
+	// ToolName, when set, is recorded as gen_ai.tool.name and used as
+	// the default Target for tools/call.
+	ToolName string
+
+	// PromptName, when set, is recorded as gen_ai.prompt.name and used
+	// as the default Target for prompts/get.
+	PromptName string
+
+	// ResourceURI, when set, is recorded as mcp.resource.uri and used
+	// as the default Target for resources/* methods.
+	ResourceURI string
+
+	// SessionID identifies the MCP session and is recorded as
+	// mcp.session.id when set.
+	SessionID string
+
+	// ProtocolVersion is recorded as mcp.protocol.version when set.
+	ProtocolVersion string
+
+	// JSONRPCRequestID is recorded as jsonrpc.request.id when set
+	// (client-side requests; ignored for notifications).
+	JSONRPCRequestID string
+
+	// ServerAddress / ServerPort identify the MCP endpoint when known.
+	ServerAddress string
+	ServerPort    int
+}
+
+// Span is the handle returned by StartClient / StartServer. It carries
+// enough state to record `mcp.{client,server}.operation.duration` and to
+// flush span attributes as the operation proceeds.
+type Span struct {
+	span trace.Span
+	// metricCtx carries the active span context so the duration
+	// histogram measurement produces span-context exemplars (drill
+	// Mimir bucket → Tempo trace).
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+	startedAt time.Time
+	method    string
+	kind      trace.SpanKind
+
+	mu      sync.Mutex
+	errType string
+	ended   bool
+}
+
+// StartClient begins a CLIENT-kind MCP span and returns a context carrying
+// it. Callers MUST call Span.End to flush the span and metrics.
+func StartClient(ctx context.Context, opts CallOptions) (context.Context, *Span) {
+	return startSpan(ctx, opts, trace.SpanKindClient)
+}
+
+// StartServer begins a SERVER-kind MCP span. Use after extracting trace
+// context from the incoming `params._meta` so the span chains onto the
+// caller. Callers MUST call Span.End.
+func StartServer(ctx context.Context, opts CallOptions) (context.Context, *Span) {
+	return startSpan(ctx, opts, trace.SpanKindServer)
+}
+
+func startSpan(ctx context.Context, opts CallOptions, kind trace.SpanKind) (context.Context, *Span) {
+	tracer := otel.Tracer(instrumentationName)
+
+	target := opts.Target
+	if target == "" {
+		switch {
+		case opts.ToolName != "":
+			target = opts.ToolName
+		case opts.PromptName != "":
+			target = opts.PromptName
+		case opts.ResourceURI != "":
+			target = opts.ResourceURI
+		}
+	}
+
+	name := opts.Method
+	if name == "" {
+		name = "mcp"
+	}
+	if target != "" {
+		name = name + " " + target
+	}
+
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrMethodName, opts.Method),
+	}
+	if opts.ToolName != "" {
+		attrs = append(attrs,
+			attribute.String(AttrGenAIToolName, opts.ToolName),
+		)
+		if strings.HasPrefix(opts.Method, "tools/") {
+			attrs = append(attrs, attribute.String(AttrGenAIOperationName, OperationExecuteTool))
+		}
+	}
+	if opts.PromptName != "" {
+		attrs = append(attrs, attribute.String(AttrGenAIPromptName, opts.PromptName))
+	}
+	if opts.ResourceURI != "" {
+		attrs = append(attrs, attribute.String(AttrResourceURI, opts.ResourceURI))
+	}
+	if opts.SessionID != "" {
+		attrs = append(attrs, attribute.String(AttrSessionID, opts.SessionID))
+	}
+	if opts.ProtocolVersion != "" {
+		attrs = append(attrs, attribute.String(AttrProtocolVersion, opts.ProtocolVersion))
+	}
+	if opts.JSONRPCRequestID != "" {
+		attrs = append(attrs, attribute.String(AttrJSONRPCRequestID, opts.JSONRPCRequestID))
+	}
+	if opts.ServerAddress != "" {
+		attrs = append(attrs, attribute.String("server.address", opts.ServerAddress))
+		if opts.ServerPort > 0 {
+			attrs = append(attrs, attribute.Int("server.port", opts.ServerPort))
+		}
+	}
+	if conv := ConversationIDFromBaggage(ctx); conv != "" {
+		attrs = append(attrs, attribute.String("gen_ai.conversation.id", conv))
+	}
+
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(kind),
+		trace.WithAttributes(attrs...),
+	)
+
+	return ctx, &Span{
+		span:      span,
+		metricCtx: ctx,
+		startedAt: time.Now(),
+		method:    opts.Method,
+		kind:      kind,
+	}
+}
+
+// SetAttributes adds extra attributes to the span. Use for MCP extensions
+// or for response-side attributes the caller learns later
+// (e.g. rpc.response.status_code).
+func (s *Span) SetAttributes(attrs ...attribute.KeyValue) {
+	if s == nil {
+		return
+	}
+	s.span.SetAttributes(attrs...)
+}
+
+// RecordError marks the span as failed and stores error.type for the
+// duration metric. errType should be a short, low-cardinality string —
+// "rpc_error", "transport", "context_canceled", or the underlying error's
+// type name as a fallback.
+func (s *Span) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span and records the operation duration metric. Safe to
+// call multiple times; subsequent calls are no-ops.
+func (s *Span) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	errType := s.errType
+	s.mu.Unlock()
+
+	s.span.End()
+
+	insts := getInstruments()
+	if insts == nil {
+		return
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrMethodName, s.method),
+	}
+	if errType != "" {
+		attrs = append(attrs, attribute.String("error.type", errType))
+	}
+
+	histogram := insts.clientOperationDuration
+	if s.kind == trace.SpanKindServer {
+		histogram = insts.serverOperationDuration
+	}
+	if histogram == nil {
+		return
+	}
+	// Use the span's started-at as the reference; we already snapshot
+	// errType under the lock above, so no additional locking is needed
+	// for the immutable startedAt field.
+	histogram.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+		metric.WithAttributes(attrs...),
+	)
+}
+
+// ClassifyError maps an MCP error to a low-cardinality error.type value.
+// MCP errors are often plain RPC errors; this helper picks reasonable
+// labels for cancellation and falls back to the type name otherwise.
+func ClassifyError(err error) string {
+	if err == nil {
+		return ""
+	}
+	switch {
+	case errors.Is(err, context.Canceled):
+		return "context_canceled"
+	case errors.Is(err, context.DeadlineExceeded):
+		return "deadline_exceeded"
+	}
+	return "rpc_error"
+}

From 53bb70665383dbc50da718e3f9de2e5c33abc4d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:39:59 +0300
Subject: [PATCH 02/17] feat(otel): wire SDK init, W3C propagation, and HTTP
 client gating

- `cmd/root/otel.go`: stand up `TracerProvider` / `MeterProvider` / `LoggerProvider` from a single `initOTelSDK` entry, configure OTLP/HTTP exporters with explicit-scheme endpoint normalization, set the global W3C trace-context + baggage propagator unconditionally, flush providers in dependency order, attach `service.*` / `host.*` / `process.*` / `os.type` / `host.arch` resource attributes, and use `AlwaysSample` so local agent sessions are not dropped by an upstream sampling decision
- `pkg/httpclient/client.go`: add a `WrapWithOTel` round-tripper gated on a single `atomic.Bool` flipped by `initOTelSDK` (avoids the prior mismatch between `--otel` and the otelhttp wrap), plus `TracedDefaultClient` / `TracedClient` helpers for one-off HTTP calls
- `cmd/root/sandbox.go`: open a host-side `sandbox.exec` span and inject the active W3C trace context as `-e KEY=VALUE` flags so processes inside the container chain onto the host trace
- `cmd/root/new.go`, `cmd/root/otel_test.go`: wire tracer scope and cover the endpoint normalization / localhost detection cases
- `go.mod` / `go.sum`: pull in `go.opentelemetry.io/otel` SDK + OTLP/HTTP exporters
---
 cmd/root/new.go          |   5 +-
 cmd/root/otel.go         | 214 +++++++++++++++++++++++++++++++--------
 cmd/root/otel_test.go    |  57 +++++++++++
 cmd/root/sandbox.go      |  16 +++
 go.mod                   |   9 +-
 go.sum                   |  14 ++-
 pkg/httpclient/client.go |  83 ++++++++++-----
 7 files changed, 321 insertions(+), 77 deletions(-)

diff --git a/cmd/root/new.go b/cmd/root/new.go
index a52bed77d..a34c4f1f6 100644
--- a/cmd/root/new.go
+++ b/cmd/root/new.go
@@ -7,6 +7,7 @@ import (
 
 	tea "charm.land/bubbletea/v2"
 	"github.com/spf13/cobra"
+	"go.opentelemetry.io/otel"
 
 	"github.com/docker/docker-agent/pkg/app"
 	"github.com/docker/docker-agent/pkg/config"
@@ -63,7 +64,9 @@ func (f *newFlags) runNewCommand(cmd *cobra.Command, args []string) (commandErr
 	}
 	defer stopToolSets(t)
 
-	rt, err := runtime.New(t)
+	rt, err := runtime.New(t,
+		runtime.WithTracer(otel.Tracer(AppName)),
+	)
 	if err != nil {
 		return err
 	}
diff --git a/cmd/root/otel.go b/cmd/root/otel.go
index 9fc1f044d..32e8afd93 100644
--- a/cmd/root/otel.go
+++ b/cmd/root/otel.go
@@ -5,15 +5,26 @@ import (
 	"fmt"
 	"net"
 	"os"
+	"runtime"
 	"strings"
 	"time"
 
+	"github.com/google/uuid"
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
+	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
+	"go.opentelemetry.io/otel/log/global"
 	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/sdk/log"
+	"go.opentelemetry.io/otel/sdk/metric"
 	"go.opentelemetry.io/otel/sdk/resource"
 	"go.opentelemetry.io/otel/sdk/trace"
 	semconv "go.opentelemetry.io/otel/semconv/v1.40.0"
+
+	"github.com/docker/docker-agent/pkg/httpclient"
+	"github.com/docker/docker-agent/pkg/version"
 )
 
 const AppName = "cagent"
@@ -25,73 +36,188 @@ func initOTelSDK(ctx context.Context) (err error) {
 		return fmt.Errorf("failed to create resource: %w", err)
 	}
 
-	var traceExporter trace.SpanExporter
 	endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
 
-	// Only initialize if endpoint is configured
-	if endpoint != "" {
-		var opts []otlptracehttp.Option
-		// An endpoint with an http:// or https:// scheme goes through
-		// WithEndpointURL so the SDK picks the transport from the scheme
-		// (per the OTLP/HTTP spec). Bare host:port still flows through
-		// WithEndpoint with the loopback-insecure shortcut preserved.
-		if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") {
-			opts = []otlptracehttp.Option{otlptracehttp.WithEndpointURL(endpoint)}
-		} else {
-			opts = []otlptracehttp.Option{otlptracehttp.WithEndpoint(endpoint)}
-			if isLocalhostEndpoint(endpoint) {
-				opts = append(opts, otlptracehttp.WithInsecure())
-			}
-		}
-		traceExporter, err = otlptracehttp.New(ctx, opts...)
-		if err != nil {
-			return fmt.Errorf("failed to create trace exporter: %w", err)
-		}
+	tp, err := newTracerProvider(ctx, res, endpoint)
+	if err != nil {
+		return fmt.Errorf("failed to create tracer provider: %w", err)
 	}
+	otel.SetTracerProvider(tp)
 
-	// Configure tracer provider
-	tracerProviderOpts := []trace.TracerProviderOption{
-		trace.WithResource(res),
+	mp, err := newMeterProvider(ctx, res, endpoint)
+	if err != nil {
+		_ = shutdownTracerProvider(tp)
+		return fmt.Errorf("failed to create meter provider: %w", err)
 	}
+	otel.SetMeterProvider(mp)
 
-	if traceExporter != nil {
-		tracerProviderOpts = append(tracerProviderOpts,
-			trace.WithBatcher(traceExporter,
-				trace.WithBatchTimeout(5*time.Second),
-				trace.WithMaxExportBatchSize(512),
-			),
-		)
+	lp, err := newLoggerProvider(ctx, res, endpoint)
+	if err != nil {
+		_ = mp.Shutdown(context.Background())
+		_ = shutdownTracerProvider(tp)
+		return fmt.Errorf("failed to create logger provider: %w", err)
 	}
+	global.SetLoggerProvider(lp)
 
-	tp := trace.NewTracerProvider(tracerProviderOpts...)
-	otel.SetTracerProvider(tp)
-
-	// Propagator must be set so otelhttp injects W3C traceparent on
-	// outbound requests and extracts it from incoming ones. Without this
-	// the SDK records spans locally but they never chain across services.
+	// Set the global text-map propagator unconditionally so otelhttp
+	// (and any other propagation-aware instrumentation) injects W3C
+	// `traceparent` / `tracestate` / `baggage` on outbound requests
+	// and extracts them on incoming ones. The propagator is a global
+	// no-op until set; without this the SDK records spans locally
+	// but they never chain across processes — `gen_ai.conversation.id`
+	// baggage and the MCP `_meta` / sandbox env-var injectors are
+	// dormant until this runs.
 	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
 		propagation.TraceContext{},
 		propagation.Baggage{},
 	))
 
+	// Single source of truth for "is OTel enabled?" — flip the
+	// httpclient gate now so outbound requests start emitting CLIENT
+	// spans and injecting traceparent. Previously the gate read
+	// OTEL_EXPORTER_OTLP_ENDPOINT directly, which diverged from the
+	// `--otel` CLI gate that controls this function: we'd either
+	// initialise providers without HTTP wrapping, or wrap HTTP without
+	// having a usable propagator.
+	httpclient.SetOTelEnabled(true)
+
 	go func() {
 		<-ctx.Done()
-		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-		defer cancel()
-		_ = tp.Shutdown(shutdownCtx)
+		// Flush in dependency order: logs and metrics first (they may
+		// reference active spans), then traces. Each provider gets its
+		// own 5s budget so a slow exporter can't starve the others —
+		// sharing a single timeout meant a stuck logs endpoint silently
+		// dropped buffered metrics and spans.
+		shutdown := func(fn func(context.Context) error) {
+			c, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+			_ = fn(c)
+		}
+		shutdown(lp.Shutdown)
+		shutdown(mp.Shutdown)
+		shutdown(tp.Shutdown)
 	}()
 
 	return nil
 }
 
+// newTracerProvider builds the SDK tracer provider with an OTLP/HTTP
+// exporter when an endpoint is set.
+func newTracerProvider(ctx context.Context, res *resource.Resource, endpoint string) (*trace.TracerProvider, error) {
+	opts := []trace.TracerProviderOption{trace.WithResource(res)}
+
+	if endpoint == "" {
+		return trace.NewTracerProvider(opts...), nil
+	}
+
+	exp, err := otlptracehttp.New(ctx, traceExporterOptions(endpoint)...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create trace exporter: %w", err)
+	}
+	opts = append(opts, trace.WithBatcher(exp,
+		trace.WithBatchTimeout(5*time.Second),
+		trace.WithMaxExportBatchSize(512),
+	))
+	return trace.NewTracerProvider(opts...), nil
+}
+
+// newMeterProvider builds the SDK meter provider. Without an endpoint the
+// provider still wires up so meters callers create are valid no-ops; with
+// an endpoint, a periodic reader exports via OTLP/HTTP.
+func newMeterProvider(ctx context.Context, res *resource.Resource, endpoint string) (*metric.MeterProvider, error) {
+	opts := []metric.Option{metric.WithResource(res)}
+
+	if endpoint != "" {
+		exp, err := otlpmetrichttp.New(ctx, metricExporterOptions(endpoint)...)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create metric exporter: %w", err)
+		}
+		opts = append(opts, metric.WithReader(metric.NewPeriodicReader(exp,
+			metric.WithInterval(60*time.Second),
+		)))
+	}
+
+	return metric.NewMeterProvider(opts...), nil
+}
+
+// newLoggerProvider builds the SDK logger provider. Required for the
+// gen_ai.client.operation.exception event (a log record per spec) and for
+// any future log-bridge instrumentation.
+func newLoggerProvider(ctx context.Context, res *resource.Resource, endpoint string) (*log.LoggerProvider, error) {
+	opts := []log.LoggerProviderOption{log.WithResource(res)}
+
+	if endpoint != "" {
+		exp, err := otlploghttp.New(ctx, logExporterOptions(endpoint)...)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create log exporter: %w", err)
+		}
+		opts = append(opts, log.WithProcessor(log.NewBatchProcessor(exp)))
+	}
+
+	return log.NewLoggerProvider(opts...), nil
+}
+
+// normalizeOTLPEndpoint turns a possibly-bare `host:port` into a fully
+// scheme-qualified URL so all three OTLP/HTTP exporters can be wired via
+// `WithEndpointURL` consistently. We can't rely on the SDKs' default
+// scheme inference: `otlptracehttp` (older API) treats a bare endpoint
+// as TLS-by-default while `otlploghttp` (newer API) treats the same
+// bare endpoint as insecure-by-default. With `OTEL_EXPORTER_OTLP_CERTIFICATE`
+// set in the env, the log exporter then errors out with
+// `insecure HTTP endpoint cannot use TLS client configuration`,
+// `initOTelSDK` propagates the failure, and the entire telemetry
+// pipeline (including traces) is torn down.
+//
+// Pinning the scheme up front removes that asymmetry: localhost gets
+// `http://`, every other host gets `https://`, and any explicit scheme
+// the caller already supplied is honoured verbatim.
+func normalizeOTLPEndpoint(endpoint string) string {
+	if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") {
+		return endpoint
+	}
+	if isLocalhostEndpoint(endpoint) {
+		return "http://" + endpoint
+	}
+	return "https://" + endpoint
+}
+
+func traceExporterOptions(endpoint string) []otlptracehttp.Option {
+	return []otlptracehttp.Option{otlptracehttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))}
+}
+
+func metricExporterOptions(endpoint string) []otlpmetrichttp.Option {
+	return []otlpmetrichttp.Option{otlpmetrichttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))}
+}
+
+func logExporterOptions(endpoint string) []otlploghttp.Option {
+	return []otlploghttp.Option{otlploghttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))}
+}
+
+func shutdownTracerProvider(tp *trace.TracerProvider) error {
+	shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	return tp.Shutdown(shutdownCtx)
+}
+
 func newOTelResource() (*resource.Resource, error) {
+	// Standard OTel resource attributes; users can layer additional
+	// labels via the spec-defined `OTEL_RESOURCE_ATTRIBUTES` env var,
+	// which `resource.Default` merges in.
+	attrs := []attribute.KeyValue{
+		semconv.ServiceName(AppName),
+		semconv.ServiceVersion(version.Version),
+		semconv.ServiceInstanceID(uuid.NewString()),
+		semconv.ProcessPID(os.Getpid()),
+		semconv.ProcessRuntimeName("go"),
+		semconv.OSTypeKey.String(runtime.GOOS),
+		semconv.HostArchKey.String(runtime.GOARCH),
+	}
+	if hostname, err := os.Hostname(); err == nil && hostname != "" {
+		attrs = append(attrs, semconv.HostName(hostname))
+	}
 	return resource.Merge(
 		resource.Default(),
-		resource.NewWithAttributes(
-			semconv.SchemaURL,
-			semconv.ServiceName(AppName),
-			semconv.ServiceVersion("dev"), // TODO: use actual version
-		),
+		resource.NewWithAttributes(semconv.SchemaURL, attrs...),
 	)
 }
 
diff --git a/cmd/root/otel_test.go b/cmd/root/otel_test.go
index 042973a9e..961383e45 100644
--- a/cmd/root/otel_test.go
+++ b/cmd/root/otel_test.go
@@ -16,6 +16,63 @@ func TestNewOTelResourceUsesCurrentSchemaURL(t *testing.T) {
 	assert.Equal(t, semconv.SchemaURL, res.SchemaURL())
 }
 
+// TestProvidersWithoutEndpoint verifies all three providers build cleanly
+// when no OTLP endpoint is configured — they're no-op exporters but must
+// still produce valid, non-nil providers so callers can create instruments.
+func TestProvidersWithoutEndpoint(t *testing.T) {
+	t.Parallel()
+
+	ctx := t.Context()
+	res, err := newOTelResource()
+	require.NoError(t, err)
+
+	tp, err := newTracerProvider(ctx, res, "")
+	require.NoError(t, err)
+	require.NotNil(t, tp)
+	assert.NotNil(t, tp.Tracer("test"))
+
+	mp, err := newMeterProvider(ctx, res, "")
+	require.NoError(t, err)
+	require.NotNil(t, mp)
+	assert.NotNil(t, mp.Meter("test"))
+
+	lp, err := newLoggerProvider(ctx, res, "")
+	require.NoError(t, err)
+	require.NotNil(t, lp)
+	assert.NotNil(t, lp.Logger("test"))
+}
+
+// TestNormalizeOTLPEndpoint pins the bare-endpoint -> URL mapping the
+// three OTLP/HTTP exporters share. Without this normalization the log
+// exporter (insecure-by-default for bare hosts) conflicted with
+// OTEL_EXPORTER_OTLP_CERTIFICATE and tore down the whole telemetry
+// pipeline; the trace exporter (TLS-by-default for bare hosts) hid
+// the inconsistency.
+func TestNormalizeOTLPEndpoint(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		endpoint string
+		want     string
+	}{
+		{"bare remote host:port -> https", "alloy.observability.svc.cluster.local:4318", "https://alloy.observability.svc.cluster.local:4318"},
+		{"bare remote host -> https", "example.com", "https://example.com"},
+		{"bare localhost host:port -> http", "localhost:4318", "http://localhost:4318"},
+		{"bare localhost -> http", "localhost", "http://localhost"},
+		{"bare ipv4 loopback -> http", "127.0.0.1:4318", "http://127.0.0.1:4318"},
+		{"bare ipv6 loopback -> http", "[::1]:4318", "http://[::1]:4318"},
+		{"explicit https preserved", "https://example.com:4318", "https://example.com:4318"},
+		{"explicit http preserved", "http://localhost:4318", "http://localhost:4318"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, normalizeOTLPEndpoint(tt.endpoint))
+		})
+	}
+}
+
 func TestIsLocalhostEndpoint(t *testing.T) {
 	t.Parallel()
 
diff --git a/cmd/root/sandbox.go b/cmd/root/sandbox.go
index 8a506138a..c163ed05e 100644
--- a/cmd/root/sandbox.go
+++ b/cmd/root/sandbox.go
@@ -18,6 +18,7 @@ import (
 	"github.com/docker/docker-agent/pkg/environment"
 	"github.com/docker/docker-agent/pkg/paths"
 	"github.com/docker/docker-agent/pkg/sandbox"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // runInSandbox delegates the current command to a Docker sandbox.
@@ -68,15 +69,30 @@ func runInSandbox(ctx context.Context, cmd *cobra.Command, args []string, runCon
 		envFlags = append(envFlags, "-e", envModelsGateway+"="+gateway)
 	}
 
+	// Wrap the sandbox exec in a span so the host side captures timing
+	// and exit code, and inject W3C trace context via env vars so the
+	// agent process spawned inside the sandbox container chains its
+	// own spans onto this parent.
+	ctx, sbxSpan := genai.StartSandboxExec(ctx, genai.SandboxOptions{
+		Runtime:   "docker",
+		Container: name,
+	})
+	defer sbxSpan.End()
+	envFlags = append(envFlags, genai.InjectSandboxEnv(ctx)...)
+
 	dockerCmd := backend.BuildExecCmd(ctx, name, wd, dockerAgentArgs, envFlags, envVars)
 	slog.Debug("Executing in sandbox", "name", name, "args", dockerCmd.Args)
 
 	if err := dockerCmd.Run(); err != nil {
 		if exitErr, ok := errors.AsType[*exec.ExitError](err); ok {
+			sbxSpan.SetExitCode(exitErr.ExitCode())
+			sbxSpan.RecordError(err, "")
 			return cli.StatusError{StatusCode: exitErr.ExitCode()}
 		}
+		sbxSpan.RecordError(err, "")
 		return fmt.Errorf("docker sandbox exec failed: %w", err)
 	}
+	sbxSpan.SetExitCode(0)
 
 	return nil
 }
diff --git a/go.mod b/go.mod
index 30257027d..b07ef09d8 100644
--- a/go.mod
+++ b/go.mod
@@ -61,8 +61,11 @@ require (
 	github.com/yuin/goldmark v1.8.2
 	github.com/zclconf/go-cty v1.13.0
 	go.opentelemetry.io/otel v1.43.0
+	go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0
+	go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
 	go.opentelemetry.io/otel/sdk v1.43.0
+	go.opentelemetry.io/otel/sdk/log v0.19.0
 	go.opentelemetry.io/otel/trace v1.43.0
 	golang.org/x/image v0.39.0
 	golang.org/x/oauth2 v0.36.0
@@ -235,9 +238,9 @@ require (
 	go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 // indirect
-	go.opentelemetry.io/otel/log v0.16.0 // indirect
-	go.opentelemetry.io/otel/metric v1.43.0 // indirect
-	go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/log v0.19.0
+	go.opentelemetry.io/otel/metric v1.43.0
+	go.opentelemetry.io/otel/sdk/metric v1.43.0
 	go.opentelemetry.io/proto/otlp v1.10.0 // indirect
 	go.yaml.in/yaml/v4 v4.0.0-rc.4
 	golang.org/x/crypto v0.50.0 // indirect
diff --git a/go.sum b/go.sum
index 24965d57e..45e90081d 100644
--- a/go.sum
+++ b/go.sum
@@ -545,22 +545,28 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
 go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
 go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
+go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0 h1:HIBTQ3VO5aupLKjC90JgMqpezVXwFuq6Ryjn0/izoag=
+go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0/go.mod h1:ji9vId85hMxqfvICA0Jt8JqEdrXaAkcpkI9HPXya0ro=
 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 h1:NOyNnS19BF2SUDApbOKbDtWZ0IK7b8FJ2uAGdIWOGb0=
 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0/go.mod h1:VL6EgVikRLcJa9ftukrHu/ZkkhFBSo1lzvdBC9CF1ss=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak=
-go.opentelemetry.io/otel/log v0.16.0 h1:DeuBPqCi6pQwtCK0pO4fvMB5eBq6sNxEnuTs88pjsN4=
-go.opentelemetry.io/otel/log v0.16.0/go.mod h1:rWsmqNVTLIA8UnwYVOItjyEZDbKIkMxdQunsIhpUMes=
+go.opentelemetry.io/otel/log v0.19.0 h1:KUZs/GOsw79TBBMfDWsXS+KZ4g2Ckzksd1ymzsIEbo4=
+go.opentelemetry.io/otel/log v0.19.0/go.mod h1:5DQYeGmxVIr4n0/BcJvF4upsraHjg6vudJJpnkL6Ipk=
 go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
 go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
 go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
 go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
-go.opentelemetry.io/otel/sdk/log v0.16.0 h1:e/b4bdlQwC5fnGtG3dlXUrNOnP7c8YLVSpSfEBIkTnI=
-go.opentelemetry.io/otel/sdk/log v0.16.0/go.mod h1:JKfP3T6ycy7QEuv3Hj8oKDy7KItrEkus8XJE6EoSzw4=
+go.opentelemetry.io/otel/sdk/log v0.19.0 h1:scYVLqT22D2gqXItnWiocLUKGH9yvkkeql5dBDiXyko=
+go.opentelemetry.io/otel/sdk/log v0.19.0/go.mod h1:vFBowwXGLlW9AvpuF7bMgnNI95LiW10szrOdvzBHlAg=
+go.opentelemetry.io/otel/sdk/log/logtest v0.19.0 h1:BEbF7ZBB6qQloV/Ub1+3NQoOUnVtcGkU3XX4Ws3GQfk=
+go.opentelemetry.io/otel/sdk/log/logtest v0.19.0/go.mod h1:Lua81/3yM0wOmoHTokLj9y9ADeA02v1naRrVrkAZuKk=
 go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
 go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
 go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
diff --git a/pkg/httpclient/client.go b/pkg/httpclient/client.go
index f968ec984..7e34927d4 100644
--- a/pkg/httpclient/client.go
+++ b/pkg/httpclient/client.go
@@ -6,8 +6,8 @@ import (
 	"maps"
 	"net/http"
 	"net/url"
-	"os"
 	"runtime"
+	"sync/atomic"
 
 	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 
@@ -40,13 +40,66 @@ func NewHTTPClient(ctx context.Context, opts ...Opt) *http.Client {
 	rt := newTransport(ctx)
 
 	return &http.Client{
-		Transport: &userAgentTransport{
+		Transport: WrapWithOTel(&userAgentTransport{
 			httpOptions: httpOptions,
 			rt:          rt,
-		},
+		}),
 	}
 }
 
+// otelEnabled tracks whether the OTel SDK has been initialised in this
+// process. `cmd/root/otel.go:initOTelSDK` calls `SetOTelEnabled(true)`
+// on success; nothing else flips this flag. Gating on a single source
+// of truth (rather than re-reading `OTEL_EXPORTER_OTLP_ENDPOINT`)
+// avoids the previous mismatch where the SDK could be initialised
+// without the HTTP wrap, or the HTTP wrap could fire without the SDK
+// initialising the propagator.
+var otelEnabled atomic.Bool
+
+// SetOTelEnabled toggles the gate consulted by WrapWithOTel. Called by
+// `initOTelSDK` after providers and the propagator are wired so HTTP
+// clients start injecting `traceparent` only once the rest of the SDK
+// can actually use the resulting spans.
+func SetOTelEnabled(enabled bool) {
+	otelEnabled.Store(enabled)
+}
+
+// WrapWithOTel returns rt wrapped with otelhttp when OpenTelemetry has
+// been enabled via `SetOTelEnabled` (called by `initOTelSDK`), or rt
+// unchanged otherwise. Gating avoids per-request span allocation on
+// the no-OTel path and stops sending a `traceparent` header to
+// upstream LLM providers that have no use for it. Exposed so callers
+// that build their own transports outside of `NewHTTPClient` can opt
+// into the same gating without duplicating the check.
+func WrapWithOTel(rt http.RoundTripper) http.RoundTripper {
+	if !otelEnabled.Load() {
+		return rt
+	}
+	return otelhttp.NewTransport(rt)
+}
+
+// TracedDefaultClient returns an `http.Client` equivalent to
+// `http.DefaultClient` but with the default transport wrapped via
+// `WrapWithOTel`. Use as a drop-in replacement at call sites that
+// previously did `http.DefaultClient.Do(req)` so OAuth metadata fetches,
+// fetch-tool requests, registry probes, and similar one-off HTTP calls
+// chain into the active trace.
+func TracedDefaultClient() *http.Client {
+	return &http.Client{Transport: WrapWithOTel(http.DefaultTransport)}
+}
+
+// TracedClient returns a configurable `http.Client` with the default
+// transport already wrapped via `WrapWithOTel`. The supplied options
+// (timeout, redirect policy, jar, etc.) are applied after construction.
+// Convenience wrapper for short-lived clients with custom timeouts.
+func TracedClient(opts ...func(*http.Client)) *http.Client {
+	c := &http.Client{Transport: WrapWithOTel(http.DefaultTransport)}
+	for _, opt := range opts {
+		opt(c)
+	}
+	return c
+}
+
 func WithHeader(key, value string) Opt {
 	return func(o *HTTPOptions) {
 		o.Header.Set(key, value)
@@ -100,15 +153,7 @@ func WithQuery(query url.Values) Opt {
 	}
 }
 
-// newTransport returns an HTTP transport with automatic gzip compression
-// disabled and using Docker Desktop proxy if available.
-//
-// When OpenTelemetry is enabled (i.e. OTEL_EXPORTER_OTLP_ENDPOINT is set,
-// matching the gating in initOTelSDK), the transport is wrapped with
-// otelhttp so each outbound request emits a CLIENT span and the W3C
-// traceparent header is injected. When OTel is disabled, the bare
-// transport is returned so we don't allocate per-request spans nor send
-// a traceparent header to upstream LLM providers.
+// newTransport returns an HTTP transport with automatic gzip compression disabled and using Docker Desktop proxy if available.
 func newTransport(ctx context.Context) http.RoundTripper {
 	// Get the base transport with Desktop proxy support from remote package
 	rt := remote.NewTransport(ctx)
@@ -122,19 +167,7 @@ func newTransport(ctx context.Context) http.RoundTripper {
 		t.DisableCompression()
 	}
 
-	return WrapWithOTel(rt)
-}
-
-// WrapWithOTel returns rt wrapped with otelhttp when OpenTelemetry is
-// enabled (OTEL_EXPORTER_OTLP_ENDPOINT set, matching the gating in
-// cmd/root/otel.go), or rt unchanged otherwise. Exposed so callers that
-// build their own transports outside of NewHTTPClient can opt into the
-// same env-gated instrumentation without duplicating the gating logic.
-func WrapWithOTel(rt http.RoundTripper) http.RoundTripper {
-	if os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") == "" {
-		return rt
-	}
-	return otelhttp.NewTransport(rt)
+	return rt
 }
 
 type userAgentTransport struct {

From 18591df970aa28278a9684bbdef8f58748ee397d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:00 +0300
Subject: [PATCH 03/17] feat(otel): instrument provider chat, embed, rerank
 with semconv spans and metrics

- `pkg/model/provider/instrument.go`: decorator that wraps any `Provider` with a `chat {model}` CLIENT span (per OTel GenAI semconv), opt-in capture of `gen_ai.input.messages` / `gen_ai.output.messages` / `gen_ai.tool.definitions`, request/response attributes including the Anthropic spec-sum input-token computation (input + cache_read + cache_creation), `gen_ai.client.token.usage` histogram, and `gen_ai.client.operation.duration` histogram. Six wrapper variants preserve the EmbeddingProvider / RerankingProvider capability surfaces so RAG fallbacks round-trip correctly
- `pkg/model/provider/factory.go`, `factory_test.go`: route construction through the decorator
- `pkg/model/provider/anthropic/client.go`, `files.go`: add `anthropic.tokens.count` and `anthropic.files.get_or_upload` spans for the overflow-retry token-counting path and the file-upload cache-or-create path; drop the unnecessary `string(model)` cast
---
 pkg/model/provider/anthropic/client.go |  30 ++-
 pkg/model/provider/anthropic/files.go  |  24 +-
 pkg/model/provider/factory.go          |  11 +-
 pkg/model/provider/factory_test.go     |   5 +-
 pkg/model/provider/instrument.go       | 309 +++++++++++++++++++++++++
 5 files changed, 374 insertions(+), 5 deletions(-)
 create mode 100644 pkg/model/provider/instrument.go

diff --git a/pkg/model/provider/anthropic/client.go b/pkg/model/provider/anthropic/client.go
index 115274458..db82bdbe8 100644
--- a/pkg/model/provider/anthropic/client.go
+++ b/pkg/model/provider/anthropic/client.go
@@ -14,6 +14,10 @@ import (
 	"github.com/anthropics/anthropic-sdk-go/option"
 	"github.com/anthropics/anthropic-sdk-go/packages/param"
 	"github.com/anthropics/anthropic-sdk-go/packages/ssestream"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/config/latest"
@@ -22,6 +26,7 @@ import (
 	"github.com/docker/docker-agent/pkg/model/provider/base"
 	"github.com/docker/docker-agent/pkg/model/provider/options"
 	"github.com/docker/docker-agent/pkg/model/provider/providerutil"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -696,7 +701,30 @@ func countAnthropicTokens(
 	messages []anthropic.MessageParam,
 	system []anthropic.TextBlockParam,
 	anthropicTools []anthropic.ToolUnionParam,
-) (int64, error) {
+) (count int64, err error) {
+	// Token counting is a blocking API call to Anthropic that fires
+	// on the context-overflow retry path. Span it so the latency is
+	// attributable when the retry stalls.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider/anthropic").Start(
+		ctx,
+		"anthropic.tokens.count",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(
+			attribute.String(genai.AttrProviderName, genai.ProviderAnthropic),
+			attribute.String(genai.AttrRequestModel, model),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		if count > 0 {
+			span.SetAttributes(attribute.Int64("cagent.anthropic.tokens.counted", count))
+		}
+		span.End()
+	}()
+
 	params := anthropic.MessageCountTokensParams{
 		Model:    model,
 		Messages: messages,
diff --git a/pkg/model/provider/anthropic/files.go b/pkg/model/provider/anthropic/files.go
index 98417abd4..015f102d2 100644
--- a/pkg/model/provider/anthropic/files.go
+++ b/pkg/model/provider/anthropic/files.go
@@ -15,6 +15,10 @@ import (
 	"time"
 
 	"github.com/anthropics/anthropic-sdk-go"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/chat"
 )
@@ -78,7 +82,25 @@ func NewFileManager(clientFn func(context.Context) (anthropic.Client, error)) *F
 // Files are deduplicated by content hash AND MIME type, so identical files with
 // different extensions will be uploaded separately.
 // Concurrent calls for the same file will wait for a single upload to complete.
-func (fm *FileManager) GetOrUpload(ctx context.Context, filePath string) (*UploadedFile, error) {
+func (fm *FileManager) GetOrUpload(ctx context.Context, filePath string) (result *UploadedFile, err error) {
+	// Span the whole upload — large files take seconds to minutes
+	// over slow links and previously the latency was completely
+	// dark. cache_hit=true paths are short-lived siblings; the
+	// network upload path is the long branch.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider/anthropic").Start(
+		ctx,
+		"anthropic.files.get_or_upload",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attribute.String("cagent.file.path", filePath)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	absPath, err := filepath.Abs(filePath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get absolute path: %w", err)
diff --git a/pkg/model/provider/factory.go b/pkg/model/provider/factory.go
index 5ca4fdd8a..22c78288d 100644
--- a/pkg/model/provider/factory.go
+++ b/pkg/model/provider/factory.go
@@ -71,7 +71,16 @@ func createDirectProvider(ctx context.Context, cfg *latest.ModelConfig, env envi
 		slog.Error("Unknown provider type", "type", providerType)
 		return nil, fmt.Errorf("unknown provider type: %s", providerType)
 	}
-	return factory(ctx, enhancedCfg, env, opts...)
+	p, err := factory(ctx, enhancedCfg, env, opts...)
+	if err != nil {
+		return nil, err
+	}
+	// Wrap leaf providers with the GenAI semconv tracer so every chat
+	// completion emits a `chat {model}` CLIENT span and the standard
+	// gen_ai.client.* metrics. The rule-based router constructed by
+	// createRuleBasedRouter is left bare — its routed targets go through
+	// resolveRoutedModel → createDirectProvider and end up wrapped here.
+	return instrumentProvider(p), nil
 }
 
 // providerFactory builds a Provider from a fully-resolved ModelConfig.
diff --git a/pkg/model/provider/factory_test.go b/pkg/model/provider/factory_test.go
index 3f849f786..339b86323 100644
--- a/pkg/model/provider/factory_test.go
+++ b/pkg/model/provider/factory_test.go
@@ -108,8 +108,9 @@ func TestCreateDirectProvider_DispatchByType(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			p, err := createDirectProvider(t.Context(), tt.cfg, environment.NewNoEnvProvider())
 			require.NoError(t, err)
-			fp, ok := p.(*fakeProvider)
-			require.True(t, ok, "expected fakeProvider, got %T", p)
+			leaf := unwrapProvider(p)
+			fp, ok := leaf.(*fakeProvider)
+			require.True(t, ok, "expected fakeProvider, got %T", leaf)
 			assert.Equal(t, tt.expectID, fp.id)
 		})
 	}
diff --git a/pkg/model/provider/instrument.go b/pkg/model/provider/instrument.go
new file mode 100644
index 000000000..92c44e42b
--- /dev/null
+++ b/pkg/model/provider/instrument.go
@@ -0,0 +1,309 @@
+package provider
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/model/provider/base"
+	"github.com/docker/docker-agent/pkg/rag/types"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// unwrapProvider returns the leaf provider underneath any number of
+// instrumentation wrappers. Used by tests and by code paths that need to
+// reach back to the concrete implementation (e.g. capability assertions
+// that the wrappers do not transparently forward).
+func unwrapProvider(p Provider) Provider {
+	for {
+		u, ok := p.(interface{ Unwrap() Provider })
+		if !ok {
+			return p
+		}
+		p = u.Unwrap()
+	}
+}
+
+// instrumentProvider wraps the leaf provider so every chat completion is
+// surrounded by a GenAI semconv-compliant span and the matching client
+// metrics. The wrapper is added once at the createDirectProvider boundary
+// — the rule-based router (createRuleBasedRouter) is left bare because it
+// dispatches to providers that are themselves already wrapped, so a
+// single chat span is emitted per call regardless of routing depth.
+//
+// To avoid changing the apparent capability of the inner provider, the
+// wrapper that is returned satisfies exactly the same set of interfaces
+// that the inner provider satisfies — chat-only, chat+rerank,
+// chat+embed+rerank, etc. RAG callers do `p.(EmbeddingProvider)` and rely
+// on `ok=false` to fall back to sequential processing; if the wrapper
+// always implemented EmbeddingProvider that fallback would silently
+// disappear.
+func instrumentProvider(p Provider) Provider {
+	if p == nil {
+		return nil
+	}
+
+	tc := &tracedChat{inner: p}
+
+	bep, isBatchEmbed := p.(BatchEmbeddingProvider)
+	ep, isEmbed := p.(EmbeddingProvider)
+	rp, isRerank := p.(RerankingProvider)
+
+	switch {
+	case isBatchEmbed && isRerank:
+		return &tracedBatchEmbedRerank{tracedChat: tc, batchEmbed: bep, rerank: rp}
+	case isBatchEmbed:
+		return &tracedBatchEmbed{tracedChat: tc, batchEmbed: bep}
+	case isEmbed && isRerank:
+		return &tracedEmbedRerank{tracedChat: tc, embed: ep, rerank: rp}
+	case isEmbed:
+		return &tracedEmbed{tracedChat: tc, embed: ep}
+	case isRerank:
+		return &tracedRerank{tracedChat: tc, rerank: rp}
+	default:
+		return tc
+	}
+}
+
+// tracedChat is the base wrapper. It satisfies just Provider and is
+// embedded by every richer wrapper. CreateChatCompletionStream is the
+// only method that adds behaviour — everything else delegates.
+type tracedChat struct {
+	inner Provider
+}
+
+func (t *tracedChat) ID() string              { return t.inner.ID() }
+func (t *tracedChat) BaseConfig() base.Config { return t.inner.BaseConfig() }
+
+// Unwrap returns the wrapped provider. Tests and any other caller that
+// needs the leaf type (e.g. for type assertions on internal helper
+// methods) can use the standard unwrap pattern:
+//
+//	if u, ok := p.(interface{ Unwrap() Provider }); ok { p = u.Unwrap() }
+func (t *tracedChat) Unwrap() Provider { return t.inner }
+
+func (t *tracedChat) CreateChatCompletionStream(ctx context.Context, messages []chat.Message, requestTools []tools.Tool) (chat.MessageStream, error) {
+	cfg := t.inner.BaseConfig()
+	req := genai.ChatRequest{
+		Provider: genai.ProviderNameForConfig(cfg.ModelConfig.Provider),
+		Model:    cfg.ModelConfig.Model,
+		Stream:   true,
+	}
+	// Populate sampling parameters from the resolved model config so the
+	// `gen_ai.request.max_tokens` / `temperature` / `top_p` / `top_k`
+	// attributes the GenAI semconv conditionally requires actually land
+	// on the span. Without this, the helper's gated emission paths were
+	// unreachable. Pointer fields distinguish "explicitly set" from
+	// "unset"; the matching Has* flags carry that signal through.
+	if mc := cfg.ModelConfig.MaxTokens; mc != nil {
+		req.MaxTokens = int(*mc)
+	}
+	if t := cfg.ModelConfig.Temperature; t != nil {
+		req.Temperature = *t
+		req.HasTemperature = true
+	}
+	if tp := cfg.ModelConfig.TopP; tp != nil {
+		req.TopP = *tp
+		req.HasTopP = true
+	}
+	chatCtx, span := genai.StartChat(ctx, req)
+
+	// Opt-in capture of request content. Helpers internally check the
+	// `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT` env var and
+	// no-op when unset, so the cost on the default path is the
+	// function-call overhead and nothing else.
+	genai.SetInputMessages(span, messages)
+	genai.SetToolDefinitions(span, requestTools)
+
+	stream, err := t.inner.CreateChatCompletionStream(chatCtx, messages, requestTools)
+	if err != nil {
+		span.RecordError(err, genai.ClassifyError(err))
+		span.End()
+		return nil, err
+	}
+	return genai.WrapStream(span, stream), nil
+}
+
+// embeddingRequestForConfig builds an EmbeddingRequest from the inner
+// provider's BaseConfig — same shape as the chat path so the spec
+// `gen_ai.provider.name` / `gen_ai.request.model` attributes use the
+// canonical names.
+func (t *tracedChat) embeddingRequestForConfig(batchSize int) genai.EmbeddingRequest {
+	cfg := t.inner.BaseConfig()
+	return genai.EmbeddingRequest{
+		Provider:  genai.ProviderNameForConfig(cfg.ModelConfig.Provider),
+		Model:     cfg.ModelConfig.Model,
+		BatchSize: batchSize,
+	}
+}
+
+// rerankSpan opens a `cagent.rerank` span. There is no spec-defined
+// rerank span yet; the operation is closely related to retrieval but
+// distinct enough to warrant its own name. Custom attributes use the
+// `cagent.*` namespace.
+func (t *tracedChat) rerankSpan(ctx context.Context, docCount int) (context.Context, trace.Span) {
+	cfg := t.inner.BaseConfig()
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider")
+	attrs := []attribute.KeyValue{
+		attribute.String(genai.AttrProviderName, genai.ProviderNameForConfig(cfg.ModelConfig.Provider)),
+		attribute.String(genai.AttrRequestModel, cfg.ModelConfig.Model),
+		attribute.Int("cagent.rerank.document_count", docCount),
+	}
+	// Carry `gen_ai.conversation.id` from baggage like every other
+	// span helper in the branch. The chat / embedding / retrieval /
+	// fallback / sandbox / MCP starters all do this; rerank was the
+	// odd one out, leaving rerank latency unattributable in
+	// per-conversation dashboards.
+	if convID := genai.ConversationIDFromContext(ctx); convID != "" {
+		attrs = append(attrs, attribute.String(genai.AttrConversationID, convID))
+	}
+	return tracer.Start(ctx, "rerank",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attrs...),
+	)
+}
+
+// wrapEmbedding wraps a single-input embedding call with a spec
+// `embeddings {model}` span. Records token usage and dimension count on
+// success; classifies errors on failure.
+func wrapEmbedding(ctx context.Context, req genai.EmbeddingRequest, fn func(context.Context) (*base.EmbeddingResult, error)) (*base.EmbeddingResult, error) {
+	ctx, span := genai.StartEmbedding(ctx, req)
+	defer span.End()
+	res, err := fn(ctx)
+	if err != nil {
+		span.RecordError(err, "")
+		return nil, err
+	}
+	if res != nil {
+		span.SetInputTokens(res.InputTokens)
+		span.SetDimensions(len(res.Embedding))
+	}
+	return res, nil
+}
+
+// wrapBatchEmbedding wraps a batch embedding call. Records the total
+// input tokens across the batch and the per-vector dimensionality.
+func wrapBatchEmbedding(ctx context.Context, req genai.EmbeddingRequest, fn func(context.Context) (*base.BatchEmbeddingResult, error)) (*base.BatchEmbeddingResult, error) {
+	ctx, span := genai.StartEmbedding(ctx, req)
+	defer span.End()
+	res, err := fn(ctx)
+	if err != nil {
+		span.RecordError(err, "")
+		return nil, err
+	}
+	if res != nil {
+		span.SetInputTokens(res.InputTokens)
+		if len(res.Embeddings) > 0 {
+			span.SetDimensions(len(res.Embeddings[0]))
+		}
+	}
+	return res, nil
+}
+
+// wrapRerank wraps a Rerank call with a `rerank` CLIENT span that
+// captures document count and error classification.
+func (t *tracedChat) wrapRerank(ctx context.Context, query string, documents []types.Document, criteria string, fn func(context.Context, string, []types.Document, string) ([]float64, error)) ([]float64, error) {
+	ctx, span := t.rerankSpan(ctx, len(documents))
+	defer span.End()
+	scores, err := fn(ctx, query, documents, criteria)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+		span.SetAttributes(attribute.String("error.type", genai.ClassifyError(err)))
+		return nil, err
+	}
+	return scores, nil
+}
+
+// tracedRerank adds RerankingProvider while still satisfying just Provider
+// at the chat layer.
+type tracedRerank struct {
+	*tracedChat
+
+	rerank RerankingProvider
+}
+
+func (t *tracedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) {
+	return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank)
+}
+
+// tracedEmbed satisfies EmbeddingProvider.
+type tracedEmbed struct {
+	*tracedChat
+
+	embed EmbeddingProvider
+}
+
+func (t *tracedEmbed) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.embed.CreateEmbedding(ctx, text)
+	})
+}
+
+// tracedEmbedRerank satisfies EmbeddingProvider and RerankingProvider.
+type tracedEmbedRerank struct {
+	*tracedChat
+
+	embed  EmbeddingProvider
+	rerank RerankingProvider
+}
+
+func (t *tracedEmbedRerank) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.embed.CreateEmbedding(ctx, text)
+	})
+}
+
+func (t *tracedEmbedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) {
+	return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank)
+}
+
+// tracedBatchEmbed satisfies BatchEmbeddingProvider (which embeds
+// EmbeddingProvider).
+type tracedBatchEmbed struct {
+	*tracedChat
+
+	batchEmbed BatchEmbeddingProvider
+}
+
+func (t *tracedBatchEmbed) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.batchEmbed.CreateEmbedding(ctx, text)
+	})
+}
+
+func (t *tracedBatchEmbed) CreateBatchEmbedding(ctx context.Context, texts []string) (*base.BatchEmbeddingResult, error) {
+	return wrapBatchEmbedding(ctx, t.embeddingRequestForConfig(len(texts)), func(ctx context.Context) (*base.BatchEmbeddingResult, error) {
+		return t.batchEmbed.CreateBatchEmbedding(ctx, texts)
+	})
+}
+
+// tracedBatchEmbedRerank satisfies BatchEmbeddingProvider and
+// RerankingProvider — the broadest combination, used by openai and dmr.
+type tracedBatchEmbedRerank struct {
+	*tracedChat
+
+	batchEmbed BatchEmbeddingProvider
+	rerank     RerankingProvider
+}
+
+func (t *tracedBatchEmbedRerank) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.batchEmbed.CreateEmbedding(ctx, text)
+	})
+}
+
+func (t *tracedBatchEmbedRerank) CreateBatchEmbedding(ctx context.Context, texts []string) (*base.BatchEmbeddingResult, error) {
+	return wrapBatchEmbedding(ctx, t.embeddingRequestForConfig(len(texts)), func(ctx context.Context) (*base.BatchEmbeddingResult, error) {
+		return t.batchEmbed.CreateBatchEmbedding(ctx, texts)
+	})
+}
+
+func (t *tracedBatchEmbedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) {
+	return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank)
+}

From 6dafda3cab529af8ed503121ee5b4eb8360145ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:01 +0300
Subject: [PATCH 04/17] feat(otel): instrument runtime sessions, streams,
 fallback, delegation, skills, and background agents

- `pkg/runtime/loop.go`: open `runtime.session` and `runtime.stream` INTERNAL spans seeded with `gen_ai.conversation.id` baggage at session start; mark the session span with `error.type=loop_detected` + `codes.Error` when the loop detector terminates
- `pkg/runtime/fallback.go`, `pkg/runtime/cache.go`: wrap the fallback chain with a `runtime.fallback` span carrying primary/final model, attempts, outcome, cooldown state; record provider-cache hit/backing on the cache span
- `pkg/runtime/agent_delegation.go`: emit `runtime.task_transfer` and `runtime.handoff` spans with `gen_ai.operation.name=invoke_agent` and `gen_ai.agent.name`
- `pkg/runtime/skill_runner.go`: emit `invoke_workflow {skill}` per spec
- `pkg/runtime/toolexec/dispatcher.go`: open `runtime.tool.call` and `runtime.tool.handler` spans with the GenAI execute_tool semconv, capture `gen_ai.tool.call.{arguments,result}` under the content-capture opt-in, and stamp `cagent.approval.{decision,source}` from `notifyApproval` so denied / canceled / read-only-allowed calls are distinguishable in trace dashboards
- `pkg/runtime/compactor/compactor.go`: wrap compaction with a span that carries summary tokens and cost
- `pkg/tools/builtin/agent/agent.go`: open a `background_agent.run` root span with a link back to the spawning context, and stamp `gen_ai.conversation.id` from baggage so the span participates in conversation-scoped queries
- `pkg/tools/startable.go`, `pkg/toolinstall/registry.go`: wrap toolset Start with a `toolset.start` span so capability discovery latency is attributable
---
 pkg/runtime/agent_delegation.go    |  54 +++++++++++++--
 pkg/runtime/cache.go               |  15 +++-
 pkg/runtime/compactor/compactor.go |  39 ++++++++++-
 pkg/runtime/fallback.go            |  23 +++++-
 pkg/runtime/loop.go                |  55 ++++++++++++---
 pkg/runtime/skill_runner.go        |  37 ++++++++--
 pkg/runtime/toolexec/dispatcher.go | 108 +++++++++++++++++++++--------
 pkg/toolinstall/registry.go        |   4 +-
 pkg/tools/builtin/agent/agent.go   |  64 ++++++++++++++++-
 pkg/tools/startable.go             |  25 ++++++-
 10 files changed, 371 insertions(+), 53 deletions(-)

diff --git a/pkg/runtime/agent_delegation.go b/pkg/runtime/agent_delegation.go
index 2f63f71e2..7906a0d63 100644
--- a/pkg/runtime/agent_delegation.go
+++ b/pkg/runtime/agent_delegation.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/tools/builtin"
 	agenttool "github.com/docker/docker-agent/pkg/tools/builtin/agent"
@@ -408,11 +409,34 @@ func (r *LocalRuntime) handleTaskTransfer(ctx context.Context, sess *session.Ses
 
 	slog.Debug("Transferring task to agent", "from_agent", a.Name(), "to_agent", params.Agent, "task", params.Task)
 
-	ctx, span := r.startSpan(ctx, "runtime.task_transfer", trace.WithAttributes(
-		attribute.String("from.agent", a.Name()),
-		attribute.String("to.agent", params.Agent),
-		attribute.String("session.id", sess.ID),
-	))
+	delegationAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationInvokeAgent),
+		// gen_ai.agent.name identifies the target agent of the invoke_agent
+		// operation per the OTel GenAI semconv (Required). cagent.agent.name
+		// is the same value but in our internal namespace; we emit both so
+		// spec-aware backends and existing cagent dashboards both see it.
+		attribute.String(genai.AttrAgentName, params.Agent),
+		attribute.String("cagent.delegation.from_agent", a.Name()),
+		attribute.String("cagent.delegation.to_agent", params.Agent),
+		attribute.String("cagent.delegation.kind", "transfer_task"),
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, params.Agent),
+	}
+	if params.Task != "" {
+		// Task length is bounded enough to be useful as a span
+		// attribute for debugging "agent X transferred which task
+		// to Y". The full task body lands on the sub-session's
+		// runtime.session span when content capture is opt-in.
+		delegationAttrs = append(delegationAttrs, attribute.Int("cagent.delegation.task_length", len(params.Task)))
+	}
+	if genai.EmitLegacyAttributes() {
+		delegationAttrs = append(delegationAttrs,
+			attribute.String("from.agent", a.Name()),
+			attribute.String("to.agent", params.Agent),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	ctx, span := r.startSpan(ctx, "runtime.task_transfer", trace.WithAttributes(delegationAttrs...))
 	defer span.End()
 
 	return r.runForwarding(ctx, sess, evts, delegationRequest{
@@ -449,6 +473,26 @@ func (r *LocalRuntime) handleHandoff(ctx context.Context, sess *session.Session,
 		return nil, err
 	}
 
+	// Handoff is in-place agent swap (same session, different agent
+	// from the next turn). Span name keeps the runtime.* family;
+	// attributes mirror the transfer_task span shape so dashboards
+	// can union both delegation kinds. Take the returned ctx so
+	// `executeOnAgentSwitchHooks` and any of its children parent
+	// onto this span instead of bypassing it.
+	ctx, span := r.startSpan(ctx, "runtime.handoff", trace.WithAttributes(
+		attribute.String(genai.AttrOperationName, genai.OperationInvokeAgent),
+		// gen_ai.agent.name — Required by OTel GenAI semconv on invoke_agent
+		// spans; identifies the agent being handed off to. See task_transfer
+		// for the rationale of dual-emitting alongside cagent.agent.name.
+		attribute.String(genai.AttrAgentName, next.Name()),
+		attribute.String("cagent.delegation.from_agent", ca),
+		attribute.String("cagent.delegation.to_agent", next.Name()),
+		attribute.String("cagent.delegation.kind", "handoff"),
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, next.Name()),
+	))
+	defer span.End()
+
 	r.executeOnAgentSwitchHooks(ctx, currentAgent, sess.ID, ca, next.Name(), agentSwitchKindHandoff)
 	r.setCurrentAgent(next.Name())
 	handoffMessage := "The agent " + ca + " handed off the conversation to you. " +
diff --git a/pkg/runtime/cache.go b/pkg/runtime/cache.go
index 3e5e5a307..7448e418b 100644
--- a/pkg/runtime/cache.go
+++ b/pkg/runtime/cache.go
@@ -10,6 +10,7 @@ import (
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/hooks"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // BuiltinCacheResponse is the name of the builtin stop hook that persists
@@ -63,7 +64,10 @@ func (r *LocalRuntime) tryReplayCachedResponse(
 	if question == "" {
 		return false
 	}
+	_, cacheSpan := genai.RecordCacheLookup(ctx, "")
 	cached, ok := c.Lookup(question)
+	cacheSpan.SetHit(ok && cached != "")
+	cacheSpan.End()
 	// Treat empty stored values as misses: cache_response only stores
 	// non-empty responses, so an empty entry only surfaces if the JSON
 	// file was hand-edited or downgraded from a future version. Replaying
@@ -99,7 +103,7 @@ func (r *LocalRuntime) tryReplayCachedResponse(
 // (handled inside [cache.Cache.Store]), which makes the replay path —
 // where [LocalRuntime.tryReplayCachedResponse] fires stop hooks for the
 // cached answer — free of redundant disk writes.
-func (r *LocalRuntime) cacheResponseBuiltin(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) {
+func (r *LocalRuntime) cacheResponseBuiltin(ctx context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) {
 	if in == nil || in.AgentName == "" || in.LastUserMessage == "" ||
 		strings.TrimSpace(in.StopResponse) == "" {
 		return nil, nil
@@ -111,7 +115,16 @@ func (r *LocalRuntime) cacheResponseBuiltin(_ context.Context, in *hooks.Input,
 		return nil, nil
 	}
 	if c := a.Cache(); c != nil {
+		// Thread the active context so the cache.store span chains
+		// onto the surrounding stop-hook trace instead of starting a
+		// detached one. Mark the operation as a successful write so
+		// the `cagent.cache.requests{operation="store"}` counter is
+		// incremented — without SetHit the store path would never
+		// register on the metric.
+		_, storeSpan := genai.RecordCacheStore(ctx, "")
 		c.Store(in.LastUserMessage, in.StopResponse)
+		storeSpan.SetHit(true)
+		storeSpan.End()
 	}
 	return nil, nil
 }
diff --git a/pkg/runtime/compactor/compactor.go b/pkg/runtime/compactor/compactor.go
index e4addce95..02a7a531a 100644
--- a/pkg/runtime/compactor/compactor.go
+++ b/pkg/runtime/compactor/compactor.go
@@ -24,6 +24,11 @@ import (
 	"fmt"
 	"time"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/compaction"
@@ -104,7 +109,39 @@ type LLMArgs struct {
 // Returns (nil, nil) when the model returns an empty summary; callers
 // should treat that as "compaction was a no-op" and skip the apply
 // step.
-func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) {
+func RunLLM(ctx context.Context, args LLMArgs) (result *Result, err error) {
+	// One INTERNAL `compaction` span covers the LLM-driven summarization
+	// strategy end-to-end. The inner LLM call gets its own `chat {model}`
+	// CLIENT child span via the provider decorator, so this parent span
+	// is a useful aggregate boundary (context limit, summary tokens,
+	// outcome) without duplicating per-call timing data.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/runtime/compactor").Start(
+		ctx,
+		"compaction",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.Int64("cagent.compaction.context_limit", args.ContextLimit),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		if result != nil {
+			// `Result.InputTokens` actually holds the compaction
+			// sub-session's *output* token count (the summary length)
+			// per the field's doc — name the span attribute by what the
+			// value is, not by what the source struct field is named.
+			span.SetAttributes(
+				attribute.Int("cagent.compaction.summary_output_tokens", int(result.InputTokens)),
+				attribute.Float64("cagent.compaction.cost", result.Cost),
+				attribute.Int("cagent.compaction.first_kept_entry", result.FirstKeptEntry),
+			)
+		}
+		span.End()
+	}()
+
 	if args.RunAgent == nil {
 		return nil, errors.New("compactor: RunAgent is required")
 	}
diff --git a/pkg/runtime/fallback.go b/pkg/runtime/fallback.go
index 8b0780aab..ee539e2a1 100644
--- a/pkg/runtime/fallback.go
+++ b/pkg/runtime/fallback.go
@@ -14,6 +14,7 @@ import (
 	"github.com/docker/docker-agent/pkg/modelerrors"
 	"github.com/docker/docker-agent/pkg/modelsdev"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -237,6 +238,14 @@ func (e *fallbackExecutor) execute(
 	modelChain := buildModelChain(primaryModel, fallbackModels)
 	startIndex := e.chainStartIndex(a, len(fallbackModels))
 
+	// One runtime.fallback span wraps the whole chain. Each per-model
+	// CreateChatCompletionStream call below opens its own `chat {model}`
+	// CLIENT child span via the provider decorator, so the fallback span
+	// is a useful aggregate boundary (total attempts, final model,
+	// terminal outcome) without duplicating per-model timing data.
+	ctx, fbSpan := genai.StartFallback(ctx, a.Name(), primaryModel.ID(), startIndex > 0)
+	defer fbSpan.End()
+
 	var lastErr error
 	primaryFailedWithNonRetryable := false
 	hasFallbacks := len(fallbackModels) > 0
@@ -252,14 +261,17 @@ func (e *fallbackExecutor) execute(
 		for attempt := range maxAttempts {
 			// Check context before each attempt
 			if ctx.Err() != nil {
+				fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 				return streamResult{}, nil, ctx.Err()
 			}
+			fbSpan.IncrementAttempt()
 
 			// Apply backoff before retry (not on first attempt of each model)
 			if attempt > 0 {
 				backoffDelay := backoff.Calculate(attempt - 1)
 				logRetryBackoff(a.Name(), modelEntry.provider.ID(), attempt, backoffDelay)
 				if !backoff.SleepWithContext(ctx, backoffDelay) {
+					fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 					return streamResult{}, nil, ctx.Err()
 				}
 			}
@@ -294,6 +306,7 @@ func (e *fallbackExecutor) execute(
 				lastErr = err
 				decision, retErr := e.classifyAttemptError(ctx, err, a, modelEntry, attempt, hasFallbacks, &primaryFailedWithNonRetryable)
 				if retErr != nil {
+					fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 					return streamResult{}, nil, retErr
 				}
 				if decision == retryDecisionBreak {
@@ -317,6 +330,7 @@ func (e *fallbackExecutor) execute(
 				lastErr = err
 				decision, retErr := e.classifyAttemptError(ctx, err, a, modelEntry, attempt, hasFallbacks, &primaryFailedWithNonRetryable)
 				if retErr != nil {
+					fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 					return streamResult{}, nil, retErr
 				}
 				if decision == retryDecisionBreak {
@@ -326,6 +340,8 @@ func (e *fallbackExecutor) execute(
 			}
 
 			e.recordSuccess(a, modelEntry, primaryFailedWithNonRetryable)
+			fbSpan.SetFinalModel(modelEntry.provider.ID())
+			fbSpan.SetOutcome(genai.FallbackOutcomeSuccess)
 			return res, modelEntry.provider, nil
 		}
 	}
@@ -339,12 +355,17 @@ func (e *fallbackExecutor) execute(
 			prefix = "all models failed"
 		}
 		wrapped := fmt.Errorf("%s: %w", prefix, lastErr)
+		fbSpan.RecordError(wrapped, "")
+		fbSpan.SetOutcome(genai.FallbackOutcomeFailed)
 		if modelerrors.IsContextOverflowError(lastErr) {
 			return streamResult{}, nil, modelerrors.NewContextOverflowError(wrapped)
 		}
 		return streamResult{}, nil, wrapped
 	}
-	return streamResult{}, nil, errors.New("model failed with unknown error")
+	unknownErr := errors.New("model failed with unknown error")
+	fbSpan.RecordError(unknownErr, "")
+	fbSpan.SetOutcome(genai.FallbackOutcomeFailed)
+	return streamResult{}, nil, unknownErr
 }
 
 // retryDecision is the outcome of handleModelError.
diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go
index 7f531257a..f24e8b21b 100644
--- a/pkg/runtime/loop.go
+++ b/pkg/runtime/loop.go
@@ -20,6 +20,7 @@ import (
 	"github.com/docker/docker-agent/pkg/modelsdev"
 	"github.com/docker/docker-agent/pkg/runtime/toolexec"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/tools/builtin"
 	bgagent "github.com/docker/docker-agent/pkg/tools/builtin/agent"
@@ -174,10 +175,32 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 	ctx = httpclient.ContextWithSessionID(ctx, sess.ID)
 	r.telemetry.RecordSessionStart(ctx, r.CurrentAgentName(), sess.ID)
 
-	ctx, sessionSpan := r.startSpan(ctx, "runtime.session", trace.WithAttributes(
-		attribute.String("agent", r.CurrentAgentName()),
-		attribute.String("session.id", sess.ID),
-	))
+	// Seed `gen_ai.conversation.id` into baggage at the session
+	// boundary. Every span the runtime, providers, MCP client, RAG,
+	// sandbox, evaluation, hooks, and (downstream) any subprocess
+	// or remote service create from here on will pick it up
+	// automatically without per-helper plumbing — and the value
+	// rides over W3C `baggage` so it crosses MCP / sandbox /
+	// HTTP boundaries too.
+	ctx = genai.WithConversationID(ctx, sess.ID)
+
+	// runtime.session is the root span for one stream. gen_ai.* keys
+	// are emitted alongside the legacy `agent` / `session.id` keys
+	// so existing dashboards keep matching while spec-aware tooling
+	// can filter by `gen_ai.conversation.id` and
+	// `cagent.agent.name`. Legacy keys drop out under
+	// OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental.
+	sessionAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, r.CurrentAgentName()),
+	}
+	if genai.EmitLegacyAttributes() {
+		sessionAttrs = append(sessionAttrs,
+			attribute.String("agent", r.CurrentAgentName()),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	ctx, sessionSpan := r.startSpan(ctx, "runtime.session", trace.WithAttributes(sessionAttrs...))
 	defer sessionSpan.End()
 
 	// Swap in this stream's events channel for elicitation and save the
@@ -440,10 +463,17 @@ func (r *LocalRuntime) runTurn(
 	toolModelOverride *string,
 	events chan Event,
 ) (ctrl turnControl) {
-	streamCtx, streamSpan := r.startSpan(ctx, "runtime.stream", trace.WithAttributes(
-		attribute.String("agent", a.Name()),
-		attribute.String("session.id", sess.ID),
-	))
+	streamAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, a.Name()),
+	}
+	if genai.EmitLegacyAttributes() {
+		streamAttrs = append(streamAttrs,
+			attribute.String("agent", a.Name()),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	streamCtx, streamSpan := r.startSpan(ctx, "runtime.stream", trace.WithAttributes(streamAttrs...))
 	// streamSpan ends inline at the natural points (success path before
 	// recordAssistantMessage, error path after handleStreamError) so its
 	// duration tracks the model call only, not the whole iteration. The
@@ -595,6 +625,15 @@ func (r *LocalRuntime) runTurn(
 			"Agent terminated: detected %d consecutive identical calls to %s. "+
 				"This indicates a degenerate loop where the model is not making progress.",
 			consecutive, toolName)
+		// Mark the session span as Error so loop-termination shows up
+		// in trace status / error-rate dashboards instead of blending
+		// in with normal completions.
+		sessionSpan.SetAttributes(
+			attribute.String("error.type", "loop_detected"),
+			attribute.String("cagent.session.terminated_by", "loop_detector"),
+			attribute.Int("cagent.loop.consecutive_calls", consecutive),
+		)
+		sessionSpan.SetStatus(codes.Error, errMsg)
 		events <- Error(errMsg)
 		r.notifyError(ctx, a, sess.ID, errMsg)
 		loopDetector.Reset()
diff --git a/pkg/runtime/skill_runner.go b/pkg/runtime/skill_runner.go
index 17bcc2679..331d6f248 100644
--- a/pkg/runtime/skill_runner.go
+++ b/pkg/runtime/skill_runner.go
@@ -10,6 +10,7 @@ import (
 	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/tools/builtin"
 )
@@ -49,11 +50,37 @@ func (r *LocalRuntime) handleRunSkill(ctx context.Context, sess *session.Session
 	// Open the span before any pre-delegation work so model resolution
 	// (inside WithAgentModel) is recorded under runtime.run_skill rather
 	// than the parent session span.
-	ctx, span := r.startSpan(ctx, "runtime.run_skill", trace.WithAttributes(
-		attribute.String("agent", ca),
-		attribute.String("skill", prepared.SkillName),
-		attribute.String("session.id", sess.ID),
-	))
+	//
+	// Skills are workflow-shaped (a coordinated process the agent
+	// orchestrates), so the GenAI semconv `invoke_workflow` operation
+	// applies. Emit it via gen_ai.* attrs alongside the legacy keys
+	// for back-compat.
+	skillAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationInvokeWorkflow),
+		attribute.String(genai.AttrWorkflowName, prepared.SkillName),
+		attribute.String(genai.AttrAgentNameRuntime, ca),
+		attribute.String(genai.AttrConversationID, sess.ID),
+	}
+	if genai.EmitLegacyAttributes() {
+		skillAttrs = append(skillAttrs,
+			attribute.String("agent", ca),
+			attribute.String("skill", prepared.SkillName),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	// Span name follows the GenAI agent semconv pattern
+	// `invoke_workflow {workflow.name}` so spec-aware backends
+	// classify the span as a workflow invocation. SpanKindInternal is
+	// passed explicitly per spec rather than relying on the SDK
+	// default — keeps intent clear and immune to default changes.
+	spanName := genai.OperationInvokeWorkflow
+	if prepared.SkillName != "" {
+		spanName = genai.OperationInvokeWorkflow + " " + prepared.SkillName
+	}
+	ctx, span := r.startSpan(ctx, spanName,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(skillAttrs...),
+	)
 	defer span.End()
 
 	slog.Debug("Running skill as sub-agent",
diff --git a/pkg/runtime/toolexec/dispatcher.go b/pkg/runtime/toolexec/dispatcher.go
index 1d1636eb7..572a9f442 100644
--- a/pkg/runtime/toolexec/dispatcher.go
+++ b/pkg/runtime/toolexec/dispatcher.go
@@ -19,6 +19,7 @@ import (
 	"github.com/docker/docker-agent/pkg/hooks"
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/telemetry"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -30,19 +31,21 @@ const (
 	ApprovalDecisionDeny     = "deny"
 	ApprovalDecisionCanceled = "canceled"
 
-	ApprovalSourceYolo                    = "yolo"
-	ApprovalSourceSessionPermissionsAllow = "session_permissions_allow"
-	ApprovalSourceSessionPermissionsDeny  = "session_permissions_deny"
-	ApprovalSourceTeamPermissionsAllow    = "team_permissions_allow"
-	ApprovalSourceTeamPermissionsDeny     = "team_permissions_deny"
-	ApprovalSourcePreToolUseHookAllow     = "pre_tool_use_hook_allow"
-	ApprovalSourcePreToolUseHookDeny      = "pre_tool_use_hook_deny"
-	ApprovalSourceReadOnlyHint            = "readonly_hint"
-	ApprovalSourceUserApproved            = "user_approved"
-	ApprovalSourceUserApprovedSession     = "user_approved_session"
-	ApprovalSourceUserApprovedTool        = "user_approved_tool"
-	ApprovalSourceUserRejected            = "user_rejected"
-	ApprovalSourceContextCanceled         = "context_canceled"
+	ApprovalSourceYolo                      = "yolo"
+	ApprovalSourceSessionPermissionsAllow   = "session_permissions_allow"
+	ApprovalSourceSessionPermissionsDeny    = "session_permissions_deny"
+	ApprovalSourceTeamPermissionsAllow      = "team_permissions_allow"
+	ApprovalSourceTeamPermissionsDeny       = "team_permissions_deny"
+	ApprovalSourcePreToolUseHookAllow       = "pre_tool_use_hook_allow"
+	ApprovalSourcePreToolUseHookDeny        = "pre_tool_use_hook_deny"
+	ApprovalSourcePermissionRequestHookDeny = "permission_request_hook_deny"
+	ApprovalSourcePermissionRequestHook     = "permission_request_hook_allow"
+	ApprovalSourceReadOnlyHint              = "readonly_hint"
+	ApprovalSourceUserApproved              = "user_approved"
+	ApprovalSourceUserApprovedSession       = "user_approved_session"
+	ApprovalSourceUserApprovedTool          = "user_approved_tool"
+	ApprovalSourceUserRejected              = "user_rejected"
+	ApprovalSourceContextCanceled           = "context_canceled"
 )
 
 // CallOutcome captures the verdicts of a single tool invocation as
@@ -245,13 +248,25 @@ type call struct {
 // and approval bookkeeping lives here so the call lifecycle is visible
 // at a glance.
 func (c *call) run(ctx context.Context) CallOutcome {
-	ctx, span := c.d.startSpan(ctx, "runtime.tool.call", trace.WithAttributes(
-		attribute.String("tool.name", c.tc.Function.Name),
-		attribute.String("tool.type", string(c.tc.Type)),
-		attribute.String("agent", c.a.Name()),
-		attribute.String("session.id", c.sess.ID),
-		attribute.String("tool.call_id", c.tc.ID),
-	))
+	// gen_ai.* attributes are always emitted (spec-compliant). Legacy
+	// attribute names are added only when the OTel stability flag is
+	// at its default — `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`
+	// drops the legacy keys. Tool type is "function" because every tool
+	// presented here is an LLM-callable function (transfer_task /
+	// handoff are runtime-managed but still appear as functions to the
+	// model).
+	attrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationExecuteTool),
+		attribute.String(genai.AttrToolName, c.tc.Function.Name),
+		attribute.String(genai.AttrToolType, "function"),
+		attribute.String(genai.AttrToolCallID, c.tc.ID),
+		attribute.String(genai.AttrAgentNameRuntime, c.a.Name()),
+		attribute.String(genai.AttrConversationID, c.sess.ID),
+	}
+	attrs = append(attrs, genai.LegacyToolAttributes(
+		c.tc.Function.Name, string(c.tc.Type), c.a.Name(), c.sess.ID, c.tc.ID,
+	)...)
+	ctx, span := c.d.startSpan(ctx, "runtime.tool.call", trace.WithAttributes(attrs...))
 	defer span.End()
 
 	slog.Debug("Processing tool call", "agent", c.a.Name(), "tool", c.tc.Function.Name, "session_id", c.sess.ID)
@@ -422,9 +437,17 @@ func (c *call) applyHookModifiedInput(result *hooks.Result) {
 }
 
 // notifyApproval forwards the resolved approval decision to the
-// HookDispatcher, when one is configured. Centralised so the nil-guard
-// stays in one place.
+// HookDispatcher, when one is configured. Also stamps the decision +
+// source on the active runtime.tool.call span so denied / canceled
+// calls are visible in trace dashboards (without it, denied tool calls
+// are indistinguishable from user-canceled ones at the span level).
 func (c *call) notifyApproval(ctx context.Context, decision, source string) {
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.approval.decision", decision),
+			attribute.String("cagent.approval.source", source),
+		)
+	}
 	if c.d.Hooks == nil {
 		return
 	}
@@ -529,6 +552,12 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call
 
 	if !result.Allowed {
 		slog.Debug("Tool denied by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.Message)
+		// Stamp the deny on the runtime.tool.call span via notifyApproval
+		// before returning. Without this the span would end with status
+		// Ok and no cagent.approval.* attrs — denied-by-hook calls would
+		// look identical to successful ones in trace dashboards, while
+		// pre_tool_use deny does emit the attrs. Symmetry matters.
+		c.notifyApproval(ctx, ApprovalDecisionDeny, ApprovalSourcePermissionRequestHookDeny)
 		rejectMsg := "The tool call was rejected by a permission_request hook."
 		if reason := strings.TrimSpace(result.Message); reason != "" {
 			rejectMsg += " Reason: " + reason
@@ -539,6 +568,7 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call
 
 	if result.PermissionAllowed {
 		slog.Debug("Tool auto-approved by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.AdditionalContext)
+		c.notifyApproval(ctx, ApprovalDecisionAllow, ApprovalSourcePermissionRequestHook)
 		return runTool(), true
 	}
 
@@ -618,14 +648,28 @@ func (c *call) runHandler(ctx context.Context, handler ToolHandler) {
 // translation, and session message persistence. It is the only place
 // where a tool actually runs.
 func (c *call) invoke(ctx context.Context, spanName string, exec func(ctx context.Context) (*tools.ToolCallResult, time.Duration, error)) *tools.ToolCallResult {
-	ctx, span := c.d.startSpan(ctx, spanName, trace.WithAttributes(
-		attribute.String("tool.name", c.tc.Function.Name),
-		attribute.String("agent", c.a.Name()),
-		attribute.String("session.id", c.sess.ID),
-		attribute.String("tool.call_id", c.tc.ID),
-	))
+	attrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationExecuteTool),
+		attribute.String(genai.AttrToolName, c.tc.Function.Name),
+		attribute.String(genai.AttrToolType, "function"),
+		attribute.String(genai.AttrToolCallID, c.tc.ID),
+		attribute.String(genai.AttrAgentNameRuntime, c.a.Name()),
+		attribute.String(genai.AttrConversationID, c.sess.ID),
+	}
+	attrs = append(attrs, genai.LegacyToolAttributes(
+		c.tc.Function.Name, string(c.tc.Type), c.a.Name(), c.sess.ID, c.tc.ID,
+	)...)
+	ctx, span := c.d.startSpan(ctx, spanName, trace.WithAttributes(attrs...))
 	defer span.End()
 
+	// gen_ai.tool.call.arguments capture is gated on the same opt-in as
+	// chat content (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`)
+	// because tool arguments commonly carry the same PII / secrets as the
+	// chat history that produced them (file paths, API tokens, prompts).
+	if genai.IsContentCaptureEnabled() && c.tc.Function.Arguments != "" {
+		span.SetAttributes(attribute.String(genai.AttrToolCallArguments, c.tc.Function.Arguments))
+	}
+
 	c.em.EmitToolCall(c.tc, c.tool, c.a.Name())
 
 	res, duration, err := exec(ctx)
@@ -647,6 +691,14 @@ func (c *call) invoke(ctx context.Context, spanName string, exec func(ctx contex
 	// path through Dispatch's `exec.Has(event)` short-circuit.
 	res.Output = c.applyToolResponseTransform(ctx, res.Output, false)
 
+	// gen_ai.tool.call.result captures the post-transform output so the
+	// span matches what the LLM actually saw on the next turn (any
+	// redact_secrets / scrubber rewrite is reflected). Same content-capture
+	// gating as arguments above.
+	if genai.IsContentCaptureEnabled() && res != nil && res.Output != "" {
+		span.SetAttributes(attribute.String(genai.AttrToolCallResult, res.Output))
+	}
+
 	c.em.EmitToolCallResponse(c.tc.ID, c.tool, res, res.Output, c.a.Name())
 	c.recordToolResponse(res)
 	return res
diff --git a/pkg/toolinstall/registry.go b/pkg/toolinstall/registry.go
index 1c53189ef..5529483b6 100644
--- a/pkg/toolinstall/registry.go
+++ b/pkg/toolinstall/registry.go
@@ -14,6 +14,8 @@ import (
 
 	"github.com/goccy/go-yaml"
 	"github.com/natefinch/atomic"
+
+	"github.com/docker/docker-agent/pkg/httpclient"
 )
 
 // githubToken returns a GitHub personal access token from the environment,
@@ -115,7 +117,7 @@ var (
 // NewRegistry creates a new Registry with default settings.
 func NewRegistry() *Registry {
 	return &Registry{
-		httpClient: http.DefaultClient,
+		httpClient: httpclient.TracedDefaultClient(),
 		baseURL:    registryBaseURL,
 		cacheDir:   RegistryDir(),
 	}
diff --git a/pkg/tools/builtin/agent/agent.go b/pkg/tools/builtin/agent/agent.go
index d195695ea..ff2b3f07a 100644
--- a/pkg/tools/builtin/agent/agent.go
+++ b/pkg/tools/builtin/agent/agent.go
@@ -12,9 +12,14 @@ import (
 	"time"
 
 	"github.com/google/uuid"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -295,6 +300,13 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 	// via HandleStop which calls cancel().
 	taskCtx, cancel := context.WithCancel(context.WithoutCancel(ctx))
 
+	// Capture a link to the current trace so the background task's
+	// new root trace can be navigated back to the spawning agent in
+	// observability-svc. The parent span context comes from the
+	// active `runtime.tool.call` span; the link survives even after
+	// that span ends, while a child-span relationship would not.
+	parentSpanContext := trace.SpanContextFromContext(ctx)
+
 	t := &task{
 		id:        taskID,
 		agentName: params.Agent,
@@ -308,9 +320,50 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 	h.wg.Go(func() {
 		defer cancel()
 
+		// Each background task starts its own trace (WithNewRoot)
+		// because it outlives the spawning request — making it a
+		// child would leave a span open after the parent ended.
+		// A span link preserves navigability from the spawning
+		// trace to the background task.
+		spanAttrs := []attribute.KeyValue{
+			attribute.String("cagent.background_agent.task_id", taskID),
+			attribute.String("cagent.background_agent.agent", params.Agent),
+		}
+		// Stamp gen_ai.conversation.id directly: WithNewRoot resets the
+		// span context but baggage flows through context.WithoutCancel,
+		// so the id is reachable yet would not appear as a span attr
+		// without an explicit lift.
+		if convID := genai.ConversationIDFromContext(taskCtx); convID != "" {
+			spanAttrs = append(spanAttrs, attribute.String(genai.AttrConversationID, convID))
+		}
+		startOpts := []trace.SpanStartOption{
+			trace.WithSpanKind(trace.SpanKindInternal),
+			trace.WithNewRoot(),
+			trace.WithAttributes(spanAttrs...),
+		}
+		if parentSpanContext.IsValid() {
+			startOpts = append(startOpts, trace.WithLinks(trace.Link{
+				SpanContext: parentSpanContext,
+				Attributes: []attribute.KeyValue{
+					attribute.String("cagent.link.kind", "spawned_from"),
+				},
+			}))
+		}
+		// Static span name; the agent name lives in the
+		// `cagent.background_agent.agent` attribute. Putting the
+		// user-defined agent name into the span name itself would
+		// blow up Tempo's operation-name index when many agents are
+		// configured.
+		tracedCtx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools/builtin/agent").Start(
+			taskCtx,
+			"background_agent.run",
+			startOpts...,
+		)
+		defer span.End()
+
 		slog.Debug("Starting background agent task", "task_id", taskID, "agent", params.Agent)
 
-		result := h.runner.RunAgent(taskCtx, RunParams{
+		result := h.runner.RunAgent(tracedCtx, RunParams{
 			AgentName:      params.Agent,
 			Task:           params.Task,
 			ExpectedOutput: params.ExpectedOutput,
@@ -321,12 +374,18 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 		if result.ErrMsg != "" {
 			t.errMsg = result.ErrMsg
 			t.storeStatus(taskFailed)
+			span.SetStatus(codes.Error, result.ErrMsg)
+			span.SetAttributes(
+				attribute.String("error.type", "agent_error"),
+				attribute.String("cagent.background_agent.outcome", "failed"),
+			)
 			slog.Debug("Background agent task failed", "task_id", taskID, "agent", params.Agent, "error", result.ErrMsg)
 			return
 		}
 
-		if taskCtx.Err() != nil && t.loadStatus() == taskRunning {
+		if tracedCtx.Err() != nil && t.loadStatus() == taskRunning {
 			t.storeStatus(taskStopped)
+			span.SetAttributes(attribute.String("cagent.background_agent.outcome", "stopped"))
 			slog.Debug("Background agent task stopped", "task_id", taskID)
 			return
 		}
@@ -335,6 +394,7 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 		// always see the populated result field.
 		t.result = result.Result
 		if t.casStatus(taskRunning, taskCompleted) {
+			span.SetAttributes(attribute.String("cagent.background_agent.outcome", "completed"))
 			slog.Debug("Background agent task completed", "task_id", taskID, "agent", params.Agent)
 		}
 	})
diff --git a/pkg/tools/startable.go b/pkg/tools/startable.go
index f550a4553..93994b1a3 100644
--- a/pkg/tools/startable.go
+++ b/pkg/tools/startable.go
@@ -4,6 +4,11 @@ import (
 	"context"
 	"fmt"
 	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 )
 
 // Describer can be implemented by a ToolSet to provide a short, user-visible
@@ -65,7 +70,7 @@ func (s *StartableToolSet) IsStarted() bool {
 // Concurrent callers block until the start attempt completes.
 // If start fails, a future call will retry.
 // If the underlying toolset doesn't implement Startable, this is a no-op.
-func (s *StartableToolSet) Start(ctx context.Context) error {
+func (s *StartableToolSet) Start(ctx context.Context) (err error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -74,6 +79,24 @@ func (s *StartableToolSet) Start(ctx context.Context) error {
 	}
 
 	if startable, ok := As[Startable](s.ToolSet); ok {
+		// Span the toolset startup — MCP handshake, OAuth probes,
+		// tool discovery, etc. can take seconds to minutes and the
+		// "tools loading…" UI was previously unattributable. Only
+		// fires when the toolset has work to do; cheap toolsets
+		// without a Startable implementation skip the span entirely.
+		ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools").Start(
+			ctx,
+			"toolset.start",
+			trace.WithSpanKind(trace.SpanKindInternal),
+			trace.WithAttributes(attribute.String("cagent.toolset.kind", fmt.Sprintf("%T", s.ToolSet))),
+		)
+		defer func() {
+			if err != nil {
+				span.RecordError(err)
+				span.SetStatus(codes.Error, err.Error())
+			}
+			span.End()
+		}()
 		if err := startable.Start(ctx); err != nil {
 			// Queue a warning ONLY on the first failure of a streak so
 			// repeated retries don't re-queue duplicate warnings.

From 9a1341d613cbc883c39736e96b1768025cab64dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:01 +0300
Subject: [PATCH 05/17] feat(otel): annotate hook executor with span verdict
 and subprocess trace context

- `pkg/hooks/executor.go`: open a single `hook.{event}` INTERNAL span per Dispatch covering every matched hook, then `annotateHookSpan` stamps the aggregated `Result` so denied / asked / allowed / modified-input / summary-provided cases are distinguishable. Verdict booleans and the structured decision/reason are unconditional; free-text `message` / `additional_context` / `system_message` / `summary` are gated on `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`
- `pkg/hooks/handler.go`: append `genai.InjectTraceContextEnv(ctx)` to the hook subprocess env so script-driven hooks that emit OTel spans (or call instrumented CLIs / LLM endpoints) chain onto the parent `hook.{event}` span instead of producing orphaned roots
---
 pkg/hooks/executor.go | 82 ++++++++++++++++++++++++++++++++++++++++++-
 pkg/hooks/handler.go  | 15 +++++++-
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/pkg/hooks/executor.go b/pkg/hooks/executor.go
index e6b0e0b8f..fb85905e1 100644
--- a/pkg/hooks/executor.go
+++ b/pkg/hooks/executor.go
@@ -10,6 +10,13 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // Executor dispatches configured hooks. Hook types are resolved against
@@ -134,6 +141,27 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input)
 		return &Result{Allowed: true}, nil
 	}
 
+	// Single span per Dispatch call covers every hook the event matched.
+	// Custom name `hook.{event}` because there is no GenAI semconv for
+	// arbitrary user-defined lifecycle hooks; we surface the event type,
+	// matched hook count, and session/agent identifiers so dashboards can
+	// split by event class without parsing span events.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/hooks").Start(
+		ctx,
+		"hook."+string(event),
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.String("cagent.hook.event", string(event)),
+			attribute.Int("cagent.hook.count", len(hooks)),
+			attribute.String("cagent.agent.name", input.AgentName),
+			attribute.String("gen_ai.conversation.id", input.SessionID),
+		),
+	)
+	if input.ToolName != "" {
+		span.SetAttributes(attribute.String("gen_ai.tool.name", input.ToolName))
+	}
+	defer span.End()
+
 	input.HookEventName = event
 	if input.Cwd == "" {
 		input.Cwd = e.workingDir
@@ -143,6 +171,8 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input)
 
 	inputJSON, err := input.ToJSON()
 	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
 		return nil, fmt.Errorf("failed to serialize hook input: %w", err)
 	}
 
@@ -153,7 +183,57 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input)
 	}
 	wg.Wait()
 
-	return aggregate(results, event), nil
+	final := aggregate(results, event)
+	annotateHookSpan(span, event, final)
+	return final, nil
+}
+
+// annotateHookSpan stamps the aggregated verdict onto the hook.{event}
+// span so dashboards can answer "did the hook block this?" and "why?"
+// without re-running the hook. Prior to this the span only carried the
+// event type and hook count — a denied call looked identical to an
+// allowed one. The verdict booleans and short reason are unconditional
+// (they're decisions, not content); free-text fields that may contain
+// PII or LLM output (Message, AdditionalContext, SystemMessage,
+// Summary) are gated on the GenAI content-capture opt-in.
+func annotateHookSpan(span trace.Span, event EventType, r *Result) {
+	if span == nil || r == nil {
+		return
+	}
+	attrs := []attribute.KeyValue{
+		attribute.Bool("cagent.hook.allowed", r.Allowed),
+		attribute.Int("cagent.hook.exit_code", r.ExitCode),
+	}
+	if r.Decision != "" {
+		attrs = append(attrs, attribute.String("cagent.hook.decision", string(r.Decision)))
+	}
+	if r.DecisionReason != "" {
+		attrs = append(attrs, attribute.String("cagent.hook.decision_reason", r.DecisionReason))
+	}
+	if event == EventPermissionRequest {
+		attrs = append(attrs, attribute.Bool("cagent.hook.permission_allowed", r.PermissionAllowed))
+	}
+	if r.ModifiedInput != nil {
+		attrs = append(attrs, attribute.Bool("cagent.hook.modified_input", true))
+	}
+	if r.Summary != "" {
+		attrs = append(attrs, attribute.Bool("cagent.hook.summary_provided", true))
+	}
+	if genai.IsContentCaptureEnabled() {
+		if r.Message != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.message", r.Message))
+		}
+		if r.AdditionalContext != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.additional_context", r.AdditionalContext))
+		}
+		if r.SystemMessage != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.system_message", r.SystemMessage))
+		}
+		if r.Summary != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.summary", r.Summary))
+		}
+	}
+	span.SetAttributes(attrs...)
 }
 
 // hooksFor returns the deduplicated list of hooks that should run for
diff --git a/pkg/hooks/handler.go b/pkg/hooks/handler.go
index 2d5a2974a..ea8276157 100644
--- a/pkg/hooks/handler.go
+++ b/pkg/hooks/handler.go
@@ -14,6 +14,7 @@ import (
 	"sync"
 
 	"github.com/docker/docker-agent/pkg/shellpath"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // Handler executes a single hook invocation. It is built by a
@@ -188,7 +189,19 @@ type commandHandler struct {
 func (h *commandHandler) Run(ctx context.Context, input []byte) (HandlerResult, error) {
 	cmd := exec.CommandContext(ctx, h.shell, append(h.shellArgs, h.command)...)
 	cmd.Dir = h.workingDir
-	cmd.Env = h.env
+	// Expand nil to os.Environ() so the child inherits the parent env
+	// (matching the pre-OTel cmd.Env=h.env=nil behaviour), and copy
+	// into a fresh backing array so concurrent hooks don't race on a
+	// shared slice when adding the trace-context vars.
+	base := h.env
+	if base == nil {
+		base = os.Environ()
+	}
+	traceEnv := genai.InjectTraceContextEnv(ctx)
+	envCopy := make([]string, 0, len(base)+len(traceEnv))
+	envCopy = append(envCopy, base...)
+	envCopy = append(envCopy, traceEnv...)
+	cmd.Env = envCopy
 	cmd.Stdin = bytes.NewReader(input)
 
 	var stdout, stderr bytes.Buffer

From e6f56f007fd9bc2a408be6b31707d2081802aed2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:02 +0300
Subject: [PATCH 06/17] feat(otel): instrument MCP client, server, and OAuth
 flows

- `pkg/mcp/server.go`: route the MCP HTTP transport through `otelhttp.NewHandler` and `otelmcp.StartServer` so inbound requests carry `traceparent` / `baggage` and emit a SERVER span per call
- `pkg/tools/mcp/session_client.go`: wrap MCP client calls (`tools/list`, `tools/call`, `prompts/list`) with CLIENT spans using the params._meta propagation carrier. Iterator wrappers open the span inside the iterator closure (not at call time) so unused iterators do not leak spans, and end on every exit path including early `yield` returns
- `pkg/tools/mcp/oauth.go`, `oauth_helpers.go`, `oauth_login.go`, `oauth_server.go`: wrap interactive OAuth flow and token refresh with `oauth.flow` / `oauth.token.refresh` CLIENT spans, route metadata HTTP calls through `httpclient.TracedClient` / `TracedDefaultClient`, and emit `oauth.step` span events at each network sub-step boundary (`fetch_protected_resource_metadata`, `fetch_authorization_server_metadata`, `dynamic_client_registration`, `request_authorization_code`, `token_exchange`) so a failure can be attributed to a specific stage without descending into HTTP children
---
 pkg/mcp/server.go               |  44 +++++++++++--
 pkg/tools/mcp/oauth.go          |  82 ++++++++++++++++++++++--
 pkg/tools/mcp/oauth_helpers.go  |   7 +-
 pkg/tools/mcp/oauth_login.go    |   6 +-
 pkg/tools/mcp/oauth_server.go   |   8 ++-
 pkg/tools/mcp/session_client.go | 110 ++++++++++++++++++++++++++++----
 6 files changed, 225 insertions(+), 32 deletions(-)

diff --git a/pkg/mcp/server.go b/pkg/mcp/server.go
index 655ad7056..43c0c988c 100644
--- a/pkg/mcp/server.go
+++ b/pkg/mcp/server.go
@@ -11,6 +11,7 @@ import (
 	"slices"
 
 	"github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 	"go.opentelemetry.io/otel"
 
 	"github.com/docker/docker-agent/pkg/agent"
@@ -19,6 +20,7 @@ import (
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/team"
 	"github.com/docker/docker-agent/pkg/teamloader"
+	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/version"
 )
@@ -61,10 +63,17 @@ func StartHTTPServer(ctx context.Context, agentFilename, agentName string, runCo
 
 	fmt.Printf("MCP HTTP server listening on http://%s\n", ln.Addr())
 
+	// Wrap with otelhttp so the MCP-over-HTTP transport extracts
+	// `traceparent` / `baggage` from incoming requests just like the
+	// stdio transport extracts them from `params._meta`. Without this
+	// HTTP-mode MCP clients lose trace context at the boundary.
 	httpServer := &http.Server{
-		Handler: mcp.NewStreamableHTTPHandler(func(_ *http.Request) *mcp.Server {
-			return server
-		}, nil),
+		Handler: otelhttp.NewHandler(
+			mcp.NewStreamableHTTPHandler(func(_ *http.Request) *mcp.Server {
+				return server
+			}, nil),
+			"mcp.http",
+		),
 	}
 
 	errCh := make(chan error, 1)
@@ -155,7 +164,25 @@ func createMCPServer(ctx context.Context, agentFilename, agentName string, runCo
 }
 
 func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mcp.CallToolRequest, ToolInput) (*mcp.CallToolResult, ToolOutput, error) {
-	return func(ctx context.Context, req *mcp.CallToolRequest, input ToolInput) (*mcp.CallToolResult, ToolOutput, error) {
+	return func(ctx context.Context, req *mcp.CallToolRequest, input ToolInput) (result *mcp.CallToolResult, output ToolOutput, err error) {
+		// Extract W3C trace context from `params._meta` (per the OTel
+		// MCP semconv) so the SERVER span chains onto the calling
+		// CLIENT span. Then start a `tools/call {agent}` SERVER span
+		// covering the full handler execution.
+		if req != nil && req.Params != nil {
+			ctx = otelmcp.ExtractMeta(ctx, req.Params.Meta)
+		}
+		ctx, span := otelmcp.StartServer(ctx, otelmcp.CallOptions{
+			Method:   otelmcp.MethodToolsCall,
+			ToolName: agentName,
+		})
+		defer func() {
+			if err != nil {
+				span.RecordError(err, "")
+			}
+			span.End()
+		}()
+
 		slog.Debug("MCP tool called", "agent", agentName, "message", input.Message)
 
 		ag, err := t.Agent(agentName)
@@ -176,6 +203,9 @@ func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mc
 		rt, err := runtime.New(t,
 			runtime.WithCurrentAgent(agentName),
 			runtime.WithNonInteractive(true),
+			// See pkg/a2a/adapter.go for rationale — without this
+			// the runtime's startSpan is a no-op when cagent runs as
+			// an MCP server, so all our runtime.* spans go silent.
 			runtime.WithTracer(otel.Tracer("cagent")),
 		)
 		if err != nil {
@@ -188,11 +218,11 @@ func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mc
 			return nil, ToolOutput{}, fmt.Errorf("agent execution failed: %w", err)
 		}
 
-		result := cmp.Or(sess.GetLastAssistantMessageContent(), "No response from agent")
+		response := cmp.Or(sess.GetLastAssistantMessageContent(), "No response from agent")
 
-		slog.Debug("Agent execution completed", "agent", agentName, "response_length", len(result))
+		slog.Debug("Agent execution completed", "agent", agentName, "response_length", len(response))
 
-		return nil, ToolOutput{Response: result}, nil
+		return nil, ToolOutput{Response: response}, nil
 	}
 }
 
diff --git a/pkg/tools/mcp/oauth.go b/pkg/tools/mcp/oauth.go
index fa3fb3b72..6385db666 100644
--- a/pkg/tools/mcp/oauth.go
+++ b/pkg/tools/mcp/oauth.go
@@ -16,9 +16,15 @@ import (
 	"time"
 
 	mcpsdk "github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 	"golang.org/x/oauth2"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/httpclient"
+	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -475,17 +481,42 @@ func (t *oauthTransport) getValidToken(ctx context.Context) *OAuthToken {
 
 	slog.Debug("Attempting silent token refresh", "url", t.baseURL)
 
-	o := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	// Wrap the refresh path in a span so the latency and failure
+	// rate of silent OAuth token refreshes are visible — the user
+	// otherwise just sees a stalled MCP request with no obvious
+	// cause. Pull conversation id from baggage so observability-svc
+	// can attribute the refresh to the spawning session.
+	refreshAttrs := []attribute.KeyValue{
+		attribute.String("cagent.oauth.base_url", t.baseURL),
+	}
+	if convID := otelmcp.ConversationIDFromBaggage(ctx); convID != "" {
+		refreshAttrs = append(refreshAttrs, attribute.String("gen_ai.conversation.id", convID))
+	}
+	ctx, refreshSpan := otel.Tracer("github.com/docker/docker-agent/pkg/tools/mcp").Start(
+		ctx,
+		"oauth.token.refresh",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(refreshAttrs...),
+	)
+	defer refreshSpan.End()
+
+	o := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
 	authServer := cmp.Or(token.AuthServer, t.baseURL)
 	metadata, err := o.getAuthorizationServerMetadata(ctx, authServer)
 	if err != nil {
 		slog.Debug("Failed to fetch auth server metadata for refresh", "auth_server", authServer, "error", err)
+		refreshSpan.RecordError(err)
+		refreshSpan.SetStatus(codes.Error, "metadata fetch failed")
+		refreshSpan.SetAttributes(attribute.String("error.type", "metadata"))
 		return nil
 	}
 
 	newToken, err := RefreshAccessToken(ctx, metadata.TokenEndpoint, token.RefreshToken, token.ClientID, token.ClientSecret)
 	if err != nil {
 		slog.Debug("Token refresh failed, will require interactive auth", "error", err)
+		refreshSpan.RecordError(err)
+		refreshSpan.SetStatus(codes.Error, "refresh failed")
+		refreshSpan.SetAttributes(attribute.String("error.type", "refresh_token"))
 		t.mu.Lock()
 		t.refreshFailedAt = time.Now()
 		t.mu.Unlock()
@@ -546,24 +577,54 @@ func configuredScopes(c *latest.RemoteOAuthConfig) []string {
 }
 
 // handleOAuthFlow performs the OAuth flow when a 401 response is received
-func (t *oauthTransport) handleOAuthFlow(ctx context.Context, authServer, wwwAuth string) error {
+func (t *oauthTransport) handleOAuthFlow(ctx context.Context, authServer, wwwAuth string) (err error) {
+	kind := "unmanaged"
 	if t.managed {
-		return t.handleManagedOAuthFlow(ctx, authServer, wwwAuth)
+		kind = "managed"
+	}
+	// Interactive OAuth flows can take seconds to minutes (user
+	// switches to browser, completes the consent screen, comes
+	// back). The span makes that latency attributable and gives
+	// dashboards a way to count auth-failure rates by managed kind.
+	flowAttrs := []attribute.KeyValue{
+		attribute.String("cagent.oauth.base_url", t.baseURL),
+		attribute.String("cagent.oauth.kind", kind),
 	}
+	if convID := otelmcp.ConversationIDFromBaggage(ctx); convID != "" {
+		flowAttrs = append(flowAttrs, attribute.String("gen_ai.conversation.id", convID))
+	}
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools/mcp").Start(
+		ctx,
+		"oauth.flow",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(flowAttrs...),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
 
+	if t.managed {
+		return t.handleManagedOAuthFlow(ctx, authServer, wwwAuth)
+	}
 	return t.handleUnmanagedOAuthFlow(ctx, authServer, wwwAuth)
 }
 
 func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, wwwAuth string) error {
 	slog.Debug("Starting OAuth flow for server", "url", t.baseURL)
+	span := trace.SpanFromContext(ctx)
 
 	resourceURL := cmp.Or(resourceMetadataFromWWWAuth(wwwAuth), authServer+"/.well-known/oauth-protected-resource")
 
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_protected_resource_metadata")))
 	resourceReq, err := http.NewRequestWithContext(ctx, http.MethodGet, resourceURL, http.NoBody)
 	if err != nil {
 		return err
 	}
-	resp, err := http.DefaultClient.Do(resourceReq)
+	resp, err := httpclient.TracedDefaultClient().Do(resourceReq)
 	if err != nil {
 		return err
 	}
@@ -585,7 +646,8 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 		resourceMetadata.AuthorizationServers = []string{authServer}
 	}
 
-	oauth := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	oauth := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_authorization_server_metadata")))
 	authServerMetadata, err := oauth.getAuthorizationServerMetadata(ctx, resourceMetadata.AuthorizationServers[0])
 	if err != nil {
 		return fmt.Errorf("failed to fetch authorization server metadata: %w", err)
@@ -628,6 +690,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 		scopes = t.oauthConfig.Scopes
 	case authServerMetadata.RegistrationEndpoint != "":
 		slog.Debug("Attempting dynamic client registration")
+		span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "dynamic_client_registration")))
 		clientID, clientSecret, err = RegisterClient(ctx, authServerMetadata, redirectURI, nil)
 		if err != nil {
 			slog.Debug("Dynamic registration failed", "error", err)
@@ -676,6 +739,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 	}
 
 	slog.Debug("Requesting authorization code", "url", authURL)
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "request_authorization_code")))
 
 	code, receivedState, err := RequestAuthorizationCode(ctx, authURL, callbackServer, state)
 	if err != nil {
@@ -687,6 +751,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 	}
 
 	slog.Debug("Exchanging authorization code for token")
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "token_exchange")))
 	token, err := ExchangeCodeForToken(
 		ctx,
 		authServerMetadata.TokenEndpoint,
@@ -720,15 +785,17 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 // where the client handles the OAuth interaction instead of us
 func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServer, wwwAuth string) error {
 	slog.Debug("Starting unmanaged OAuth flow for server", "url", t.baseURL)
+	span := trace.SpanFromContext(ctx)
 
 	// Extract resource URL from WWW-Authenticate header
 	resourceURL := cmp.Or(resourceMetadataFromWWWAuth(wwwAuth), authServer+"/.well-known/oauth-protected-resource")
 
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_protected_resource_metadata")))
 	resourceReq, err := http.NewRequestWithContext(ctx, http.MethodGet, resourceURL, http.NoBody)
 	if err != nil {
 		return err
 	}
-	resp, err := http.DefaultClient.Do(resourceReq)
+	resp, err := httpclient.TracedDefaultClient().Do(resourceReq)
 	if err != nil {
 		return err
 	}
@@ -750,7 +817,8 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe
 		resourceMetadata.AuthorizationServers = []string{authServer}
 	}
 
-	oauth := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	oauth := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_authorization_server_metadata")))
 	authServerMetadata, err := oauth.getAuthorizationServerMetadata(ctx, resourceMetadata.AuthorizationServers[0])
 	if err != nil {
 		return fmt.Errorf("failed to fetch authorization server metadata: %w", err)
diff --git a/pkg/tools/mcp/oauth_helpers.go b/pkg/tools/mcp/oauth_helpers.go
index ca9e862c8..768bec002 100644
--- a/pkg/tools/mcp/oauth_helpers.go
+++ b/pkg/tools/mcp/oauth_helpers.go
@@ -16,6 +16,7 @@ import (
 	"golang.org/x/oauth2"
 
 	"github.com/docker/docker-agent/pkg/browser"
+	"github.com/docker/docker-agent/pkg/httpclient"
 )
 
 // GenerateState generates a random state parameter for OAuth CSRF protection
@@ -62,7 +63,7 @@ func ExchangeCodeForToken(ctx context.Context, tokenEndpoint, code, codeVerifier
 
 	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.TracedDefaultClient().Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("failed to exchange code for token: %w", err)
 	}
@@ -221,7 +222,7 @@ func RegisterClient(ctx context.Context, authMetadata *AuthorizationServerMetada
 	}
 	req.Header.Set("Content-Type", "application/json")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.TracedDefaultClient().Do(req)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to register client: %w", err)
 	}
@@ -269,7 +270,7 @@ func RefreshAccessToken(ctx context.Context, tokenEndpoint, refreshToken, client
 	}
 	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.TracedDefaultClient().Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("failed to refresh token: %w", err)
 	}
diff --git a/pkg/tools/mcp/oauth_login.go b/pkg/tools/mcp/oauth_login.go
index 00d57c8fb..a71ddc2a2 100644
--- a/pkg/tools/mcp/oauth_login.go
+++ b/pkg/tools/mcp/oauth_login.go
@@ -11,6 +11,8 @@ import (
 	"time"
 
 	"golang.org/x/oauth2"
+
+	"github.com/docker/docker-agent/pkg/httpclient"
 )
 
 // PerformOAuthLogin performs a standalone OAuth flow for the given MCP server URL.
@@ -19,7 +21,7 @@ import (
 func PerformOAuthLogin(ctx context.Context, serverURL string) error {
 	tokenStore := NewKeyringTokenStore()
 
-	o := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	o := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
 
 	// Derive the base origin (scheme + host) from the server URL.
 	// The well-known endpoints live at the origin, not under the SSE/path.
@@ -35,7 +37,7 @@ func PerformOAuthLogin(ctx context.Context, serverURL string) error {
 	if err != nil {
 		return fmt.Errorf("failed to create resource metadata request: %w", err)
 	}
-	resp, err := http.DefaultClient.Do(resourceReq)
+	resp, err := httpclient.TracedDefaultClient().Do(resourceReq)
 	if err != nil {
 		return fmt.Errorf("failed to fetch protected resource metadata: %w", err)
 	}
diff --git a/pkg/tools/mcp/oauth_server.go b/pkg/tools/mcp/oauth_server.go
index 5a355ccb4..527316a5d 100644
--- a/pkg/tools/mcp/oauth_server.go
+++ b/pkg/tools/mcp/oauth_server.go
@@ -12,6 +12,8 @@ import (
 	"strings"
 	"sync"
 	"time"
+
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 )
 
 // CallbackServer handles OAuth callback requests
@@ -53,8 +55,12 @@ func NewCallbackServerOnPort(port int) (*CallbackServer, error) {
 	mux := http.NewServeMux()
 	mux.HandleFunc("/callback", cs.handleCallback)
 
+	// Wrap with otelhttp so the OAuth callback span chains onto the
+	// caller's trace when the OAuth provider preserves trace context
+	// in the redirect (most don't, but the wrap is harmless when
+	// they don't, and useful when they do).
 	cs.server = &http.Server{
-		Handler:      mux,
+		Handler:      otelhttp.NewHandler(mux, "oauth.callback"),
 		ReadTimeout:  10 * time.Second,
 		WriteTimeout: 10 * time.Second,
 	}
diff --git a/pkg/tools/mcp/session_client.go b/pkg/tools/mcp/session_client.go
index 778ee1530..d7dd68891 100644
--- a/pkg/tools/mcp/session_client.go
+++ b/pkg/tools/mcp/session_client.go
@@ -10,6 +10,7 @@ import (
 
 	gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
 
+	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -93,35 +94,120 @@ func (c *sessionClient) Close(context.Context) error {
 }
 
 func (c *sessionClient) ListTools(ctx context.Context, request *gomcp.ListToolsParams) iter.Seq2[*gomcp.Tool, error] {
-	if s := c.getSession(); s != nil {
-		return s.Tools(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return func(yield func(*gomcp.Tool, error) bool) {
+			yield(nil, errors.New("session not initialized"))
+		}
 	}
+	// Start the span and the underlying RPC inside the closure so a
+	// caller that obtains the iterator and never iterates does not
+	// leak the span (and the in-flight RPC). Span lifetime now equals
+	// iteration lifetime.
 	return func(yield func(*gomcp.Tool, error) bool) {
-		yield(nil, errors.New("session not initialized"))
+		spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{
+			Method: otelmcp.MethodToolsList,
+		})
+		defer span.End()
+
+		if request != nil {
+			request.Meta = otelmcp.EnsureMeta(request.Meta)
+			otelmcp.InjectMeta(spanCtx, request.Meta)
+		}
+		for tool, err := range s.Tools(spanCtx, request) {
+			if err != nil {
+				// Record each error inline rather than only the
+				// last one — paginated lists may yield multiple
+				// failures and the trace should reflect them all.
+				span.RecordError(err, "")
+			}
+			if !yield(tool, err) {
+				return
+			}
+		}
 	}
 }
 
 func (c *sessionClient) CallTool(ctx context.Context, request *gomcp.CallToolParams) (*gomcp.CallToolResult, error) {
-	if s := c.getSession(); s != nil {
-		return s.CallTool(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return nil, errors.New("session not initialized")
+	}
+	opts := otelmcp.CallOptions{
+		Method: otelmcp.MethodToolsCall,
+	}
+	if request != nil {
+		opts.ToolName = request.Name
+	}
+	spanCtx, span := otelmcp.StartClient(ctx, opts)
+	defer span.End()
+
+	if request != nil {
+		request.Meta = otelmcp.EnsureMeta(request.Meta)
+		otelmcp.InjectMeta(spanCtx, request.Meta)
+	}
+
+	result, err := s.CallTool(spanCtx, request)
+	if err != nil {
+		span.RecordError(err, "")
 	}
-	return nil, errors.New("session not initialized")
+	return result, err
 }
 
 func (c *sessionClient) ListPrompts(ctx context.Context, request *gomcp.ListPromptsParams) iter.Seq2[*gomcp.Prompt, error] {
-	if s := c.getSession(); s != nil {
-		return s.Prompts(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return func(yield func(*gomcp.Prompt, error) bool) {
+			yield(nil, errors.New("session not initialized"))
+		}
 	}
 	return func(yield func(*gomcp.Prompt, error) bool) {
-		yield(nil, errors.New("session not initialized"))
+		// Span and RPC start at iteration time so an unused
+		// iterator never leaks either.
+		spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{
+			Method: otelmcp.MethodPromptsList,
+		})
+		defer span.End()
+
+		if request != nil {
+			request.Meta = otelmcp.EnsureMeta(request.Meta)
+			otelmcp.InjectMeta(spanCtx, request.Meta)
+		}
+		for prompt, err := range s.Prompts(spanCtx, request) {
+			if err != nil {
+				span.RecordError(err, "")
+			}
+			if !yield(prompt, err) {
+				return
+			}
+		}
 	}
 }
 
 func (c *sessionClient) GetPrompt(ctx context.Context, request *gomcp.GetPromptParams) (*gomcp.GetPromptResult, error) {
-	if s := c.getSession(); s != nil {
-		return s.GetPrompt(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return nil, errors.New("session not initialized")
+	}
+	opts := otelmcp.CallOptions{
+		Method: otelmcp.MethodPromptsGet,
+	}
+	if request != nil {
+		opts.PromptName = request.Name
+	}
+	spanCtx, span := otelmcp.StartClient(ctx, opts)
+	defer span.End()
+
+	if request != nil {
+		request.Meta = otelmcp.EnsureMeta(request.Meta)
+		otelmcp.InjectMeta(spanCtx, request.Meta)
+	}
+
+	result, err := s.GetPrompt(spanCtx, request)
+	if err != nil {
+		span.RecordError(err, "")
 	}
-	return nil, errors.New("session not initialized")
+	return result, err
 }
 
 // handleElicitationRequest forwards incoming elicitation requests from the MCP

From d125606659304d8df28b49a82e9c38f68ce91703 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:03 +0300
Subject: [PATCH 07/17] feat(otel): instrument A2A server with otelhttp and
 gen_ai.invoke_agent semconv

- `pkg/a2a/server.go`: wrap the agent-card and JSON-RPC endpoints with `otelhttp.NewHandler` so inbound A2A requests extract `traceparent` / `tracestate` / `baggage` and emit a SERVER span. The outer `agent-a2a` server wrap covers any auxiliary routes
- `pkg/a2a/adapter.go`: in `runDockerAgent`, decorate the active SERVER span with `gen_ai.operation.name=invoke_agent`, `gen_ai.agent.name`, and `cagent.agent.name`. Wires the runtime tracer scope so per-invocation `runtime.session` / `runtime.stream` / `runtime.tool.call` chain onto the inbound A2A span instead of starting fresh trace ids per request
---
 pkg/a2a/adapter.go | 25 +++++++++++++++++++++++++
 pkg/a2a/server.go  | 23 +++++++++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/pkg/a2a/adapter.go b/pkg/a2a/adapter.go
index 333083dc6..3be77917e 100644
--- a/pkg/a2a/adapter.go
+++ b/pkg/a2a/adapter.go
@@ -8,6 +8,8 @@ import (
 	"strings"
 
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 	"google.golang.org/adk/agent"
 	"google.golang.org/adk/model"
 	adksession "google.golang.org/adk/session"
@@ -17,6 +19,7 @@ import (
 	"github.com/docker/docker-agent/pkg/runtime"
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/team"
+	cgenai "github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // newDockerAgentAdapter creates a new ADK agent adapter from a docker agent team and agent name.
@@ -43,6 +46,21 @@ func newDockerAgentAdapter(t *team.Team, agentName string) (agent.Agent, error)
 // runDockerAgent executes a docker agent and returns ADK session events
 func runDockerAgent(ctx agent.InvocationContext, t *team.Team, agentName string, a *dagent.Agent) iter.Seq2[*adksession.Event, error] {
 	return func(yield func(*adksession.Event, error) bool) {
+		// Decorate the inbound `a2a.message` SERVER span (created by
+		// otelhttp.NewHandler in server.go) with the GenAI semconv
+		// invoke_agent shape so dashboards can recognise A2A traffic as
+		// agent invocations rather than generic JSON-RPC POSTs. The
+		// runtime.session span we open below is the child that records
+		// the actual work; this annotation makes the parent searchable
+		// via gen_ai.operation.name="invoke_agent".
+		if span := trace.SpanFromContext(ctx); span.IsRecording() {
+			span.SetAttributes(
+				attribute.String(cgenai.AttrOperationName, cgenai.OperationInvokeAgent),
+				attribute.String(cgenai.AttrAgentName, agentName),
+				attribute.String(cgenai.AttrAgentNameRuntime, agentName),
+			)
+		}
+
 		// Extract user message from the ADK context
 		userContent := ctx.UserContent()
 		message := contentToMessage(userContent)
@@ -60,6 +78,13 @@ func runDockerAgent(ctx agent.InvocationContext, t *team.Team, agentName string,
 		// Create runtime
 		rt, err := runtime.New(t,
 			runtime.WithCurrentAgent(agentName),
+			// Match the tracer scope used by `cmd/root/run.go` so
+			// MCP / A2A / API spans share the same instrumentation
+			// scope as the CLI's runtime spans. Without this option
+			// `LocalRuntime.startSpan` sees a nil tracer and silently
+			// returns no-op spans for runtime.session, runtime.stream,
+			// runtime.tool.call, runtime.fallback, runtime.run_skill,
+			// hook events, and so on.
 			runtime.WithTracer(otel.Tracer("cagent")),
 		)
 		if err != nil {
diff --git a/pkg/a2a/server.go b/pkg/a2a/server.go
index c9fa93081..8914b8f9b 100644
--- a/pkg/a2a/server.go
+++ b/pkg/a2a/server.go
@@ -14,6 +14,7 @@ import (
 	"github.com/a2aproject/a2a-go/a2asrv"
 	"github.com/labstack/echo/v4"
 	"github.com/labstack/echo/v4/middleware"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 	"google.golang.org/adk/runner"
 	"google.golang.org/adk/server/adka2a"
 	"google.golang.org/adk/session"
@@ -104,8 +105,26 @@ func Run(ctx context.Context, agentFilename, agentName string, runConfig *config
 	}))
 	e.Use(middleware.RequestLogger())
 
-	e.GET(a2asrv.WellKnownAgentCardPath, echo.WrapHandler(a2asrv.NewStaticAgentCardHandler(agentCard)))
-	e.POST(agentPath, echo.WrapHandler(a2asrv.NewJSONRPCHandler(a2asrv.NewHandler(executor))))
+	// Wrap both A2A endpoints with otelhttp so the configured W3C
+	// propagator extracts `traceparent` / `tracestate` / `baggage`
+	// from incoming requests. The agent runtime started inside
+	// `runDockerAgent` then chains its spans onto the calling agent's
+	// trace, and the `gen_ai.conversation.id` baggage seeded by the
+	// caller flows through into our local runtime spans without
+	// per-call plumbing. The agent-card endpoint is included so
+	// discovery requests carry the same trace context as the
+	// downstream invocation — propagation is uniform across all
+	// public surfaces of the server.
+	cardHandler := otelhttp.NewHandler(
+		a2asrv.NewStaticAgentCardHandler(agentCard),
+		"a2a.agent_card",
+	)
+	jsonrpcHandler := otelhttp.NewHandler(
+		a2asrv.NewJSONRPCHandler(a2asrv.NewHandler(executor)),
+		"a2a.message",
+	)
+	e.GET(a2asrv.WellKnownAgentCardPath, echo.WrapHandler(cardHandler))
+	e.POST(agentPath, echo.WrapHandler(jsonrpcHandler))
 
 	if err := e.Server.Serve(ln); err != nil && ctx.Err() == nil {
 		slog.Error("Failed to start server", "error", err)

From 86996c7ad1c9423e8343842df4a4f4937081e2e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:03 +0300
Subject: [PATCH 08/17] feat(otel): wrap remaining HTTP servers, wire runtime
 tracer entry points, and add cold-start spans
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `pkg/server/server.go`: wrap the agent-api Echo handler with `otelhttp.NewHandler` so inbound API requests extract `traceparent` / `tracestate` / `baggage` and the runtime spans started downstream chain onto the calling client trace
- `pkg/server/session_manager.go`: wire the runtime tracer scope into per-session runtime construction; open a `session.runtime_init` INTERNAL span on the cold path (team load + runtime construction) so per-request first-use latency is attributable. Cached hits skip the span — they are a pointer load
- `pkg/chatserver/server.go`, `pkg/chatserver/runtime_pool.go`: wrap the chat completions HTTP server with `otelhttp.NewHandler` and propagate the runtime tracer through the per-session pool
- `pkg/teamloader/teamloader.go`: open a `teamloader.load` INTERNAL span around `LoadWithConfig` so the cold-start path (config parse, model alias resolution, OCI agent pulls, toolset starts) becomes attributable
- `pkg/acp/agent.go`: wire the runtime tracer into the ACP entry point so its sub-spans share scope with CLI / API runs
---
 pkg/acp/agent.go               |  4 ++++
 pkg/chatserver/runtime_pool.go | 10 +++++++++-
 pkg/chatserver/server.go       | 14 ++++++++++++--
 pkg/server/server.go           |  9 ++++++++-
 pkg/server/session_manager.go  | 26 +++++++++++++++++++++++++-
 pkg/teamloader/teamloader.go   | 29 ++++++++++++++++++++++++++++-
 6 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/pkg/acp/agent.go b/pkg/acp/agent.go
index 811bfef99..1dc15e362 100644
--- a/pkg/acp/agent.go
+++ b/pkg/acp/agent.go
@@ -14,6 +14,7 @@ import (
 	"sync"
 
 	"github.com/coder/acp-go-sdk"
+	"go.opentelemetry.io/otel"
 
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/runtime"
@@ -144,6 +145,9 @@ func (a *Agent) NewSession(ctx context.Context, params acp.NewSessionRequest) (a
 	rt, err := runtime.New(a.team,
 		runtime.WithCurrentAgent(defaultAgent.Name()),
 		runtime.WithSessionStore(a.sessionStore),
+		// Match the CLI tracer scope; without this the ACP-mode
+		// runtime's `startSpan` is a no-op for every runtime.* span.
+		runtime.WithTracer(otel.Tracer("cagent")),
 	)
 	if err != nil {
 		return acp.NewSessionResponse{}, fmt.Errorf("failed to create runtime: %w", err)
diff --git a/pkg/chatserver/runtime_pool.go b/pkg/chatserver/runtime_pool.go
index d79f03448..397d13513 100644
--- a/pkg/chatserver/runtime_pool.go
+++ b/pkg/chatserver/runtime_pool.go
@@ -4,6 +4,8 @@ import (
 	"errors"
 	"sync"
 
+	"go.opentelemetry.io/otel"
+
 	"github.com/docker/docker-agent/pkg/runtime"
 	"github.com/docker/docker-agent/pkg/team"
 )
@@ -56,7 +58,13 @@ func (p *runtimePool) Get(agent string) (runtime.Runtime, error) {
 	if rt := p.takeIdle(agent); rt != nil {
 		return rt, nil
 	}
-	rt, err := runtime.New(p.team, runtime.WithCurrentAgent(agent))
+	// Match the tracer scope used by the CLI; without this the
+	// pooled chatserver runtimes are tracer-less so all `runtime.*`
+	// spans go silent in OpenAI-compatible chat-completions mode.
+	rt, err := runtime.New(p.team,
+		runtime.WithCurrentAgent(agent),
+		runtime.WithTracer(otel.Tracer("cagent")),
+	)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/chatserver/server.go b/pkg/chatserver/server.go
index 928e56636..c05b2a423 100644
--- a/pkg/chatserver/server.go
+++ b/pkg/chatserver/server.go
@@ -36,6 +36,7 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/labstack/echo/v4/middleware"
 	"github.com/openai/openai-go/v3"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/runtime"
@@ -125,14 +126,23 @@ func Run(ctx context.Context, agentFilename string, opts Options, ln net.Listene
 		return err
 	}
 
-	httpServer := &http.Server{
-		Handler: newRouter(&server{
+	// Wrap with otelhttp so incoming /v1/chat/completions requests
+	// (including SSE streams) extract the caller's trace context.
+	// otelhttp ends the span when the response body is closed, so
+	// SSE streaming responses get a span that covers the full
+	// stream duration.
+	handler := otelhttp.NewHandler(
+		newRouter(&server{
 			team:              t,
 			policy:            policy,
 			conversations:     newConversationStore(opts.ConversationsMaxSessions, conversationTTL(opts)),
 			conversationLocks: newConversationLockSet(),
 			runtimes:          newRuntimePool(t, opts.MaxIdleRuntimes),
 		}, opts),
+		"chatserver",
+	)
+	httpServer := &http.Server{
+		Handler:           handler,
 		ReadHeaderTimeout: 30 * time.Second,
 	}
 	return serve(ctx, httpServer, ln)
diff --git a/pkg/server/server.go b/pkg/server/server.go
index dd33b1290..030cdbb1a 100644
--- a/pkg/server/server.go
+++ b/pkg/server/server.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/labstack/echo/v4"
 	"github.com/labstack/echo/v4/middleware"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 
 	"github.com/docker/docker-agent/pkg/api"
 	"github.com/docker/docker-agent/pkg/config"
@@ -80,8 +81,14 @@ func New(ctx context.Context, sessionStore session.Store, runConfig *config.Runt
 }
 
 func (s *Server) Serve(ctx context.Context, ln net.Listener) error {
+	// Wrap the Echo handler with otelhttp so the configured W3C
+	// propagator extracts `traceparent` / `tracestate` / `baggage`
+	// from incoming API requests. Without this the API server's
+	// runtime spans (already wired via `WithTracer` in the session
+	// manager) start fresh trace ids per request rather than
+	// chaining onto the calling client's trace.
 	srv := http.Server{
-		Handler: s.e,
+		Handler: otelhttp.NewHandler(s.e, "agent-api"),
 	}
 
 	if err := srv.Serve(ln); err != nil && ctx.Err() == nil {
diff --git a/pkg/server/session_manager.go b/pkg/server/session_manager.go
index 0b6d82605..7d6130440 100644
--- a/pkg/server/session_manager.go
+++ b/pkg/server/session_manager.go
@@ -12,6 +12,9 @@ import (
 	"time"
 
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/api"
 	"github.com/docker/docker-agent/pkg/concurrent"
@@ -402,12 +405,30 @@ func (sm *SessionManager) generateTitle(ctx context.Context, sess *session.Sessi
 	}
 }
 
-func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.Session, agentFilename, currentAgent string, rc *config.RuntimeConfig) (runtime.Runtime, *sessiontitle.Generator, error) {
+func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.Session, agentFilename, currentAgent string, rc *config.RuntimeConfig) (_ runtime.Runtime, _ *sessiontitle.Generator, err error) {
 	// Caller (RunSession) holds sm.mux and has already verified that no
 	// active runtime exists for this session. This function is purely a
 	// constructor: it must not touch sm.runtimeSessions, otherwise it would
 	// briefly publish a half-initialised activeRuntimes (e.g. without the
 	// cancel func) that other goroutines could observe.
+	//
+	// Every call is a cold-path construction (caller short-circuits
+	// cached hits), so a span here attributes per-request first-use
+	// latency (team load + runtime construction) without adding noise
+	// on warm paths.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/server").Start(
+		ctx, "session.runtime_init",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attribute.String("gen_ai.conversation.id", sess.ID)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	t, err := sm.loadTeam(ctx, agentFilename, rc)
 	if err != nil {
 		return nil, nil, err
@@ -427,6 +448,9 @@ func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.S
 		runtime.WithCurrentAgent(currentAgent),
 		runtime.WithManagedOAuth(false),
 		runtime.WithSessionStore(sm.sessionStore),
+		// Match the tracer scope used by the CLI; without this the
+		// API-server runtime's startSpan is a no-op so all the
+		// runtime.* spans go silent in HTTP-server mode.
 		runtime.WithTracer(otel.Tracer("cagent")),
 	}
 	run, err := runtime.New(t, opts...)
diff --git a/pkg/teamloader/teamloader.go b/pkg/teamloader/teamloader.go
index 72db52c5f..f49d20732 100644
--- a/pkg/teamloader/teamloader.go
+++ b/pkg/teamloader/teamloader.go
@@ -13,6 +13,11 @@ import (
 	"strings"
 	"sync"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/config/latest"
@@ -84,7 +89,23 @@ func Load(ctx context.Context, agentSource config.Source, runConfig *config.Runt
 
 // LoadWithConfig loads an agent team and returns both the team and config info
 // needed for runtime model switching.
-func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *config.RuntimeConfig, opts ...Opt) (*LoadResult, error) {
+func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *config.RuntimeConfig, opts ...Opt) (result *LoadResult, err error) {
+	// Cold-start path: parses config, resolves model aliases, may pull
+	// referenced sub-agents over the network, and starts every toolset.
+	// All synchronous from the caller's perspective. The span makes the
+	// breakdown attributable when first-use latency is high.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/teamloader").Start(
+		ctx, "teamloader.load",
+		trace.WithSpanKind(trace.SpanKindInternal),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	var loadOpts loadOptions
 	loadOpts.toolsetRegistry = NewDefaultToolsetRegistry()
 
@@ -99,6 +120,12 @@ func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *c
 	if err != nil {
 		return nil, err
 	}
+	if cfg != nil {
+		span.SetAttributes(
+			attribute.Int("cagent.teamloader.agent_count", len(cfg.Agents)),
+			attribute.Int("cagent.teamloader.model_count", len(cfg.Models)),
+		)
+	}
 
 	// Resolve model aliases (e.g., "claude-sonnet-4-5" -> "claude-sonnet-4-5-20250929")
 	// This ensures the API uses the pinned model version. The original name is preserved

From 3e55ce47d48fbf5ba17b552d88b90f46b166782c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:04 +0300
Subject: [PATCH 09/17] feat(otel): instrument memory, RAG, sessiontitle, and
 evaluation

- `pkg/memory/database/sqlite/sqlite.go`: open `memory.{op}` spans on `AddMemory`, `SearchMemories`, etc., with named-return error capture so failures attach to the span via `RecordError`. The search path additionally emits a `retrieval` semconv span for cross-tool dashboards
- `pkg/rag/manager.go`: open `retrieval` (semconv) spans on `Query`, plus `rag.init` / `rag.reindex` / `rag.file_watcher` for lifecycle visibility
- `pkg/sessiontitle/generator.go`: wrap title generation with a `sessiontitle.generate` span; named-return errors fold onto the span on failure
- `pkg/evaluation/judge.go`: emit `gen_ai.evaluation.result` log events from the LLM-as-judge evaluator with score / explanation / error.type, linked to the active span via context for cross-signal join
---
 pkg/evaluation/judge.go              | 25 ++++++++
 pkg/memory/database/sqlite/sqlite.go | 87 +++++++++++++++++++++++++---
 pkg/rag/manager.go                   | 75 ++++++++++++++++++++++--
 pkg/sessiontitle/generator.go        | 34 ++++++++++-
 4 files changed, 206 insertions(+), 15 deletions(-)

diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go
index 38ae652fd..391536aee 100644
--- a/pkg/evaluation/judge.go
+++ b/pkg/evaluation/judge.go
@@ -13,6 +13,7 @@ import (
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/model/provider"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // relevancePrompt is the prompt template for the judge model to evaluate responses.
@@ -155,10 +156,34 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
 	for i, r := range rawResults {
 		if r.err != nil {
 			errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err))
+			// Emit gen_ai.evaluation.result with error.type so the
+			// failed checks show up alongside the successful ones in
+			// log-based dashboards. Set ScoreLabel="error" so
+			// dashboards that GROUP BY label still surface these
+			// rows (otherwise the missing label silently drops them).
+			genai.EmitEvaluationResult(ctx, genai.EvaluationResult{
+				Name:       "relevance",
+				ScoreLabel: "error",
+				ErrorType:  genai.ClassifyError(r.err),
+			})
 			continue
 		}
 		results[i].Passed = r.passed
 		results[i].Reason = r.reason
+
+		score := 0.0
+		label := "failed"
+		if r.passed {
+			score = 1.0
+			label = "passed"
+		}
+		genai.EmitEvaluationResult(ctx, genai.EvaluationResult{
+			Name:          "relevance",
+			ScoreLabel:    label,
+			ScoreValue:    score,
+			HasScoreValue: true,
+			Explanation:   r.reason,
+		})
 	}
 
 	if len(errs) > 0 {
diff --git a/pkg/memory/database/sqlite/sqlite.go b/pkg/memory/database/sqlite/sqlite.go
index e1e349893..cc2409729 100644
--- a/pkg/memory/database/sqlite/sqlite.go
+++ b/pkg/memory/database/sqlite/sqlite.go
@@ -6,10 +6,40 @@ import (
 	"fmt"
 	"strings"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/memory/database"
 	"github.com/docker/docker-agent/pkg/sqliteutil"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
+// memoryDataSourceID is the `gen_ai.data_source.id` value used on
+// retrieval-shaped memory operations (SearchMemories) so observability-svc
+// can group "agent recalled this memory" timeline entries the same way it
+// groups RAG retrievals.
+const memoryDataSourceID = "memory"
+
+// startMemorySpan opens a small INTERNAL span for a memory CRUD operation.
+// op is recorded as `cagent.memory.op` and the span name is
+// `memory.{op}`. Conversation id flows in via baggage so the span lands
+// on the right session timeline.
+func startMemorySpan(ctx context.Context, op string) (context.Context, trace.Span) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/memory/database/sqlite")
+	attrs := []attribute.KeyValue{
+		attribute.String("cagent.memory.op", op),
+	}
+	if convID := genai.ConversationIDFromContext(ctx); convID != "" {
+		attrs = append(attrs, attribute.String(genai.AttrConversationID, convID))
+	}
+	return tracer.Start(ctx, "memory."+op,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+}
+
 type MemoryDatabase struct {
 	db *sql.DB
 }
@@ -40,15 +70,25 @@ func NewMemoryDatabase(path string) (database.Database, error) {
 }
 
 func (m *MemoryDatabase) AddMemory(ctx context.Context, memory database.UserMemory) error {
+	ctx, span := startMemorySpan(ctx, "add")
+	defer span.End()
+
 	if memory.ID == "" {
 		return database.ErrEmptyID
 	}
 	_, err := m.db.ExecContext(ctx, "INSERT INTO memories (id, created_at, memory, category) VALUES (?, ?, ?, ?)",
 		memory.ID, memory.CreatedAt, memory.Memory, memory.Category)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+	}
 	return err
 }
 
 func (m *MemoryDatabase) GetMemories(ctx context.Context) ([]database.UserMemory, error) {
+	ctx, span := startMemorySpan(ctx, "list")
+	defer span.End()
+
 	rows, err := m.db.QueryContext(ctx, "SELECT id, created_at, memory, COALESCE(category, '') FROM memories")
 	if err != nil {
 		return nil, err
@@ -73,11 +113,37 @@ func (m *MemoryDatabase) GetMemories(ctx context.Context) ([]database.UserMemory
 }
 
 func (m *MemoryDatabase) DeleteMemory(ctx context.Context, memory database.UserMemory) error {
+	ctx, span := startMemorySpan(ctx, "delete")
+	defer span.End()
+
 	_, err := m.db.ExecContext(ctx, "DELETE FROM memories WHERE id = ?", memory.ID)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+	}
 	return err
 }
 
-func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category string) ([]database.UserMemory, error) {
+func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category string) (results []database.UserMemory, err error) {
+	// SearchMemories is the retrieval shape per the OTel GenAI semconv:
+	// the agent is recalling stored memories filtered by query/category.
+	// Use the spec'd `retrieval {data_source.id}` span so this lands on
+	// the same dashboard row as RAG retrievals.
+	ctx, retSpan := genai.StartRetrieval(ctx, "sqlite", memoryDataSourceID, false, "")
+	defer func() {
+		if err != nil {
+			retSpan.RecordError(err, "")
+		}
+		retSpan.SetResultCount(len(results))
+		retSpan.End()
+	}()
+	if category != "" {
+		retSpan.SetAttributes(attribute.String("cagent.memory.category", category))
+	}
+
+	// Assign to the named returns (not local shadows) so the deferred
+	// span closure observes the live error and result count regardless
+	// of which return path fires.
 	var conditions []string
 	var args []any
 
@@ -102,30 +168,35 @@ func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category str
 		stmt += " WHERE " + strings.Join(conditions, " AND ")
 	}
 
-	rows, err := m.db.QueryContext(ctx, stmt, args...)
+	var rows *sql.Rows
+	rows, err = m.db.QueryContext(ctx, stmt, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer rows.Close()
 
-	var memories []database.UserMemory
 	for rows.Next() {
 		var memory database.UserMemory
-		err := rows.Scan(&memory.ID, &memory.CreatedAt, &memory.Memory, &memory.Category)
-		if err != nil {
+		// gocritic suggests `:=` here, but we want to assign to the
+		// named return `err` so the deferred span closure observes
+		// the failure. nolint pragma documents the intent.
+		if err = rows.Scan(&memory.ID, &memory.CreatedAt, &memory.Memory, &memory.Category); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability
 			return nil, err
 		}
-		memories = append(memories, memory)
+		results = append(results, memory)
 	}
 
-	if err := rows.Err(); err != nil {
+	if err = rows.Err(); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability
 		return nil, err
 	}
 
-	return memories, nil
+	return results, nil
 }
 
 func (m *MemoryDatabase) UpdateMemory(ctx context.Context, memory database.UserMemory) error {
+	ctx, span := startMemorySpan(ctx, "update")
+	defer span.End()
+
 	if memory.ID == "" {
 		return database.ErrEmptyID
 	}
diff --git a/pkg/rag/manager.go b/pkg/rag/manager.go
index 17e77675f..40b051a52 100644
--- a/pkg/rag/manager.go
+++ b/pkg/rag/manager.go
@@ -11,11 +11,17 @@ import (
 	"slices"
 	"time"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/rag/database"
 	"github.com/docker/docker-agent/pkg/rag/fusion"
 	"github.com/docker/docker-agent/pkg/rag/rerank"
 	"github.com/docker/docker-agent/pkg/rag/strategy"
 	"github.com/docker/docker-agent/pkg/rag/types"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // ToolConfig represents tool-specific configuration
@@ -143,7 +149,23 @@ func New(_ context.Context, name string, config Config, strategyEvents <-chan ty
 // Initialize indexes all documents using all configured strategies
 // Each strategy indexes its own document set (shared + strategy-specific)
 // Strategies are initialized in parallel for better performance
-func (m *Manager) Initialize(ctx context.Context) error {
+func (m *Manager) Initialize(ctx context.Context) (err error) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag")
+	ctx, span := tracer.Start(ctx, "rag.initialize",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.String(genai.AttrDataSourceID, m.name),
+			attribute.Int("cagent.rag.num_strategies", len(m.strategies)),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	slog.Debug("[RAG Manager] Starting initialization",
 		"rag_name", m.name,
 		"num_strategies", len(m.strategies))
@@ -211,7 +233,20 @@ func (m *Manager) Initialize(ctx context.Context) error {
 
 // Query searches for relevant documents using all configured strategies
 // If multiple strategies are configured, results are combined using the fusion strategy
-func (m *Manager) Query(ctx context.Context, query string) ([]database.SearchResult, error) {
+func (m *Manager) Query(ctx context.Context, query string) (results []database.SearchResult, err error) {
+	// Start a `retrieval {rag_name}` span per the OTel GenAI semconv.
+	// The query text itself is sensitive so we never capture it on the
+	// span here — content capture is gated by a separate environment
+	// variable in a later commit and emitted via a span event then.
+	ctx, retSpan := genai.StartRetrieval(ctx, "rag", m.name, false, "")
+	defer func() {
+		if err != nil {
+			retSpan.RecordError(err, "")
+		}
+		retSpan.SetResultCount(len(results))
+		retSpan.End()
+	}()
+
 	slog.Debug("[RAG Manager] Starting query",
 		"rag_name", m.name,
 		"num_strategies", len(m.strategies),
@@ -228,7 +263,11 @@ func (m *Manager) Query(ctx context.Context, query string) ([]database.SearchRes
 				"strategy_limit", strategyCfg.Limit,
 				"strategy_threshold", strategyCfg.Threshold)
 
-			results, err := strategyImpl.Query(ctx, query, strategyCfg.Limit, strategyCfg.Threshold)
+			// Assign to the function's named returns (note `=`, not
+			// `:=`) so the deferred span closure sees the live values
+			// even if a future change replaces the explicit
+			// `return X, Y` form below with a bare `return`.
+			results, err = strategyImpl.Query(ctx, query, strategyCfg.Limit, strategyCfg.Threshold)
 			if err != nil {
 				slog.Error("[RAG Manager] Strategy query failed",
 					"rag_name", m.name,
@@ -431,7 +470,20 @@ func getStrategyNames(stratMap map[string]strategy.Strategy) []string {
 }
 
 // CheckAndReindexChangedFiles checks for file changes and re-indexes if needed
-func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) error {
+func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) (err error) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag")
+	ctx, span := tracer.Start(ctx, "rag.reindex",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attribute.String(genai.AttrDataSourceID, m.name)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	for strategyName, strategyImpl := range m.strategies {
 		strategyCfg := m.strategyConfigs[strategyName]
 		if err := strategyImpl.CheckAndReindexChangedFiles(ctx, strategyCfg.Docs, strategyCfg.Chunking); err != nil {
@@ -442,7 +494,20 @@ func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) error {
 }
 
 // StartFileWatcher starts monitoring files and directories for changes
-func (m *Manager) StartFileWatcher(ctx context.Context) error {
+func (m *Manager) StartFileWatcher(ctx context.Context) (err error) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag")
+	ctx, span := tracer.Start(ctx, "rag.file_watcher.start",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attribute.String(genai.AttrDataSourceID, m.name)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	for strategyName, strategyImpl := range m.strategies {
 		strategyCfg := m.strategyConfigs[strategyName]
 		if err := strategyImpl.StartFileWatcher(ctx, strategyCfg.Docs, strategyCfg.Chunking); err != nil {
diff --git a/pkg/sessiontitle/generator.go b/pkg/sessiontitle/generator.go
index be8b33166..21f0a8ff9 100644
--- a/pkg/sessiontitle/generator.go
+++ b/pkg/sessiontitle/generator.go
@@ -13,10 +13,16 @@ import (
 	"strings"
 	"time"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/model/provider"
 	"github.com/docker/docker-agent/pkg/model/provider/options"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 const (
@@ -56,7 +62,7 @@ func New(model provider.Provider, fallbackModels ...provider.Provider) *Generato
 // CreateChatCompletionStream, avoiding the overhead of spinning up a nested
 // runtime, and falls back to the next model on failure.
 // Returns an empty string if no models or messages are configured.
-func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages []string) (string, error) {
+func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages []string) (title string, err error) {
 	if g == nil || len(g.models) == 0 || len(userMessages) == 0 {
 		return "", nil
 	}
@@ -67,6 +73,27 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages
 	// the originating session.
 	ctx = httpclient.ContextWithSessionID(ctx, sessionID)
 
+	// Wrap the whole title-generation in a span so the boundary is
+	// visible on the session timeline. The inner per-attempt LLM
+	// calls each get their own `chat {model}` CLIENT child span via
+	// the provider decorator.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/sessiontitle").Start(
+		ctx,
+		"sessiontitle.generate",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.String(genai.AttrConversationID, sessionID),
+			attribute.Int("cagent.sessiontitle.candidate_count", len(g.models)),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	// Apply timeout to prevent hanging on slow or unresponsive models.
 	ctx, cancel := context.WithTimeout(ctx, titleGenerationTimeout)
 	defer cancel()
@@ -77,7 +104,10 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages
 
 	var lastErr error
 	for idx, baseModel := range g.models {
-		if err := ctx.Err(); err != nil {
+		// Assign to the named-return `err` so a context cancellation
+		// is observed by the deferred span closure as a recorded
+		// error rather than silently slipping through.
+		if err = ctx.Err(); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability
 			return "", err
 		}
 

From b0d39e7a5a7b12346f331c5e0bd48645b893215f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Sun, 3 May 2026 19:40:05 +0300
Subject: [PATCH 10/17] feat(otel): annotate built-in tool internals

- `pkg/tools/builtin/shell.go`, `script_shell.go`: stamp `cagent.tool.{shell,script_shell}.{cmd,cwd,timeout_seconds}` on the active `runtime.tool.handler` span. Cmd ships unconditionally because it is the main signal of what the agent did; redact at the OTel collector if commands carry secrets
- `pkg/tools/builtin/filesystem.go`: stamp `cagent.tool.filesystem.{op,path,paths,path_count}` covering all file operations. Paths ship unconditionally for the same incident-response reason
- `pkg/tools/builtin/fetch.go`: stamp `cagent.tool.fetch.{urls,url_count,format}`; each fetched URL still emits its own HTTP CLIENT child span via `httpclient.WrapWithOTel`
- `pkg/tools/builtin/lsp.go`: wrap every tool from `lspTool` so each LSP RPC stamps `cagent.tool.lsp.{tool,read_only}` on the parent span
- `pkg/tools/builtin/lsp_lifecycle.go`: inject `genai.InjectTraceContextEnv(ctx)` into the LSP server spawn env so OTel-aware language servers chain onto the agent trace
- `pkg/tools/builtin/openapi.go`, `pkg/tools/builtin/api.go`: route the user-facing HTTP clients through `httpclient.WrapWithOTel(remote.NewTransport(ctx))` so each API call emits a CLIENT span and propagates `traceparent`
- `pkg/tools/codemode/exec.go`: stamp `cagent.tool.codemode.{script,script_length,tool_call_count}` so a code-mode turn is visible as "ran N lines of JS that called M tools"
---
 pkg/tools/builtin/api.go           |  3 +-
 pkg/tools/builtin/deferred.go      | 32 +++++++++++++++-
 pkg/tools/builtin/fetch.go         | 23 +++++++++++-
 pkg/tools/builtin/filesystem.go    | 59 +++++++++++++++++++++++++++---
 pkg/tools/builtin/lsp.go           | 22 ++++++++++-
 pkg/tools/builtin/lsp_lifecycle.go | 10 ++++-
 pkg/tools/builtin/openapi.go       |  5 ++-
 pkg/tools/builtin/script_shell.go  | 14 +++++++
 pkg/tools/builtin/shell.go         | 16 ++++++++
 pkg/tools/builtin/todo.go          | 43 +++++++++++++++++++++-
 pkg/tools/builtin/user_prompt.go   | 14 +++++++
 pkg/tools/codemode/exec.go         | 20 ++++++++++
 12 files changed, 245 insertions(+), 16 deletions(-)

diff --git a/pkg/tools/builtin/api.go b/pkg/tools/builtin/api.go
index 3b6dba0a5..3ab8f96e3 100644
--- a/pkg/tools/builtin/api.go
+++ b/pkg/tools/builtin/api.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/js"
 	"github.com/docker/docker-agent/pkg/remote"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -32,7 +33,7 @@ var (
 func (t *APITool) callTool(ctx context.Context, toolCall tools.ToolCall) (*tools.ToolCallResult, error) {
 	client := &http.Client{
 		Timeout:   30 * time.Second,
-		Transport: remote.NewTransport(ctx),
+		Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx)),
 	}
 
 	endpoint := t.config.Endpoint
diff --git a/pkg/tools/builtin/deferred.go b/pkg/tools/builtin/deferred.go
index 5866a0093..d70f82952 100644
--- a/pkg/tools/builtin/deferred.go
+++ b/pkg/tools/builtin/deferred.go
@@ -8,6 +8,9 @@ import (
 	"strings"
 	"sync"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -84,7 +87,7 @@ type AddToolArgs struct {
 	Name string `json:"name" jsonschema:"The name of the tool to activate"`
 }
 
-func (d *DeferredToolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*tools.ToolCallResult, error) {
+func (d *DeferredToolset) handleSearchTool(ctx context.Context, args SearchToolArgs) (*tools.ToolCallResult, error) {
 	query := strings.ToLower(args.Query)
 
 	d.mu.RLock()
@@ -103,6 +106,15 @@ func (d *DeferredToolset) handleSearchTool(_ context.Context, args SearchToolArg
 		}
 	}
 
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.deferred.op", "search_tool"),
+			attribute.String("cagent.tool.deferred.query", args.Query),
+			attribute.Int("cagent.tool.deferred.match_count", len(results)),
+			attribute.Int("cagent.tool.deferred.pool_size", len(d.deferredTools)),
+		)
+	}
+
 	if len(results) == 0 {
 		return tools.ResultError(fmt.Sprintf("No deferred tools found matching '%s'", args.Query)), nil
 	}
@@ -115,21 +127,37 @@ func (d *DeferredToolset) handleSearchTool(_ context.Context, args SearchToolArg
 	return tools.ResultSuccess(fmt.Sprintf("Found %d deferred tool(s):\n%s", len(results), string(output))), nil
 }
 
-func (d *DeferredToolset) handleAddTool(_ context.Context, args AddToolArgs) (*tools.ToolCallResult, error) {
+func (d *DeferredToolset) handleAddTool(ctx context.Context, args AddToolArgs) (*tools.ToolCallResult, error) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
+	span := trace.SpanFromContext(ctx)
+	annotate := func(outcome string) {
+		if !span.IsRecording() {
+			return
+		}
+		span.SetAttributes(
+			attribute.String("cagent.tool.deferred.op", "add_tool"),
+			attribute.String("cagent.tool.deferred.tool_name", args.Name),
+			attribute.String("cagent.tool.deferred.outcome", outcome),
+			attribute.Int("cagent.tool.deferred.activated_count", len(d.activatedTools)),
+		)
+	}
+
 	if _, exists := d.activatedTools[args.Name]; exists {
+		annotate("already_active")
 		return tools.ResultSuccess(fmt.Sprintf("Tool '%s' is already active", args.Name)), nil
 	}
 
 	entry, exists := d.deferredTools[args.Name]
 	if !exists {
+		annotate("not_found")
 		return tools.ResultError(fmt.Sprintf("Tool '%s' not found.", args.Name)), nil
 	}
 
 	delete(d.deferredTools, args.Name)
 	d.activatedTools[args.Name] = entry.tool
+	annotate("activated")
 
 	return tools.ResultSuccess(fmt.Sprintf("Tool '%s' has been activated and is now available for use.\n\nDescription: %s", args.Name, entry.tool.Description)), nil
 }
diff --git a/pkg/tools/builtin/fetch.go b/pkg/tools/builtin/fetch.go
index 235608e70..61ac01eca 100644
--- a/pkg/tools/builtin/fetch.go
+++ b/pkg/tools/builtin/fetch.go
@@ -15,7 +15,10 @@ import (
 	htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
 	"github.com/k3a/html2text"
 	"github.com/temoto/robotstxt"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/remote"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/useragent"
@@ -52,10 +55,28 @@ func (h *fetchHandler) CallTool(ctx context.Context, params FetchToolArgs) (*too
 		return nil, errors.New("at least one URL is required")
 	}
 
+	// Decorate the active runtime.tool.handler span with the URL list
+	// and request shape. Each fetched URL still produces its own HTTP
+	// CLIENT child span via `httpclient.WrapWithOTel` below, so the
+	// per-request status / latency / target host all show up there;
+	// the parent span gets the requested URLs so a quick glance answers
+	// "which sites did the agent hit on this turn?" without expanding
+	// the children.
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		attrs := []attribute.KeyValue{
+			attribute.Int("cagent.tool.fetch.url_count", len(params.URLs)),
+			attribute.StringSlice("cagent.tool.fetch.urls", params.URLs),
+		}
+		if params.Format != "" {
+			attrs = append(attrs, attribute.String("cagent.tool.fetch.format", params.Format))
+		}
+		span.SetAttributes(attrs...)
+	}
+
 	// Set timeout if specified
 	client := &http.Client{
 		Timeout:   h.timeout,
-		Transport: remote.NewTransport(ctx),
+		Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx)),
 		// Re-check the domain allow/deny lists on every redirect: without this,
 		// an allowed origin could redirect into a denied one and bypass the
 		// policy. The 10-redirect cap mirrors the net/http default.
diff --git a/pkg/tools/builtin/filesystem.go b/pkg/tools/builtin/filesystem.go
index 29203e043..42e8fde43 100644
--- a/pkg/tools/builtin/filesystem.go
+++ b/pkg/tools/builtin/filesystem.go
@@ -16,12 +16,34 @@ import (
 	"strings"
 	"sync"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/fsx"
 	"github.com/docker/docker-agent/pkg/shellpath"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
+// annotateFilesystemSpan stamps the operation kind and target path
+// onto the active runtime.tool.handler span. Paths ship unconditionally
+// — they're the main signal of what the agent touched. Drop or hash
+// `cagent.tool.filesystem.path` at the OTel collector if paths
+// routinely reveal identifiers you don't want shipped.
+func annotateFilesystemSpan(ctx context.Context, op, path string) {
+	span := trace.SpanFromContext(ctx)
+	if !span.IsRecording() {
+		return
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String("cagent.tool.filesystem.op", op),
+	}
+	if path != "" {
+		attrs = append(attrs, attribute.String("cagent.tool.filesystem.path", path))
+	}
+	span.SetAttributes(attrs...)
+}
+
 const (
 	ToolNameReadFile           = "read_file"
 	ToolNameReadMultipleFiles  = "read_multiple_files"
@@ -626,6 +648,7 @@ func (t *FilesystemTool) shouldIgnorePath(path string) bool {
 // Handler implementations
 
 func (t *FilesystemTool) handleDirectoryTree(ctx context.Context, args DirectoryTreeArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "directory_tree", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -698,6 +721,7 @@ func (t *FilesystemTool) editFileHandler() tools.ToolHandler {
 }
 
 func (t *FilesystemTool) handleEditFile(ctx context.Context, args EditFileArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "edit_file", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -735,7 +759,8 @@ func (t *FilesystemTool) handleEditFile(ctx context.Context, args EditFileArgs)
 	return tools.ResultSuccess("File edited successfully. Changes:\n" + strings.Join(changes, "\n")), nil
 }
 
-func (t *FilesystemTool) handleListDirectory(_ context.Context, args ListDirectoryArgs) (*tools.ToolCallResult, error) {
+func (t *FilesystemTool) handleListDirectory(ctx context.Context, args ListDirectoryArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "list_directory", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -776,7 +801,8 @@ func (t *FilesystemTool) handleListDirectory(_ context.Context, args ListDirecto
 	}, nil
 }
 
-func (t *FilesystemTool) handleReadFile(_ context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) {
+func (t *FilesystemTool) handleReadFile(ctx context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "read_file", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return &tools.ToolCallResult{
@@ -883,6 +909,13 @@ func (t *FilesystemTool) readImageFile(resolvedPath, originalPath string) (*tool
 }
 
 func (t *FilesystemTool) handleReadMultipleFiles(ctx context.Context, args ReadMultipleFilesArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "read_multiple_files", "")
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
+			attribute.StringSlice("cagent.tool.filesystem.paths", args.Paths),
+		)
+	}
 	type PathContent struct {
 		Path    string `json:"path"`
 		Content string `json:"content"`
@@ -956,7 +989,8 @@ func (t *FilesystemTool) handleReadMultipleFiles(ctx context.Context, args ReadM
 	}, nil
 }
 
-func (t *FilesystemTool) handleSearchFilesContent(_ context.Context, args SearchFilesContentArgs) (*tools.ToolCallResult, error) {
+func (t *FilesystemTool) handleSearchFilesContent(ctx context.Context, args SearchFilesContentArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "search_files_content", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -1076,6 +1110,7 @@ func (t *FilesystemTool) handleSearchFilesContent(_ context.Context, args Search
 }
 
 func (t *FilesystemTool) handleWriteFile(ctx context.Context, args WriteFileArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "write_file", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -1098,7 +1133,14 @@ func (t *FilesystemTool) handleWriteFile(ctx context.Context, args WriteFileArgs
 	return tools.ResultSuccess(fmt.Sprintf("File written successfully: %s (%d bytes)", args.Path, len(args.Content))), nil
 }
 
-func (t *FilesystemTool) handleCreateDirectory(_ context.Context, args CreateDirectoryArgs) (*tools.ToolCallResult, error) {
+func (t *FilesystemTool) handleCreateDirectory(ctx context.Context, args CreateDirectoryArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "create_directory", "")
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
+			attribute.StringSlice("cagent.tool.filesystem.paths", args.Paths),
+		)
+	}
 	var results []string
 	for _, path := range args.Paths {
 		resolvedPath, err := t.resolveAndCheckPath(path)
@@ -1114,7 +1156,14 @@ func (t *FilesystemTool) handleCreateDirectory(_ context.Context, args CreateDir
 	return tools.ResultSuccess(strings.Join(results, "\n")), nil
 }
 
-func (t *FilesystemTool) handleRemoveDirectory(_ context.Context, args RemoveDirectoryArgs) (*tools.ToolCallResult, error) {
+func (t *FilesystemTool) handleRemoveDirectory(ctx context.Context, args RemoveDirectoryArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "remove_directory", "")
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
+			attribute.StringSlice("cagent.tool.filesystem.paths", args.Paths),
+		)
+	}
 	var results []string
 	for _, path := range args.Paths {
 		resolvedPath, err := t.resolveAndCheckPath(path)
diff --git a/pkg/tools/builtin/lsp.go b/pkg/tools/builtin/lsp.go
index 709522956..073284428 100644
--- a/pkg/tools/builtin/lsp.go
+++ b/pkg/tools/builtin/lsp.go
@@ -19,6 +19,9 @@ import (
 	"sync/atomic"
 	"time"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/tools/lifecycle"
@@ -460,12 +463,29 @@ type WorkspaceArgs struct{}
 
 // lspTool is a shorthand for constructing a tools.Tool with common LSP defaults.
 func lspTool(name, title, description string, readOnly bool, params any, handler tools.ToolHandler) tools.Tool {
+	// Wrap the handler so every LSP RPC stamps the LSP method name on
+	// the active runtime.tool.handler span. Single tool name = single
+	// LSP operation, so the gen_ai.tool.name attribute on the parent
+	// span is enough for filtering by RPC kind in dashboards. The
+	// `cagent.tool.lsp.tool` is redundant with gen_ai.tool.name but
+	// kept under the cagent.* namespace for symmetry with the other
+	// builtin tool annotations and so dashboards have a uniform
+	// `cagent.tool.{kind}.*` query surface across builtins.
+	wrapped := func(ctx context.Context, tc tools.ToolCall) (*tools.ToolCallResult, error) {
+		if span := trace.SpanFromContext(ctx); span.IsRecording() {
+			span.SetAttributes(
+				attribute.String("cagent.tool.lsp.tool", name),
+				attribute.Bool("cagent.tool.lsp.read_only", readOnly),
+			)
+		}
+		return handler(ctx, tc)
+	}
 	return tools.Tool{
 		Name:        name,
 		Category:    "lsp",
 		Description: description,
 		Parameters:  params,
-		Handler:     handler,
+		Handler:     wrapped,
 		Annotations: tools.ToolAnnotations{
 			Title:        title,
 			ReadOnlyHint: readOnly,
diff --git a/pkg/tools/builtin/lsp_lifecycle.go b/pkg/tools/builtin/lsp_lifecycle.go
index 759dc283f..461762595 100644
--- a/pkg/tools/builtin/lsp_lifecycle.go
+++ b/pkg/tools/builtin/lsp_lifecycle.go
@@ -12,6 +12,7 @@ import (
 	"sync"
 
 	"github.com/docker/docker-agent/pkg/concurrent"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools/lifecycle"
 )
 
@@ -28,7 +29,7 @@ func (c *lspConnector) Connect(ctx context.Context) (lifecycle.Session, error) {
 	h := c.h
 	slog.Debug("Starting LSP server", "command", h.command, "args", h.args)
 
-	p, err := spawnLSPProcess(h)
+	p, err := spawnLSPProcess(ctx, h)
 	if err != nil {
 		return nil, err
 	}
@@ -73,14 +74,19 @@ type lspProcess struct {
 // kicks off a stderr-drain goroutine bound to the process lifetime.
 // Errors are mapped to typed lifecycle errors so the supervisor can
 // apply the right policy.
-func spawnLSPProcess(h *lspHandler) (*lspProcess, error) {
+func spawnLSPProcess(callerCtx context.Context, h *lspHandler) (*lspProcess, error) {
 	// The process must outlive the caller's request context (which is
 	// often cancelled when an HTTP/agent turn ends). The supervisor
 	// calls Close to shut it down on Stop or restart.
 	processCtx, processCancel := context.WithCancel(context.Background())
 
 	cmd := exec.CommandContext(processCtx, h.command, h.args...)
+	// Inherit the caller's W3C trace context (the Connect call's
+	// `toolset.start` or per-request span) so an OTel-aware LSP server
+	// can chain its spans onto the agent trace. Most LSPs do not emit
+	// OTel today, so this is defensive parity with sandbox.exec.
 	cmd.Env = append(os.Environ(), h.env...)
+	cmd.Env = append(cmd.Env, genai.InjectTraceContextEnv(callerCtx)...)
 	cmd.Dir = h.workingDir
 
 	stdin, err := cmd.StdinPipe()
diff --git a/pkg/tools/builtin/openapi.go b/pkg/tools/builtin/openapi.go
index ab21fa953..2dd2aa050 100644
--- a/pkg/tools/builtin/openapi.go
+++ b/pkg/tools/builtin/openapi.go
@@ -18,6 +18,7 @@ import (
 	v3 "github.com/pb33f/libopenapi/datamodel/high/v3"
 	"go.yaml.in/yaml/v4"
 
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/remote"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/upstream"
@@ -74,7 +75,7 @@ func (t *OpenAPITool) fetchSpec(ctx context.Context) (*v3.Document, error) {
 	req.Header.Set("Accept", "application/json")
 	setHeaders(req, t.headers)
 
-	resp, err := (&http.Client{Timeout: httpTimeout, Transport: remote.NewTransport(ctx)}).Do(req)
+	resp, err := (&http.Client{Timeout: httpTimeout, Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx))}).Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("request failed: %w", err)
 	}
@@ -423,7 +424,7 @@ func (h *openAPIHandler) callTool(ctx context.Context, params openAPICallArgs) (
 	req.Header.Set("Accept", "application/json")
 	setHeaders(req, h.headers)
 
-	resp, err := (&http.Client{Timeout: httpTimeout, Transport: remote.NewTransport(ctx)}).Do(req)
+	resp, err := (&http.Client{Timeout: httpTimeout, Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx))}).Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("request failed: %w", err)
 	}
diff --git a/pkg/tools/builtin/script_shell.go b/pkg/tools/builtin/script_shell.go
index 13c69f9f8..20f31c980 100644
--- a/pkg/tools/builtin/script_shell.go
+++ b/pkg/tools/builtin/script_shell.go
@@ -11,6 +11,9 @@ import (
 	"slices"
 	"strings"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/shellpath"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -138,6 +141,17 @@ func (t *ScriptShellTool) execute(ctx context.Context, toolConfig *latest.Script
 		}
 	}
 
+	// Stamp the script_shell call shape onto the active span. Cmd
+	// ships unconditionally for the same reason as shell.RunShell —
+	// see that comment for the redact-at-collector guidance.
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.script_shell.tool_name", toolCall.Function.Name),
+			attribute.String("cagent.tool.script_shell.cmd", toolConfig.Cmd),
+			attribute.String("cagent.tool.script_shell.cwd", cmp.Or(toolConfig.WorkingDir, ".")),
+		)
+	}
+
 	shell, argsPrefix := shellpath.DetectShell()
 
 	cmd := exec.CommandContext(ctx, shell, append(argsPrefix, toolConfig.Cmd)...)
diff --git a/pkg/tools/builtin/shell.go b/pkg/tools/builtin/shell.go
index 510e4c009..5b5833fe1 100644
--- a/pkg/tools/builtin/shell.go
+++ b/pkg/tools/builtin/shell.go
@@ -16,6 +16,9 @@ import (
 	"sync/atomic"
 	"time"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/shellpath"
@@ -199,6 +202,19 @@ func (h *shellHandler) RunShell(ctx context.Context, params RunShellArgs) (*tool
 
 	cwd := h.resolveWorkDir(params.Cwd)
 
+	// Stamp the call shape (cmd, cwd, timeout) onto the active span.
+	// Cmd ships unconditionally — it's the main signal of what the
+	// agent actually did, and gating it on chat-content capture loses
+	// too much debug value. Drop or hash `cagent.tool.shell.cmd` at
+	// the OTel collector if commands routinely carry secrets.
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.shell.cmd", params.Cmd),
+			attribute.Float64("cagent.tool.shell.timeout_seconds", timeout.Seconds()),
+			attribute.String("cagent.tool.shell.cwd", cwd),
+		)
+	}
+
 	slog.Debug("Executing native shell command", "command", params.Cmd, "cwd", cwd)
 
 	return h.runNativeCommand(timeoutCtx, ctx, params.Cmd, cwd, timeout), nil
diff --git a/pkg/tools/builtin/todo.go b/pkg/tools/builtin/todo.go
index c80b6451a..6892e618a 100644
--- a/pkg/tools/builtin/todo.go
+++ b/pkg/tools/builtin/todo.go
@@ -8,10 +8,43 @@ import (
 	"sync"
 	"sync/atomic"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
+// annotateTodoSpan stamps the operation kind, batch size, and the
+// resulting list size onto the active runtime.tool.handler span so a
+// glance at a session shows when the agent was actually managing
+// progress vs. just chatting.
+func annotateTodoSpan(ctx context.Context, op string, batch, total, completed int) {
+	span := trace.SpanFromContext(ctx)
+	if !span.IsRecording() {
+		return
+	}
+	span.SetAttributes(
+		attribute.String("cagent.tool.todo.op", op),
+		attribute.Int("cagent.tool.todo.batch_size", batch),
+		attribute.Int("cagent.tool.todo.total", total),
+		attribute.Int("cagent.tool.todo.completed", completed),
+	)
+}
+
+// countCompleted returns how many todos in the current snapshot are
+// marked completed. Cheap O(n) scan over a typically-tiny slice; called
+// once per todo handler invocation for the span annotation.
+func countCompleted(all []Todo) int {
+	n := 0
+	for _, t := range all {
+		if t.Status == "completed" {
+			n++
+		}
+	}
+	return n
+}
+
 const (
 	ToolNameCreateTodo  = "create_todo"
 	ToolNameCreateTodos = "create_todos"
@@ -199,9 +232,11 @@ func (h *todoHandler) jsonResult(ctx context.Context, v any) (*tools.ToolCallRes
 
 func (h *todoHandler) createTodo(ctx context.Context, params CreateTodoArgs) (*tools.ToolCallResult, error) {
 	created := h.addTodo(ctx, params.Description)
+	all := h.storage.All(ctx)
+	annotateTodoSpan(ctx, "create_todo", 1, len(all), countCompleted(all))
 	return h.jsonResult(ctx, CreateTodoOutput{
 		Created:  created,
-		AllTodos: h.storage.All(ctx),
+		AllTodos: all,
 		Reminder: h.incompleteReminder(ctx),
 	})
 }
@@ -211,9 +246,11 @@ func (h *todoHandler) createTodos(ctx context.Context, params CreateTodosArgs) (
 	for _, desc := range params.Descriptions {
 		created = append(created, h.addTodo(ctx, desc))
 	}
+	all := h.storage.All(ctx)
+	annotateTodoSpan(ctx, "create_todos", len(params.Descriptions), len(all), countCompleted(all))
 	return h.jsonResult(ctx, CreateTodosOutput{
 		Created:  created,
-		AllTodos: h.storage.All(ctx),
+		AllTodos: all,
 		Reminder: h.incompleteReminder(ctx),
 	})
 }
@@ -246,6 +283,7 @@ func (h *todoHandler) updateTodos(ctx context.Context, params UpdateTodosArgs) (
 
 	result.AllTodos = h.storage.All(ctx)
 	result.Reminder = h.incompleteReminder(ctx)
+	annotateTodoSpan(ctx, "update_todos", len(params.Updates), len(result.AllTodos), countCompleted(result.AllTodos))
 
 	return h.jsonResult(ctx, result)
 }
@@ -283,6 +321,7 @@ func (h *todoHandler) listTodos(ctx context.Context, _ tools.ToolCall) (*tools.T
 	if todos == nil {
 		todos = []Todo{}
 	}
+	annotateTodoSpan(ctx, "list_todos", 0, len(todos), countCompleted(todos))
 	out := ListTodosOutput{Todos: todos}
 	out.Reminder = h.incompleteReminder(ctx)
 	return h.jsonResult(ctx, out)
diff --git a/pkg/tools/builtin/user_prompt.go b/pkg/tools/builtin/user_prompt.go
index 95caab47c..d969ff1ef 100644
--- a/pkg/tools/builtin/user_prompt.go
+++ b/pkg/tools/builtin/user_prompt.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 
 	"github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/tools"
 )
@@ -47,6 +49,14 @@ func (t *UserPromptTool) userPrompt(ctx context.Context, params UserPromptArgs)
 		return tools.ResultError("user_prompt tool is not available in this context (no elicitation handler configured)"), nil
 	}
 
+	span := trace.SpanFromContext(ctx)
+	if span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.user_prompt.message_length", len(params.Message)),
+			attribute.Bool("cagent.tool.user_prompt.has_schema", params.Schema != nil),
+		)
+	}
+
 	var meta mcp.Meta
 	if params.Title != "" {
 		meta = mcp.Meta{"cagent/title": params.Title}
@@ -68,6 +78,10 @@ func (t *UserPromptTool) userPrompt(ctx context.Context, params UserPromptArgs)
 		Content: result.Content,
 	}
 
+	if span.IsRecording() {
+		span.SetAttributes(attribute.String("cagent.tool.user_prompt.action", string(result.Action)))
+	}
+
 	responseJSON, err := json.Marshal(response)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal response: %w", err)
diff --git a/pkg/tools/codemode/exec.go b/pkg/tools/codemode/exec.go
index 0d16b3035..9d3a78b16 100644
--- a/pkg/tools/codemode/exec.go
+++ b/pkg/tools/codemode/exec.go
@@ -8,6 +8,8 @@ import (
 	"slices"
 
 	"github.com/dop251/goja"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/tools"
 )
@@ -40,6 +42,24 @@ func (c *codeModeTool) runJavascript(ctx context.Context, script string) (Script
 	vm := goja.New()
 	tracker := &toolCallTracker{}
 
+	// Stamp the script body and length onto the active span; the
+	// post-run defer adds the tool-call count. Script ships
+	// unconditionally — it's the main signal of what a code-mode turn
+	// did. Drop or hash `cagent.tool.codemode.script` at the OTel
+	// collector if scripts routinely carry secrets.
+	span := trace.SpanFromContext(ctx)
+	if span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.codemode.script", script),
+			attribute.Int("cagent.tool.codemode.script_length", len(script)),
+		)
+	}
+	defer func() {
+		if span.IsRecording() {
+			span.SetAttributes(attribute.Int("cagent.tool.codemode.tool_call_count", len(tracker.calls)))
+		}
+	}()
+
 	// Inject console object to the help the LLM debug its own code.
 	var (
 		stdOut bytes.Buffer

From 4356a83c012b99b4ca3b362e52e7acb4fd0317db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Mon, 4 May 2026 11:39:10 +0300
Subject: [PATCH 11/17] fix(otel): correct tool_call_response schema and cap
 filesystem paths attribute

- Change `tool_call_response` parts to use `result` field instead of `content` to align with OTel GenAI semconv example schema
- Cap `cagent.tool.filesystem.paths` attribute to 32 entries to prevent backends from dropping oversized attributes on multi-hundred-path calls
- Always record `path_count` to preserve total fidelity when paths are truncated
- Fix typo in `ApprovalSourcePermissionRequestHook` constant name (add missing `Allow` suffix)
- Remove `t.Parallel()` from MCP tests that mutate global OTel state
---
 pkg/runtime/toolexec/dispatcher.go | 32 +++++++++++++++---------------
 pkg/telemetry/genai/content.go     | 10 +++++++---
 pkg/telemetry/mcp/mcp_test.go      |  6 ++++--
 pkg/telemetry/mcp/span.go          |  6 +++---
 pkg/tools/builtin/filesystem.go    | 22 +++++++++++++++++---
 5 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/pkg/runtime/toolexec/dispatcher.go b/pkg/runtime/toolexec/dispatcher.go
index 572a9f442..21cd1050a 100644
--- a/pkg/runtime/toolexec/dispatcher.go
+++ b/pkg/runtime/toolexec/dispatcher.go
@@ -31,21 +31,21 @@ const (
 	ApprovalDecisionDeny     = "deny"
 	ApprovalDecisionCanceled = "canceled"
 
-	ApprovalSourceYolo                      = "yolo"
-	ApprovalSourceSessionPermissionsAllow   = "session_permissions_allow"
-	ApprovalSourceSessionPermissionsDeny    = "session_permissions_deny"
-	ApprovalSourceTeamPermissionsAllow      = "team_permissions_allow"
-	ApprovalSourceTeamPermissionsDeny       = "team_permissions_deny"
-	ApprovalSourcePreToolUseHookAllow       = "pre_tool_use_hook_allow"
-	ApprovalSourcePreToolUseHookDeny        = "pre_tool_use_hook_deny"
-	ApprovalSourcePermissionRequestHookDeny = "permission_request_hook_deny"
-	ApprovalSourcePermissionRequestHook     = "permission_request_hook_allow"
-	ApprovalSourceReadOnlyHint              = "readonly_hint"
-	ApprovalSourceUserApproved              = "user_approved"
-	ApprovalSourceUserApprovedSession       = "user_approved_session"
-	ApprovalSourceUserApprovedTool          = "user_approved_tool"
-	ApprovalSourceUserRejected              = "user_rejected"
-	ApprovalSourceContextCanceled           = "context_canceled"
+	ApprovalSourceYolo                       = "yolo"
+	ApprovalSourceSessionPermissionsAllow    = "session_permissions_allow"
+	ApprovalSourceSessionPermissionsDeny     = "session_permissions_deny"
+	ApprovalSourceTeamPermissionsAllow       = "team_permissions_allow"
+	ApprovalSourceTeamPermissionsDeny        = "team_permissions_deny"
+	ApprovalSourcePreToolUseHookAllow        = "pre_tool_use_hook_allow"
+	ApprovalSourcePreToolUseHookDeny         = "pre_tool_use_hook_deny"
+	ApprovalSourcePermissionRequestHookDeny  = "permission_request_hook_deny"
+	ApprovalSourcePermissionRequestHookAllow = "permission_request_hook_allow"
+	ApprovalSourceReadOnlyHint               = "readonly_hint"
+	ApprovalSourceUserApproved               = "user_approved"
+	ApprovalSourceUserApprovedSession        = "user_approved_session"
+	ApprovalSourceUserApprovedTool           = "user_approved_tool"
+	ApprovalSourceUserRejected               = "user_rejected"
+	ApprovalSourceContextCanceled            = "context_canceled"
 )
 
 // CallOutcome captures the verdicts of a single tool invocation as
@@ -568,7 +568,7 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call
 
 	if result.PermissionAllowed {
 		slog.Debug("Tool auto-approved by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.AdditionalContext)
-		c.notifyApproval(ctx, ApprovalDecisionAllow, ApprovalSourcePermissionRequestHook)
+		c.notifyApproval(ctx, ApprovalDecisionAllow, ApprovalSourcePermissionRequestHookAllow)
 		return runTool(), true
 	}
 
diff --git a/pkg/telemetry/genai/content.go b/pkg/telemetry/genai/content.go
index 108adfbac..b7d09cc24 100644
--- a/pkg/telemetry/genai/content.go
+++ b/pkg/telemetry/genai/content.go
@@ -192,10 +192,14 @@ func messageToStructured(m *chat.Message) structuredMessage {
 		})
 	}
 	if m.ToolCallID != "" {
+		// Per the OTel GenAI semconv example schema, tool_call_response
+		// parts carry the payload in `result`, not `content` (which is
+		// reserved for `text`/`reasoning` parts). Spec-aware backends
+		// look for the `result` key when decoding tool responses.
 		parts = append(parts, messagePart{
-			Type:    "tool_call_response",
-			ID:      m.ToolCallID,
-			Content: m.Content,
+			Type:   "tool_call_response",
+			ID:     m.ToolCallID,
+			Result: m.Content,
 		})
 	}
 
diff --git a/pkg/telemetry/mcp/mcp_test.go b/pkg/telemetry/mcp/mcp_test.go
index 0f1533803..5e5d78342 100644
--- a/pkg/telemetry/mcp/mcp_test.go
+++ b/pkg/telemetry/mcp/mcp_test.go
@@ -24,7 +24,8 @@ func TestEnsureMeta(t *testing.T) {
 }
 
 func TestInjectExtractRoundTrip(t *testing.T) {
-	t.Parallel()
+	// Mutates the global OTel text-map propagator, so this test cannot
+	// run in parallel with other tests that read or modify it.
 
 	// A propagator must be configured for inject/extract to do anything;
 	// install one for the duration of the test and put it back after.
@@ -70,7 +71,8 @@ func TestExtractMetaNilReturnsParent(t *testing.T) {
 }
 
 func TestStartClientReturnsActiveSpan(t *testing.T) {
-	t.Parallel()
+	// Mutates the global OTel tracer provider, so this test cannot run
+	// in parallel with other tests that read or modify it.
 
 	tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample()))
 	t.Cleanup(func() { _ = tp.Shutdown(t.Context()) })
diff --git a/pkg/telemetry/mcp/span.go b/pkg/telemetry/mcp/span.go
index 0ab3a806b..594ba99bd 100644
--- a/pkg/telemetry/mcp/span.go
+++ b/pkg/telemetry/mcp/span.go
@@ -169,9 +169,9 @@ func (s *Span) SetAttributes(attrs ...attribute.KeyValue) {
 }
 
 // RecordError marks the span as failed and stores error.type for the
-// duration metric. errType should be a short, low-cardinality string —
-// "rpc_error", "transport", "context_canceled", or the underlying error's
-// type name as a fallback.
+// duration metric. errType should be a short, low-cardinality string;
+// when empty, ClassifyError(err) supplies a value (one of
+// "context_canceled", "deadline_exceeded", "rpc_error").
 func (s *Span) RecordError(err error, errType string) {
 	if s == nil || err == nil {
 		return
diff --git a/pkg/tools/builtin/filesystem.go b/pkg/tools/builtin/filesystem.go
index 42e8fde43..9f323949b 100644
--- a/pkg/tools/builtin/filesystem.go
+++ b/pkg/tools/builtin/filesystem.go
@@ -44,6 +44,22 @@ func annotateFilesystemSpan(ctx context.Context, op, path string) {
 	span.SetAttributes(attrs...)
 }
 
+// maxFilesystemPathsAttr caps how many entries from args.Paths land on a
+// span attribute. Many backends drop attributes over a few KiB and per-
+// element string costs add up fast on a multi-hundred-path call. The
+// path_count attribute (always recorded) preserves total fidelity.
+const maxFilesystemPathsAttr = 32
+
+// cappedPaths returns paths truncated to maxFilesystemPathsAttr entries.
+// Callers should also record `path_count = len(paths)` separately so the
+// truncation is visible.
+func cappedPaths(paths []string) []string {
+	if len(paths) <= maxFilesystemPathsAttr {
+		return paths
+	}
+	return paths[:maxFilesystemPathsAttr]
+}
+
 const (
 	ToolNameReadFile           = "read_file"
 	ToolNameReadMultipleFiles  = "read_multiple_files"
@@ -913,7 +929,7 @@ func (t *FilesystemTool) handleReadMultipleFiles(ctx context.Context, args ReadM
 	if span := trace.SpanFromContext(ctx); span.IsRecording() {
 		span.SetAttributes(
 			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
-			attribute.StringSlice("cagent.tool.filesystem.paths", args.Paths),
+			attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)),
 		)
 	}
 	type PathContent struct {
@@ -1138,7 +1154,7 @@ func (t *FilesystemTool) handleCreateDirectory(ctx context.Context, args CreateD
 	if span := trace.SpanFromContext(ctx); span.IsRecording() {
 		span.SetAttributes(
 			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
-			attribute.StringSlice("cagent.tool.filesystem.paths", args.Paths),
+			attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)),
 		)
 	}
 	var results []string
@@ -1161,7 +1177,7 @@ func (t *FilesystemTool) handleRemoveDirectory(ctx context.Context, args RemoveD
 	if span := trace.SpanFromContext(ctx); span.IsRecording() {
 		span.SetAttributes(
 			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
-			attribute.StringSlice("cagent.tool.filesystem.paths", args.Paths),
+			attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)),
 		)
 	}
 	var results []string

From b6a181b4ad8a8db325d5c01ebe8520b181755006 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Mon, 4 May 2026 12:57:40 +0300
Subject: [PATCH 12/17] fix(otel): gate codemode script body on capture,
 sanitize fetch URL attrs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `pkg/tools/codemode/exec.go`: emit `cagent.tool.codemode.script_hash` (SHA-256) + `script_length` unconditionally so dashboards can correlate identical scripts and spot oversize submissions, but gate the full `cagent.tool.codemode.script` body behind `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`. Codemode scripts are kilobyte-scale arbitrary JS that routinely embed auth tokens / pasted user data / inline secrets, so the bundle decision (Option B, ship body unconditionally) was the wrong call for this attribute specifically
- `pkg/tools/builtin/fetch.go`: strip query strings, fragments, and userinfo from `cagent.tool.fetch.urls` so the attribute can ship by default without leaking signed-URL tokens, OAuth codes, or inline credentials. Path stays intact so dashboards still answer "which sites/endpoints did the agent hit?". Unparseable URLs are emitted as `<unparseable>` rather than passed through verbatim

Both span attributes were flagged on the upstream PR review for the same root cause — emitting unbounded user-controlled content as a default-on telemetry attribute creates a PII/secret-exfiltration surface. The other Option B attributes (`shell.cmd`, `filesystem.path`, `script_shell.cmd`) stay unconditional: they are short, do not carry the same query-token / arbitrary-content risk, and remain decision-relevant for incident response
---
 pkg/tools/builtin/fetch.go | 38 ++++++++++++++++++++++++++++++--------
 pkg/tools/codemode/exec.go | 20 ++++++++++++++------
 2 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/pkg/tools/builtin/fetch.go b/pkg/tools/builtin/fetch.go
index 61ac01eca..08d997491 100644
--- a/pkg/tools/builtin/fetch.go
+++ b/pkg/tools/builtin/fetch.go
@@ -50,22 +50,44 @@ type FetchToolArgs struct {
 	Format  string   `json:"format,omitempty"`
 }
 
+// sanitizeFetchURLs strips query strings and userinfo from each URL so
+// the resulting span attribute can ship by default without leaking
+// signed-URL tokens, OAuth codes, or inline credentials. URLs that fail
+// to parse are emitted as a sentinel rather than the raw string, since
+// an unparseable URL could also carry sensitive material.
+func sanitizeFetchURLs(urls []string) []string {
+	out := make([]string, len(urls))
+	for i, raw := range urls {
+		u, err := url.Parse(raw)
+		if err != nil {
+			out[i] = "<unparseable>"
+			continue
+		}
+		u.RawQuery = ""
+		u.Fragment = ""
+		u.User = nil
+		out[i] = u.String()
+	}
+	return out
+}
+
 func (h *fetchHandler) CallTool(ctx context.Context, params FetchToolArgs) (*tools.ToolCallResult, error) {
 	if len(params.URLs) == 0 {
 		return nil, errors.New("at least one URL is required")
 	}
 
-	// Decorate the active runtime.tool.handler span with the URL list
-	// and request shape. Each fetched URL still produces its own HTTP
-	// CLIENT child span via `httpclient.WrapWithOTel` below, so the
-	// per-request status / latency / target host all show up there;
-	// the parent span gets the requested URLs so a quick glance answers
-	// "which sites did the agent hit on this turn?" without expanding
-	// the children.
+	// Decorate the active runtime.tool.handler span with the requested
+	// URLs. Strip query params and userinfo first: query strings often
+	// carry signed-URL tokens, OAuth codes, or session IDs, and userinfo
+	// carries credentials inline. The path stays intact so dashboards
+	// can still answer "which sites/endpoints did the agent hit?" — the
+	// HTTP CLIENT child span emitted by `httpclient.WrapWithOTel` below
+	// retains the full URL under `http.url` for callers that opt into
+	// that backend's full-URL capture.
 	if span := trace.SpanFromContext(ctx); span.IsRecording() {
 		attrs := []attribute.KeyValue{
 			attribute.Int("cagent.tool.fetch.url_count", len(params.URLs)),
-			attribute.StringSlice("cagent.tool.fetch.urls", params.URLs),
+			attribute.StringSlice("cagent.tool.fetch.urls", sanitizeFetchURLs(params.URLs)),
 		}
 		if params.Format != "" {
 			attrs = append(attrs, attribute.String("cagent.tool.fetch.format", params.Format))
diff --git a/pkg/tools/codemode/exec.go b/pkg/tools/codemode/exec.go
index 9d3a78b16..df143b1f4 100644
--- a/pkg/tools/codemode/exec.go
+++ b/pkg/tools/codemode/exec.go
@@ -3,6 +3,8 @@ package codemode
 import (
 	"bytes"
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
 	"encoding/json"
 	"fmt"
 	"slices"
@@ -11,6 +13,7 @@ import (
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/trace"
 
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -42,17 +45,22 @@ func (c *codeModeTool) runJavascript(ctx context.Context, script string) (Script
 	vm := goja.New()
 	tracker := &toolCallTracker{}
 
-	// Stamp the script body and length onto the active span; the
-	// post-run defer adds the tool-call count. Script ships
-	// unconditionally — it's the main signal of what a code-mode turn
-	// did. Drop or hash `cagent.tool.codemode.script` at the OTel
-	// collector if scripts routinely carry secrets.
+	// Always stamp a hash + length so dashboards can correlate
+	// identical scripts ("model ran the same script 200 times this
+	// hour") without ever shipping the body. Codemode scripts are
+	// kilobyte-scale arbitrary JS — embedded auth tokens, pasted
+	// user data, and inline secrets are common — so the body itself
+	// is gated behind the GenAI content-capture opt-in.
 	span := trace.SpanFromContext(ctx)
 	if span.IsRecording() {
+		sum := sha256.Sum256([]byte(script))
 		span.SetAttributes(
-			attribute.String("cagent.tool.codemode.script", script),
+			attribute.String("cagent.tool.codemode.script_hash", hex.EncodeToString(sum[:])),
 			attribute.Int("cagent.tool.codemode.script_length", len(script)),
 		)
+		if genai.IsContentCaptureEnabled() {
+			span.SetAttributes(attribute.String("cagent.tool.codemode.script", script))
+		}
 	}
 	defer func() {
 		if span.IsRecording() {

From 653dcc98a1326d42bb0445e83eec792ee089795f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Tue, 5 May 2026 15:27:42 +0300
Subject: [PATCH 13/17] feat(mcp-client): span remote mcp requests with w3c
 traceparent injection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wrap the HTTP transport chain with `httpclient.WrapWithOTel` so every outbound MCP request injects W3C `traceparent` headers and creates an HTTP CLIENT span. Without this wrap, the streamable-HTTP/SSE transports the gomcp SDK builds send raw POST/GET requests that never chain onto the calling cagent span—the downstream MCP server's spans then live in a separate root trace, breaking end-to-end observability for any agent talking to a remote MCP server.

`WrapWithOTel` is a no-op when OTel is disabled at runtime, so the laptop-mode default stays unchanged.
---
 pkg/tools/mcp/remote.go | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pkg/tools/mcp/remote.go b/pkg/tools/mcp/remote.go
index 805c3fe1a..1a4ecdf7b 100644
--- a/pkg/tools/mcp/remote.go
+++ b/pkg/tools/mcp/remote.go
@@ -9,6 +9,7 @@ import (
 	gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/upstream"
 )
 
@@ -31,6 +32,7 @@ func newRemoteClient(url, transportType string, headers map[string]string, token
 	}
 
 	return &remoteMCPClient{
+		sessionClient: sessionClient{serverAddress: url},
 		url:           url,
 		transportType: transportType,
 		headers:       headers,
@@ -132,6 +134,16 @@ func (c *remoteMCPClient) SetManagedOAuth(managed bool) {
 // The oauthTransport is returned alongside the client so callers can inspect
 // the most recent server-side failure (via lastServerError) when Connect()
 // returns a bare HTTP-status error and we need to surface the actual cause.
+//
+// The transport chain wraps `httpclient.WrapWithOTel` outermost so every
+// outbound MCP request injects W3C `traceparent` (and creates an HTTP
+// CLIENT span). Without this wrap, the streamable-HTTP / SSE transports
+// the gomcp SDK builds with our `*http.Client` send raw POST/GET requests
+// that never chain onto the calling cagent span — the downstream MCP
+// server's spans then live in a separate root trace, breaking end-to-end
+// observability for any agent talking to a remote MCP. `WrapWithOTel` is
+// a no-op when OTel is disabled at runtime, so the laptop-mode default
+// stays unchanged.
 func (c *remoteMCPClient) createHTTPClient() (*http.Client, *oauthTransport) {
 	base := c.headerTransport()
 
@@ -145,7 +157,7 @@ func (c *remoteMCPClient) createHTTPClient() (*http.Client, *oauthTransport) {
 		oauthConfig: c.oauthConfig,
 	}
 
-	return &http.Client{Transport: oauthT}, oauthT
+	return &http.Client{Transport: httpclient.WrapWithOTel(oauthT)}, oauthT
 }
 
 func (c *remoteMCPClient) headerTransport() http.RoundTripper {

From d70e6d7df64adcd438707a1bf37a343ab6d07891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Tue, 5 May 2026 15:38:40 +0300
Subject: [PATCH 14/17] feat(mcp-client): stamp server.address on all client
 spans

Capture the connection identifier (URL for remote clients, executable name for stdio) at construction time and stamp it as `server.address` on every CLIENT-kind MCP span. This makes error spans triageable when multiple MCP servers are wired up - without it, a `tools/list` failure only shows `mcp.method.name=tools/list` with no indication of which target produced the error.

Also stamp `server.address` on the parent `toolset.start` span before initialization so Initialize failures (e.g. multi-replica MCP "session not found" 404) carry the target address directly on the span rather than requiring log greppage to match toolsets to URLs.
---
 pkg/tools/mcp/mcp.go            | 20 ++++++++++++++++++++
 pkg/tools/mcp/mcp_test.go       |  2 ++
 pkg/tools/mcp/reconnect_test.go |  1 +
 pkg/tools/mcp/session_client.go | 33 +++++++++++++++++++++++++++++----
 pkg/tools/mcp/stdio.go          | 13 +++++++++----
 5 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/pkg/tools/mcp/mcp.go b/pkg/tools/mcp/mcp.go
index a0537fba3..34f9f3abf 100644
--- a/pkg/tools/mcp/mcp.go
+++ b/pkg/tools/mcp/mcp.go
@@ -16,6 +16,8 @@ import (
 	"time"
 
 	"github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -33,6 +35,11 @@ type mcpClient interface {
 	SetManagedOAuth(managed bool)
 	SetToolListChangedHandler(handler func())
 	SetPromptListChangedHandler(handler func())
+	// ServerAddress returns the connection identifier (URL for remote
+	// clients, executable name for stdio). Used by `Toolset.Start` to
+	// stamp `server.address` on the parent `toolset.start` span so
+	// initialize failures show which target produced them.
+	ServerAddress() string
 	// Wait blocks until the underlying connection is closed by the server.
 	// It returns nil if the connection was closed gracefully.
 	Wait() error
@@ -286,6 +293,19 @@ func (ts *Toolset) Start(ctx context.Context) error {
 	if ts.supervisor == nil {
 		return errors.New("toolset has no supervisor: must be created via NewToolsetCommand or NewRemoteToolset")
 	}
+	// Stamp the connection identifier on the parent `toolset.start`
+	// span before doing anything else so an Initialize failure (e.g.
+	// the multi-replica MCP "session not found" 404 case) carries the
+	// target address as `server.address` — without this, the error
+	// message has the only clue and triage requires log greppage to
+	// match toolsets to URLs.
+	if ts.mcpClient != nil {
+		if addr := ts.mcpClient.ServerAddress(); addr != "" {
+			if span := trace.SpanFromContext(ctx); span.IsRecording() {
+				span.SetAttributes(attribute.String("server.address", addr))
+			}
+		}
+	}
 	return ts.supervisor.Start(ctx)
 }
 
diff --git a/pkg/tools/mcp/mcp_test.go b/pkg/tools/mcp/mcp_test.go
index 8a80e6264..63be08bab 100644
--- a/pkg/tools/mcp/mcp_test.go
+++ b/pkg/tools/mcp/mcp_test.go
@@ -50,6 +50,8 @@ func (m *mockMCPClient) SetToolListChangedHandler(func()) {}
 
 func (m *mockMCPClient) SetPromptListChangedHandler(func()) {}
 
+func (m *mockMCPClient) ServerAddress() string { return "mock://test" }
+
 func (m *mockMCPClient) Wait() error { return nil }
 
 func (m *mockMCPClient) Close(context.Context) error { return nil }
diff --git a/pkg/tools/mcp/reconnect_test.go b/pkg/tools/mcp/reconnect_test.go
index 71ece482b..df0257a89 100644
--- a/pkg/tools/mcp/reconnect_test.go
+++ b/pkg/tools/mcp/reconnect_test.go
@@ -72,6 +72,7 @@ func (m *failingInitClient) SetOAuthSuccessHandler(func())                  {}
 func (m *failingInitClient) SetManagedOAuth(bool)                           {}
 func (m *failingInitClient) SetToolListChangedHandler(func())               {}
 func (m *failingInitClient) SetPromptListChangedHandler(func())             {}
+func (m *failingInitClient) ServerAddress() string                          { return "mock://failing" }
 
 func (m *failingInitClient) Wait() error {
 	m.mu.Lock()
diff --git a/pkg/tools/mcp/session_client.go b/pkg/tools/mcp/session_client.go
index d7dd68891..8eee35cbc 100644
--- a/pkg/tools/mcp/session_client.go
+++ b/pkg/tools/mcp/session_client.go
@@ -18,8 +18,16 @@ import (
 // implementations. Both stdioMCPClient and remoteMCPClient embed it to avoid
 // duplicating the session-nil guards, notification handlers, and delegating
 // methods.
+//
+// `serverAddress` is captured at construction time (the remote URL for
+// HTTP/SSE clients, the executable name for stdio clients) and stamped on
+// every CLIENT-kind MCP span as the OTel `server.address` attribute. Without
+// it, a `tools/list` failure span carries `mcp.method.name=tools/list` and
+// nothing else identifying which target produced the error — useful in a
+// single-MCP agent, useless in any agent wired to two or more.
 type sessionClient struct {
 	session                  *gomcp.ClientSession
+	serverAddress            string
 	toolListChangedHandler   func()
 	promptListChangedHandler func()
 	elicitationHandler       tools.ElicitationHandler
@@ -34,6 +42,15 @@ func (c *sessionClient) setSession(s *gomcp.ClientSession) {
 	c.mu.Unlock()
 }
 
+// ServerAddress returns the connection identifier captured at construction
+// time (URL for remote clients, executable name for stdio). Exposed so
+// the parent `toolset.start` span can stamp it as `server.address` —
+// otherwise an Initialize failure surfaces the error message but no
+// indication of which MCP target produced it.
+func (c *sessionClient) ServerAddress() string {
+	return c.serverAddress
+}
+
 // getSession returns the current session under the read lock.
 func (c *sessionClient) getSession() *gomcp.ClientSession {
 	c.mu.RLock()
@@ -106,7 +123,9 @@ func (c *sessionClient) ListTools(ctx context.Context, request *gomcp.ListToolsP
 	// iteration lifetime.
 	return func(yield func(*gomcp.Tool, error) bool) {
 		spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{
-			Method: otelmcp.MethodToolsList,
+			Method:        otelmcp.MethodToolsList,
+			SessionID:     s.ID(),
+			ServerAddress: c.serverAddress,
 		})
 		defer span.End()
 
@@ -134,7 +153,9 @@ func (c *sessionClient) CallTool(ctx context.Context, request *gomcp.CallToolPar
 		return nil, errors.New("session not initialized")
 	}
 	opts := otelmcp.CallOptions{
-		Method: otelmcp.MethodToolsCall,
+		Method:        otelmcp.MethodToolsCall,
+		SessionID:     s.ID(),
+		ServerAddress: c.serverAddress,
 	}
 	if request != nil {
 		opts.ToolName = request.Name
@@ -165,7 +186,9 @@ func (c *sessionClient) ListPrompts(ctx context.Context, request *gomcp.ListProm
 		// Span and RPC start at iteration time so an unused
 		// iterator never leaks either.
 		spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{
-			Method: otelmcp.MethodPromptsList,
+			Method:        otelmcp.MethodPromptsList,
+			SessionID:     s.ID(),
+			ServerAddress: c.serverAddress,
 		})
 		defer span.End()
 
@@ -190,7 +213,9 @@ func (c *sessionClient) GetPrompt(ctx context.Context, request *gomcp.GetPromptP
 		return nil, errors.New("session not initialized")
 	}
 	opts := otelmcp.CallOptions{
-		Method: otelmcp.MethodPromptsGet,
+		Method:        otelmcp.MethodPromptsGet,
+		SessionID:     s.ID(),
+		ServerAddress: c.serverAddress,
 	}
 	if request != nil {
 		opts.PromptName = request.Name
diff --git a/pkg/tools/mcp/stdio.go b/pkg/tools/mcp/stdio.go
index 01e3fab25..454fb3139 100644
--- a/pkg/tools/mcp/stdio.go
+++ b/pkg/tools/mcp/stdio.go
@@ -22,10 +22,15 @@ type stdioMCPClient struct {
 
 func newStdioCmdClient(command string, args, env []string, cwd string) *stdioMCPClient {
 	return &stdioMCPClient{
-		command: command,
-		args:    args,
-		env:     env,
-		cwd:     cwd,
+		// stdio has no real "server address" in the OTel HTTP sense; using
+		// the command as a stand-in keeps spans triageable when the agent
+		// has multiple stdio MCPs wired up. Span readers see the
+		// executable name (e.g. `foo-mcp-server`) on `server.address`.
+		sessionClient: sessionClient{serverAddress: command},
+		command:       command,
+		args:          args,
+		env:           env,
+		cwd:           cwd,
 	}
 }
 

From 60b5ebd34c409a0ecf422510dc8640dab10f7122 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Tue, 5 May 2026 16:21:50 +0300
Subject: [PATCH 15/17] fix(mcp-client): strip credentials from server.address
 before stamping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Storing the raw remote URL as serverAddress meant that any MCP endpoint
configured with credentials in the URL — basic-auth userinfo
(https://user:token@host/) or a query-string secret
(https://host/?api_key=...) — would have those credentials replicated
verbatim onto every CLIENT-kind span as server.address and shipped to
the trace backend. Telemetry pipelines are not the right place for
secrets to land.

OTel semantic conventions for server.address specify the host (with
optional port) anyway, so collapse to u.Host before storing. The
existing buildRemoteDescription helper already does the same thing
for user-visible toolset descriptions; this brings the span attribute
in line.

Empty fallback on parse failure is safe — every callsite already
guards on addr != "" before stamping, so a sanitisation miss leaves
the span without server.address rather than leaking a raw URL.
---
 pkg/tools/mcp/remote.go      | 23 ++++++++++++++++++++++-
 pkg/tools/mcp/remote_test.go | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/pkg/tools/mcp/remote.go b/pkg/tools/mcp/remote.go
index 1a4ecdf7b..42a7a4254 100644
--- a/pkg/tools/mcp/remote.go
+++ b/pkg/tools/mcp/remote.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"log/slog"
 	"net/http"
+	neturl "net/url"
 
 	gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
 
@@ -32,7 +33,7 @@ func newRemoteClient(url, transportType string, headers map[string]string, token
 	}
 
 	return &remoteMCPClient{
-		sessionClient: sessionClient{serverAddress: url},
+		sessionClient: sessionClient{serverAddress: sanitizeRemoteAddress(url)},
 		url:           url,
 		transportType: transportType,
 		headers:       headers,
@@ -41,6 +42,26 @@ func newRemoteClient(url, transportType string, headers map[string]string, token
 	}
 }
 
+// sanitizeRemoteAddress extracts a span-safe identifier from an MCP URL
+// before stamping it as `server.address`. The URL may legitimately
+// contain credentials in userinfo (`https://user:token@host/`) or query
+// params (`?api_key=...`); sending those to the trace backend would be
+// a real exfiltration risk. OTel's semantic convention for
+// `server.address` is the host (with optional port) anyway, so we keep
+// only `u.Host` and drop everything else.
+//
+// Returns the empty string on parse failure or hostless URLs (file://,
+// stdio commands, malformed input). The caller stamps `server.address`
+// only when it's non-empty, so a sanitisation miss leaves the span
+// without that attribute rather than leaking a raw URL.
+func sanitizeRemoteAddress(rawURL string) string {
+	u, err := neturl.Parse(rawURL)
+	if err != nil || u.Host == "" {
+		return ""
+	}
+	return u.Host
+}
+
 func (c *remoteMCPClient) Initialize(ctx context.Context, _ *gomcp.InitializeRequest) (*gomcp.InitializeResult, error) {
 	// Create HTTP client with OAuth support. We keep a reference to the
 	// oauthTransport so we can enrich Connect errors with the server's own
diff --git a/pkg/tools/mcp/remote_test.go b/pkg/tools/mcp/remote_test.go
index 98678fd5d..17d97c9b0 100644
--- a/pkg/tools/mcp/remote_test.go
+++ b/pkg/tools/mcp/remote_test.go
@@ -12,6 +12,38 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// TestSanitizeRemoteAddress verifies that URLs with embedded credentials
+// (basic-auth userinfo, query-string secrets) collapse to a host-only
+// string before reaching the `server.address` span attribute. The point
+// is exfiltration safety: a URL like `https://user:token@host/?api_key=…`
+// would otherwise be replicated verbatim into every CLIENT span and
+// shipped to the trace backend.
+func TestSanitizeRemoteAddress(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		url  string
+		want string
+	}{
+		{name: "plain", url: "https://example.com/mcp", want: "example.com"},
+		{name: "host with port", url: "https://example.com:8443/mcp", want: "example.com:8443"},
+		{name: "userinfo stripped", url: "https://alice:s3cret@example.com/mcp", want: "example.com"},
+		{name: "query stripped", url: "https://example.com/mcp?api_key=s3cret", want: "example.com"},
+		{name: "userinfo and query stripped", url: "https://alice:s3cret@example.com:8443/mcp?api_key=x", want: "example.com:8443"},
+		{name: "fragment stripped", url: "https://example.com/mcp#frag", want: "example.com"},
+		{name: "hostless empty fallback", url: "not-a-url", want: ""},
+		{name: "empty input", url: "", want: ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			got := sanitizeRemoteAddress(tc.url)
+			assert.Equal(t, tc.want, got, "sanitizeRemoteAddress(%q)", tc.url)
+		})
+	}
+}
+
 // TestRemoteClientCustomHeaders verifies that custom headers passed to the remote
 // MCP client are actually applied to HTTP requests sent to the MCP server.
 func TestRemoteClientCustomHeaders(t *testing.T) {

From 75ccd04ef769b485845817adbffcad43cf513a24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Wed, 6 May 2026 10:04:46 +0300
Subject: [PATCH 16/17] fix(otel): unwrap toolset wrapper in toolset.start kind
 attribute

Every toolset goes through tools.WithName in the team-loader
registry, which sandwiches a *tools.namedToolSet between the
StartableToolSet and the actual implementation. %T on the
embedded ToolSet therefore always reported *tools.namedToolSet
regardless of whether the inner toolset was MCP, A2A, a builtin,
or anything else - so the attribute could never answer the
question it exists to answer ("which kind of toolset is starting
right now?").

Unwrap once before formatting, mirroring what DescribeToolSet
already does for the same reason. Now the attribute reads
*mcp.Toolset, *builtin.ShellTool, etc., so a toolset.start
without HTTP children is immediately distinguishable from a
remote MCP whose POSTs are missing for some other reason.
---
 pkg/tools/startable.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pkg/tools/startable.go b/pkg/tools/startable.go
index 93994b1a3..67258ba6e 100644
--- a/pkg/tools/startable.go
+++ b/pkg/tools/startable.go
@@ -84,11 +84,19 @@ func (s *StartableToolSet) Start(ctx context.Context) (err error) {
 		// "tools loading…" UI was previously unattributable. Only
 		// fires when the toolset has work to do; cheap toolsets
 		// without a Startable implementation skip the span entirely.
+		// Unwrap once so the kind attribute names the underlying toolset
+		// (e.g. *mcp.Toolset, *builtin.ShellTool) instead of the
+		// *tools.namedToolSet wrapper that every toolset gets in the
+		// registry — same pattern DescribeToolSet uses.
+		inner := s.ToolSet
+		if u, ok := inner.(Unwrapper); ok {
+			inner = u.Unwrap()
+		}
 		ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools").Start(
 			ctx,
 			"toolset.start",
 			trace.WithSpanKind(trace.SpanKindInternal),
-			trace.WithAttributes(attribute.String("cagent.toolset.kind", fmt.Sprintf("%T", s.ToolSet))),
+			trace.WithAttributes(attribute.String("cagent.toolset.kind", fmt.Sprintf("%T", inner))),
 		)
 		defer func() {
 			if err != nil {

From 8a791f732173096e6bc315aad662373b1a7d0f4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomas=20Daba=C5=A1inskas?= <tomas.dabasinskas@cogvel.com>
Date: Wed, 6 May 2026 14:35:14 +0300
Subject: [PATCH 17/17] chore(otel): add tool count attributes to session and
 mcp spans

Record tool counts at two key points in the execution flow:

- Session span: total tools available after exclusion filters
- MCP list span: tools successfully yielded by each server

These attributes enable quick analysis of tool availability without inspecting nested spans or JSON-RPC payloads. The MCP count preserves partial results when iteration terminates early.
---
 pkg/runtime/loop.go             |  6 ++++++
 pkg/tools/mcp/session_client.go | 13 +++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go
index f24e8b21b..5e2f84867 100644
--- a/pkg/runtime/loop.go
+++ b/pkg/runtime/loop.go
@@ -231,6 +231,12 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 	}
 	agentTools = filterExcludedTools(agentTools, sess.ExcludedTools)
 
+	// Record the catalogue size on the session span — answers "how
+	// many tools could this turn actually use?" without having to
+	// walk into per-toolset spans. Stamped after exclusion filters
+	// so the count matches what was offered to the model.
+	sessionSpan.SetAttributes(attribute.Int("cagent.agent.tools.count", len(agentTools)))
+
 	events <- ToolsetInfo(len(agentTools), false, a.Name())
 
 	messages := sess.GetMessages(a)
diff --git a/pkg/tools/mcp/session_client.go b/pkg/tools/mcp/session_client.go
index 8eee35cbc..e2259142c 100644
--- a/pkg/tools/mcp/session_client.go
+++ b/pkg/tools/mcp/session_client.go
@@ -9,6 +9,7 @@ import (
 	"sync"
 
 	gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel/attribute"
 
 	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -129,6 +130,16 @@ func (c *sessionClient) ListTools(ctx context.Context, request *gomcp.ListToolsP
 		})
 		defer span.End()
 
+		// Stamp the tool count on the span when iteration finishes —
+		// answers "what did this server actually return?" without
+		// having to walk into the JSON-RPC payload. Counts only the
+		// tools the iterator yielded successfully; partial counts are
+		// preserved when the caller breaks out early.
+		var count int
+		defer func() {
+			span.SetAttributes(attribute.Int("cagent.mcp.tools.count", count))
+		}()
+
 		if request != nil {
 			request.Meta = otelmcp.EnsureMeta(request.Meta)
 			otelmcp.InjectMeta(spanCtx, request.Meta)
@@ -139,6 +150,8 @@ func (c *sessionClient) ListTools(ctx context.Context, request *gomcp.ListToolsP
 				// last one — paginated lists may yield multiple
 				// failures and the trace should reflect them all.
 				span.RecordError(err, "")
+			} else if tool != nil {
+				count++
 			}
 			if !yield(tool, err) {
 				return