diff --git a/cmd/root/new.go b/cmd/root/new.go
index a52bed77d..a34c4f1f6 100644
--- a/cmd/root/new.go
+++ b/cmd/root/new.go
@@ -7,6 +7,7 @@ import (
 
 	tea "charm.land/bubbletea/v2"
 	"github.com/spf13/cobra"
+	"go.opentelemetry.io/otel"
 
 	"github.com/docker/docker-agent/pkg/app"
 	"github.com/docker/docker-agent/pkg/config"
@@ -63,7 +64,9 @@ func (f *newFlags) runNewCommand(cmd *cobra.Command, args []string) (commandErr
 	}
 	defer stopToolSets(t)
 
-	rt, err := runtime.New(t)
+	rt, err := runtime.New(t,
+		runtime.WithTracer(otel.Tracer(AppName)),
+	)
 	if err != nil {
 		return err
 	}
diff --git a/cmd/root/otel.go b/cmd/root/otel.go
index 9fc1f044d..32e8afd93 100644
--- a/cmd/root/otel.go
+++ b/cmd/root/otel.go
@@ -5,15 +5,26 @@ import (
 	"fmt"
 	"net"
 	"os"
+	"runtime"
 	"strings"
 	"time"
 
+	"github.com/google/uuid"
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
+	"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
+	"go.opentelemetry.io/otel/log/global"
 	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/sdk/log"
+	"go.opentelemetry.io/otel/sdk/metric"
 	"go.opentelemetry.io/otel/sdk/resource"
 	"go.opentelemetry.io/otel/sdk/trace"
 	semconv "go.opentelemetry.io/otel/semconv/v1.40.0"
+
+	"github.com/docker/docker-agent/pkg/httpclient"
+	"github.com/docker/docker-agent/pkg/version"
 )
 
 const AppName = "cagent"
@@ -25,73 +36,188 @@ func initOTelSDK(ctx context.Context) (err error) {
 		return fmt.Errorf("failed to create resource: %w", err)
 	}
 
-	var traceExporter trace.SpanExporter
 	endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
 
-	// Only initialize if endpoint is configured
-	if endpoint != "" {
-		var opts []otlptracehttp.Option
-		// An endpoint with an http:// or https:// scheme goes through
-		// WithEndpointURL so the SDK picks the transport from the scheme
-		// (per the OTLP/HTTP spec). Bare host:port still flows through
-		// WithEndpoint with the loopback-insecure shortcut preserved.
-		if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") {
-			opts = []otlptracehttp.Option{otlptracehttp.WithEndpointURL(endpoint)}
-		} else {
-			opts = []otlptracehttp.Option{otlptracehttp.WithEndpoint(endpoint)}
-			if isLocalhostEndpoint(endpoint) {
-				opts = append(opts, otlptracehttp.WithInsecure())
-			}
-		}
-		traceExporter, err = otlptracehttp.New(ctx, opts...)
-		if err != nil {
-			return fmt.Errorf("failed to create trace exporter: %w", err)
-		}
+	tp, err := newTracerProvider(ctx, res, endpoint)
+	if err != nil {
+		return fmt.Errorf("failed to create tracer provider: %w", err)
 	}
+	otel.SetTracerProvider(tp)
 
-	// Configure tracer provider
-	tracerProviderOpts := []trace.TracerProviderOption{
-		trace.WithResource(res),
+	mp, err := newMeterProvider(ctx, res, endpoint)
+	if err != nil {
+		_ = shutdownTracerProvider(tp)
+		return fmt.Errorf("failed to create meter provider: %w", err)
 	}
+	otel.SetMeterProvider(mp)
 
-	if traceExporter != nil {
-		tracerProviderOpts = append(tracerProviderOpts,
-			trace.WithBatcher(traceExporter,
-				trace.WithBatchTimeout(5*time.Second),
-				trace.WithMaxExportBatchSize(512),
-			),
-		)
+	lp, err := newLoggerProvider(ctx, res, endpoint)
+	if err != nil {
+		_ = mp.Shutdown(context.Background())
+		_ = shutdownTracerProvider(tp)
+		return fmt.Errorf("failed to create logger provider: %w", err)
 	}
+	global.SetLoggerProvider(lp)
 
-	tp := trace.NewTracerProvider(tracerProviderOpts...)
-	otel.SetTracerProvider(tp)
-
-	// Propagator must be set so otelhttp injects W3C traceparent on
-	// outbound requests and extracts it from incoming ones. Without this
-	// the SDK records spans locally but they never chain across services.
+	// Set the global text-map propagator unconditionally so otelhttp
+	// (and any other propagation-aware instrumentation) injects W3C
+	// `traceparent` / `tracestate` / `baggage` on outbound requests
+	// and extracts them on incoming ones. The propagator is a global
+	// no-op until set; without this the SDK records spans locally
+	// but they never chain across processes — `gen_ai.conversation.id`
+	// baggage and the MCP `_meta` / sandbox env-var injectors are
+	// dormant until this runs.
 	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
 		propagation.TraceContext{},
 		propagation.Baggage{},
 	))
 
+	// Single source of truth for "is OTel enabled?" — flip the
+	// httpclient gate now so outbound requests start emitting CLIENT
+	// spans and injecting traceparent. Previously the gate read
+	// OTEL_EXPORTER_OTLP_ENDPOINT directly, which diverged from the
+	// `--otel` CLI gate that controls this function: we'd either
+	// initialise providers without HTTP wrapping, or wrap HTTP without
+	// having a usable propagator.
+	httpclient.SetOTelEnabled(true)
+
 	go func() {
 		<-ctx.Done()
-		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-		defer cancel()
-		_ = tp.Shutdown(shutdownCtx)
+		// Flush in dependency order: logs and metrics first (they may
+		// reference active spans), then traces. Each provider gets its
+		// own 5s budget so a slow exporter can't starve the others —
+		// sharing a single timeout meant a stuck logs endpoint silently
+		// dropped buffered metrics and spans.
+		shutdown := func(fn func(context.Context) error) {
+			c, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+			defer cancel()
+			_ = fn(c)
+		}
+		shutdown(lp.Shutdown)
+		shutdown(mp.Shutdown)
+		shutdown(tp.Shutdown)
 	}()
 
 	return nil
 }
 
+// newTracerProvider builds the SDK tracer provider with an OTLP/HTTP
+// exporter when an endpoint is set.
+func newTracerProvider(ctx context.Context, res *resource.Resource, endpoint string) (*trace.TracerProvider, error) {
+	opts := []trace.TracerProviderOption{trace.WithResource(res)}
+
+	if endpoint == "" {
+		return trace.NewTracerProvider(opts...), nil
+	}
+
+	exp, err := otlptracehttp.New(ctx, traceExporterOptions(endpoint)...)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create trace exporter: %w", err)
+	}
+	opts = append(opts, trace.WithBatcher(exp,
+		trace.WithBatchTimeout(5*time.Second),
+		trace.WithMaxExportBatchSize(512),
+	))
+	return trace.NewTracerProvider(opts...), nil
+}
+
+// newMeterProvider builds the SDK meter provider. Without an endpoint the
+// provider still wires up so meters callers create are valid no-ops; with
+// an endpoint, a periodic reader exports via OTLP/HTTP.
+func newMeterProvider(ctx context.Context, res *resource.Resource, endpoint string) (*metric.MeterProvider, error) {
+	opts := []metric.Option{metric.WithResource(res)}
+
+	if endpoint != "" {
+		exp, err := otlpmetrichttp.New(ctx, metricExporterOptions(endpoint)...)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create metric exporter: %w", err)
+		}
+		opts = append(opts, metric.WithReader(metric.NewPeriodicReader(exp,
+			metric.WithInterval(60*time.Second),
+		)))
+	}
+
+	return metric.NewMeterProvider(opts...), nil
+}
+
+// newLoggerProvider builds the SDK logger provider. Required for the
+// gen_ai.client.operation.exception event (a log record per spec) and for
+// any future log-bridge instrumentation.
+func newLoggerProvider(ctx context.Context, res *resource.Resource, endpoint string) (*log.LoggerProvider, error) {
+	opts := []log.LoggerProviderOption{log.WithResource(res)}
+
+	if endpoint != "" {
+		exp, err := otlploghttp.New(ctx, logExporterOptions(endpoint)...)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create log exporter: %w", err)
+		}
+		opts = append(opts, log.WithProcessor(log.NewBatchProcessor(exp)))
+	}
+
+	return log.NewLoggerProvider(opts...), nil
+}
+
+// normalizeOTLPEndpoint turns a possibly-bare `host:port` into a fully
+// scheme-qualified URL so all three OTLP/HTTP exporters can be wired via
+// `WithEndpointURL` consistently. We can't rely on the SDKs' default
+// scheme inference: `otlptracehttp` (older API) treats a bare endpoint
+// as TLS-by-default while `otlploghttp` (newer API) treats the same
+// bare endpoint as insecure-by-default. With `OTEL_EXPORTER_OTLP_CERTIFICATE`
+// set in the env, the log exporter then errors out with
+// `insecure HTTP endpoint cannot use TLS client configuration`,
+// `initOTelSDK` propagates the failure, and the entire telemetry
+// pipeline (including traces) is torn down.
+//
+// Pinning the scheme up front removes that asymmetry: localhost gets
+// `http://`, every other host gets `https://`, and any explicit scheme
+// the caller already supplied is honoured verbatim.
+func normalizeOTLPEndpoint(endpoint string) string {
+	if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") {
+		return endpoint
+	}
+	if isLocalhostEndpoint(endpoint) {
+		return "http://" + endpoint
+	}
+	return "https://" + endpoint
+}
+
+func traceExporterOptions(endpoint string) []otlptracehttp.Option {
+	return []otlptracehttp.Option{otlptracehttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))}
+}
+
+func metricExporterOptions(endpoint string) []otlpmetrichttp.Option {
+	return []otlpmetrichttp.Option{otlpmetrichttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))}
+}
+
+func logExporterOptions(endpoint string) []otlploghttp.Option {
+	return []otlploghttp.Option{otlploghttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))}
+}
+
+func shutdownTracerProvider(tp *trace.TracerProvider) error {
+	shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	return tp.Shutdown(shutdownCtx)
+}
+
 func newOTelResource() (*resource.Resource, error) {
+	// Standard OTel resource attributes; users can layer additional
+	// labels via the spec-defined `OTEL_RESOURCE_ATTRIBUTES` env var,
+	// which `resource.Default` merges in.
+	attrs := []attribute.KeyValue{
+		semconv.ServiceName(AppName),
+		semconv.ServiceVersion(version.Version),
+		semconv.ServiceInstanceID(uuid.NewString()),
+		semconv.ProcessPID(os.Getpid()),
+		semconv.ProcessRuntimeName("go"),
+		semconv.OSTypeKey.String(runtime.GOOS),
+		semconv.HostArchKey.String(runtime.GOARCH),
+	}
+	if hostname, err := os.Hostname(); err == nil && hostname != "" {
+		attrs = append(attrs, semconv.HostName(hostname))
+	}
 	return resource.Merge(
 		resource.Default(),
-		resource.NewWithAttributes(
-			semconv.SchemaURL,
-			semconv.ServiceName(AppName),
-			semconv.ServiceVersion("dev"), // TODO: use actual version
-		),
+		resource.NewWithAttributes(semconv.SchemaURL, attrs...),
 	)
 }
 
diff --git a/cmd/root/otel_test.go b/cmd/root/otel_test.go
index 042973a9e..961383e45 100644
--- a/cmd/root/otel_test.go
+++ b/cmd/root/otel_test.go
@@ -16,6 +16,63 @@ func TestNewOTelResourceUsesCurrentSchemaURL(t *testing.T) {
 	assert.Equal(t, semconv.SchemaURL, res.SchemaURL())
 }
 
+// TestProvidersWithoutEndpoint verifies all three providers build cleanly
+// when no OTLP endpoint is configured — they're no-op exporters but must
+// still produce valid, non-nil providers so callers can create instruments.
+func TestProvidersWithoutEndpoint(t *testing.T) {
+	t.Parallel()
+
+	ctx := t.Context()
+	res, err := newOTelResource()
+	require.NoError(t, err)
+
+	tp, err := newTracerProvider(ctx, res, "")
+	require.NoError(t, err)
+	require.NotNil(t, tp)
+	assert.NotNil(t, tp.Tracer("test"))
+
+	mp, err := newMeterProvider(ctx, res, "")
+	require.NoError(t, err)
+	require.NotNil(t, mp)
+	assert.NotNil(t, mp.Meter("test"))
+
+	lp, err := newLoggerProvider(ctx, res, "")
+	require.NoError(t, err)
+	require.NotNil(t, lp)
+	assert.NotNil(t, lp.Logger("test"))
+}
+
+// TestNormalizeOTLPEndpoint pins the bare-endpoint -> URL mapping the
+// three OTLP/HTTP exporters share. Without this normalization the log
+// exporter (insecure-by-default for bare hosts) conflicted with
+// OTEL_EXPORTER_OTLP_CERTIFICATE and tore down the whole telemetry
+// pipeline; the trace exporter (TLS-by-default for bare hosts) hid
+// the inconsistency.
+func TestNormalizeOTLPEndpoint(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		endpoint string
+		want     string
+	}{
+		{"bare remote host:port -> https", "alloy.observability.svc.cluster.local:4318", "https://alloy.observability.svc.cluster.local:4318"},
+		{"bare remote host -> https", "example.com", "https://example.com"},
+		{"bare localhost host:port -> http", "localhost:4318", "http://localhost:4318"},
+		{"bare localhost -> http", "localhost", "http://localhost"},
+		{"bare ipv4 loopback -> http", "127.0.0.1:4318", "http://127.0.0.1:4318"},
+		{"bare ipv6 loopback -> http", "[::1]:4318", "http://[::1]:4318"},
+		{"explicit https preserved", "https://example.com:4318", "https://example.com:4318"},
+		{"explicit http preserved", "http://localhost:4318", "http://localhost:4318"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, normalizeOTLPEndpoint(tt.endpoint))
+		})
+	}
+}
+
 func TestIsLocalhostEndpoint(t *testing.T) {
 	t.Parallel()
 
diff --git a/cmd/root/sandbox.go b/cmd/root/sandbox.go
index 8a506138a..c163ed05e 100644
--- a/cmd/root/sandbox.go
+++ b/cmd/root/sandbox.go
@@ -18,6 +18,7 @@ import (
 	"github.com/docker/docker-agent/pkg/environment"
 	"github.com/docker/docker-agent/pkg/paths"
 	"github.com/docker/docker-agent/pkg/sandbox"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // runInSandbox delegates the current command to a Docker sandbox.
@@ -68,15 +69,30 @@ func runInSandbox(ctx context.Context, cmd *cobra.Command, args []string, runCon
 		envFlags = append(envFlags, "-e", envModelsGateway+"="+gateway)
 	}
 
+	// Wrap the sandbox exec in a span so the host side captures timing
+	// and exit code, and inject W3C trace context via env vars so the
+	// agent process spawned inside the sandbox container chains its
+	// own spans onto this parent.
+	ctx, sbxSpan := genai.StartSandboxExec(ctx, genai.SandboxOptions{
+		Runtime:   "docker",
+		Container: name,
+	})
+	defer sbxSpan.End()
+	envFlags = append(envFlags, genai.InjectSandboxEnv(ctx)...)
+
 	dockerCmd := backend.BuildExecCmd(ctx, name, wd, dockerAgentArgs, envFlags, envVars)
 	slog.Debug("Executing in sandbox", "name", name, "args", dockerCmd.Args)
 
 	if err := dockerCmd.Run(); err != nil {
 		if exitErr, ok := errors.AsType[*exec.ExitError](err); ok {
+			sbxSpan.SetExitCode(exitErr.ExitCode())
+			sbxSpan.RecordError(err, "")
 			return cli.StatusError{StatusCode: exitErr.ExitCode()}
 		}
+		sbxSpan.RecordError(err, "")
 		return fmt.Errorf("docker sandbox exec failed: %w", err)
 	}
+	sbxSpan.SetExitCode(0)
 
 	return nil
 }
diff --git a/go.mod b/go.mod
index c050f2778..02eb88cdf 100644
--- a/go.mod
+++ b/go.mod
@@ -61,8 +61,11 @@ require (
 	github.com/yuin/goldmark v1.8.2
 	github.com/zclconf/go-cty v1.18.1
 	go.opentelemetry.io/otel v1.43.0
+	go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0
+	go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
 	go.opentelemetry.io/otel/sdk v1.43.0
+	go.opentelemetry.io/otel/sdk/log v0.19.0
 	go.opentelemetry.io/otel/trace v1.43.0
 	golang.org/x/image v0.39.0
 	golang.org/x/oauth2 v0.36.0
@@ -234,9 +237,9 @@ require (
 	go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 // indirect
-	go.opentelemetry.io/otel/log v0.16.0 // indirect
-	go.opentelemetry.io/otel/metric v1.43.0 // indirect
-	go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/log v0.19.0
+	go.opentelemetry.io/otel/metric v1.43.0
+	go.opentelemetry.io/otel/sdk/metric v1.43.0
 	go.opentelemetry.io/proto/otlp v1.10.0 // indirect
 	go.yaml.in/yaml/v4 v4.0.0-rc.4
 	golang.org/x/crypto v0.50.0 // indirect
diff --git a/go.sum b/go.sum
index 7a79c0304..6164d4149 100644
--- a/go.sum
+++ b/go.sum
@@ -539,22 +539,28 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8V
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo=
 go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
 go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
+go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0 h1:HIBTQ3VO5aupLKjC90JgMqpezVXwFuq6Ryjn0/izoag=
+go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0/go.mod h1:ji9vId85hMxqfvICA0Jt8JqEdrXaAkcpkI9HPXya0ro=
 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 h1:NOyNnS19BF2SUDApbOKbDtWZ0IK7b8FJ2uAGdIWOGb0=
 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0/go.mod h1:VL6EgVikRLcJa9ftukrHu/ZkkhFBSo1lzvdBC9CF1ss=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY=
+go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc=
 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak=
-go.opentelemetry.io/otel/log v0.16.0 h1:DeuBPqCi6pQwtCK0pO4fvMB5eBq6sNxEnuTs88pjsN4=
-go.opentelemetry.io/otel/log v0.16.0/go.mod h1:rWsmqNVTLIA8UnwYVOItjyEZDbKIkMxdQunsIhpUMes=
+go.opentelemetry.io/otel/log v0.19.0 h1:KUZs/GOsw79TBBMfDWsXS+KZ4g2Ckzksd1ymzsIEbo4=
+go.opentelemetry.io/otel/log v0.19.0/go.mod h1:5DQYeGmxVIr4n0/BcJvF4upsraHjg6vudJJpnkL6Ipk=
 go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
 go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
 go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
 go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
-go.opentelemetry.io/otel/sdk/log v0.16.0 h1:e/b4bdlQwC5fnGtG3dlXUrNOnP7c8YLVSpSfEBIkTnI=
-go.opentelemetry.io/otel/sdk/log v0.16.0/go.mod h1:JKfP3T6ycy7QEuv3Hj8oKDy7KItrEkus8XJE6EoSzw4=
+go.opentelemetry.io/otel/sdk/log v0.19.0 h1:scYVLqT22D2gqXItnWiocLUKGH9yvkkeql5dBDiXyko=
+go.opentelemetry.io/otel/sdk/log v0.19.0/go.mod h1:vFBowwXGLlW9AvpuF7bMgnNI95LiW10szrOdvzBHlAg=
+go.opentelemetry.io/otel/sdk/log/logtest v0.19.0 h1:BEbF7ZBB6qQloV/Ub1+3NQoOUnVtcGkU3XX4Ws3GQfk=
+go.opentelemetry.io/otel/sdk/log/logtest v0.19.0/go.mod h1:Lua81/3yM0wOmoHTokLj9y9ADeA02v1naRrVrkAZuKk=
 go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
 go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
 go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
diff --git a/pkg/a2a/adapter.go b/pkg/a2a/adapter.go
index 333083dc6..3be77917e 100644
--- a/pkg/a2a/adapter.go
+++ b/pkg/a2a/adapter.go
@@ -8,6 +8,8 @@ import (
 	"strings"
 
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 	"google.golang.org/adk/agent"
 	"google.golang.org/adk/model"
 	adksession "google.golang.org/adk/session"
@@ -17,6 +19,7 @@ import (
 	"github.com/docker/docker-agent/pkg/runtime"
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/team"
+	cgenai "github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // newDockerAgentAdapter creates a new ADK agent adapter from a docker agent team and agent name.
@@ -43,6 +46,21 @@ func newDockerAgentAdapter(t *team.Team, agentName string) (agent.Agent, error)
 // runDockerAgent executes a docker agent and returns ADK session events
 func runDockerAgent(ctx agent.InvocationContext, t *team.Team, agentName string, a *dagent.Agent) iter.Seq2[*adksession.Event, error] {
 	return func(yield func(*adksession.Event, error) bool) {
+		// Decorate the inbound `a2a.message` SERVER span (created by
+		// otelhttp.NewHandler in server.go) with the GenAI semconv
+		// invoke_agent shape so dashboards can recognise A2A traffic as
+		// agent invocations rather than generic JSON-RPC POSTs. The
+		// runtime.session span we open below is the child that records
+		// the actual work; this annotation makes the parent searchable
+		// via gen_ai.operation.name="invoke_agent".
+		if span := trace.SpanFromContext(ctx); span.IsRecording() {
+			span.SetAttributes(
+				attribute.String(cgenai.AttrOperationName, cgenai.OperationInvokeAgent),
+				attribute.String(cgenai.AttrAgentName, agentName),
+				attribute.String(cgenai.AttrAgentNameRuntime, agentName),
+			)
+		}
+
 		// Extract user message from the ADK context
 		userContent := ctx.UserContent()
 		message := contentToMessage(userContent)
@@ -60,6 +78,13 @@ func runDockerAgent(ctx agent.InvocationContext, t *team.Team, agentName string,
 		// Create runtime
 		rt, err := runtime.New(t,
 			runtime.WithCurrentAgent(agentName),
+			// Match the tracer scope used by `cmd/root/run.go` so
+			// MCP / A2A / API spans share the same instrumentation
+			// scope as the CLI's runtime spans. Without this option
+			// `LocalRuntime.startSpan` sees a nil tracer and silently
+			// returns no-op spans for runtime.session, runtime.stream,
+			// runtime.tool.call, runtime.fallback, runtime.run_skill,
+			// hook events, and so on.
 			runtime.WithTracer(otel.Tracer("cagent")),
 		)
 		if err != nil {
diff --git a/pkg/a2a/server.go b/pkg/a2a/server.go
index c9fa93081..8914b8f9b 100644
--- a/pkg/a2a/server.go
+++ b/pkg/a2a/server.go
@@ -14,6 +14,7 @@ import (
 	"github.com/a2aproject/a2a-go/a2asrv"
 	"github.com/labstack/echo/v4"
 	"github.com/labstack/echo/v4/middleware"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 	"google.golang.org/adk/runner"
 	"google.golang.org/adk/server/adka2a"
 	"google.golang.org/adk/session"
@@ -104,8 +105,26 @@ func Run(ctx context.Context, agentFilename, agentName string, runConfig *config
 	}))
 	e.Use(middleware.RequestLogger())
 
-	e.GET(a2asrv.WellKnownAgentCardPath, echo.WrapHandler(a2asrv.NewStaticAgentCardHandler(agentCard)))
-	e.POST(agentPath, echo.WrapHandler(a2asrv.NewJSONRPCHandler(a2asrv.NewHandler(executor))))
+	// Wrap both A2A endpoints with otelhttp so the configured W3C
+	// propagator extracts `traceparent` / `tracestate` / `baggage`
+	// from incoming requests. The agent runtime started inside
+	// `runDockerAgent` then chains its spans onto the calling agent's
+	// trace, and the `gen_ai.conversation.id` baggage seeded by the
+	// caller flows through into our local runtime spans without
+	// per-call plumbing. The agent-card endpoint is included so
+	// discovery requests carry the same trace context as the
+	// downstream invocation — propagation is uniform across all
+	// public surfaces of the server.
+	cardHandler := otelhttp.NewHandler(
+		a2asrv.NewStaticAgentCardHandler(agentCard),
+		"a2a.agent_card",
+	)
+	jsonrpcHandler := otelhttp.NewHandler(
+		a2asrv.NewJSONRPCHandler(a2asrv.NewHandler(executor)),
+		"a2a.message",
+	)
+	e.GET(a2asrv.WellKnownAgentCardPath, echo.WrapHandler(cardHandler))
+	e.POST(agentPath, echo.WrapHandler(jsonrpcHandler))
 
 	if err := e.Server.Serve(ln); err != nil && ctx.Err() == nil {
 		slog.Error("Failed to start server", "error", err)
diff --git a/pkg/acp/agent.go b/pkg/acp/agent.go
index 06b8a4879..5de115fca 100644
--- a/pkg/acp/agent.go
+++ b/pkg/acp/agent.go
@@ -14,6 +14,7 @@ import (
 	"sync"
 
 	"github.com/coder/acp-go-sdk"
+	"go.opentelemetry.io/otel"
 
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/runtime"
@@ -144,6 +145,9 @@ func (a *Agent) NewSession(ctx context.Context, params acp.NewSessionRequest) (a
 	rt, err := runtime.New(a.team,
 		runtime.WithCurrentAgent(defaultAgent.Name()),
 		runtime.WithSessionStore(a.sessionStore),
+		// Match the CLI tracer scope; without this the ACP-mode
+		// runtime's `startSpan` is a no-op for every runtime.* span.
+		runtime.WithTracer(otel.Tracer("cagent")),
 	)
 	if err != nil {
 		return acp.NewSessionResponse{}, fmt.Errorf("failed to create runtime: %w", err)
diff --git a/pkg/chatserver/runtime_pool.go b/pkg/chatserver/runtime_pool.go
index d79f03448..397d13513 100644
--- a/pkg/chatserver/runtime_pool.go
+++ b/pkg/chatserver/runtime_pool.go
@@ -4,6 +4,8 @@ import (
 	"errors"
 	"sync"
 
+	"go.opentelemetry.io/otel"
+
 	"github.com/docker/docker-agent/pkg/runtime"
 	"github.com/docker/docker-agent/pkg/team"
 )
@@ -56,7 +58,13 @@ func (p *runtimePool) Get(agent string) (runtime.Runtime, error) {
 	if rt := p.takeIdle(agent); rt != nil {
 		return rt, nil
 	}
-	rt, err := runtime.New(p.team, runtime.WithCurrentAgent(agent))
+	// Match the tracer scope used by the CLI; without this the
+	// pooled chatserver runtimes are tracer-less so all `runtime.*`
+	// spans go silent in OpenAI-compatible chat-completions mode.
+	rt, err := runtime.New(p.team,
+		runtime.WithCurrentAgent(agent),
+		runtime.WithTracer(otel.Tracer("cagent")),
+	)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/chatserver/server.go b/pkg/chatserver/server.go
index 60ab69d51..aa2a1bbdf 100644
--- a/pkg/chatserver/server.go
+++ b/pkg/chatserver/server.go
@@ -36,6 +36,7 @@ import (
 	"github.com/labstack/echo/v4"
 	"github.com/labstack/echo/v4/middleware"
 	"github.com/openai/openai-go/v3"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/runtime"
@@ -125,14 +126,23 @@ func Run(ctx context.Context, agentFilename string, opts Options, ln net.Listene
 		return err
 	}
 
-	httpServer := &http.Server{
-		Handler: newRouter(&server{
+	// Wrap with otelhttp so incoming /v1/chat/completions requests
+	// (including SSE streams) extract the caller's trace context.
+	// otelhttp ends the span when the response body is closed, so
+	// SSE streaming responses get a span that covers the full
+	// stream duration.
+	handler := otelhttp.NewHandler(
+		newRouter(&server{
 			team:              t,
 			policy:            policy,
 			conversations:     newConversationStore(opts.ConversationsMaxSessions, conversationTTL(opts)),
 			conversationLocks: newConversationLockSet(),
 			runtimes:          newRuntimePool(t, opts.MaxIdleRuntimes),
 		}, opts),
+		"chatserver",
+	)
+	httpServer := &http.Server{
+		Handler:           handler,
 		ReadHeaderTimeout: 30 * time.Second,
 	}
 	return serve(ctx, httpServer, ln)
diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go
index 38ae652fd..391536aee 100644
--- a/pkg/evaluation/judge.go
+++ b/pkg/evaluation/judge.go
@@ -13,6 +13,7 @@ import (
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/model/provider"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // relevancePrompt is the prompt template for the judge model to evaluate responses.
@@ -155,10 +156,34 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
 	for i, r := range rawResults {
 		if r.err != nil {
 			errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err))
+			// Emit gen_ai.evaluation.result with error.type so the
+			// failed checks show up alongside the successful ones in
+			// log-based dashboards. Set ScoreLabel="error" so
+			// dashboards that GROUP BY label still surface these
+			// rows (otherwise the missing label silently drops them).
+			genai.EmitEvaluationResult(ctx, genai.EvaluationResult{
+				Name:       "relevance",
+				ScoreLabel: "error",
+				ErrorType:  genai.ClassifyError(r.err),
+			})
 			continue
 		}
 		results[i].Passed = r.passed
 		results[i].Reason = r.reason
+
+		score := 0.0
+		label := "failed"
+		if r.passed {
+			score = 1.0
+			label = "passed"
+		}
+		genai.EmitEvaluationResult(ctx, genai.EvaluationResult{
+			Name:          "relevance",
+			ScoreLabel:    label,
+			ScoreValue:    score,
+			HasScoreValue: true,
+			Explanation:   r.reason,
+		})
 	}
 
 	if len(errs) > 0 {
diff --git a/pkg/hooks/executor.go b/pkg/hooks/executor.go
index e6b0e0b8f..fb85905e1 100644
--- a/pkg/hooks/executor.go
+++ b/pkg/hooks/executor.go
@@ -10,6 +10,13 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // Executor dispatches configured hooks. Hook types are resolved against
@@ -134,6 +141,27 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input)
 		return &Result{Allowed: true}, nil
 	}
 
+	// Single span per Dispatch call covers every hook the event matched.
+	// Custom name `hook.{event}` because there is no GenAI semconv for
+	// arbitrary user-defined lifecycle hooks; we surface the event type,
+	// matched hook count, and session/agent identifiers so dashboards can
+	// split by event class without parsing span events.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/hooks").Start(
+		ctx,
+		"hook."+string(event),
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.String("cagent.hook.event", string(event)),
+			attribute.Int("cagent.hook.count", len(hooks)),
+			attribute.String("cagent.agent.name", input.AgentName),
+			attribute.String("gen_ai.conversation.id", input.SessionID),
+		),
+	)
+	if input.ToolName != "" {
+		span.SetAttributes(attribute.String("gen_ai.tool.name", input.ToolName))
+	}
+	defer span.End()
+
 	input.HookEventName = event
 	if input.Cwd == "" {
 		input.Cwd = e.workingDir
@@ -143,6 +171,8 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input)
 
 	inputJSON, err := input.ToJSON()
 	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
 		return nil, fmt.Errorf("failed to serialize hook input: %w", err)
 	}
 
@@ -153,7 +183,57 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input)
 	}
 	wg.Wait()
 
-	return aggregate(results, event), nil
+	final := aggregate(results, event)
+	annotateHookSpan(span, event, final)
+	return final, nil
+}
+
+// annotateHookSpan stamps the aggregated verdict onto the hook.{event}
+// span so dashboards can answer "did the hook block this?" and "why?"
+// without re-running the hook. Prior to this the span only carried the
+// event type and hook count — a denied call looked identical to an
+// allowed one. The verdict booleans and short reason are unconditional
+// (they're decisions, not content); free-text fields that may contain
+// PII or LLM output (Message, AdditionalContext, SystemMessage,
+// Summary) are gated on the GenAI content-capture opt-in.
+func annotateHookSpan(span trace.Span, event EventType, r *Result) {
+	if span == nil || r == nil {
+		return
+	}
+	attrs := []attribute.KeyValue{
+		attribute.Bool("cagent.hook.allowed", r.Allowed),
+		attribute.Int("cagent.hook.exit_code", r.ExitCode),
+	}
+	if r.Decision != "" {
+		attrs = append(attrs, attribute.String("cagent.hook.decision", string(r.Decision)))
+	}
+	if r.DecisionReason != "" {
+		attrs = append(attrs, attribute.String("cagent.hook.decision_reason", r.DecisionReason))
+	}
+	if event == EventPermissionRequest {
+		attrs = append(attrs, attribute.Bool("cagent.hook.permission_allowed", r.PermissionAllowed))
+	}
+	if r.ModifiedInput != nil {
+		attrs = append(attrs, attribute.Bool("cagent.hook.modified_input", true))
+	}
+	if r.Summary != "" {
+		attrs = append(attrs, attribute.Bool("cagent.hook.summary_provided", true))
+	}
+	if genai.IsContentCaptureEnabled() {
+		if r.Message != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.message", r.Message))
+		}
+		if r.AdditionalContext != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.additional_context", r.AdditionalContext))
+		}
+		if r.SystemMessage != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.system_message", r.SystemMessage))
+		}
+		if r.Summary != "" {
+			attrs = append(attrs, attribute.String("cagent.hook.summary", r.Summary))
+		}
+	}
+	span.SetAttributes(attrs...)
 }
 
 // hooksFor returns the deduplicated list of hooks that should run for
diff --git a/pkg/hooks/handler.go b/pkg/hooks/handler.go
index 2d5a2974a..ea8276157 100644
--- a/pkg/hooks/handler.go
+++ b/pkg/hooks/handler.go
@@ -14,6 +14,7 @@ import (
 	"sync"
 
 	"github.com/docker/docker-agent/pkg/shellpath"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // Handler executes a single hook invocation. It is built by a
@@ -188,7 +189,19 @@ type commandHandler struct {
 func (h *commandHandler) Run(ctx context.Context, input []byte) (HandlerResult, error) {
 	cmd := exec.CommandContext(ctx, h.shell, append(h.shellArgs, h.command)...)
 	cmd.Dir = h.workingDir
-	cmd.Env = h.env
+	// Expand nil to os.Environ() so the child inherits the parent env
+	// (matching the pre-OTel cmd.Env=h.env=nil behaviour), and copy
+	// into a fresh backing array so concurrent hooks don't race on a
+	// shared slice when adding the trace-context vars.
+	base := h.env
+	if base == nil {
+		base = os.Environ()
+	}
+	traceEnv := genai.InjectTraceContextEnv(ctx)
+	envCopy := make([]string, 0, len(base)+len(traceEnv))
+	envCopy = append(envCopy, base...)
+	envCopy = append(envCopy, traceEnv...)
+	cmd.Env = envCopy
 	cmd.Stdin = bytes.NewReader(input)
 
 	var stdout, stderr bytes.Buffer
diff --git a/pkg/httpclient/client.go b/pkg/httpclient/client.go
index bb256c7b7..b4d9e7bd5 100644
--- a/pkg/httpclient/client.go
+++ b/pkg/httpclient/client.go
@@ -6,8 +6,8 @@ import (
 	"maps"
 	"net/http"
 	"net/url"
-	"os"
 	"runtime"
+	"sync/atomic"
 
 	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 
@@ -41,13 +41,66 @@ func NewHTTPClient(ctx context.Context, opts ...Opt) *http.Client {
 	rt := newTransport(ctx)
 
 	return &http.Client{
-		Transport: &userAgentTransport{
+		Transport: WrapWithOTel(&userAgentTransport{
 			httpOptions: httpOptions,
 			rt:          rt,
-		},
+		}),
 	}
 }
 
+// otelEnabled tracks whether the OTel SDK has been initialised in this
+// process. `cmd/root/otel.go:initOTelSDK` calls `SetOTelEnabled(true)`
+// on success; nothing else flips this flag. Gating on a single source
+// of truth (rather than re-reading `OTEL_EXPORTER_OTLP_ENDPOINT`)
+// avoids the previous mismatch where the SDK could be initialised
+// without the HTTP wrap, or the HTTP wrap could fire without the SDK
+// initialising the propagator.
+var otelEnabled atomic.Bool
+
+// SetOTelEnabled toggles the gate consulted by WrapWithOTel. Called by
+// `initOTelSDK` after providers and the propagator are wired so HTTP
+// clients start injecting `traceparent` only once the rest of the SDK
+// can actually use the resulting spans.
+func SetOTelEnabled(enabled bool) {
+	otelEnabled.Store(enabled)
+}
+
+// WrapWithOTel returns rt wrapped with otelhttp when OpenTelemetry has
+// been enabled via `SetOTelEnabled` (called by `initOTelSDK`), or rt
+// unchanged otherwise. Gating avoids per-request span allocation on
+// the no-OTel path and stops sending a `traceparent` header to
+// upstream LLM providers that have no use for it. Exposed so callers
+// that build their own transports outside of `NewHTTPClient` can opt
+// into the same gating without duplicating the check.
+func WrapWithOTel(rt http.RoundTripper) http.RoundTripper {
+	if !otelEnabled.Load() {
+		return rt
+	}
+	return otelhttp.NewTransport(rt)
+}
+
+// TracedDefaultClient returns an `http.Client` equivalent to
+// `http.DefaultClient` but with the default transport wrapped via
+// `WrapWithOTel`. Use as a drop-in replacement at call sites that
+// previously did `http.DefaultClient.Do(req)` so OAuth metadata fetches,
+// fetch-tool requests, registry probes, and similar one-off HTTP calls
+// chain into the active trace.
+func TracedDefaultClient() *http.Client {
+	return &http.Client{Transport: WrapWithOTel(http.DefaultTransport)}
+}
+
+// TracedClient returns a configurable `http.Client` with the default
+// transport already wrapped via `WrapWithOTel`. The supplied options
+// (timeout, redirect policy, jar, etc.) are applied after construction.
+// Convenience wrapper for short-lived clients with custom timeouts.
+func TracedClient(opts ...func(*http.Client)) *http.Client {
+	c := &http.Client{Transport: WrapWithOTel(http.DefaultTransport)}
+	for _, opt := range opts {
+		opt(c)
+	}
+	return c
+}
+
 func WithHeader(key, value string) Opt {
 	return func(o *HTTPOptions) {
 		o.Header.Set(key, value)
@@ -109,15 +162,7 @@ func WithQuery(query url.Values) Opt {
 	}
 }
 
-// newTransport returns an HTTP transport with automatic gzip compression
-// disabled and using Docker Desktop proxy if available.
-//
-// When OpenTelemetry is enabled (i.e. OTEL_EXPORTER_OTLP_ENDPOINT is set,
-// matching the gating in initOTelSDK), the transport is wrapped with
-// otelhttp so each outbound request emits a CLIENT span and the W3C
-// traceparent header is injected. When OTel is disabled, the bare
-// transport is returned so we don't allocate per-request spans nor send
-// a traceparent header to upstream LLM providers.
+// newTransport returns an HTTP transport with automatic gzip compression disabled and using Docker Desktop proxy if available.
 func newTransport(ctx context.Context) http.RoundTripper {
 	// Get the base transport with Desktop proxy support from remote package
 	rt := remote.NewTransport(ctx)
@@ -131,19 +176,7 @@ func newTransport(ctx context.Context) http.RoundTripper {
 		t.DisableCompression()
 	}
 
-	return WrapWithOTel(rt)
-}
-
-// WrapWithOTel returns rt wrapped with otelhttp when OpenTelemetry is
-// enabled (OTEL_EXPORTER_OTLP_ENDPOINT set, matching the gating in
-// cmd/root/otel.go), or rt unchanged otherwise. Exposed so callers that
-// build their own transports outside of NewHTTPClient can opt into the
-// same env-gated instrumentation without duplicating the gating logic.
-func WrapWithOTel(rt http.RoundTripper) http.RoundTripper {
-	if os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") == "" {
-		return rt
-	}
-	return otelhttp.NewTransport(rt)
+	return rt
 }
 
 type userAgentTransport struct {
diff --git a/pkg/mcp/server.go b/pkg/mcp/server.go
index 9a5a0a22f..583aa01ba 100644
--- a/pkg/mcp/server.go
+++ b/pkg/mcp/server.go
@@ -11,6 +11,7 @@ import (
 	"slices"
 
 	"github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 	"go.opentelemetry.io/otel"
 
 	"github.com/docker/docker-agent/pkg/agent"
@@ -19,6 +20,7 @@ import (
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/team"
 	"github.com/docker/docker-agent/pkg/teamloader"
+	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/version"
 )
@@ -61,10 +63,17 @@ func StartHTTPServer(ctx context.Context, agentFilename, agentName string, runCo
 
 	fmt.Printf("MCP HTTP server listening on http://%s\n", ln.Addr())
 
+	// Wrap with otelhttp so the MCP-over-HTTP transport extracts
+	// `traceparent` / `baggage` from incoming requests just like the
+	// stdio transport extracts them from `params._meta`. Without this
+	// HTTP-mode MCP clients lose trace context at the boundary.
 	httpServer := &http.Server{
-		Handler: mcp.NewStreamableHTTPHandler(func(_ *http.Request) *mcp.Server {
-			return server
-		}, nil),
+		Handler: otelhttp.NewHandler(
+			mcp.NewStreamableHTTPHandler(func(_ *http.Request) *mcp.Server {
+				return server
+			}, nil),
+			"mcp.http",
+		),
 	}
 
 	errCh := make(chan error, 1)
@@ -158,7 +167,25 @@ func createMCPServer(ctx context.Context, agentFilename, agentName string, runCo
 }
 
 func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mcp.CallToolRequest, ToolInput) (*mcp.CallToolResult, ToolOutput, error) {
-	return func(ctx context.Context, req *mcp.CallToolRequest, input ToolInput) (*mcp.CallToolResult, ToolOutput, error) {
+	return func(ctx context.Context, req *mcp.CallToolRequest, input ToolInput) (result *mcp.CallToolResult, output ToolOutput, err error) {
+		// Extract W3C trace context from `params._meta` (per the OTel
+		// MCP semconv) so the SERVER span chains onto the calling
+		// CLIENT span. Then start a `tools/call {agent}` SERVER span
+		// covering the full handler execution.
+		if req != nil && req.Params != nil {
+			ctx = otelmcp.ExtractMeta(ctx, req.Params.Meta)
+		}
+		ctx, span := otelmcp.StartServer(ctx, otelmcp.CallOptions{
+			Method:   otelmcp.MethodToolsCall,
+			ToolName: agentName,
+		})
+		defer func() {
+			if err != nil {
+				span.RecordError(err, "")
+			}
+			span.End()
+		}()
+
 		slog.Debug("MCP tool called", "agent", agentName, "message", input.Message)
 
 		ag, err := t.Agent(agentName)
@@ -179,6 +206,9 @@ func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mc
 		rt, err := runtime.New(t,
 			runtime.WithCurrentAgent(agentName),
 			runtime.WithNonInteractive(true),
+			// See pkg/a2a/adapter.go for rationale — without this
+			// the runtime's startSpan is a no-op when cagent runs as
+			// an MCP server, so all our runtime.* spans go silent.
 			runtime.WithTracer(otel.Tracer("cagent")),
 		)
 		if err != nil {
@@ -191,11 +221,11 @@ func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mc
 			return nil, ToolOutput{}, fmt.Errorf("agent execution failed: %w", err)
 		}
 
-		result := cmp.Or(sess.GetLastAssistantMessageContent(), "No response from agent")
+		response := cmp.Or(sess.GetLastAssistantMessageContent(), "No response from agent")
 
-		slog.Debug("Agent execution completed", "agent", agentName, "response_length", len(result))
+		slog.Debug("Agent execution completed", "agent", agentName, "response_length", len(response))
 
-		return nil, ToolOutput{Response: result}, nil
+		return nil, ToolOutput{Response: response}, nil
 	}
 }
 
diff --git a/pkg/memory/database/sqlite/sqlite.go b/pkg/memory/database/sqlite/sqlite.go
index e1e349893..cc2409729 100644
--- a/pkg/memory/database/sqlite/sqlite.go
+++ b/pkg/memory/database/sqlite/sqlite.go
@@ -6,10 +6,40 @@ import (
 	"fmt"
 	"strings"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/memory/database"
 	"github.com/docker/docker-agent/pkg/sqliteutil"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
+// memoryDataSourceID is the `gen_ai.data_source.id` value used on
+// retrieval-shaped memory operations (SearchMemories) so observability-svc
+// can group "agent recalled this memory" timeline entries the same way it
+// groups RAG retrievals.
+const memoryDataSourceID = "memory"
+
+// startMemorySpan opens a small INTERNAL span for a memory CRUD operation.
+// op is recorded as `cagent.memory.op` and the span name is
+// `memory.{op}`. Conversation id flows in via baggage so the span lands
+// on the right session timeline.
+func startMemorySpan(ctx context.Context, op string) (context.Context, trace.Span) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/memory/database/sqlite")
+	attrs := []attribute.KeyValue{
+		attribute.String("cagent.memory.op", op),
+	}
+	if convID := genai.ConversationIDFromContext(ctx); convID != "" {
+		attrs = append(attrs, attribute.String(genai.AttrConversationID, convID))
+	}
+	return tracer.Start(ctx, "memory."+op,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+}
+
 type MemoryDatabase struct {
 	db *sql.DB
 }
@@ -40,15 +70,25 @@ func NewMemoryDatabase(path string) (database.Database, error) {
 }
 
 func (m *MemoryDatabase) AddMemory(ctx context.Context, memory database.UserMemory) error {
+	ctx, span := startMemorySpan(ctx, "add")
+	defer span.End()
+
 	if memory.ID == "" {
 		return database.ErrEmptyID
 	}
 	_, err := m.db.ExecContext(ctx, "INSERT INTO memories (id, created_at, memory, category) VALUES (?, ?, ?, ?)",
 		memory.ID, memory.CreatedAt, memory.Memory, memory.Category)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+	}
 	return err
 }
 
 func (m *MemoryDatabase) GetMemories(ctx context.Context) ([]database.UserMemory, error) {
+	ctx, span := startMemorySpan(ctx, "list")
+	defer span.End()
+
 	rows, err := m.db.QueryContext(ctx, "SELECT id, created_at, memory, COALESCE(category, '') FROM memories")
 	if err != nil {
 		return nil, err
@@ -73,11 +113,37 @@ func (m *MemoryDatabase) GetMemories(ctx context.Context) ([]database.UserMemory
 }
 
 func (m *MemoryDatabase) DeleteMemory(ctx context.Context, memory database.UserMemory) error {
+	ctx, span := startMemorySpan(ctx, "delete")
+	defer span.End()
+
 	_, err := m.db.ExecContext(ctx, "DELETE FROM memories WHERE id = ?", memory.ID)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+	}
 	return err
 }
 
-func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category string) ([]database.UserMemory, error) {
+func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category string) (results []database.UserMemory, err error) {
+	// SearchMemories is the retrieval shape per the OTel GenAI semconv:
+	// the agent is recalling stored memories filtered by query/category.
+	// Use the spec'd `retrieval {data_source.id}` span so this lands on
+	// the same dashboard row as RAG retrievals.
+	ctx, retSpan := genai.StartRetrieval(ctx, "sqlite", memoryDataSourceID, false, "")
+	defer func() {
+		if err != nil {
+			retSpan.RecordError(err, "")
+		}
+		retSpan.SetResultCount(len(results))
+		retSpan.End()
+	}()
+	if category != "" {
+		retSpan.SetAttributes(attribute.String("cagent.memory.category", category))
+	}
+
+	// Assign to the named returns (not local shadows) so the deferred
+	// span closure observes the live error and result count regardless
+	// of which return path fires.
 	var conditions []string
 	var args []any
 
@@ -102,30 +168,35 @@ func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category str
 		stmt += " WHERE " + strings.Join(conditions, " AND ")
 	}
 
-	rows, err := m.db.QueryContext(ctx, stmt, args...)
+	var rows *sql.Rows
+	rows, err = m.db.QueryContext(ctx, stmt, args...)
 	if err != nil {
 		return nil, err
 	}
 	defer rows.Close()
 
-	var memories []database.UserMemory
 	for rows.Next() {
 		var memory database.UserMemory
-		err := rows.Scan(&memory.ID, &memory.CreatedAt, &memory.Memory, &memory.Category)
-		if err != nil {
+		// gocritic suggests `:=` here, but we want to assign to the
+		// named return `err` so the deferred span closure observes
+		// the failure. nolint pragma documents the intent.
+		if err = rows.Scan(&memory.ID, &memory.CreatedAt, &memory.Memory, &memory.Category); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability
 			return nil, err
 		}
-		memories = append(memories, memory)
+		results = append(results, memory)
 	}
 
-	if err := rows.Err(); err != nil {
+	if err = rows.Err(); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability
 		return nil, err
 	}
 
-	return memories, nil
+	return results, nil
 }
 
 func (m *MemoryDatabase) UpdateMemory(ctx context.Context, memory database.UserMemory) error {
+	ctx, span := startMemorySpan(ctx, "update")
+	defer span.End()
+
 	if memory.ID == "" {
 		return database.ErrEmptyID
 	}
diff --git a/pkg/model/provider/anthropic/client.go b/pkg/model/provider/anthropic/client.go
index 115274458..db82bdbe8 100644
--- a/pkg/model/provider/anthropic/client.go
+++ b/pkg/model/provider/anthropic/client.go
@@ -14,6 +14,10 @@ import (
 	"github.com/anthropics/anthropic-sdk-go/option"
 	"github.com/anthropics/anthropic-sdk-go/packages/param"
 	"github.com/anthropics/anthropic-sdk-go/packages/ssestream"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/config/latest"
@@ -22,6 +26,7 @@ import (
 	"github.com/docker/docker-agent/pkg/model/provider/base"
 	"github.com/docker/docker-agent/pkg/model/provider/options"
 	"github.com/docker/docker-agent/pkg/model/provider/providerutil"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -696,7 +701,30 @@ func countAnthropicTokens(
 	messages []anthropic.MessageParam,
 	system []anthropic.TextBlockParam,
 	anthropicTools []anthropic.ToolUnionParam,
-) (int64, error) {
+) (count int64, err error) {
+	// Token counting is a blocking API call to Anthropic that fires
+	// on the context-overflow retry path. Span it so the latency is
+	// attributable when the retry stalls.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider/anthropic").Start(
+		ctx,
+		"anthropic.tokens.count",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(
+			attribute.String(genai.AttrProviderName, genai.ProviderAnthropic),
+			attribute.String(genai.AttrRequestModel, model),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		if count > 0 {
+			span.SetAttributes(attribute.Int64("cagent.anthropic.tokens.counted", count))
+		}
+		span.End()
+	}()
+
 	params := anthropic.MessageCountTokensParams{
 		Model:    model,
 		Messages: messages,
diff --git a/pkg/model/provider/anthropic/files.go b/pkg/model/provider/anthropic/files.go
index 98417abd4..015f102d2 100644
--- a/pkg/model/provider/anthropic/files.go
+++ b/pkg/model/provider/anthropic/files.go
@@ -15,6 +15,10 @@ import (
 	"time"
 
 	"github.com/anthropics/anthropic-sdk-go"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/chat"
 )
@@ -78,7 +82,25 @@ func NewFileManager(clientFn func(context.Context) (anthropic.Client, error)) *F
 // Files are deduplicated by content hash AND MIME type, so identical files with
 // different extensions will be uploaded separately.
 // Concurrent calls for the same file will wait for a single upload to complete.
-func (fm *FileManager) GetOrUpload(ctx context.Context, filePath string) (*UploadedFile, error) {
+func (fm *FileManager) GetOrUpload(ctx context.Context, filePath string) (result *UploadedFile, err error) {
+	// Span the whole upload — large files take seconds to minutes
+	// over slow links and previously the latency was completely
+	// dark. cache_hit=true paths are short-lived siblings; the
+	// network upload path is the long branch.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider/anthropic").Start(
+		ctx,
+		"anthropic.files.get_or_upload",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attribute.String("cagent.file.path", filePath)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	absPath, err := filepath.Abs(filePath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get absolute path: %w", err)
diff --git a/pkg/model/provider/factory.go b/pkg/model/provider/factory.go
index 5ca4fdd8a..22c78288d 100644
--- a/pkg/model/provider/factory.go
+++ b/pkg/model/provider/factory.go
@@ -71,7 +71,16 @@ func createDirectProvider(ctx context.Context, cfg *latest.ModelConfig, env envi
 		slog.Error("Unknown provider type", "type", providerType)
 		return nil, fmt.Errorf("unknown provider type: %s", providerType)
 	}
-	return factory(ctx, enhancedCfg, env, opts...)
+	p, err := factory(ctx, enhancedCfg, env, opts...)
+	if err != nil {
+		return nil, err
+	}
+	// Wrap leaf providers with the GenAI semconv tracer so every chat
+	// completion emits a `chat {model}` CLIENT span and the standard
+	// gen_ai.client.* metrics. The rule-based router constructed by
+	// createRuleBasedRouter is left bare — its routed targets go through
+	// resolveRoutedModel → createDirectProvider and end up wrapped here.
+	return instrumentProvider(p), nil
 }
 
 // providerFactory builds a Provider from a fully-resolved ModelConfig.
diff --git a/pkg/model/provider/factory_test.go b/pkg/model/provider/factory_test.go
index 3f849f786..339b86323 100644
--- a/pkg/model/provider/factory_test.go
+++ b/pkg/model/provider/factory_test.go
@@ -108,8 +108,9 @@ func TestCreateDirectProvider_DispatchByType(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			p, err := createDirectProvider(t.Context(), tt.cfg, environment.NewNoEnvProvider())
 			require.NoError(t, err)
-			fp, ok := p.(*fakeProvider)
-			require.True(t, ok, "expected fakeProvider, got %T", p)
+			leaf := unwrapProvider(p)
+			fp, ok := leaf.(*fakeProvider)
+			require.True(t, ok, "expected fakeProvider, got %T", leaf)
 			assert.Equal(t, tt.expectID, fp.id)
 		})
 	}
diff --git a/pkg/model/provider/instrument.go b/pkg/model/provider/instrument.go
new file mode 100644
index 000000000..92c44e42b
--- /dev/null
+++ b/pkg/model/provider/instrument.go
@@ -0,0 +1,309 @@
+package provider
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/model/provider/base"
+	"github.com/docker/docker-agent/pkg/rag/types"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// unwrapProvider returns the leaf provider underneath any number of
+// instrumentation wrappers. Used by tests and by code paths that need to
+// reach back to the concrete implementation (e.g. capability assertions
+// that the wrappers do not transparently forward).
+func unwrapProvider(p Provider) Provider {
+	for {
+		u, ok := p.(interface{ Unwrap() Provider })
+		if !ok {
+			return p
+		}
+		p = u.Unwrap()
+	}
+}
+
+// instrumentProvider wraps the leaf provider so every chat completion is
+// surrounded by a GenAI semconv-compliant span and the matching client
+// metrics. The wrapper is added once at the createDirectProvider boundary
+// — the rule-based router (createRuleBasedRouter) is left bare because it
+// dispatches to providers that are themselves already wrapped, so a
+// single chat span is emitted per call regardless of routing depth.
+//
+// To avoid changing the apparent capability of the inner provider, the
+// wrapper that is returned satisfies exactly the same set of interfaces
+// that the inner provider satisfies — chat-only, chat+rerank,
+// chat+embed+rerank, etc. RAG callers do `p.(EmbeddingProvider)` and rely
+// on `ok=false` to fall back to sequential processing; if the wrapper
+// always implemented EmbeddingProvider that fallback would silently
+// disappear.
+func instrumentProvider(p Provider) Provider {
+	if p == nil {
+		return nil
+	}
+
+	tc := &tracedChat{inner: p}
+
+	bep, isBatchEmbed := p.(BatchEmbeddingProvider)
+	ep, isEmbed := p.(EmbeddingProvider)
+	rp, isRerank := p.(RerankingProvider)
+
+	switch {
+	case isBatchEmbed && isRerank:
+		return &tracedBatchEmbedRerank{tracedChat: tc, batchEmbed: bep, rerank: rp}
+	case isBatchEmbed:
+		return &tracedBatchEmbed{tracedChat: tc, batchEmbed: bep}
+	case isEmbed && isRerank:
+		return &tracedEmbedRerank{tracedChat: tc, embed: ep, rerank: rp}
+	case isEmbed:
+		return &tracedEmbed{tracedChat: tc, embed: ep}
+	case isRerank:
+		return &tracedRerank{tracedChat: tc, rerank: rp}
+	default:
+		return tc
+	}
+}
+
+// tracedChat is the base wrapper. It satisfies just Provider and is
+// embedded by every richer wrapper. CreateChatCompletionStream is the
+// only method that adds behaviour — everything else delegates.
+type tracedChat struct {
+	inner Provider
+}
+
+func (t *tracedChat) ID() string              { return t.inner.ID() }
+func (t *tracedChat) BaseConfig() base.Config { return t.inner.BaseConfig() }
+
+// Unwrap returns the wrapped provider. Tests and any other caller that
+// needs the leaf type (e.g. for type assertions on internal helper
+// methods) can use the standard unwrap pattern:
+//
+//	if u, ok := p.(interface{ Unwrap() Provider }); ok { p = u.Unwrap() }
+func (t *tracedChat) Unwrap() Provider { return t.inner }
+
+func (t *tracedChat) CreateChatCompletionStream(ctx context.Context, messages []chat.Message, requestTools []tools.Tool) (chat.MessageStream, error) {
+	cfg := t.inner.BaseConfig()
+	req := genai.ChatRequest{
+		Provider: genai.ProviderNameForConfig(cfg.ModelConfig.Provider),
+		Model:    cfg.ModelConfig.Model,
+		Stream:   true,
+	}
+	// Populate sampling parameters from the resolved model config so the
+	// `gen_ai.request.max_tokens` / `temperature` / `top_p` / `top_k`
+	// attributes the GenAI semconv conditionally requires actually land
+	// on the span. Without this, the helper's gated emission paths were
+	// unreachable. Pointer fields distinguish "explicitly set" from
+	// "unset"; the matching Has* flags carry that signal through.
+	if mc := cfg.ModelConfig.MaxTokens; mc != nil {
+		req.MaxTokens = int(*mc)
+	}
+	if t := cfg.ModelConfig.Temperature; t != nil {
+		req.Temperature = *t
+		req.HasTemperature = true
+	}
+	if tp := cfg.ModelConfig.TopP; tp != nil {
+		req.TopP = *tp
+		req.HasTopP = true
+	}
+	chatCtx, span := genai.StartChat(ctx, req)
+
+	// Opt-in capture of request content. Helpers internally check the
+	// `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT` env var and
+	// no-op when unset, so the cost on the default path is the
+	// function-call overhead and nothing else.
+	genai.SetInputMessages(span, messages)
+	genai.SetToolDefinitions(span, requestTools)
+
+	stream, err := t.inner.CreateChatCompletionStream(chatCtx, messages, requestTools)
+	if err != nil {
+		span.RecordError(err, genai.ClassifyError(err))
+		span.End()
+		return nil, err
+	}
+	return genai.WrapStream(span, stream), nil
+}
+
+// embeddingRequestForConfig builds an EmbeddingRequest from the inner
+// provider's BaseConfig — same shape as the chat path so the spec
+// `gen_ai.provider.name` / `gen_ai.request.model` attributes use the
+// canonical names.
+func (t *tracedChat) embeddingRequestForConfig(batchSize int) genai.EmbeddingRequest {
+	cfg := t.inner.BaseConfig()
+	return genai.EmbeddingRequest{
+		Provider:  genai.ProviderNameForConfig(cfg.ModelConfig.Provider),
+		Model:     cfg.ModelConfig.Model,
+		BatchSize: batchSize,
+	}
+}
+
+// rerankSpan opens a `cagent.rerank` span. There is no spec-defined
+// rerank span yet; the operation is closely related to retrieval but
+// distinct enough to warrant its own name. Custom attributes use the
+// `cagent.*` namespace.
+func (t *tracedChat) rerankSpan(ctx context.Context, docCount int) (context.Context, trace.Span) {
+	cfg := t.inner.BaseConfig()
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider")
+	attrs := []attribute.KeyValue{
+		attribute.String(genai.AttrProviderName, genai.ProviderNameForConfig(cfg.ModelConfig.Provider)),
+		attribute.String(genai.AttrRequestModel, cfg.ModelConfig.Model),
+		attribute.Int("cagent.rerank.document_count", docCount),
+	}
+	// Carry `gen_ai.conversation.id` from baggage like every other
+	// span helper in the branch. The chat / embedding / retrieval /
+	// fallback / sandbox / MCP starters all do this; rerank was the
+	// odd one out, leaving rerank latency unattributable in
+	// per-conversation dashboards.
+	if convID := genai.ConversationIDFromContext(ctx); convID != "" {
+		attrs = append(attrs, attribute.String(genai.AttrConversationID, convID))
+	}
+	return tracer.Start(ctx, "rerank",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attrs...),
+	)
+}
+
+// wrapEmbedding wraps a single-input embedding call with a spec
+// `embeddings {model}` span. Records token usage and dimension count on
+// success; classifies errors on failure.
+func wrapEmbedding(ctx context.Context, req genai.EmbeddingRequest, fn func(context.Context) (*base.EmbeddingResult, error)) (*base.EmbeddingResult, error) {
+	ctx, span := genai.StartEmbedding(ctx, req)
+	defer span.End()
+	res, err := fn(ctx)
+	if err != nil {
+		span.RecordError(err, "")
+		return nil, err
+	}
+	if res != nil {
+		span.SetInputTokens(res.InputTokens)
+		span.SetDimensions(len(res.Embedding))
+	}
+	return res, nil
+}
+
+// wrapBatchEmbedding wraps a batch embedding call. Records the total
+// input tokens across the batch and the per-vector dimensionality.
+func wrapBatchEmbedding(ctx context.Context, req genai.EmbeddingRequest, fn func(context.Context) (*base.BatchEmbeddingResult, error)) (*base.BatchEmbeddingResult, error) {
+	ctx, span := genai.StartEmbedding(ctx, req)
+	defer span.End()
+	res, err := fn(ctx)
+	if err != nil {
+		span.RecordError(err, "")
+		return nil, err
+	}
+	if res != nil {
+		span.SetInputTokens(res.InputTokens)
+		if len(res.Embeddings) > 0 {
+			span.SetDimensions(len(res.Embeddings[0]))
+		}
+	}
+	return res, nil
+}
+
+// wrapRerank wraps a Rerank call with a `rerank` CLIENT span that
+// captures document count and error classification.
+func (t *tracedChat) wrapRerank(ctx context.Context, query string, documents []types.Document, criteria string, fn func(context.Context, string, []types.Document, string) ([]float64, error)) ([]float64, error) {
+	ctx, span := t.rerankSpan(ctx, len(documents))
+	defer span.End()
+	scores, err := fn(ctx, query, documents, criteria)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+		span.SetAttributes(attribute.String("error.type", genai.ClassifyError(err)))
+		return nil, err
+	}
+	return scores, nil
+}
+
+// tracedRerank adds RerankingProvider while still satisfying just Provider
+// at the chat layer.
+type tracedRerank struct {
+	*tracedChat
+
+	rerank RerankingProvider
+}
+
+func (t *tracedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) {
+	return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank)
+}
+
+// tracedEmbed satisfies EmbeddingProvider.
+type tracedEmbed struct {
+	*tracedChat
+
+	embed EmbeddingProvider
+}
+
+func (t *tracedEmbed) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.embed.CreateEmbedding(ctx, text)
+	})
+}
+
+// tracedEmbedRerank satisfies EmbeddingProvider and RerankingProvider.
+type tracedEmbedRerank struct {
+	*tracedChat
+
+	embed  EmbeddingProvider
+	rerank RerankingProvider
+}
+
+func (t *tracedEmbedRerank) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.embed.CreateEmbedding(ctx, text)
+	})
+}
+
+func (t *tracedEmbedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) {
+	return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank)
+}
+
+// tracedBatchEmbed satisfies BatchEmbeddingProvider (which embeds
+// EmbeddingProvider).
+type tracedBatchEmbed struct {
+	*tracedChat
+
+	batchEmbed BatchEmbeddingProvider
+}
+
+func (t *tracedBatchEmbed) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.batchEmbed.CreateEmbedding(ctx, text)
+	})
+}
+
+func (t *tracedBatchEmbed) CreateBatchEmbedding(ctx context.Context, texts []string) (*base.BatchEmbeddingResult, error) {
+	return wrapBatchEmbedding(ctx, t.embeddingRequestForConfig(len(texts)), func(ctx context.Context) (*base.BatchEmbeddingResult, error) {
+		return t.batchEmbed.CreateBatchEmbedding(ctx, texts)
+	})
+}
+
+// tracedBatchEmbedRerank satisfies BatchEmbeddingProvider and
+// RerankingProvider — the broadest combination, used by openai and dmr.
+type tracedBatchEmbedRerank struct {
+	*tracedChat
+
+	batchEmbed BatchEmbeddingProvider
+	rerank     RerankingProvider
+}
+
+func (t *tracedBatchEmbedRerank) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) {
+	return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) {
+		return t.batchEmbed.CreateEmbedding(ctx, text)
+	})
+}
+
+func (t *tracedBatchEmbedRerank) CreateBatchEmbedding(ctx context.Context, texts []string) (*base.BatchEmbeddingResult, error) {
+	return wrapBatchEmbedding(ctx, t.embeddingRequestForConfig(len(texts)), func(ctx context.Context) (*base.BatchEmbeddingResult, error) {
+		return t.batchEmbed.CreateBatchEmbedding(ctx, texts)
+	})
+}
+
+func (t *tracedBatchEmbedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) {
+	return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank)
+}
diff --git a/pkg/rag/manager.go b/pkg/rag/manager.go
index 17e77675f..40b051a52 100644
--- a/pkg/rag/manager.go
+++ b/pkg/rag/manager.go
@@ -11,11 +11,17 @@ import (
 	"slices"
 	"time"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/rag/database"
 	"github.com/docker/docker-agent/pkg/rag/fusion"
 	"github.com/docker/docker-agent/pkg/rag/rerank"
 	"github.com/docker/docker-agent/pkg/rag/strategy"
 	"github.com/docker/docker-agent/pkg/rag/types"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // ToolConfig represents tool-specific configuration
@@ -143,7 +149,23 @@ func New(_ context.Context, name string, config Config, strategyEvents <-chan ty
 // Initialize indexes all documents using all configured strategies
 // Each strategy indexes its own document set (shared + strategy-specific)
 // Strategies are initialized in parallel for better performance
-func (m *Manager) Initialize(ctx context.Context) error {
+func (m *Manager) Initialize(ctx context.Context) (err error) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag")
+	ctx, span := tracer.Start(ctx, "rag.initialize",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.String(genai.AttrDataSourceID, m.name),
+			attribute.Int("cagent.rag.num_strategies", len(m.strategies)),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	slog.Debug("[RAG Manager] Starting initialization",
 		"rag_name", m.name,
 		"num_strategies", len(m.strategies))
@@ -211,7 +233,20 @@ func (m *Manager) Initialize(ctx context.Context) error {
 
 // Query searches for relevant documents using all configured strategies
 // If multiple strategies are configured, results are combined using the fusion strategy
-func (m *Manager) Query(ctx context.Context, query string) ([]database.SearchResult, error) {
+func (m *Manager) Query(ctx context.Context, query string) (results []database.SearchResult, err error) {
+	// Start a `retrieval {rag_name}` span per the OTel GenAI semconv.
+	// The query text itself is sensitive so we never capture it on the
+	// span here — content capture is gated by a separate environment
+	// variable in a later commit and emitted via a span event then.
+	ctx, retSpan := genai.StartRetrieval(ctx, "rag", m.name, false, "")
+	defer func() {
+		if err != nil {
+			retSpan.RecordError(err, "")
+		}
+		retSpan.SetResultCount(len(results))
+		retSpan.End()
+	}()
+
 	slog.Debug("[RAG Manager] Starting query",
 		"rag_name", m.name,
 		"num_strategies", len(m.strategies),
@@ -228,7 +263,11 @@ func (m *Manager) Query(ctx context.Context, query string) ([]database.SearchRes
 				"strategy_limit", strategyCfg.Limit,
 				"strategy_threshold", strategyCfg.Threshold)
 
-			results, err := strategyImpl.Query(ctx, query, strategyCfg.Limit, strategyCfg.Threshold)
+			// Assign to the function's named returns (note `=`, not
+			// `:=`) so the deferred span closure sees the live values
+			// even if a future change replaces the explicit
+			// `return X, Y` form below with a bare `return`.
+			results, err = strategyImpl.Query(ctx, query, strategyCfg.Limit, strategyCfg.Threshold)
 			if err != nil {
 				slog.Error("[RAG Manager] Strategy query failed",
 					"rag_name", m.name,
@@ -431,7 +470,20 @@ func getStrategyNames(stratMap map[string]strategy.Strategy) []string {
 }
 
 // CheckAndReindexChangedFiles checks for file changes and re-indexes if needed
-func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) error {
+func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) (err error) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag")
+	ctx, span := tracer.Start(ctx, "rag.reindex",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attribute.String(genai.AttrDataSourceID, m.name)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	for strategyName, strategyImpl := range m.strategies {
 		strategyCfg := m.strategyConfigs[strategyName]
 		if err := strategyImpl.CheckAndReindexChangedFiles(ctx, strategyCfg.Docs, strategyCfg.Chunking); err != nil {
@@ -442,7 +494,20 @@ func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) error {
 }
 
 // StartFileWatcher starts monitoring files and directories for changes
-func (m *Manager) StartFileWatcher(ctx context.Context) error {
+func (m *Manager) StartFileWatcher(ctx context.Context) (err error) {
+	tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag")
+	ctx, span := tracer.Start(ctx, "rag.file_watcher.start",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attribute.String(genai.AttrDataSourceID, m.name)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	for strategyName, strategyImpl := range m.strategies {
 		strategyCfg := m.strategyConfigs[strategyName]
 		if err := strategyImpl.StartFileWatcher(ctx, strategyCfg.Docs, strategyCfg.Chunking); err != nil {
diff --git a/pkg/runtime/agent_delegation.go b/pkg/runtime/agent_delegation.go
index 0f46f280e..e8c5e39fe 100644
--- a/pkg/runtime/agent_delegation.go
+++ b/pkg/runtime/agent_delegation.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 	agenttool "github.com/docker/docker-agent/pkg/tools/builtin/agent"
 	"github.com/docker/docker-agent/pkg/tools/builtin/handoff"
@@ -408,11 +409,34 @@ func (r *LocalRuntime) handleTaskTransfer(ctx context.Context, sess *session.Ses
 
 	slog.Debug("Transferring task to agent", "from_agent", a.Name(), "to_agent", params.Agent, "task", params.Task)
 
-	ctx, span := r.startSpan(ctx, "runtime.task_transfer", trace.WithAttributes(
-		attribute.String("from.agent", a.Name()),
-		attribute.String("to.agent", params.Agent),
-		attribute.String("session.id", sess.ID),
-	))
+	delegationAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationInvokeAgent),
+		// gen_ai.agent.name identifies the target agent of the invoke_agent
+		// operation per the OTel GenAI semconv (Required). cagent.agent.name
+		// is the same value but in our internal namespace; we emit both so
+		// spec-aware backends and existing cagent dashboards both see it.
+		attribute.String(genai.AttrAgentName, params.Agent),
+		attribute.String("cagent.delegation.from_agent", a.Name()),
+		attribute.String("cagent.delegation.to_agent", params.Agent),
+		attribute.String("cagent.delegation.kind", "transfer_task"),
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, params.Agent),
+	}
+	if params.Task != "" {
+		// Task length is bounded enough to be useful as a span
+		// attribute for debugging "agent X transferred which task
+		// to Y". The full task body lands on the sub-session's
+		// runtime.session span when content capture is opt-in.
+		delegationAttrs = append(delegationAttrs, attribute.Int("cagent.delegation.task_length", len(params.Task)))
+	}
+	if genai.EmitLegacyAttributes() {
+		delegationAttrs = append(delegationAttrs,
+			attribute.String("from.agent", a.Name()),
+			attribute.String("to.agent", params.Agent),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	ctx, span := r.startSpan(ctx, "runtime.task_transfer", trace.WithAttributes(delegationAttrs...))
 	defer span.End()
 
 	return r.runForwarding(ctx, sess, evts, delegationRequest{
@@ -449,6 +473,26 @@ func (r *LocalRuntime) handleHandoff(ctx context.Context, sess *session.Session,
 		return nil, err
 	}
 
+	// Handoff is in-place agent swap (same session, different agent
+	// from the next turn). Span name keeps the runtime.* family;
+	// attributes mirror the transfer_task span shape so dashboards
+	// can union both delegation kinds. Take the returned ctx so
+	// `executeOnAgentSwitchHooks` and any of its children parent
+	// onto this span instead of bypassing it.
+	ctx, span := r.startSpan(ctx, "runtime.handoff", trace.WithAttributes(
+		attribute.String(genai.AttrOperationName, genai.OperationInvokeAgent),
+		// gen_ai.agent.name — Required by OTel GenAI semconv on invoke_agent
+		// spans; identifies the agent being handed off to. See task_transfer
+		// for the rationale of dual-emitting alongside cagent.agent.name.
+		attribute.String(genai.AttrAgentName, next.Name()),
+		attribute.String("cagent.delegation.from_agent", ca),
+		attribute.String("cagent.delegation.to_agent", next.Name()),
+		attribute.String("cagent.delegation.kind", "handoff"),
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, next.Name()),
+	))
+	defer span.End()
+
 	r.executeOnAgentSwitchHooks(ctx, currentAgent, sess.ID, ca, next.Name(), agentSwitchKindHandoff)
 	r.setCurrentAgent(next.Name())
 	handoffMessage := "The agent " + ca + " handed off the conversation to you. " +
diff --git a/pkg/runtime/cache.go b/pkg/runtime/cache.go
index 3e5e5a307..7448e418b 100644
--- a/pkg/runtime/cache.go
+++ b/pkg/runtime/cache.go
@@ -10,6 +10,7 @@ import (
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/hooks"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 // BuiltinCacheResponse is the name of the builtin stop hook that persists
@@ -63,7 +64,10 @@ func (r *LocalRuntime) tryReplayCachedResponse(
 	if question == "" {
 		return false
 	}
+	_, cacheSpan := genai.RecordCacheLookup(ctx, "")
 	cached, ok := c.Lookup(question)
+	cacheSpan.SetHit(ok && cached != "")
+	cacheSpan.End()
 	// Treat empty stored values as misses: cache_response only stores
 	// non-empty responses, so an empty entry only surfaces if the JSON
 	// file was hand-edited or downgraded from a future version. Replaying
@@ -99,7 +103,7 @@ func (r *LocalRuntime) tryReplayCachedResponse(
 // (handled inside [cache.Cache.Store]), which makes the replay path —
 // where [LocalRuntime.tryReplayCachedResponse] fires stop hooks for the
 // cached answer — free of redundant disk writes.
-func (r *LocalRuntime) cacheResponseBuiltin(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) {
+func (r *LocalRuntime) cacheResponseBuiltin(ctx context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) {
 	if in == nil || in.AgentName == "" || in.LastUserMessage == "" ||
 		strings.TrimSpace(in.StopResponse) == "" {
 		return nil, nil
@@ -111,7 +115,16 @@ func (r *LocalRuntime) cacheResponseBuiltin(_ context.Context, in *hooks.Input,
 		return nil, nil
 	}
 	if c := a.Cache(); c != nil {
+		// Thread the active context so the cache.store span chains
+		// onto the surrounding stop-hook trace instead of starting a
+		// detached one. Mark the operation as a successful write so
+		// the `cagent.cache.requests{operation="store"}` counter is
+		// incremented — without SetHit the store path would never
+		// register on the metric.
+		_, storeSpan := genai.RecordCacheStore(ctx, "")
 		c.Store(in.LastUserMessage, in.StopResponse)
+		storeSpan.SetHit(true)
+		storeSpan.End()
 	}
 	return nil, nil
 }
diff --git a/pkg/runtime/compactor/compactor.go b/pkg/runtime/compactor/compactor.go
index 721edd2d7..cc52030d5 100644
--- a/pkg/runtime/compactor/compactor.go
+++ b/pkg/runtime/compactor/compactor.go
@@ -24,6 +24,11 @@ import (
 	"fmt"
 	"time"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/compaction"
@@ -104,7 +109,39 @@ type LLMArgs struct {
 // Returns (nil, nil) when the model returns an empty summary; callers
 // should treat that as "compaction was a no-op" and skip the apply
 // step.
-func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) {
+func RunLLM(ctx context.Context, args LLMArgs) (result *Result, err error) {
+	// One INTERNAL `compaction` span covers the LLM-driven summarization
+	// strategy end-to-end. The inner LLM call gets its own `chat {model}`
+	// CLIENT child span via the provider decorator, so this parent span
+	// is a useful aggregate boundary (context limit, summary tokens,
+	// outcome) without duplicating per-call timing data.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/runtime/compactor").Start(
+		ctx,
+		"compaction",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.Int64("cagent.compaction.context_limit", args.ContextLimit),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		if result != nil {
+			// `Result.InputTokens` actually holds the compaction
+			// sub-session's *output* token count (the summary length)
+			// per the field's doc — name the span attribute by what the
+			// value is, not by what the source struct field is named.
+			span.SetAttributes(
+				attribute.Int("cagent.compaction.summary_output_tokens", int(result.InputTokens)),
+				attribute.Float64("cagent.compaction.cost", result.Cost),
+				attribute.Int("cagent.compaction.first_kept_entry", result.FirstKeptEntry),
+			)
+		}
+		span.End()
+	}()
+
 	if args.RunAgent == nil {
 		return nil, errors.New("compactor: RunAgent is required")
 	}
diff --git a/pkg/runtime/fallback.go b/pkg/runtime/fallback.go
index 8b0780aab..ee539e2a1 100644
--- a/pkg/runtime/fallback.go
+++ b/pkg/runtime/fallback.go
@@ -14,6 +14,7 @@ import (
 	"github.com/docker/docker-agent/pkg/modelerrors"
 	"github.com/docker/docker-agent/pkg/modelsdev"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -237,6 +238,14 @@ func (e *fallbackExecutor) execute(
 	modelChain := buildModelChain(primaryModel, fallbackModels)
 	startIndex := e.chainStartIndex(a, len(fallbackModels))
 
+	// One runtime.fallback span wraps the whole chain. Each per-model
+	// CreateChatCompletionStream call below opens its own `chat {model}`
+	// CLIENT child span via the provider decorator, so the fallback span
+	// is a useful aggregate boundary (total attempts, final model,
+	// terminal outcome) without duplicating per-model timing data.
+	ctx, fbSpan := genai.StartFallback(ctx, a.Name(), primaryModel.ID(), startIndex > 0)
+	defer fbSpan.End()
+
 	var lastErr error
 	primaryFailedWithNonRetryable := false
 	hasFallbacks := len(fallbackModels) > 0
@@ -252,14 +261,17 @@ func (e *fallbackExecutor) execute(
 		for attempt := range maxAttempts {
 			// Check context before each attempt
 			if ctx.Err() != nil {
+				fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 				return streamResult{}, nil, ctx.Err()
 			}
+			fbSpan.IncrementAttempt()
 
 			// Apply backoff before retry (not on first attempt of each model)
 			if attempt > 0 {
 				backoffDelay := backoff.Calculate(attempt - 1)
 				logRetryBackoff(a.Name(), modelEntry.provider.ID(), attempt, backoffDelay)
 				if !backoff.SleepWithContext(ctx, backoffDelay) {
+					fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 					return streamResult{}, nil, ctx.Err()
 				}
 			}
@@ -294,6 +306,7 @@ func (e *fallbackExecutor) execute(
 				lastErr = err
 				decision, retErr := e.classifyAttemptError(ctx, err, a, modelEntry, attempt, hasFallbacks, &primaryFailedWithNonRetryable)
 				if retErr != nil {
+					fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 					return streamResult{}, nil, retErr
 				}
 				if decision == retryDecisionBreak {
@@ -317,6 +330,7 @@ func (e *fallbackExecutor) execute(
 				lastErr = err
 				decision, retErr := e.classifyAttemptError(ctx, err, a, modelEntry, attempt, hasFallbacks, &primaryFailedWithNonRetryable)
 				if retErr != nil {
+					fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled)
 					return streamResult{}, nil, retErr
 				}
 				if decision == retryDecisionBreak {
@@ -326,6 +340,8 @@ func (e *fallbackExecutor) execute(
 			}
 
 			e.recordSuccess(a, modelEntry, primaryFailedWithNonRetryable)
+			fbSpan.SetFinalModel(modelEntry.provider.ID())
+			fbSpan.SetOutcome(genai.FallbackOutcomeSuccess)
 			return res, modelEntry.provider, nil
 		}
 	}
@@ -339,12 +355,17 @@ func (e *fallbackExecutor) execute(
 			prefix = "all models failed"
 		}
 		wrapped := fmt.Errorf("%s: %w", prefix, lastErr)
+		fbSpan.RecordError(wrapped, "")
+		fbSpan.SetOutcome(genai.FallbackOutcomeFailed)
 		if modelerrors.IsContextOverflowError(lastErr) {
 			return streamResult{}, nil, modelerrors.NewContextOverflowError(wrapped)
 		}
 		return streamResult{}, nil, wrapped
 	}
-	return streamResult{}, nil, errors.New("model failed with unknown error")
+	unknownErr := errors.New("model failed with unknown error")
+	fbSpan.RecordError(unknownErr, "")
+	fbSpan.SetOutcome(genai.FallbackOutcomeFailed)
+	return streamResult{}, nil, unknownErr
 }
 
 // retryDecision is the outcome of handleModelError.
diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go
index 96f2f4f7e..e366f0de8 100644
--- a/pkg/runtime/loop.go
+++ b/pkg/runtime/loop.go
@@ -20,6 +20,7 @@ import (
 	"github.com/docker/docker-agent/pkg/modelsdev"
 	"github.com/docker/docker-agent/pkg/runtime/toolexec"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 	bgagent "github.com/docker/docker-agent/pkg/tools/builtin/agent"
 	"github.com/docker/docker-agent/pkg/tools/builtin/handoff"
@@ -179,10 +180,32 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 	ctx = httpclient.ContextWithSessionID(ctx, sess.ID)
 	r.telemetry.RecordSessionStart(ctx, r.CurrentAgentName(), sess.ID)
 
-	ctx, sessionSpan := r.startSpan(ctx, "runtime.session", trace.WithAttributes(
-		attribute.String("agent", r.CurrentAgentName()),
-		attribute.String("session.id", sess.ID),
-	))
+	// Seed `gen_ai.conversation.id` into baggage at the session
+	// boundary. Every span the runtime, providers, MCP client, RAG,
+	// sandbox, evaluation, hooks, and (downstream) any subprocess
+	// or remote service create from here on will pick it up
+	// automatically without per-helper plumbing — and the value
+	// rides over W3C `baggage` so it crosses MCP / sandbox /
+	// HTTP boundaries too.
+	ctx = genai.WithConversationID(ctx, sess.ID)
+
+	// runtime.session is the root span for one stream. gen_ai.* keys
+	// are emitted alongside the legacy `agent` / `session.id` keys
+	// so existing dashboards keep matching while spec-aware tooling
+	// can filter by `gen_ai.conversation.id` and
+	// `cagent.agent.name`. Legacy keys drop out under
+	// OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental.
+	sessionAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, r.CurrentAgentName()),
+	}
+	if genai.EmitLegacyAttributes() {
+		sessionAttrs = append(sessionAttrs,
+			attribute.String("agent", r.CurrentAgentName()),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	ctx, sessionSpan := r.startSpan(ctx, "runtime.session", trace.WithAttributes(sessionAttrs...))
 	defer sessionSpan.End()
 
 	// Swap in this stream's events channel for elicitation and save the
@@ -213,6 +236,12 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session,
 	}
 	agentTools = filterExcludedTools(agentTools, sess.ExcludedTools)
 
+	// Record the catalogue size on the session span — answers "how
+	// many tools could this turn actually use?" without having to
+	// walk into per-toolset spans. Stamped after exclusion filters
+	// so the count matches what was offered to the model.
+	sessionSpan.SetAttributes(attribute.Int("cagent.agent.tools.count", len(agentTools)))
+
 	events <- ToolsetInfo(len(agentTools), false, a.Name())
 
 	messages := sess.GetMessages(a)
@@ -445,10 +474,17 @@ func (r *LocalRuntime) runTurn(
 	toolModelOverride *string,
 	events chan Event,
 ) (ctrl turnControl) {
-	streamCtx, streamSpan := r.startSpan(ctx, "runtime.stream", trace.WithAttributes(
-		attribute.String("agent", a.Name()),
-		attribute.String("session.id", sess.ID),
-	))
+	streamAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrConversationID, sess.ID),
+		attribute.String(genai.AttrAgentNameRuntime, a.Name()),
+	}
+	if genai.EmitLegacyAttributes() {
+		streamAttrs = append(streamAttrs,
+			attribute.String("agent", a.Name()),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	streamCtx, streamSpan := r.startSpan(ctx, "runtime.stream", trace.WithAttributes(streamAttrs...))
 	// streamSpan ends inline at the natural points (success path before
 	// recordAssistantMessage, error path after handleStreamError) so its
 	// duration tracks the model call only, not the whole iteration. The
@@ -600,6 +636,15 @@ func (r *LocalRuntime) runTurn(
 			"Agent terminated: detected %d consecutive identical calls to %s. "+
 				"This indicates a degenerate loop where the model is not making progress.",
 			consecutive, toolName)
+		// Mark the session span as Error so loop-termination shows up
+		// in trace status / error-rate dashboards instead of blending
+		// in with normal completions.
+		sessionSpan.SetAttributes(
+			attribute.String("error.type", "loop_detected"),
+			attribute.String("cagent.session.terminated_by", "loop_detector"),
+			attribute.Int("cagent.loop.consecutive_calls", consecutive),
+		)
+		sessionSpan.SetStatus(codes.Error, errMsg)
 		events <- Error(errMsg)
 		r.notifyError(ctx, a, sess.ID, errMsg)
 		loopDetector.Reset()
diff --git a/pkg/runtime/skill_runner.go b/pkg/runtime/skill_runner.go
index 71e9a7c6d..6ca31c6a6 100644
--- a/pkg/runtime/skill_runner.go
+++ b/pkg/runtime/skill_runner.go
@@ -10,6 +10,7 @@ import (
 	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/tools/builtin/skills"
 )
@@ -49,11 +50,37 @@ func (r *LocalRuntime) handleRunSkill(ctx context.Context, sess *session.Session
 	// Open the span before any pre-delegation work so model resolution
 	// (inside WithAgentModel) is recorded under runtime.run_skill rather
 	// than the parent session span.
-	ctx, span := r.startSpan(ctx, "runtime.run_skill", trace.WithAttributes(
-		attribute.String("agent", ca),
-		attribute.String("skill", prepared.SkillName),
-		attribute.String("session.id", sess.ID),
-	))
+	//
+	// Skills are workflow-shaped (a coordinated process the agent
+	// orchestrates), so the GenAI semconv `invoke_workflow` operation
+	// applies. Emit it via gen_ai.* attrs alongside the legacy keys
+	// for back-compat.
+	skillAttrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationInvokeWorkflow),
+		attribute.String(genai.AttrWorkflowName, prepared.SkillName),
+		attribute.String(genai.AttrAgentNameRuntime, ca),
+		attribute.String(genai.AttrConversationID, sess.ID),
+	}
+	if genai.EmitLegacyAttributes() {
+		skillAttrs = append(skillAttrs,
+			attribute.String("agent", ca),
+			attribute.String("skill", prepared.SkillName),
+			attribute.String("session.id", sess.ID),
+		)
+	}
+	// Span name follows the GenAI agent semconv pattern
+	// `invoke_workflow {workflow.name}` so spec-aware backends
+	// classify the span as a workflow invocation. SpanKindInternal is
+	// passed explicitly per spec rather than relying on the SDK
+	// default — keeps intent clear and immune to default changes.
+	spanName := genai.OperationInvokeWorkflow
+	if prepared.SkillName != "" {
+		spanName = genai.OperationInvokeWorkflow + " " + prepared.SkillName
+	}
+	ctx, span := r.startSpan(ctx, spanName,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(skillAttrs...),
+	)
 	defer span.End()
 
 	slog.Debug("Running skill as sub-agent",
diff --git a/pkg/runtime/toolexec/dispatcher.go b/pkg/runtime/toolexec/dispatcher.go
index 1d1636eb7..21cd1050a 100644
--- a/pkg/runtime/toolexec/dispatcher.go
+++ b/pkg/runtime/toolexec/dispatcher.go
@@ -19,6 +19,7 @@ import (
 	"github.com/docker/docker-agent/pkg/hooks"
 	"github.com/docker/docker-agent/pkg/session"
 	"github.com/docker/docker-agent/pkg/telemetry"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -30,19 +31,21 @@ const (
 	ApprovalDecisionDeny     = "deny"
 	ApprovalDecisionCanceled = "canceled"
 
-	ApprovalSourceYolo                    = "yolo"
-	ApprovalSourceSessionPermissionsAllow = "session_permissions_allow"
-	ApprovalSourceSessionPermissionsDeny  = "session_permissions_deny"
-	ApprovalSourceTeamPermissionsAllow    = "team_permissions_allow"
-	ApprovalSourceTeamPermissionsDeny     = "team_permissions_deny"
-	ApprovalSourcePreToolUseHookAllow     = "pre_tool_use_hook_allow"
-	ApprovalSourcePreToolUseHookDeny      = "pre_tool_use_hook_deny"
-	ApprovalSourceReadOnlyHint            = "readonly_hint"
-	ApprovalSourceUserApproved            = "user_approved"
-	ApprovalSourceUserApprovedSession     = "user_approved_session"
-	ApprovalSourceUserApprovedTool        = "user_approved_tool"
-	ApprovalSourceUserRejected            = "user_rejected"
-	ApprovalSourceContextCanceled         = "context_canceled"
+	ApprovalSourceYolo                       = "yolo"
+	ApprovalSourceSessionPermissionsAllow    = "session_permissions_allow"
+	ApprovalSourceSessionPermissionsDeny     = "session_permissions_deny"
+	ApprovalSourceTeamPermissionsAllow       = "team_permissions_allow"
+	ApprovalSourceTeamPermissionsDeny        = "team_permissions_deny"
+	ApprovalSourcePreToolUseHookAllow        = "pre_tool_use_hook_allow"
+	ApprovalSourcePreToolUseHookDeny         = "pre_tool_use_hook_deny"
+	ApprovalSourcePermissionRequestHookDeny  = "permission_request_hook_deny"
+	ApprovalSourcePermissionRequestHookAllow = "permission_request_hook_allow"
+	ApprovalSourceReadOnlyHint               = "readonly_hint"
+	ApprovalSourceUserApproved               = "user_approved"
+	ApprovalSourceUserApprovedSession        = "user_approved_session"
+	ApprovalSourceUserApprovedTool           = "user_approved_tool"
+	ApprovalSourceUserRejected               = "user_rejected"
+	ApprovalSourceContextCanceled            = "context_canceled"
 )
 
 // CallOutcome captures the verdicts of a single tool invocation as
@@ -245,13 +248,25 @@ type call struct {
 // and approval bookkeeping lives here so the call lifecycle is visible
 // at a glance.
 func (c *call) run(ctx context.Context) CallOutcome {
-	ctx, span := c.d.startSpan(ctx, "runtime.tool.call", trace.WithAttributes(
-		attribute.String("tool.name", c.tc.Function.Name),
-		attribute.String("tool.type", string(c.tc.Type)),
-		attribute.String("agent", c.a.Name()),
-		attribute.String("session.id", c.sess.ID),
-		attribute.String("tool.call_id", c.tc.ID),
-	))
+	// gen_ai.* attributes are always emitted (spec-compliant). Legacy
+	// attribute names are added only when the OTel stability flag is
+	// at its default — `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`
+	// drops the legacy keys. Tool type is "function" because every tool
+	// presented here is an LLM-callable function (transfer_task /
+	// handoff are runtime-managed but still appear as functions to the
+	// model).
+	attrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationExecuteTool),
+		attribute.String(genai.AttrToolName, c.tc.Function.Name),
+		attribute.String(genai.AttrToolType, "function"),
+		attribute.String(genai.AttrToolCallID, c.tc.ID),
+		attribute.String(genai.AttrAgentNameRuntime, c.a.Name()),
+		attribute.String(genai.AttrConversationID, c.sess.ID),
+	}
+	attrs = append(attrs, genai.LegacyToolAttributes(
+		c.tc.Function.Name, string(c.tc.Type), c.a.Name(), c.sess.ID, c.tc.ID,
+	)...)
+	ctx, span := c.d.startSpan(ctx, "runtime.tool.call", trace.WithAttributes(attrs...))
 	defer span.End()
 
 	slog.Debug("Processing tool call", "agent", c.a.Name(), "tool", c.tc.Function.Name, "session_id", c.sess.ID)
@@ -422,9 +437,17 @@ func (c *call) applyHookModifiedInput(result *hooks.Result) {
 }
 
 // notifyApproval forwards the resolved approval decision to the
-// HookDispatcher, when one is configured. Centralised so the nil-guard
-// stays in one place.
+// HookDispatcher, when one is configured. Also stamps the decision +
+// source on the active runtime.tool.call span so denied / canceled
+// calls are visible in trace dashboards (without it, denied tool calls
+// are indistinguishable from user-canceled ones at the span level).
 func (c *call) notifyApproval(ctx context.Context, decision, source string) {
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.approval.decision", decision),
+			attribute.String("cagent.approval.source", source),
+		)
+	}
 	if c.d.Hooks == nil {
 		return
 	}
@@ -529,6 +552,12 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call
 
 	if !result.Allowed {
 		slog.Debug("Tool denied by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.Message)
+		// Stamp the deny on the runtime.tool.call span via notifyApproval
+		// before returning. Without this the span would end with status
+		// Ok and no cagent.approval.* attrs — denied-by-hook calls would
+		// look identical to successful ones in trace dashboards, while
+		// pre_tool_use deny does emit the attrs. Symmetry matters.
+		c.notifyApproval(ctx, ApprovalDecisionDeny, ApprovalSourcePermissionRequestHookDeny)
 		rejectMsg := "The tool call was rejected by a permission_request hook."
 		if reason := strings.TrimSpace(result.Message); reason != "" {
 			rejectMsg += " Reason: " + reason
@@ -539,6 +568,7 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call
 
 	if result.PermissionAllowed {
 		slog.Debug("Tool auto-approved by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.AdditionalContext)
+		c.notifyApproval(ctx, ApprovalDecisionAllow, ApprovalSourcePermissionRequestHookAllow)
 		return runTool(), true
 	}
 
@@ -618,14 +648,28 @@ func (c *call) runHandler(ctx context.Context, handler ToolHandler) {
 // translation, and session message persistence. It is the only place
 // where a tool actually runs.
 func (c *call) invoke(ctx context.Context, spanName string, exec func(ctx context.Context) (*tools.ToolCallResult, time.Duration, error)) *tools.ToolCallResult {
-	ctx, span := c.d.startSpan(ctx, spanName, trace.WithAttributes(
-		attribute.String("tool.name", c.tc.Function.Name),
-		attribute.String("agent", c.a.Name()),
-		attribute.String("session.id", c.sess.ID),
-		attribute.String("tool.call_id", c.tc.ID),
-	))
+	attrs := []attribute.KeyValue{
+		attribute.String(genai.AttrOperationName, genai.OperationExecuteTool),
+		attribute.String(genai.AttrToolName, c.tc.Function.Name),
+		attribute.String(genai.AttrToolType, "function"),
+		attribute.String(genai.AttrToolCallID, c.tc.ID),
+		attribute.String(genai.AttrAgentNameRuntime, c.a.Name()),
+		attribute.String(genai.AttrConversationID, c.sess.ID),
+	}
+	attrs = append(attrs, genai.LegacyToolAttributes(
+		c.tc.Function.Name, string(c.tc.Type), c.a.Name(), c.sess.ID, c.tc.ID,
+	)...)
+	ctx, span := c.d.startSpan(ctx, spanName, trace.WithAttributes(attrs...))
 	defer span.End()
 
+	// gen_ai.tool.call.arguments capture is gated on the same opt-in as
+	// chat content (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`)
+	// because tool arguments commonly carry the same PII / secrets as the
+	// chat history that produced them (file paths, API tokens, prompts).
+	if genai.IsContentCaptureEnabled() && c.tc.Function.Arguments != "" {
+		span.SetAttributes(attribute.String(genai.AttrToolCallArguments, c.tc.Function.Arguments))
+	}
+
 	c.em.EmitToolCall(c.tc, c.tool, c.a.Name())
 
 	res, duration, err := exec(ctx)
@@ -647,6 +691,14 @@ func (c *call) invoke(ctx context.Context, spanName string, exec func(ctx contex
 	// path through Dispatch's `exec.Has(event)` short-circuit.
 	res.Output = c.applyToolResponseTransform(ctx, res.Output, false)
 
+	// gen_ai.tool.call.result captures the post-transform output so the
+	// span matches what the LLM actually saw on the next turn (any
+	// redact_secrets / scrubber rewrite is reflected). Same content-capture
+	// gating as arguments above.
+	if genai.IsContentCaptureEnabled() && res != nil && res.Output != "" {
+		span.SetAttributes(attribute.String(genai.AttrToolCallResult, res.Output))
+	}
+
 	c.em.EmitToolCallResponse(c.tc.ID, c.tool, res, res.Output, c.a.Name())
 	c.recordToolResponse(res)
 	return res
diff --git a/pkg/server/server.go b/pkg/server/server.go
index dd33b1290..030cdbb1a 100644
--- a/pkg/server/server.go
+++ b/pkg/server/server.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/labstack/echo/v4"
 	"github.com/labstack/echo/v4/middleware"
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 
 	"github.com/docker/docker-agent/pkg/api"
 	"github.com/docker/docker-agent/pkg/config"
@@ -80,8 +81,14 @@ func New(ctx context.Context, sessionStore session.Store, runConfig *config.Runt
 }
 
 func (s *Server) Serve(ctx context.Context, ln net.Listener) error {
+	// Wrap the Echo handler with otelhttp so the configured W3C
+	// propagator extracts `traceparent` / `tracestate` / `baggage`
+	// from incoming API requests. Without this the API server's
+	// runtime spans (already wired via `WithTracer` in the session
+	// manager) start fresh trace ids per request rather than
+	// chaining onto the calling client's trace.
 	srv := http.Server{
-		Handler: s.e,
+		Handler: otelhttp.NewHandler(s.e, "agent-api"),
 	}
 
 	if err := srv.Serve(ln); err != nil && ctx.Err() == nil {
diff --git a/pkg/server/session_manager.go b/pkg/server/session_manager.go
index 0b6d82605..7d6130440 100644
--- a/pkg/server/session_manager.go
+++ b/pkg/server/session_manager.go
@@ -12,6 +12,9 @@ import (
 	"time"
 
 	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/api"
 	"github.com/docker/docker-agent/pkg/concurrent"
@@ -402,12 +405,30 @@ func (sm *SessionManager) generateTitle(ctx context.Context, sess *session.Sessi
 	}
 }
 
-func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.Session, agentFilename, currentAgent string, rc *config.RuntimeConfig) (runtime.Runtime, *sessiontitle.Generator, error) {
+func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.Session, agentFilename, currentAgent string, rc *config.RuntimeConfig) (_ runtime.Runtime, _ *sessiontitle.Generator, err error) {
 	// Caller (RunSession) holds sm.mux and has already verified that no
 	// active runtime exists for this session. This function is purely a
 	// constructor: it must not touch sm.runtimeSessions, otherwise it would
 	// briefly publish a half-initialised activeRuntimes (e.g. without the
 	// cancel func) that other goroutines could observe.
+	//
+	// Every call is a cold-path construction (caller short-circuits
+	// cached hits), so a span here attributes per-request first-use
+	// latency (team load + runtime construction) without adding noise
+	// on warm paths.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/server").Start(
+		ctx, "session.runtime_init",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attribute.String("gen_ai.conversation.id", sess.ID)),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	t, err := sm.loadTeam(ctx, agentFilename, rc)
 	if err != nil {
 		return nil, nil, err
@@ -427,6 +448,9 @@ func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.S
 		runtime.WithCurrentAgent(currentAgent),
 		runtime.WithManagedOAuth(false),
 		runtime.WithSessionStore(sm.sessionStore),
+		// Match the tracer scope used by the CLI; without this the
+		// API-server runtime's startSpan is a no-op so all the
+		// runtime.* spans go silent in HTTP-server mode.
 		runtime.WithTracer(otel.Tracer("cagent")),
 	}
 	run, err := runtime.New(t, opts...)
diff --git a/pkg/sessiontitle/generator.go b/pkg/sessiontitle/generator.go
index be8b33166..21f0a8ff9 100644
--- a/pkg/sessiontitle/generator.go
+++ b/pkg/sessiontitle/generator.go
@@ -13,10 +13,16 @@ import (
 	"strings"
 	"time"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/model/provider"
 	"github.com/docker/docker-agent/pkg/model/provider/options"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 )
 
 const (
@@ -56,7 +62,7 @@ func New(model provider.Provider, fallbackModels ...provider.Provider) *Generato
 // CreateChatCompletionStream, avoiding the overhead of spinning up a nested
 // runtime, and falls back to the next model on failure.
 // Returns an empty string if no models or messages are configured.
-func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages []string) (string, error) {
+func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages []string) (title string, err error) {
 	if g == nil || len(g.models) == 0 || len(userMessages) == 0 {
 		return "", nil
 	}
@@ -67,6 +73,27 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages
 	// the originating session.
 	ctx = httpclient.ContextWithSessionID(ctx, sessionID)
 
+	// Wrap the whole title-generation in a span so the boundary is
+	// visible on the session timeline. The inner per-attempt LLM
+	// calls each get their own `chat {model}` CLIENT child span via
+	// the provider decorator.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/sessiontitle").Start(
+		ctx,
+		"sessiontitle.generate",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(
+			attribute.String(genai.AttrConversationID, sessionID),
+			attribute.Int("cagent.sessiontitle.candidate_count", len(g.models)),
+		),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	// Apply timeout to prevent hanging on slow or unresponsive models.
 	ctx, cancel := context.WithTimeout(ctx, titleGenerationTimeout)
 	defer cancel()
@@ -77,7 +104,10 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages
 
 	var lastErr error
 	for idx, baseModel := range g.models {
-		if err := ctx.Err(); err != nil {
+		// Assign to the named-return `err` so a context cancellation
+		// is observed by the deferred span closure as a recorded
+		// error rather than silently slipping through.
+		if err = ctx.Err(); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability
 			return "", err
 		}
 
diff --git a/pkg/teamloader/teamloader.go b/pkg/teamloader/teamloader.go
index f28ceea8b..bbe4a8c6f 100644
--- a/pkg/teamloader/teamloader.go
+++ b/pkg/teamloader/teamloader.go
@@ -13,6 +13,11 @@ import (
 	"strings"
 	"sync"
 
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/agent"
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/config/latest"
@@ -88,7 +93,23 @@ func Load(ctx context.Context, agentSource config.Source, runConfig *config.Runt
 
 // LoadWithConfig loads an agent team and returns both the team and config info
 // needed for runtime model switching.
-func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *config.RuntimeConfig, opts ...Opt) (*LoadResult, error) {
+func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *config.RuntimeConfig, opts ...Opt) (result *LoadResult, err error) {
+	// Cold-start path: parses config, resolves model aliases, may pull
+	// referenced sub-agents over the network, and starts every toolset.
+	// All synchronous from the caller's perspective. The span makes the
+	// breakdown attributable when first-use latency is high.
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/teamloader").Start(
+		ctx, "teamloader.load",
+		trace.WithSpanKind(trace.SpanKindInternal),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
+
 	var loadOpts loadOptions
 	loadOpts.toolsetRegistry = NewDefaultToolsetRegistry()
 
@@ -103,6 +124,12 @@ func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *c
 	if err != nil {
 		return nil, err
 	}
+	if cfg != nil {
+		span.SetAttributes(
+			attribute.Int("cagent.teamloader.agent_count", len(cfg.Agents)),
+			attribute.Int("cagent.teamloader.model_count", len(cfg.Models)),
+		)
+	}
 
 	// Resolve model aliases (e.g., "claude-sonnet-4-5" -> "claude-sonnet-4-5-20250929")
 	// This ensures the API uses the pinned model version. The original name is preserved
diff --git a/pkg/telemetry/genai/attrs.go b/pkg/telemetry/genai/attrs.go
new file mode 100644
index 000000000..48e0b82f4
--- /dev/null
+++ b/pkg/telemetry/genai/attrs.go
@@ -0,0 +1,105 @@
+package genai
+
+// Attribute keys defined by the OTel GenAI semantic conventions. All are
+// Development stability — declared as constants here so call sites depend
+// on a stable local symbol rather than a moving upstream import path.
+const (
+	AttrOperationName  = "gen_ai.operation.name"
+	AttrProviderName   = "gen_ai.provider.name"
+	AttrConversationID = "gen_ai.conversation.id"
+	AttrOutputType     = "gen_ai.output.type"
+
+	AttrAgentName        = "gen_ai.agent.name"
+	AttrAgentID          = "gen_ai.agent.id"
+	AttrAgentDescription = "gen_ai.agent.description"
+	AttrAgentVersion     = "gen_ai.agent.version"
+
+	AttrWorkflowName = "gen_ai.workflow.name"
+
+	AttrRequestModel            = "gen_ai.request.model"
+	AttrRequestStream           = "gen_ai.request.stream"
+	AttrRequestMaxTokens        = "gen_ai.request.max_tokens"
+	AttrRequestTemperature      = "gen_ai.request.temperature"
+	AttrRequestTopP             = "gen_ai.request.top_p"
+	AttrRequestTopK             = "gen_ai.request.top_k"
+	AttrRequestFrequencyPenalty = "gen_ai.request.frequency_penalty"
+	AttrRequestPresencePenalty  = "gen_ai.request.presence_penalty"
+	AttrRequestStopSequences    = "gen_ai.request.stop_sequences"
+	AttrRequestChoiceCount      = "gen_ai.request.choice.count"
+	AttrRequestSeed             = "gen_ai.request.seed"
+	AttrRequestEncodingFormats  = "gen_ai.request.encoding_formats"
+
+	AttrResponseModel            = "gen_ai.response.model"
+	AttrResponseID               = "gen_ai.response.id"
+	AttrResponseFinishReasons    = "gen_ai.response.finish_reasons"
+	AttrResponseTimeToFirstChunk = "gen_ai.response.time_to_first_chunk"
+
+	AttrUsageInputTokens              = "gen_ai.usage.input_tokens"
+	AttrUsageOutputTokens             = "gen_ai.usage.output_tokens"
+	AttrUsageCacheReadInputTokens     = "gen_ai.usage.cache_read.input_tokens"
+	AttrUsageCacheCreationInputTokens = "gen_ai.usage.cache_creation.input_tokens"
+	AttrUsageReasoningOutputTokens    = "gen_ai.usage.reasoning.output_tokens"
+
+	AttrTokenType = "gen_ai.token.type"
+
+	AttrToolName          = "gen_ai.tool.name"
+	AttrToolCallID        = "gen_ai.tool.call.id"
+	AttrToolType          = "gen_ai.tool.type"
+	AttrToolDescription   = "gen_ai.tool.description"
+	AttrToolDefinitions   = "gen_ai.tool.definitions"
+	AttrToolCallArguments = "gen_ai.tool.call.arguments"
+	AttrToolCallResult    = "gen_ai.tool.call.result"
+
+	AttrInputMessages      = "gen_ai.input.messages"
+	AttrOutputMessages     = "gen_ai.output.messages"
+	AttrSystemInstructions = "gen_ai.system_instructions"
+
+	AttrPromptName = "gen_ai.prompt.name"
+
+	AttrDataSourceID             = "gen_ai.data_source.id"
+	AttrEmbeddingsDimensionCount = "gen_ai.embeddings.dimension.count"
+	AttrRetrievalDocuments       = "gen_ai.retrieval.documents"
+	AttrRetrievalQueryText       = "gen_ai.retrieval.query.text"
+
+	AttrEvaluationName        = "gen_ai.evaluation.name"
+	AttrEvaluationScoreLabel  = "gen_ai.evaluation.score.label"
+	AttrEvaluationScoreValue  = "gen_ai.evaluation.score.value"
+	AttrEvaluationExplanation = "gen_ai.evaluation.explanation"
+)
+
+// Operation names — values for AttrOperationName.
+const (
+	OperationChat            = "chat"
+	OperationTextCompletion  = "text_completion"
+	OperationGenerateContent = "generate_content"
+	OperationEmbeddings      = "embeddings"
+	OperationCreateAgent     = "create_agent"
+	OperationInvokeAgent     = "invoke_agent"
+	OperationInvokeWorkflow  = "invoke_workflow"
+	OperationExecuteTool     = "execute_tool"
+	OperationRetrieval       = "retrieval"
+)
+
+// Token types — values for AttrTokenType when recording the token usage
+// histogram. Spec defines `input` and `output`; we use the cache_read /
+// cache_creation / reasoning variants to mirror the per-token-type
+// usage attributes for richer breakdowns.
+const (
+	TokenTypeInput         = "input"
+	TokenTypeOutput        = "output"
+	TokenTypeCacheRead     = "cache_read.input"
+	TokenTypeCacheCreation = "cache_creation.input"
+	TokenTypeReasoning     = "reasoning.output"
+)
+
+// Provider names — values for AttrProviderName. Names follow the values
+// defined in the provider-specific GenAI semconv pages.
+const (
+	ProviderAnthropic   = "anthropic"
+	ProviderOpenAI      = "openai"
+	ProviderAWSBedrock  = "aws.bedrock"
+	ProviderGCPVertexAI = "gcp.vertex_ai"
+	ProviderGCPGenAI    = "gcp.gen_ai"
+	ProviderAzureAI     = "azure.ai.inference"
+	ProviderDMR         = "docker.dmr"
+)
diff --git a/pkg/telemetry/genai/content.go b/pkg/telemetry/genai/content.go
new file mode 100644
index 000000000..b7d09cc24
--- /dev/null
+++ b/pkg/telemetry/genai/content.go
@@ -0,0 +1,207 @@
+package genai
+
+import (
+	"encoding/json"
+	"os"
+	"strings"
+
+	"go.opentelemetry.io/otel/attribute"
+
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// EnvCaptureMessageContent is the OTel-recommended environment variable
+// that toggles capture of GenAI request/response content as span
+// attributes. Default is off because chat history routinely contains
+// PII, secrets, internal documents, and other content that should not
+// be exported by default.
+//
+// Recognised truthy values: "true", "1", "yes", "on" (case-insensitive).
+const EnvCaptureMessageContent = "OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"
+
+// IsContentCaptureEnabled reports whether the OTel content-capture
+// opt-in is set. Read on every call so tests and feature flags can
+// flip the value at runtime.
+func IsContentCaptureEnabled() bool {
+	switch strings.ToLower(strings.TrimSpace(os.Getenv(EnvCaptureMessageContent))) {
+	case "true", "1", "yes", "on":
+		return true
+	default:
+		return false
+	}
+}
+
+// messagePart matches the OTel GenAI semconv message part schema
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/non-normative/examples-llm-calls/).
+//
+// Field choice per spec:
+//   - "text" parts use Content
+//   - "uri" parts use URI (and may set MimeType / Modality)
+//   - "tool_call" / "tool_call_response" parts use ID, Name, Arguments,
+//     Result
+type messagePart struct {
+	Type      string `json:"type"`
+	Content   string `json:"content,omitempty"`
+	URI       string `json:"uri,omitempty"`
+	MimeType  string `json:"mime_type,omitempty"`
+	Modality  string `json:"modality,omitempty"`
+	ID        string `json:"id,omitempty"`
+	Name      string `json:"name,omitempty"`
+	Arguments any    `json:"arguments,omitempty"`
+	Result    any    `json:"result,omitempty"`
+}
+
+type structuredMessage struct {
+	Role  string        `json:"role"`
+	Parts []messagePart `json:"parts"`
+}
+
+// SetInputMessages serialises chat history into `gen_ai.input.messages`
+// per the OTel GenAI examples schema (role + parts) and attaches it to
+// the span. System messages are removed from the array and emitted
+// separately as `gen_ai.system_instructions` per the spec.
+//
+// No-op when content capture is disabled or the span is nil.
+func SetInputMessages(span *ChatSpan, messages []chat.Message) {
+	if span == nil || !IsContentCaptureEnabled() {
+		return
+	}
+
+	var systemInstructions []structuredMessage
+	var input []structuredMessage
+	for i := range messages {
+		msg := messageToStructured(&messages[i])
+		if messages[i].Role == chat.MessageRoleSystem {
+			systemInstructions = append(systemInstructions, msg)
+			continue
+		}
+		input = append(input, msg)
+	}
+
+	if len(systemInstructions) > 0 {
+		if encoded, err := json.Marshal(systemInstructions); err == nil {
+			span.SetAttributes(attribute.String(AttrSystemInstructions, string(encoded)))
+		}
+	}
+	if len(input) > 0 {
+		if encoded, err := json.Marshal(input); err == nil {
+			span.SetAttributes(attribute.String(AttrInputMessages, string(encoded)))
+		}
+	}
+}
+
+// SetOutputMessages serialises the assembled response into
+// `gen_ai.output.messages`. Use after streaming has completed and the
+// final assistant message is known.
+func SetOutputMessages(span *ChatSpan, content, reasoning string, toolCalls []tools.ToolCall) {
+	if span == nil || !IsContentCaptureEnabled() {
+		return
+	}
+	parts := []messagePart{}
+	if reasoning != "" {
+		parts = append(parts, messagePart{Type: "reasoning", Content: reasoning})
+	}
+	if content != "" {
+		parts = append(parts, messagePart{Type: "text", Content: content})
+	}
+	for _, tc := range toolCalls {
+		parts = append(parts, messagePart{
+			Type:      "tool_call",
+			ID:        tc.ID,
+			Name:      tc.Function.Name,
+			Arguments: tc.Function.Arguments,
+		})
+	}
+	if len(parts) == 0 {
+		return
+	}
+	out := []structuredMessage{{Role: "assistant", Parts: parts}}
+	if encoded, err := json.Marshal(out); err == nil {
+		span.SetAttributes(attribute.String(AttrOutputMessages, string(encoded)))
+	}
+}
+
+// SetToolDefinitions serialises the tool definitions presented to the
+// model into `gen_ai.tool.definitions`.
+func SetToolDefinitions(span *ChatSpan, toolDefs []tools.Tool) {
+	if span == nil || !IsContentCaptureEnabled() || len(toolDefs) == 0 {
+		return
+	}
+	encoded, err := json.Marshal(toolDefs)
+	if err != nil {
+		return
+	}
+	span.SetAttributes(attribute.String(AttrToolDefinitions, string(encoded)))
+}
+
+// messageToStructured converts a chat.Message to the spec-shaped
+// structured message representation. Multi-content messages produce one
+// part per content block; tool calls and tool results map to their
+// respective part types.
+func messageToStructured(m *chat.Message) structuredMessage {
+	role := string(m.Role)
+	parts := []messagePart{}
+
+	switch {
+	case len(m.MultiContent) > 0:
+		for _, mc := range m.MultiContent {
+			switch mc.Type {
+			case chat.MessagePartTypeText:
+				if mc.Text != "" {
+					parts = append(parts, messagePart{Type: "text", Content: mc.Text})
+				}
+			case chat.MessagePartTypeImageURL:
+				if mc.ImageURL != nil && mc.ImageURL.URL != "" {
+					parts = append(parts, messagePart{
+						Type:     "uri",
+						URI:      mc.ImageURL.URL,
+						Modality: "image",
+					})
+				}
+			case chat.MessagePartTypeFile:
+				if mc.File != nil {
+					p := messagePart{Type: "file", ID: mc.File.FileID}
+					if mc.File.MimeType != "" {
+						p.MimeType = mc.File.MimeType
+					}
+					parts = append(parts, p)
+				}
+			}
+		}
+	case m.ToolCallID != "":
+		// Tool result messages: the entire content is the tool's
+		// response payload, encoded as a single tool_call_response
+		// part. Skip the default text/reasoning branch so we don't
+		// also emit a duplicate `text` part with the same payload.
+	default:
+		if m.ReasoningContent != "" {
+			parts = append(parts, messagePart{Type: "reasoning", Content: m.ReasoningContent})
+		}
+		if m.Content != "" {
+			parts = append(parts, messagePart{Type: "text", Content: m.Content})
+		}
+	}
+
+	for _, tc := range m.ToolCalls {
+		parts = append(parts, messagePart{
+			Type:      "tool_call",
+			ID:        tc.ID,
+			Name:      tc.Function.Name,
+			Arguments: tc.Function.Arguments,
+		})
+	}
+	if m.ToolCallID != "" {
+		// Per the OTel GenAI semconv example schema, tool_call_response
+		// parts carry the payload in `result`, not `content` (which is
+		// reserved for `text`/`reasoning` parts). Spec-aware backends
+		// look for the `result` key when decoding tool responses.
+		parts = append(parts, messagePart{
+			Type:   "tool_call_response",
+			ID:     m.ToolCallID,
+			Result: m.Content,
+		})
+	}
+
+	return structuredMessage{Role: role, Parts: parts}
+}
diff --git a/pkg/telemetry/genai/conversation.go b/pkg/telemetry/genai/conversation.go
new file mode 100644
index 000000000..06b0edf4d
--- /dev/null
+++ b/pkg/telemetry/genai/conversation.go
@@ -0,0 +1,52 @@
+package genai
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/baggage"
+)
+
+// baggageKeyConversationID matches the GenAI semconv attribute key for
+// the conversation identifier so the value flows transparently through
+// the W3C `baggage` header alongside `traceparent`. Any downstream
+// service or subprocess running OTel auto-instrumentation will pick it
+// up without per-helper plumbing.
+const baggageKeyConversationID = "gen_ai.conversation.id"
+
+// WithConversationID returns a context that carries the conversation id
+// in OTel baggage. Spans created later in the chain — including ones in
+// helper packages that have no direct access to the session — read it
+// via ConversationIDFromContext and attach `gen_ai.conversation.id`
+// automatically. Empty id is a no-op.
+func WithConversationID(ctx context.Context, id string) context.Context {
+	if id == "" {
+		return ctx
+	}
+	member, err := baggage.NewMember(baggageKeyConversationID, id)
+	if err != nil {
+		return ctx
+	}
+	bag, err := baggage.FromContext(ctx).SetMember(member)
+	if err != nil {
+		return ctx
+	}
+	return baggage.ContextWithBaggage(ctx, bag)
+}
+
+// ConversationIDFromContext returns the conversation id stored in the
+// context's baggage, or "" when none has been seeded.
+func ConversationIDFromContext(ctx context.Context) string {
+	return baggage.FromContext(ctx).Member(baggageKeyConversationID).Value()
+}
+
+// conversationAttribute returns the gen_ai.conversation.id attribute
+// from baggage when present, or zero-value KeyValue when absent. Helper
+// for span starters so they can append it in one line.
+func conversationAttribute(ctx context.Context) (attribute.KeyValue, bool) {
+	id := ConversationIDFromContext(ctx)
+	if id == "" {
+		return attribute.KeyValue{}, false
+	}
+	return attribute.String(AttrConversationID, id), true
+}
diff --git a/pkg/telemetry/genai/doc.go b/pkg/telemetry/genai/doc.go
new file mode 100644
index 000000000..61bf90dd0
--- /dev/null
+++ b/pkg/telemetry/genai/doc.go
@@ -0,0 +1,15 @@
+// Package genai provides OpenTelemetry instrumentation helpers that follow
+// the GenAI semantic conventions
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/).
+//
+// The package is structured so that callers — provider clients, the agent
+// runtime, MCP clients — describe what they are doing in domain terms and
+// the helpers produce the spec-conformant spans, metrics, and log records.
+// Centralising the OTel surface here lets us upgrade the semantic
+// conventions in one place and keeps the call sites compact.
+//
+// All gen_ai.* attributes are Development stability per the spec. Attribute
+// keys are declared as constants in this package rather than imported from
+// go.opentelemetry.io/otel/semconv to insulate callers from the upstream
+// reorganisations the GenAI conventions are still going through.
+package genai
diff --git a/pkg/telemetry/genai/embedding.go b/pkg/telemetry/genai/embedding.go
new file mode 100644
index 000000000..a83ad752e
--- /dev/null
+++ b/pkg/telemetry/genai/embedding.go
@@ -0,0 +1,176 @@
+package genai
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// EmbeddingRequest carries the inputs needed to start an
+// `embeddings {model}` span per the OTel GenAI semantic conventions.
+type EmbeddingRequest struct {
+	Provider string
+	Model    string
+	// BatchSize is the number of input texts in the embedding call,
+	// recorded as `cagent.embeddings.batch_size`. Zero means
+	// single-input.
+	BatchSize int
+	// EncodingFormats is the optional list of requested output
+	// encodings (e.g. "float", "base64") per the GenAI semconv.
+	// Recorded as `gen_ai.request.encoding_formats` when non-empty.
+	EncodingFormats []string
+}
+
+// EmbeddingSpan handles the lifecycle of an embedding span and the
+// matching `gen_ai.client.operation.duration` / `gen_ai.client.token.usage`
+// metric records.
+type EmbeddingSpan struct {
+	span      trace.Span
+	provider  string
+	model     string
+	startedAt time.Time
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+
+	mu          sync.Mutex
+	ended       bool
+	inputTokens int64
+	dimensions  int
+	errType     string
+}
+
+// StartEmbedding begins a CLIENT-kind `embeddings {model}` span and
+// records the spec-required `gen_ai.operation.name=embeddings`,
+// `gen_ai.provider.name`, and `gen_ai.request.model` attributes.
+func StartEmbedding(ctx context.Context, req EmbeddingRequest) (context.Context, *EmbeddingSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	name := OperationEmbeddings
+	if req.Model != "" {
+		name = OperationEmbeddings + " " + req.Model
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationEmbeddings),
+		attribute.String(AttrProviderName, req.Provider),
+	}
+	if req.Model != "" {
+		attrs = append(attrs, attribute.String(AttrRequestModel, req.Model))
+	}
+	if req.BatchSize > 1 {
+		attrs = append(attrs, attribute.Int("cagent.embeddings.batch_size", req.BatchSize))
+	}
+	if len(req.EncodingFormats) > 0 {
+		attrs = append(attrs, attribute.StringSlice(AttrRequestEncodingFormats, req.EncodingFormats))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &EmbeddingSpan{
+		span:      span,
+		provider:  req.Provider,
+		model:     req.Model,
+		startedAt: time.Now(),
+		metricCtx: ctx,
+	}
+}
+
+// SetInputTokens records the number of input tokens consumed by the
+// embedding call. Emitted as `gen_ai.usage.input_tokens` on the span
+// and as the `gen_ai.client.token.usage` metric at End time.
+func (s *EmbeddingSpan) SetInputTokens(n int64) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.inputTokens = n
+	s.mu.Unlock()
+}
+
+// SetDimensions records the dimensionality of the resulting embedding
+// vector(s). Emitted as `gen_ai.embeddings.dimension.count`.
+func (s *EmbeddingSpan) SetDimensions(d int) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.dimensions = d
+	s.mu.Unlock()
+}
+
+// RecordError marks the span as failed and stores `error.type` for the
+// duration metric.
+func (s *EmbeddingSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span and records the duration + token-usage metrics.
+func (s *EmbeddingSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	inputTokens := s.inputTokens
+	dimensions := s.dimensions
+	errType := s.errType
+	s.mu.Unlock()
+
+	if inputTokens > 0 {
+		s.span.SetAttributes(attribute.Int64(AttrUsageInputTokens, inputTokens))
+	}
+	if dimensions > 0 {
+		s.span.SetAttributes(attribute.Int(AttrEmbeddingsDimensionCount, dimensions))
+	}
+	s.span.End()
+
+	insts := getInstruments()
+	if insts == nil {
+		return
+	}
+	commonAttrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationEmbeddings),
+		attribute.String(AttrProviderName, s.provider),
+	}
+	if s.model != "" {
+		commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model))
+	}
+	durationAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+	if errType != "" {
+		durationAttrs = append(durationAttrs, attribute.String("error.type", errType))
+	}
+	if insts.clientOperationDuration != nil {
+		insts.clientOperationDuration.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+			metric.WithAttributes(durationAttrs...),
+		)
+	}
+	if inputTokens > 0 && insts.clientTokenUsage != nil {
+		tokenAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+		tokenAttrs = append(tokenAttrs, attribute.String(AttrTokenType, TokenTypeInput))
+		insts.clientTokenUsage.Record(s.metricCtx, inputTokens,
+			metric.WithAttributes(tokenAttrs...),
+		)
+	}
+}
diff --git a/pkg/telemetry/genai/errors.go b/pkg/telemetry/genai/errors.go
new file mode 100644
index 000000000..8d1f7db18
--- /dev/null
+++ b/pkg/telemetry/genai/errors.go
@@ -0,0 +1,85 @@
+package genai
+
+import (
+	"context"
+	"errors"
+	"net"
+	"strings"
+
+	"go.opentelemetry.io/otel/attribute"
+)
+
+// ErrorTypeOther is the OTel-mandated fallback for `error.type` when no
+// classifier matches. The spec requires `_OTHER` rather than a Go type
+// name so backends can rely on a bounded cardinality.
+const ErrorTypeOther = "_OTHER"
+
+// ClassifyError maps a provider error to a low-cardinality `error.type`
+// value suitable for span and metric attributes. Falls back to
+// `_OTHER` (the spec-defined sentinel) when the error does not match any
+// known pattern.
+//
+// Spec leaves the value space open for callers — these strings are picked
+// for cross-provider comparability on dashboards.
+func ClassifyError(err error) string {
+	if err == nil {
+		return ""
+	}
+	switch {
+	case errors.Is(err, context.Canceled):
+		return "context_canceled"
+	case errors.Is(err, context.DeadlineExceeded):
+		return "deadline_exceeded"
+	}
+
+	msg := strings.ToLower(err.Error())
+	switch {
+	case strings.Contains(msg, "context length") || strings.Contains(msg, "context_length"):
+		// Bare "max_tokens" matches too eagerly: validation errors
+		// like `max_tokens must be > 0` and "model X does not
+		// support max_tokens" both contain the token but are not
+		// context overflows. Stick to the unambiguous phrases.
+		return "context_length_exceeded"
+	case strings.Contains(msg, "rate limit") || strings.Contains(msg, "429"):
+		return "rate_limit"
+	case strings.Contains(msg, "401") || strings.Contains(msg, "unauthorized") || strings.Contains(msg, "authentication"):
+		return "auth"
+	case strings.Contains(msg, "403") || strings.Contains(msg, "forbidden") || strings.Contains(msg, "permission"):
+		return "forbidden"
+	case strings.Contains(msg, "content policy") || strings.Contains(msg, "content filter") || strings.Contains(msg, "safety"):
+		return "content_policy"
+	}
+
+	var netErr net.Error
+	if errors.As(err, &netErr) {
+		if netErr.Timeout() {
+			return "network_timeout"
+		}
+		return "network"
+	}
+
+	return ErrorTypeOther
+}
+
+// applyExtraAttribute converts a StreamAttributer KeyValue into an OTel
+// attribute and applies it to the span. Unsupported value types are
+// dropped silently — telemetry must never crash request paths.
+func applyExtraAttribute(span *ChatSpan, kv KeyValue) {
+	if span == nil || kv.Key == "" {
+		return
+	}
+	switch v := kv.Value.(type) {
+	case string:
+		span.SetAttributes(attribute.String(kv.Key, v))
+	case bool:
+		span.SetAttributes(attribute.Bool(kv.Key, v))
+	case int:
+		span.SetAttributes(attribute.Int(kv.Key, v))
+	case int64:
+		span.SetAttributes(attribute.Int64(kv.Key, v))
+	case float64:
+		span.SetAttributes(attribute.Float64(kv.Key, v))
+	case []string:
+		span.SetAttributes(attribute.StringSlice(kv.Key, v))
+	}
+}
diff --git a/pkg/telemetry/genai/evaluation.go b/pkg/telemetry/genai/evaluation.go
new file mode 100644
index 000000000..4d1673efa
--- /dev/null
+++ b/pkg/telemetry/genai/evaluation.go
@@ -0,0 +1,64 @@
+package genai
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/log"
+	"go.opentelemetry.io/otel/log/global"
+)
+
+// EvaluationResult describes one evaluation outcome that should be emitted
+// as a `gen_ai.evaluation.result` log record per the OTel GenAI semconv
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/).
+type EvaluationResult struct {
+	// Name is the evaluation metric — e.g. "relevance", "factuality",
+	// "tool_calls_f1". Required.
+	Name string
+
+	// ScoreLabel is the human-readable verdict — e.g. "passed",
+	// "failed", "satisfactory". Optional but commonly set.
+	ScoreLabel string
+
+	// ScoreValue is the numeric score (commonly 0.0–1.0). Optional.
+	ScoreValue    float64
+	HasScoreValue bool
+
+	// Explanation is a free-form reason for the score. Optional.
+	Explanation string
+
+	// ErrorType is set when the evaluation itself failed (e.g. the
+	// judge model errored out). Mirrors the spec's `error.type` field.
+	ErrorType string
+}
+
+// EmitEvaluationResult emits a `gen_ai.evaluation.result` log record. The
+// record links to the active span via the supplied context so dashboards
+// can join evaluation outcomes back onto the operation that produced
+// them. No-op when no logger provider is configured.
+func EmitEvaluationResult(ctx context.Context, result EvaluationResult) {
+	logger := global.GetLoggerProvider().Logger(instrumentationName)
+
+	var rec log.Record
+	rec.SetEventName("gen_ai.evaluation.result")
+	rec.SetSeverity(log.SeverityInfo)
+	rec.SetSeverityText("INFO")
+
+	rec.AddAttributes(log.String(AttrEvaluationName, result.Name))
+	if result.ScoreLabel != "" {
+		rec.AddAttributes(log.String(AttrEvaluationScoreLabel, result.ScoreLabel))
+	}
+	if result.HasScoreValue {
+		rec.AddAttributes(log.Float64(AttrEvaluationScoreValue, result.ScoreValue))
+	}
+	if result.Explanation != "" {
+		rec.AddAttributes(log.String(AttrEvaluationExplanation, result.Explanation))
+	}
+	if result.ErrorType != "" {
+		rec.AddAttributes(log.String("error.type", result.ErrorType))
+	}
+	if convID := ConversationIDFromContext(ctx); convID != "" {
+		rec.AddAttributes(log.String(AttrConversationID, convID))
+	}
+
+	logger.Emit(ctx, rec)
+}
diff --git a/pkg/telemetry/genai/genai_test.go b/pkg/telemetry/genai/genai_test.go
new file mode 100644
index 000000000..692d41212
--- /dev/null
+++ b/pkg/telemetry/genai/genai_test.go
@@ -0,0 +1,156 @@
+package genai
+
+import (
+	"context"
+	"errors"
+	"io"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/docker/docker-agent/pkg/chat"
+)
+
+func TestProviderNameForConfig(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		in   string
+		want string
+	}{
+		{"openai", ProviderOpenAI},
+		{"openai_chatcompletions", ProviderOpenAI},
+		{"openai_responses", ProviderOpenAI},
+		{"anthropic", ProviderAnthropic},
+		{"amazon-bedrock", ProviderAWSBedrock},
+		{"google", ProviderGCPGenAI},
+		{"vertexai", ProviderGCPVertexAI},
+		{"azure", ProviderAzureAI},
+		{"dmr", ProviderDMR},
+		{"custom-provider", "custom-provider"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.in, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, ProviderNameForConfig(tt.in))
+		})
+	}
+}
+
+func TestClassifyError(t *testing.T) {
+	t.Parallel()
+	tests := []struct {
+		name string
+		err  error
+		want string
+	}{
+		{"nil", nil, ""},
+		{"context canceled", context.Canceled, "context_canceled"},
+		{"context deadline", context.DeadlineExceeded, "deadline_exceeded"},
+		{"rate limit", errors.New("HTTP 429 Too Many Requests"), "rate_limit"},
+		{"context length", errors.New("context_length_exceeded: prompt too large"), "context_length_exceeded"},
+		{"unauthorized", errors.New("HTTP 401 Unauthorized"), "auth"},
+		{"forbidden", errors.New("HTTP 403 Forbidden"), "forbidden"},
+		{"content policy", errors.New("response blocked by content filter"), "content_policy"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, ClassifyError(tt.err))
+		})
+	}
+}
+
+// fakeStream produces a fixed sequence of chunks then EOF.
+type fakeStream struct {
+	chunks []chat.MessageStreamResponse
+	idx    int
+	closed bool
+}
+
+func (f *fakeStream) Recv() (chat.MessageStreamResponse, error) {
+	if f.idx >= len(f.chunks) {
+		return chat.MessageStreamResponse{}, io.EOF
+	}
+	r := f.chunks[f.idx]
+	f.idx++
+	return r, nil
+}
+
+func (f *fakeStream) Close() { f.closed = true }
+
+func TestStartChatAndWrapStream(t *testing.T) {
+	t.Parallel()
+
+	stream := &fakeStream{
+		chunks: []chat.MessageStreamResponse{
+			{
+				ID:    "resp-1",
+				Model: "claude-sonnet-4",
+				Choices: []chat.MessageStreamChoice{
+					{Delta: chat.MessageDelta{Content: "hello"}},
+				},
+			},
+			{
+				Choices: []chat.MessageStreamChoice{
+					{FinishReason: chat.FinishReasonStop},
+				},
+				Usage: &chat.Usage{
+					InputTokens:       100,
+					OutputTokens:      50,
+					CachedInputTokens: 20,
+					CacheWriteTokens:  10,
+				},
+			},
+		},
+	}
+
+	ctx, span := StartChat(t.Context(), ChatRequest{
+		Provider:  ProviderAnthropic,
+		Model:     "claude-sonnet-4",
+		Stream:    true,
+		MaxTokens: 4096,
+	})
+	require.NotNil(t, span)
+	require.NotNil(t, ctx)
+
+	wrapped := WrapStream(span, stream)
+
+	// Drain the stream.
+	for {
+		resp, err := wrapped.Recv()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		require.NoError(t, err)
+		_ = resp
+	}
+	wrapped.Close()
+	assert.True(t, stream.closed)
+
+	// Re-closing should be a no-op (the wrapper guards against
+	// double-Close, which would otherwise emit two End() calls).
+	wrapped.Close()
+}
+
+func TestWrapStreamNilSpanReturnsOriginal(t *testing.T) {
+	t.Parallel()
+	s := &fakeStream{}
+	got := WrapStream(nil, s)
+	assert.Same(t, s, got)
+}
+
+func TestServerAddressFromURL(t *testing.T) {
+	t.Parallel()
+	host, port := ServerAddressFromURL("https://api.anthropic.com:443/v1/messages")
+	assert.Equal(t, "api.anthropic.com", host)
+	assert.Equal(t, 443, port)
+
+	host, port = ServerAddressFromURL("https://api.openai.com/v1/chat/completions")
+	assert.Equal(t, "api.openai.com", host)
+	assert.Equal(t, 0, port)
+
+	host, port = ServerAddressFromURL("")
+	assert.Empty(t, host)
+	assert.Equal(t, 0, port)
+}
diff --git a/pkg/telemetry/genai/metrics.go b/pkg/telemetry/genai/metrics.go
new file mode 100644
index 000000000..01f8d90f8
--- /dev/null
+++ b/pkg/telemetry/genai/metrics.go
@@ -0,0 +1,80 @@
+package genai
+
+import (
+	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// instrumentationName identifies this package as the OTel instrumentation
+// scope for spans, metrics, and log records it produces.
+const instrumentationName = "github.com/docker/docker-agent/pkg/telemetry/genai"
+
+// metricBucketsDuration matches the spec for `gen_ai.client.operation.duration`
+// (and related per-chunk timing histograms).
+var metricBucketsDuration = []float64{
+	0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
+}
+
+// metricBucketsTokenUsage matches the spec for `gen_ai.client.token.usage`.
+var metricBucketsTokenUsage = []float64{
+	1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864,
+}
+
+// instruments holds the lazily-initialised metric instruments. Resolved on
+// first use because the global MeterProvider is set at SDK init time, which
+// may run after package-level var initialisation in some contexts.
+type instruments struct {
+	clientOperationDuration     metric.Float64Histogram
+	clientOperationTTFC         metric.Float64Histogram
+	clientOperationTimePerChunk metric.Float64Histogram
+	clientTokenUsage            metric.Int64Histogram
+}
+
+var (
+	instOnce sync.Once
+	inst     *instruments
+)
+
+// getInstruments resolves and caches the package-level meter instruments.
+// Histogram creation rarely fails in practice; when one does we keep the
+// instruments that did succeed and leave the failed one nil. Call sites
+// already nil-check each instrument, so a partial set is functional —
+// previously a single early return left every metric permanently
+// disabled, which surprised production debugging when one bucket
+// configuration tripped a registration error.
+func getInstruments() *instruments {
+	instOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		i := &instruments{}
+
+		i.clientOperationDuration, _ = meter.Float64Histogram(
+			"gen_ai.client.operation.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("GenAI operation duration."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		i.clientOperationTTFC, _ = meter.Float64Histogram(
+			"gen_ai.client.operation.time_to_first_chunk",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time to receive the first chunk of a streaming GenAI response."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		i.clientOperationTimePerChunk, _ = meter.Float64Histogram(
+			"gen_ai.client.operation.time_per_output_chunk",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time between consecutive output chunks of a streaming GenAI response."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		i.clientTokenUsage, _ = meter.Int64Histogram(
+			"gen_ai.client.token.usage",
+			metric.WithUnit("{token}"),
+			metric.WithDescription("Number of tokens used in a GenAI client request, broken down by token type."),
+			metric.WithExplicitBucketBoundaries(metricBucketsTokenUsage...),
+		)
+
+		inst = i
+	})
+	return inst
+}
diff --git a/pkg/telemetry/genai/provider_names.go b/pkg/telemetry/genai/provider_names.go
new file mode 100644
index 000000000..8076583f4
--- /dev/null
+++ b/pkg/telemetry/genai/provider_names.go
@@ -0,0 +1,28 @@
+package genai
+
+// ProviderNameForConfig maps the project's internal provider type strings
+// (the values used in agent YAML and resolved by
+// pkg/model/provider.resolveProviderType) to the GenAI semconv provider
+// names defined in the per-provider semantic conventions. Unknown
+// providers fall through unchanged so dashboards still receive a value
+// rather than empty string.
+func ProviderNameForConfig(internalName string) string {
+	switch internalName {
+	case "openai", "openai_chatcompletions", "openai_responses":
+		return ProviderOpenAI
+	case "anthropic":
+		return ProviderAnthropic
+	case "amazon-bedrock":
+		return ProviderAWSBedrock
+	case "google":
+		return ProviderGCPGenAI
+	case "vertexai", "google-vertex":
+		return ProviderGCPVertexAI
+	case "azure", "azure-openai":
+		return ProviderAzureAI
+	case "dmr":
+		return ProviderDMR
+	default:
+		return internalName
+	}
+}
diff --git a/pkg/telemetry/genai/runtime.go b/pkg/telemetry/genai/runtime.go
new file mode 100644
index 000000000..628b21c1c
--- /dev/null
+++ b/pkg/telemetry/genai/runtime.go
@@ -0,0 +1,367 @@
+package genai
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// Custom (non-spec) attribute keys for runtime-side observability that has
+// no GenAI semconv equivalent yet (fallback chain, response cache,
+// approval pipeline). Kept under the `cagent.` namespace so they are
+// clearly distinguishable from the spec-defined `gen_ai.*` and `mcp.*`
+// attributes when scrolling through a span.
+const (
+	AttrFallbackPrimaryModel = "cagent.fallback.primary_model"
+	AttrFallbackFinalModel   = "cagent.fallback.final_model"
+	AttrFallbackAttempts     = "cagent.fallback.attempts"
+	AttrFallbackOutcome      = "cagent.fallback.outcome"
+	AttrFallbackInCooldown   = "cagent.fallback.in_cooldown"
+
+	AttrCacheHit     = "cagent.cache.hit"
+	AttrCacheBacking = "cagent.cache.backing"
+
+	AttrAgentNameRuntime = "cagent.agent.name"
+
+	AttrRetrievalResultCount = "cagent.retrieval.result_count"
+
+	AttrSandboxRuntime   = "cagent.sandbox.runtime"
+	AttrSandboxImage     = "cagent.sandbox.image"
+	AttrSandboxContainer = "cagent.sandbox.container"
+	AttrSandboxExitCode  = "cagent.sandbox.exit_code"
+)
+
+// FallbackOutcome values for AttrFallbackOutcome.
+const (
+	FallbackOutcomeSuccess         = "success"
+	FallbackOutcomeFailed          = "failed"
+	FallbackOutcomeContextCanceled = "context_canceled"
+)
+
+// FallbackSpan is the handle for an in-flight runtime.fallback span.
+type FallbackSpan struct {
+	span      trace.Span
+	startedAt time.Time
+
+	mu       sync.Mutex
+	attempts int
+	final    string
+	outcome  string
+	errType  string
+	ended    bool
+}
+
+// StartFallback begins a runtime.fallback span covering the whole fallback
+// chain for one agent turn. Each per-model attempt produces its own
+// `chat {model}` CLIENT child span (created by the provider decorator).
+// Attributes set up front: primary model name, agent name, in-cooldown
+// flag. The caller updates final model / attempts / outcome through the
+// returned handle and calls End to flush.
+func StartFallback(ctx context.Context, agentName, primaryModel string, inCooldown bool) (context.Context, *FallbackSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrAgentNameRuntime, agentName),
+		attribute.Bool(AttrFallbackInCooldown, inCooldown),
+	}
+	if primaryModel != "" {
+		attrs = append(attrs, attribute.String(AttrFallbackPrimaryModel, primaryModel))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, "runtime.fallback",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &FallbackSpan{
+		span:      span,
+		startedAt: time.Now(),
+	}
+}
+
+// IncrementAttempt counts one attempt against the chain. Called once per
+// (model × retry) iteration so the final span carries the total count.
+func (s *FallbackSpan) IncrementAttempt() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.attempts++
+	s.mu.Unlock()
+}
+
+// SetFinalModel records the model that ultimately served the response.
+// Called on the success path; not called on full-failure paths so the
+// attribute remains absent and dashboards can distinguish the cases.
+func (s *FallbackSpan) SetFinalModel(model string) {
+	if s == nil || model == "" {
+		return
+	}
+	s.mu.Lock()
+	s.final = model
+	s.mu.Unlock()
+}
+
+// RecordError stores an error and an error.type label for the metric.
+func (s *FallbackSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// SetOutcome records the terminal outcome of the chain. Use one of the
+// FallbackOutcome* constants.
+func (s *FallbackSpan) SetOutcome(outcome string) {
+	if s == nil || outcome == "" {
+		return
+	}
+	s.mu.Lock()
+	s.outcome = outcome
+	s.mu.Unlock()
+}
+
+// End closes the span and flushes accumulated attributes.
+func (s *FallbackSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	final := s.final
+	outcome := s.outcome
+	attempts := s.attempts
+	s.mu.Unlock()
+
+	if final != "" {
+		s.span.SetAttributes(attribute.String(AttrFallbackFinalModel, final))
+	}
+	if outcome != "" {
+		s.span.SetAttributes(attribute.String(AttrFallbackOutcome, outcome))
+	}
+	s.span.SetAttributes(attribute.Int(AttrFallbackAttempts, attempts))
+	s.span.End()
+}
+
+// RetrievalSpan handles a retrieval-operation span lifecycle.
+type RetrievalSpan struct {
+	span      trace.Span
+	startedAt time.Time
+
+	mu          sync.Mutex
+	resultCount int
+	errType     string
+	ended       bool
+}
+
+// StartRetrieval begins a `retrieval {data_source.id}` span per the OTel
+// GenAI semconv. providerName identifies the retrieval backend
+// ("sqlite", "rag", an embedding-provider name) and is Required by the
+// spec for retrieval operations. dataSourceID identifies the corpus /
+// index / collection being queried; queryText is captured only when
+// the caller has confirmed the content-capture opt-in.
+func StartRetrieval(ctx context.Context, providerName, dataSourceID string, captureQuery bool, queryText string) (context.Context, *RetrievalSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	name := OperationRetrieval
+	if dataSourceID != "" {
+		name = OperationRetrieval + " " + dataSourceID
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationRetrieval),
+	}
+	if providerName != "" {
+		attrs = append(attrs, attribute.String(AttrProviderName, providerName))
+	}
+	if dataSourceID != "" {
+		attrs = append(attrs, attribute.String(AttrDataSourceID, dataSourceID))
+	}
+	if captureQuery && queryText != "" {
+		attrs = append(attrs, attribute.String(AttrRetrievalQueryText, queryText))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &RetrievalSpan{span: span, startedAt: time.Now()}
+}
+
+// SetAttributes adds extra attributes to the retrieval span. Use for
+// retrieval-specific extensions (corpus filter, category, fusion mode,
+// etc.) that don't have a dedicated setter.
+func (s *RetrievalSpan) SetAttributes(attrs ...attribute.KeyValue) {
+	if s == nil {
+		return
+	}
+	s.span.SetAttributes(attrs...)
+}
+
+// SetResultCount records how many documents the retrieval returned.
+func (s *RetrievalSpan) SetResultCount(n int) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.resultCount = n
+	s.mu.Unlock()
+}
+
+// RecordError marks the retrieval span as failed.
+func (s *RetrievalSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the retrieval span and flushes the result count.
+func (s *RetrievalSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	count := s.resultCount
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.Int(AttrRetrievalResultCount, count))
+	s.span.End()
+}
+
+// CacheRequest counter — records every cache lookup with `result=hit|miss`
+// and a `backing` attribute for memory-only vs file-backed caches.
+var (
+	cacheCounterOnce sync.Once
+	cacheCounter     metric.Int64Counter
+)
+
+func getCacheCounter() metric.Int64Counter {
+	cacheCounterOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		c, err := meter.Int64Counter(
+			"cagent.cache.requests",
+			metric.WithUnit("{request}"),
+			metric.WithDescription("Number of response-cache lookups, broken down by hit/miss."),
+		)
+		if err != nil {
+			return
+		}
+		cacheCounter = c
+	})
+	return cacheCounter
+}
+
+// RecordCacheLookup increments the cache counter and returns a small span
+// describing the lookup. Callers `defer span.End()` and the helper sets
+// `cagent.cache.hit` from the value returned by SetHit.
+func RecordCacheLookup(ctx context.Context, backing string) (context.Context, *CacheSpan) {
+	return startCacheSpan(ctx, "cache.lookup", "lookup", backing)
+}
+
+// RecordCacheStore is the Store-side counterpart of RecordCacheLookup.
+func RecordCacheStore(ctx context.Context, backing string) (context.Context, *CacheSpan) {
+	return startCacheSpan(ctx, "cache.store", "store", backing)
+}
+
+func startCacheSpan(ctx context.Context, spanName, op, backing string) (context.Context, *CacheSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	attrs := []attribute.KeyValue{}
+	if backing != "" {
+		attrs = append(attrs, attribute.String(AttrCacheBacking, backing))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, spanName,
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &CacheSpan{span: span, metricCtx: ctx, backing: backing, op: op}
+}
+
+// CacheSpan handles cache-operation span lifecycle.
+type CacheSpan struct {
+	span trace.Span
+	// metricCtx carries the active span context so counter Add calls
+	// produce span-context exemplars (drill Mimir bucket → Tempo
+	// trace). Without this the counter measurement gets only the
+	// resource attributes.
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+	backing   string
+	op        string
+
+	mu  sync.Mutex
+	hit bool
+	set bool
+}
+
+// SetHit records whether the lookup found an entry. Increments the
+// cache counter immediately so the metric reflects the result even if End
+// is called late.
+func (s *CacheSpan) SetHit(hit bool) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.hit = hit
+	s.set = true
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.Bool(AttrCacheHit, hit))
+
+	if c := getCacheCounter(); c != nil {
+		result := "miss"
+		if hit {
+			result = "hit"
+		}
+		attrs := []attribute.KeyValue{
+			attribute.String("result", result),
+			attribute.String("operation", s.op),
+		}
+		if s.backing != "" {
+			attrs = append(attrs, attribute.String(AttrCacheBacking, s.backing))
+		}
+		// Use the active context so the counter measurement carries
+		// the span exemplar — drill from Mimir bucket → Tempo trace
+		// works for cache operations the same way it does for chat.
+		c.Add(s.metricCtx, 1, metric.WithAttributes(attrs...))
+	}
+}
+
+// End closes the cache span.
+func (s *CacheSpan) End() {
+	if s == nil {
+		return
+	}
+	s.span.End()
+}
diff --git a/pkg/telemetry/genai/sandbox.go b/pkg/telemetry/genai/sandbox.go
new file mode 100644
index 000000000..4b97d7fc0
--- /dev/null
+++ b/pkg/telemetry/genai/sandbox.go
@@ -0,0 +1,231 @@
+package genai
+
+import (
+	"context"
+	"strings"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// envCarrier adapts an env-var key/value map to OTel's TextMapCarrier so
+// the configured propagator can write traceparent / tracestate / baggage
+// into a subprocess's environment. Keys are uppercased on Set to match
+// the convention subprocess-propagation tools (otel-cli, OTel SDKs)
+// expect.
+type envCarrier map[string]string
+
+func (c envCarrier) Get(key string) string { return c[strings.ToUpper(key)] }
+func (c envCarrier) Set(key, value string) { c[strings.ToUpper(key)] = value }
+func (c envCarrier) Keys() []string {
+	keys := make([]string, 0, len(c))
+	for k := range c {
+		keys = append(keys, k)
+	}
+	return keys
+}
+
+var _ propagation.TextMapCarrier = envCarrier{}
+
+// InjectSandboxEnv returns docker-style `-e KEY=VALUE` flags carrying the
+// W3C trace context for the current span so the agent process spawned
+// inside a sandbox container inherits the parent trace. Anything OTel-
+// aware running in the container — another agent, an HTTP client with
+// otelhttp transport, otel-cli — auto-parents its spans onto the active
+// CLIENT span on the host side.
+//
+// Returns nil when no propagator is configured or when the active context
+// has no span context to inject.
+func InjectSandboxEnv(ctx context.Context) []string {
+	carrier := envCarrier{}
+	otel.GetTextMapPropagator().Inject(ctx, carrier)
+	if len(carrier) == 0 {
+		return nil
+	}
+	flags := make([]string, 0, 2*len(carrier))
+	for k, v := range carrier {
+		flags = append(flags, "-e", k+"="+v)
+	}
+	return flags
+}
+
+// InjectTraceContextEnv returns `KEY=VALUE` env-var strings carrying the
+// W3C trace context for the current span. Use to extend `exec.Cmd.Env`
+// for direct subprocess spawns (hook scripts, LSP servers) so OTel-aware
+// children chain onto the active span. Companion to `InjectSandboxEnv`,
+// which formats for `docker -e`.
+//
+// Returns nil when no propagator is configured or when the active context
+// has no span context to inject.
+func InjectTraceContextEnv(ctx context.Context) []string {
+	carrier := envCarrier{}
+	otel.GetTextMapPropagator().Inject(ctx, carrier)
+	if len(carrier) == 0 {
+		return nil
+	}
+	out := make([]string, 0, len(carrier))
+	for k, v := range carrier {
+		out = append(out, k+"="+v)
+	}
+	return out
+}
+
+// SandboxSpan handles the lifecycle of a sandbox.exec span and the
+// matching sandbox.exec.duration histogram. Use to wrap the actual
+// `docker sandbox exec` (or equivalent) subprocess invocation so the
+// host side has timing, exit code, runtime kind, and image information
+// alongside the inherited child trace from inside the sandbox.
+type SandboxSpan struct {
+	span trace.Span
+	// metricCtx carries the active span context so histogram Record
+	// calls produce span-context exemplars (drill Mimir → Tempo).
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+	startedAt time.Time
+	runtime   string
+
+	mu       sync.Mutex
+	exitCode int
+	hasExit  bool
+	errType  string
+	ended    bool
+}
+
+// SandboxOptions configures the attributes set on a sandbox.exec span at
+// creation time. All fields are optional except Runtime.
+type SandboxOptions struct {
+	// Runtime is a short label identifying the sandbox backend (e.g.
+	// `"docker"`). Recorded as `cagent.sandbox.runtime` and used as a
+	// histogram label, so callers should keep the set of values small
+	// and stable.
+	Runtime string
+
+	// Image is the container/pod image when known.
+	Image string
+
+	// Container is the container/pod identifier when known.
+	Container string
+
+	// AgentName is the agent being executed in the sandbox.
+	AgentName string
+}
+
+// StartSandboxExec opens a `sandbox.exec` INTERNAL span. Runtime kind is
+// set up front; exit code and error info attach via the returned handle.
+func StartSandboxExec(ctx context.Context, opts SandboxOptions) (context.Context, *SandboxSpan) {
+	tracer := otel.Tracer(instrumentationName)
+	attrs := []attribute.KeyValue{}
+	if opts.Runtime != "" {
+		attrs = append(attrs, attribute.String(AttrSandboxRuntime, opts.Runtime))
+	}
+	if opts.Image != "" {
+		attrs = append(attrs, attribute.String(AttrSandboxImage, opts.Image))
+	}
+	if opts.Container != "" {
+		attrs = append(attrs, attribute.String(AttrSandboxContainer, opts.Container))
+	}
+	if opts.AgentName != "" {
+		attrs = append(attrs, attribute.String(AttrAgentNameRuntime, opts.AgentName))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+	ctx, span := tracer.Start(ctx, "sandbox.exec",
+		trace.WithSpanKind(trace.SpanKindInternal),
+		trace.WithAttributes(attrs...),
+	)
+	return ctx, &SandboxSpan{span: span, metricCtx: ctx, startedAt: time.Now(), runtime: opts.Runtime}
+}
+
+// SetExitCode records the subprocess exit code as
+// `cagent.sandbox.exit_code`. Set zero on success.
+func (s *SandboxSpan) SetExitCode(code int) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	s.exitCode = code
+	s.hasExit = true
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.Int(AttrSandboxExitCode, code))
+}
+
+// RecordError marks the span as failed.
+func (s *SandboxSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span and records the sandbox.exec.duration histogram.
+func (s *SandboxSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	errType := s.errType
+	s.mu.Unlock()
+
+	s.span.End()
+
+	hist := getSandboxDurationHistogram()
+	if hist == nil {
+		return
+	}
+	attrs := []attribute.KeyValue{}
+	if s.runtime != "" {
+		// Partitions the histogram by sandbox backend so dashboards
+		// can compare exec latency across runtimes when more than
+		// one is wired up.
+		attrs = append(attrs, attribute.String(AttrSandboxRuntime, s.runtime))
+	}
+	if errType != "" {
+		attrs = append(attrs, attribute.String("error.type", errType))
+	}
+	// Use the active context so the histogram measurement carries the
+	// span exemplar — drill from Mimir bucket → Tempo trace.
+	hist.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+		metric.WithAttributes(attrs...),
+	)
+}
+
+var (
+	sandboxDurationOnce sync.Once
+	sandboxDurationHist metric.Float64Histogram
+)
+
+func getSandboxDurationHistogram() metric.Float64Histogram {
+	sandboxDurationOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		h, err := meter.Float64Histogram(
+			"cagent.sandbox.exec.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time the host side spent waiting for a sandbox exec invocation to complete."),
+			metric.WithExplicitBucketBoundaries(metricBucketsDuration...),
+		)
+		if err != nil {
+			return
+		}
+		sandboxDurationHist = h
+	})
+	return sandboxDurationHist
+}
diff --git a/pkg/telemetry/genai/span.go b/pkg/telemetry/genai/span.go
new file mode 100644
index 000000000..9b0542973
--- /dev/null
+++ b/pkg/telemetry/genai/span.go
@@ -0,0 +1,418 @@
+package genai
+
+import (
+	"context"
+	"net/url"
+	"slices"
+	"strconv"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+	tracenoop "go.opentelemetry.io/otel/trace/noop"
+)
+
+// ChatRequest carries the inputs needed to start a `chat {model}` span and
+// to record the matching client metrics. Provider-specific extensions
+// (openai service tier, aws.bedrock guardrail, etc.) attach via
+// ChatSpan.SetAttributes after the span has started.
+type ChatRequest struct {
+	// Provider is the GenAI provider name. Use one of the Provider*
+	// constants. Set on the span at creation time per the per-provider
+	// semconv MUST clauses.
+	Provider string
+
+	// Model is the requested model identifier. Empty model is allowed
+	// (some routers do not commit until inside the call) but produces a
+	// span name of just "chat".
+	Model string
+
+	// Stream is true if the request is streaming. Recorded as
+	// gen_ai.request.stream.
+	Stream bool
+
+	// ServerAddress / ServerPort identify the GenAI endpoint when known
+	// (helpful for routing-aware dashboards). Optional.
+	ServerAddress string
+	ServerPort    int
+
+	// Sampling parameters. Zero values are treated as unset and not
+	// recorded on the span.
+	MaxTokens        int
+	Temperature      float64
+	TopP             float64
+	TopK             float64
+	FrequencyPenalty float64
+	PresencePenalty  float64
+	Seed             int
+	StopSequences    []string
+	ChoiceCount      int
+
+	// HasTemperature / HasTopP / HasTopK / HasFreqPenalty / HasPresPenalty
+	// disambiguate "explicitly zero" from "unset" for the float params.
+	// Callers that use the zero value as meaningful must set these.
+	HasTemperature bool
+	HasTopP        bool
+	HasTopK        bool
+	HasFreqPenalty bool
+	HasPresPenalty bool
+}
+
+// ServerAddressFromURL extracts host and port for the ServerAddress /
+// ServerPort fields when callers have a full URL handy.
+func ServerAddressFromURL(raw string) (string, int) {
+	if raw == "" {
+		return "", 0
+	}
+	u, err := url.Parse(raw)
+	if err != nil || u.Host == "" {
+		return "", 0
+	}
+	port, _ := strconv.Atoi(u.Port())
+	return u.Hostname(), port
+}
+
+// ChatSpan is the handle returned by StartChat. It wraps an OTel span and
+// captures enough state to emit per-operation metrics on End.
+type ChatSpan struct {
+	span      trace.Span
+	provider  string
+	model     string
+	startedAt time.Time
+	// metricCtx carries the request context captured at StartChat
+	// time so metric Record / Add calls in End preserve the
+	// trace-to-metric exemplar link. Using context.Background() here
+	// would silently strip the active span context and break
+	// drill-from-metric-bucket-to-trace navigation in Tempo/Mimir.
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+
+	mu            sync.Mutex
+	ended         bool
+	responseModel string
+	finishReasons []string
+	usageRecorded bool
+	usage         chatUsage
+	errType       string
+
+	// Streaming metrics: the first non-empty chunk timestamp and the
+	// previous chunk timestamp drive the time_to_first_chunk and
+	// time_per_output_chunk histograms.
+	firstChunkAt   time.Time
+	prevChunkAt    time.Time
+	chunkDurations []float64
+}
+
+type chatUsage struct {
+	inputTokens        int64
+	outputTokens       int64
+	cacheReadInput     int64
+	cacheCreationInput int64
+	reasoningOutput    int64
+}
+
+// StartChat begins a CLIENT-kind `chat {model}` span and records the
+// required gen_ai.* request attributes. The returned context carries the
+// new span; callers MUST call ChatSpan.End to flush the span and metrics.
+func StartChat(ctx context.Context, req ChatRequest) (context.Context, *ChatSpan) {
+	tracer := otel.Tracer(instrumentationName)
+
+	name := OperationChat
+	if req.Model != "" {
+		name = OperationChat + " " + req.Model
+	}
+
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationChat),
+		attribute.String(AttrProviderName, req.Provider),
+		attribute.Bool(AttrRequestStream, req.Stream),
+	}
+	if req.Model != "" {
+		attrs = append(attrs, attribute.String(AttrRequestModel, req.Model))
+	}
+	if req.ServerAddress != "" {
+		attrs = append(attrs, attribute.String("server.address", req.ServerAddress))
+		if req.ServerPort > 0 {
+			attrs = append(attrs, attribute.Int("server.port", req.ServerPort))
+		}
+	}
+	if req.MaxTokens > 0 {
+		attrs = append(attrs, attribute.Int(AttrRequestMaxTokens, req.MaxTokens))
+	}
+	if req.HasTemperature {
+		attrs = append(attrs, attribute.Float64(AttrRequestTemperature, req.Temperature))
+	}
+	if req.HasTopP {
+		attrs = append(attrs, attribute.Float64(AttrRequestTopP, req.TopP))
+	}
+	if req.HasTopK {
+		attrs = append(attrs, attribute.Float64(AttrRequestTopK, req.TopK))
+	}
+	if req.HasFreqPenalty {
+		attrs = append(attrs, attribute.Float64(AttrRequestFrequencyPenalty, req.FrequencyPenalty))
+	}
+	if req.HasPresPenalty {
+		attrs = append(attrs, attribute.Float64(AttrRequestPresencePenalty, req.PresencePenalty))
+	}
+	if req.Seed != 0 {
+		attrs = append(attrs, attribute.Int(AttrRequestSeed, req.Seed))
+	}
+	if len(req.StopSequences) > 0 {
+		attrs = append(attrs, attribute.StringSlice(AttrRequestStopSequences, req.StopSequences))
+	}
+	if req.ChoiceCount > 0 && req.ChoiceCount != 1 {
+		attrs = append(attrs, attribute.Int(AttrRequestChoiceCount, req.ChoiceCount))
+	}
+	if conv, ok := conversationAttribute(ctx); ok {
+		attrs = append(attrs, conv)
+	}
+
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(attrs...),
+	)
+
+	return ctx, &ChatSpan{
+		span:      span,
+		provider:  req.Provider,
+		model:     req.Model,
+		startedAt: time.Now(),
+		metricCtx: ctx,
+	}
+}
+
+// SetAttributes adds extra attributes to the span. Use for provider-specific
+// fields (openai.*, aws.bedrock.*) and for response-side attributes the
+// caller learns later.
+func (s *ChatSpan) SetAttributes(attrs ...attribute.KeyValue) {
+	if s == nil {
+		return
+	}
+	s.span.SetAttributes(attrs...)
+}
+
+// SetResponseModel records gen_ai.response.model. Some providers return a
+// resolved model name that differs from the requested one (alias expansion,
+// version pinning); both values are useful.
+func (s *ChatSpan) SetResponseModel(model string) {
+	if s == nil || model == "" {
+		return
+	}
+	s.mu.Lock()
+	s.responseModel = model
+	s.mu.Unlock()
+	s.span.SetAttributes(attribute.String(AttrResponseModel, model))
+}
+
+// SetResponseID records gen_ai.response.id.
+func (s *ChatSpan) SetResponseID(id string) {
+	if s == nil || id == "" {
+		return
+	}
+	s.span.SetAttributes(attribute.String(AttrResponseID, id))
+}
+
+// AddFinishReason accumulates a finish reason. The spec defines the
+// attribute as a string array — multiple values are recorded once on End.
+func (s *ChatSpan) AddFinishReason(reason string) {
+	if s == nil || reason == "" {
+		return
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if slices.Contains(s.finishReasons, reason) {
+		return
+	}
+	s.finishReasons = append(s.finishReasons, reason)
+}
+
+// RecordUsage stores the token usage for emission as both span attributes
+// and the gen_ai.client.token.usage histogram. Callers pass raw provider
+// values; this package applies the spec-mandated Anthropic input-token sum
+// (`input_tokens` reported by Anthropic excludes cached tokens, so the
+// spec requires summing input + cache_read + cache_creation).
+func (s *ChatSpan) RecordUsage(inputTokens, outputTokens, cacheReadInput, cacheCreationInput, reasoningOutput int64) {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.usage.inputTokens = inputTokens
+	s.usage.outputTokens = outputTokens
+	s.usage.cacheReadInput = cacheReadInput
+	s.usage.cacheCreationInput = cacheCreationInput
+	s.usage.reasoningOutput = reasoningOutput
+	s.usageRecorded = true
+}
+
+// MarkChunk records the timing of a streamed output chunk. The first call
+// drives gen_ai.response.time_to_first_chunk (and the corresponding
+// metric); subsequent calls accumulate per-chunk durations.
+func (s *ChatSpan) MarkChunk() {
+	if s == nil {
+		return
+	}
+	now := time.Now()
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.firstChunkAt.IsZero() {
+		s.firstChunkAt = now
+	} else {
+		s.chunkDurations = append(s.chunkDurations, now.Sub(s.prevChunkAt).Seconds())
+	}
+	s.prevChunkAt = now
+}
+
+// RecordError marks the span as failed and stores error.type for the
+// duration metric. errType should be a short, low-cardinality string —
+// "rate_limit", "context_length_exceeded", "auth", "network",
+// "context_canceled", or "_OTHER" as the spec-defined fallback. When
+// errType is empty, ClassifyError(err) is called to derive a value, so
+// callers that don't already have a classification can pass "" without
+// losing it to the "_OTHER" bucket.
+func (s *ChatSpan) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span, flushes accumulated finish reasons / usage / timing
+// to the span, and records the duration and token-usage histograms. Safe
+// to call multiple times; subsequent calls are no-ops.
+func (s *ChatSpan) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	finishReasons := append([]string(nil), s.finishReasons...)
+	usage := s.usage
+	usageRecorded := s.usageRecorded
+	errType := s.errType
+	firstChunkAt := s.firstChunkAt
+	chunkDurations := append([]float64(nil), s.chunkDurations...)
+	s.mu.Unlock()
+
+	if len(finishReasons) > 0 {
+		s.span.SetAttributes(attribute.StringSlice(AttrResponseFinishReasons, finishReasons))
+	}
+	if !firstChunkAt.IsZero() {
+		ttfc := firstChunkAt.Sub(s.startedAt).Seconds()
+		s.span.SetAttributes(attribute.Float64(AttrResponseTimeToFirstChunk, ttfc))
+	}
+	if usageRecorded {
+		// Apply the spec-mandated Anthropic input-token math: Anthropic's
+		// API reports input_tokens excluding cache, but spec wants the
+		// inclusive total on gen_ai.usage.input_tokens.
+		spanInputTokens := usage.inputTokens
+		if s.provider == ProviderAnthropic {
+			spanInputTokens += usage.cacheReadInput + usage.cacheCreationInput
+		}
+		spanAttrs := []attribute.KeyValue{
+			attribute.Int64(AttrUsageInputTokens, spanInputTokens),
+			attribute.Int64(AttrUsageOutputTokens, usage.outputTokens),
+		}
+		if usage.cacheReadInput > 0 {
+			spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageCacheReadInputTokens, usage.cacheReadInput))
+		}
+		if usage.cacheCreationInput > 0 {
+			spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageCacheCreationInputTokens, usage.cacheCreationInput))
+		}
+		if usage.reasoningOutput > 0 {
+			spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageReasoningOutputTokens, usage.reasoningOutput))
+		}
+		s.span.SetAttributes(spanAttrs...)
+	}
+
+	s.span.End()
+
+	// Emit metrics. Failure to resolve instruments must not block span
+	// completion, so we silently skip when getInstruments returns nil.
+	insts := getInstruments()
+	if insts == nil {
+		return
+	}
+
+	commonAttrs := []attribute.KeyValue{
+		attribute.String(AttrOperationName, OperationChat),
+		attribute.String(AttrProviderName, s.provider),
+	}
+	if s.model != "" {
+		commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model))
+	}
+
+	durationAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+	if errType != "" {
+		durationAttrs = append(durationAttrs, attribute.String("error.type", errType))
+	}
+	if insts.clientOperationDuration != nil {
+		insts.clientOperationDuration.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+			metric.WithAttributes(durationAttrs...),
+		)
+	}
+
+	if !firstChunkAt.IsZero() && insts.clientOperationTTFC != nil {
+		insts.clientOperationTTFC.Record(s.metricCtx, firstChunkAt.Sub(s.startedAt).Seconds(),
+			metric.WithAttributes(commonAttrs...),
+		)
+	}
+	if insts.clientOperationTimePerChunk != nil {
+		for _, d := range chunkDurations {
+			insts.clientOperationTimePerChunk.Record(s.metricCtx, d,
+				metric.WithAttributes(commonAttrs...),
+			)
+		}
+	}
+
+	if usageRecorded && insts.clientTokenUsage != nil {
+		recordTokenMetric := func(tokenType string, value int64) {
+			if value <= 0 {
+				return
+			}
+			tokenAttrs := append([]attribute.KeyValue(nil), commonAttrs...)
+			tokenAttrs = append(tokenAttrs, attribute.String(AttrTokenType, tokenType))
+			insts.clientTokenUsage.Record(s.metricCtx, value,
+				metric.WithAttributes(tokenAttrs...),
+			)
+		}
+		// Per-token-type metric data points use raw provider values so a
+		// backend summing across types reconstructs the true total
+		// without double-counting cached tokens. The Anthropic spec sum
+		// (input + cache_read + cache_creation) is only applied to the
+		// span attribute `gen_ai.usage.input_tokens` per the per-provider
+		// semconv MUST clause — see span attribute emission above.
+		recordTokenMetric(TokenTypeInput, usage.inputTokens)
+		recordTokenMetric(TokenTypeOutput, usage.outputTokens)
+		recordTokenMetric(TokenTypeCacheRead, usage.cacheReadInput)
+		recordTokenMetric(TokenTypeCacheCreation, usage.cacheCreationInput)
+		recordTokenMetric(TokenTypeReasoning, usage.reasoningOutput)
+	}
+}
+
+// Span returns the underlying OTel span so callers can attach span events
+// or links when they need finer control than the helpers expose. Returns
+// a real no-op span (not a struct embedding a nil trace.Span) when the
+// receiver is nil so callers don't have to nil-check before invoking
+// Span methods like AddEvent / SetAttributes.
+func (s *ChatSpan) Span() trace.Span {
+	if s == nil {
+		return tracenoop.Span{}
+	}
+	return s.span
+}
diff --git a/pkg/telemetry/genai/stability.go b/pkg/telemetry/genai/stability.go
new file mode 100644
index 000000000..021ce0450
--- /dev/null
+++ b/pkg/telemetry/genai/stability.go
@@ -0,0 +1,130 @@
+package genai
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	"go.opentelemetry.io/otel/attribute"
+)
+
+// EnvSemconvStability is the OTel-defined environment variable that lets
+// callers opt into experimental versions of the GenAI semantic
+// conventions
+// (https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/README.md).
+//
+// It is a comma-separated list of opt-in tokens. The only token defined
+// for GenAI today is `gen_ai_latest_experimental` — when present, the
+// instrumentation emits only the spec-defined `gen_ai.*` attributes and
+// drops the legacy attribute names (e.g. `tool.name`, `agent`,
+// `session.id`).
+//
+// Default behaviour (env var unset) is dual-emit: spans carry both the
+// legacy keys and the `gen_ai.*` keys so existing dashboards keep
+// working alongside spec-aware tooling. This matches the spec's
+// recommendation that instrumentations not change the version of
+// conventions they emit by default and instead require the opt-in for
+// the new version.
+const EnvSemconvStability = "OTEL_SEMCONV_STABILITY_OPT_IN"
+
+// stabilityToken is the spec-defined opt-in for the latest experimental
+// GenAI conventions.
+const stabilityToken = "gen_ai_latest_experimental"
+
+// Stability identifies which version of attribute names a span should
+// emit.
+type Stability int
+
+const (
+	// StabilityDualEmit is the default: emit both legacy attribute
+	// names (`tool.name`, `agent`, `session.id`, ...) and the
+	// `gen_ai.*` keys, so existing dashboards continue working while
+	// spec-aware tooling sees the new values.
+	StabilityDualEmit Stability = iota
+	// StabilityGenAILatest is selected by
+	// `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Only
+	// the `gen_ai.*` attributes are emitted; the legacy keys are
+	// dropped.
+	StabilityGenAILatest
+)
+
+var (
+	stabilityMu     sync.Mutex
+	stabilityOnce   sync.Once
+	cachedStability Stability
+)
+
+// CurrentStability returns the active stability mode. The result is
+// computed once per process from the env var; tests that need to flip
+// the mode at runtime should call ResetStabilityForTest first.
+func CurrentStability() Stability {
+	stabilityMu.Lock()
+	once := &stabilityOnce
+	stabilityMu.Unlock()
+
+	once.Do(func() {
+		raw := os.Getenv(EnvSemconvStability)
+		for tok := range strings.SplitSeq(raw, ",") {
+			// Spec: tokens are case-insensitive.
+			if strings.EqualFold(strings.TrimSpace(tok), stabilityToken) {
+				stabilityMu.Lock()
+				cachedStability = StabilityGenAILatest
+				stabilityMu.Unlock()
+				return
+			}
+		}
+		stabilityMu.Lock()
+		cachedStability = StabilityDualEmit
+		stabilityMu.Unlock()
+	})
+
+	stabilityMu.Lock()
+	defer stabilityMu.Unlock()
+	return cachedStability
+}
+
+// ResetStabilityForTest clears the cached stability value so a
+// subsequent CurrentStability call re-reads the env var. Test-only —
+// callers must ensure no other goroutine is in CurrentStability when
+// this runs. The mutex protects the sync.Once and cache fields against
+// other Reset calls and against the lock-protected segments of
+// CurrentStability, but CurrentStability releases the mutex before
+// invoking once.Do, so a concurrent reset there races on the
+// sync.Once memory itself (flagged under -race). All in-tree usage is
+// sequential (t.Setenv + t.Cleanup, no t.Parallel), so this is safe in
+// practice; do not introduce parallel callers.
+func ResetStabilityForTest() {
+	stabilityMu.Lock()
+	defer stabilityMu.Unlock()
+	stabilityOnce = sync.Once{}
+	cachedStability = StabilityDualEmit
+}
+
+// EmitLegacyAttributes reports whether legacy (pre-semconv) attribute
+// keys should be emitted. True when stability is StabilityDualEmit;
+// false when the user has opted into `gen_ai_latest_experimental`.
+func EmitLegacyAttributes() bool {
+	return CurrentStability() == StabilityDualEmit
+}
+
+// LegacyToolAttributes returns the historic tool dispatcher attribute
+// set (`tool.name`, `agent`, `session.id`, `tool.call_id`,
+// `tool.type`) — but only when legacy emission is enabled. Returns nil
+// otherwise so call sites can append unconditionally.
+func LegacyToolAttributes(toolName, toolType, agentName, sessionID, callID string) []attribute.KeyValue {
+	if !EmitLegacyAttributes() {
+		return nil
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String("tool.name", toolName),
+		attribute.String("agent", agentName),
+		attribute.String("session.id", sessionID),
+	}
+	if toolType != "" {
+		attrs = append(attrs, attribute.String("tool.type", toolType))
+	}
+	if callID != "" {
+		attrs = append(attrs, attribute.String("tool.call_id", callID))
+	}
+	return attrs
+}
diff --git a/pkg/telemetry/genai/stability_test.go b/pkg/telemetry/genai/stability_test.go
new file mode 100644
index 000000000..f89ee7991
--- /dev/null
+++ b/pkg/telemetry/genai/stability_test.go
@@ -0,0 +1,55 @@
+package genai
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestCurrentStabilityDefault(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "")
+	ResetStabilityForTest()
+	assert.Equal(t, StabilityDualEmit, CurrentStability())
+	assert.True(t, EmitLegacyAttributes())
+}
+
+func TestCurrentStabilityGenAILatest(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "gen_ai_latest_experimental")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityGenAILatest, CurrentStability())
+	assert.False(t, EmitLegacyAttributes())
+}
+
+func TestCurrentStabilityIgnoresUnrelatedTokens(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "http,database")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityDualEmit, CurrentStability())
+}
+
+func TestCurrentStabilityCompositeList(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "http, gen_ai_latest_experimental ,database")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityGenAILatest, CurrentStability())
+}
+
+func TestCurrentStabilityCaseInsensitive(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "GEN_AI_LATEST_EXPERIMENTAL")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Equal(t, StabilityGenAILatest, CurrentStability())
+}
+
+func TestLegacyToolAttributesGated(t *testing.T) {
+	t.Setenv(EnvSemconvStability, "gen_ai_latest_experimental")
+	ResetStabilityForTest()
+	t.Cleanup(ResetStabilityForTest)
+	assert.Empty(t, LegacyToolAttributes("shell", "function", "main", "sess1", "call1"))
+
+	t.Setenv(EnvSemconvStability, "")
+	ResetStabilityForTest()
+	got := LegacyToolAttributes("shell", "function", "main", "sess1", "call1")
+	assert.NotEmpty(t, got)
+}
diff --git a/pkg/telemetry/genai/stream.go b/pkg/telemetry/genai/stream.go
new file mode 100644
index 000000000..382597512
--- /dev/null
+++ b/pkg/telemetry/genai/stream.go
@@ -0,0 +1,255 @@
+package genai
+
+import (
+	"errors"
+	"io"
+	"strings"
+	"sync"
+
+	"github.com/docker/docker-agent/pkg/chat"
+	"github.com/docker/docker-agent/pkg/tools"
+)
+
+// StreamAttributer is an optional interface that provider stream adapters
+// may implement to surface provider-specific attributes to the chat span
+// once the response is complete. The wrapper queries the underlying stream
+// on Close (in addition to the per-chunk Recv path) and applies whatever
+// attributes the provider chose to expose. Implementations are expected to
+// be safe to call after Close.
+type StreamAttributer interface {
+	GenAIStreamAttributes() []KeyValue
+}
+
+// KeyValue is a re-exported attribute key/value pair used by the optional
+// StreamAttributer interface so providers can implement it without
+// importing go.opentelemetry.io/otel/attribute directly. The decorator
+// converts these back into OTel attributes before applying them to the
+// span.
+type KeyValue struct {
+	Key   string
+	Value any
+}
+
+// WrapStream wraps a chat.MessageStream so that consuming the stream
+// drives the lifecycle of a ChatSpan: per-chunk timing, response-level
+// attributes (id / response.model / finish reasons), usage capture, and
+// final span End on stream close or terminal error.
+//
+// The returned stream forwards all Recv/Close calls to the underlying
+// stream verbatim and adds no other behaviour, so swapping it in is
+// invisible to callers.
+func WrapStream(span *ChatSpan, stream chat.MessageStream) chat.MessageStream {
+	if span == nil || stream == nil {
+		return stream
+	}
+	return &instrumentedStream{
+		span:    span,
+		inner:   stream,
+		capture: IsContentCaptureEnabled(),
+	}
+}
+
+type instrumentedStream struct {
+	span  *ChatSpan
+	inner chat.MessageStream
+
+	// mu guards the lifecycle flags and the streaming-state buffers
+	// so a Recv that errors concurrently with the consumer's Close
+	// does not race on the check-then-set in endOnce or
+	// double-apply attributes through SetOutputMessages.
+	mu sync.Mutex
+
+	// ended is set when the span has been finalised (output flushed
+	// and `End` called). innerClosed is set when the inner stream's
+	// `Close` has been called. They are tracked separately so an
+	// error in `Recv` can end the span without preempting the
+	// caller's `Close` that releases the inner stream's resources.
+	ended       bool
+	innerClosed bool
+
+	// capture buffers the streamed deltas for emission as
+	// `gen_ai.output.messages` on Close. Filled only when content
+	// capture is opted in (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true`)
+	// so the buffer cost stays out of the default request path.
+	capture       bool
+	contentBuf    strings.Builder
+	reasoningBuf  strings.Builder
+	pendingTools  map[string]*tools.ToolCall
+	toolCallOrder []string
+}
+
+func (s *instrumentedStream) Recv() (chat.MessageStreamResponse, error) {
+	resp, err := s.inner.Recv()
+	if err != nil {
+		// io.EOF is the normal stream terminator and is not an error
+		// for the span's purposes — End handles closing.
+		// For non-EOF errors we end the span here too: callers that
+		// abandon the stream after an error (a common pattern for
+		// network failures) would otherwise leak the span and skip the
+		// duration metric. Close remains idempotent so the canonical
+		// `defer Close()` path still works.
+		if !errors.Is(err, io.EOF) {
+			s.span.RecordError(err, ClassifyError(err))
+			s.endOnce()
+		}
+		return resp, err
+	}
+
+	// First chunk arrival is meaningful for the time_to_first_chunk
+	// metric. Mark on every Recv that produced any content so we cover
+	// cases where the provider opens with an empty preamble.
+	if hasChunkPayload(&resp) {
+		s.span.MarkChunk()
+	}
+
+	if resp.ID != "" {
+		s.span.SetResponseID(resp.ID)
+	}
+	if resp.Model != "" {
+		s.span.SetResponseModel(resp.Model)
+	}
+	for i := range resp.Choices {
+		if resp.Choices[i].FinishReason != "" {
+			s.span.AddFinishReason(string(resp.Choices[i].FinishReason))
+		}
+	}
+	if resp.Usage != nil {
+		s.span.RecordUsage(
+			resp.Usage.InputTokens,
+			resp.Usage.OutputTokens,
+			resp.Usage.CachedInputTokens,
+			resp.Usage.CacheWriteTokens,
+			resp.Usage.ReasoningTokens,
+		)
+	}
+
+	if s.capture {
+		s.mu.Lock()
+		s.bufferDeltas(&resp)
+		s.mu.Unlock()
+	}
+	return resp, nil
+}
+
+// bufferDeltas accumulates content and tool-call deltas for the
+// gen_ai.output.messages attribute. Tool calls arrive across multiple
+// chunks (id once, name once, arguments in pieces), so we keep a map
+// keyed by id and concatenate arguments as they stream in.
+func (s *instrumentedStream) bufferDeltas(resp *chat.MessageStreamResponse) {
+	for i := range resp.Choices {
+		d := &resp.Choices[i].Delta
+		if d.Content != "" {
+			s.contentBuf.WriteString(d.Content)
+		}
+		if d.ReasoningContent != "" {
+			s.reasoningBuf.WriteString(d.ReasoningContent)
+		}
+		for j := range d.ToolCalls {
+			tc := &d.ToolCalls[j]
+			id := tc.ID
+			if id == "" {
+				// Provider didn't include the id on this delta — fall
+				// back to the most recent in-progress tool call.
+				if len(s.toolCallOrder) == 0 {
+					continue
+				}
+				id = s.toolCallOrder[len(s.toolCallOrder)-1]
+			}
+			if s.pendingTools == nil {
+				s.pendingTools = map[string]*tools.ToolCall{}
+			}
+			existing, ok := s.pendingTools[id]
+			if !ok {
+				existing = &tools.ToolCall{ID: id, Type: tc.Type}
+				s.pendingTools[id] = existing
+				s.toolCallOrder = append(s.toolCallOrder, id)
+			}
+			if tc.Function.Name != "" {
+				existing.Function.Name = tc.Function.Name
+			}
+			if tc.Function.Arguments != "" {
+				existing.Function.Arguments += tc.Function.Arguments
+			}
+		}
+	}
+}
+
+func (s *instrumentedStream) Close() {
+	s.mu.Lock()
+	closeInner := !s.innerClosed
+	s.innerClosed = true
+	s.mu.Unlock()
+	if closeInner {
+		s.inner.Close()
+	}
+	s.endOnce()
+}
+
+// endOnce flushes captured content, applies provider-supplied attributes,
+// and ends the span — at most once per stream. Both the error path in
+// `Recv` and the explicit `Close` path go through here so a stream that
+// errors mid-flight still ends its span without waiting for the caller.
+// `inner.Close` is intentionally NOT called here: leaving it to the
+// explicit `Close` path keeps the contract that the wrapper releases
+// the underlying stream exactly when the caller asks.
+func (s *instrumentedStream) endOnce() {
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	// Snapshot the buffers under the lock so we don't race against a
+	// concurrent Recv writing more deltas. Release before calling out
+	// to the OTel SDK and the StreamAttributer hook to avoid holding
+	// the mutex across third-party code.
+	var (
+		extras       []KeyValue
+		captured     bool
+		content      string
+		reasoning    string
+		collected    []tools.ToolCall
+		streamAttrer StreamAttributer
+	)
+	if attrer, ok := s.inner.(StreamAttributer); ok {
+		streamAttrer = attrer
+	}
+	if s.capture {
+		captured = true
+		content = s.contentBuf.String()
+		reasoning = s.reasoningBuf.String()
+		for _, id := range s.toolCallOrder {
+			if tc, ok := s.pendingTools[id]; ok {
+				collected = append(collected, *tc)
+			}
+		}
+	}
+	s.mu.Unlock()
+
+	if streamAttrer != nil {
+		extras = streamAttrer.GenAIStreamAttributes()
+	}
+	for _, kv := range extras {
+		applyExtraAttribute(s.span, kv)
+	}
+	if captured {
+		SetOutputMessages(s.span, content, reasoning, collected)
+	}
+	s.span.End()
+}
+
+// hasChunkPayload reports whether the response carries content that should
+// count as an output chunk (text, reasoning, tool call, etc.). Empty
+// keep-alive frames do not advance the per-chunk timing metrics.
+func hasChunkPayload(resp *chat.MessageStreamResponse) bool {
+	for i := range resp.Choices {
+		d := &resp.Choices[i].Delta
+		if d.Content != "" || d.ReasoningContent != "" || d.ThinkingSignature != "" {
+			return true
+		}
+		if len(d.ToolCalls) > 0 || d.FunctionCall != nil {
+			return true
+		}
+	}
+	return false
+}
diff --git a/pkg/telemetry/mcp/attrs.go b/pkg/telemetry/mcp/attrs.go
new file mode 100644
index 000000000..64a1d4138
--- /dev/null
+++ b/pkg/telemetry/mcp/attrs.go
@@ -0,0 +1,58 @@
+package mcp
+
+// MCP attribute keys defined by the OTel semantic conventions
+// (https://opentelemetry.io/docs/specs/semconv/registry/attributes/mcp/).
+// All are Development stability.
+const (
+	AttrMethodName      = "mcp.method.name"
+	AttrProtocolVersion = "mcp.protocol.version"
+	AttrResourceURI     = "mcp.resource.uri"
+	AttrSessionID       = "mcp.session.id"
+)
+
+// JSON-RPC attribute keys used alongside MCP spans for request id and
+// response status when applicable.
+const (
+	AttrJSONRPCRequestID       = "jsonrpc.request.id"
+	AttrJSONRPCProtocolVersion = "jsonrpc.protocol.version"
+	AttrRPCResponseStatusCode  = "rpc.response.status_code"
+)
+
+// gen_ai.* attribute keys that the MCP semconv overlays on MCP spans when
+// applicable. These are duplicated here as constants so the MCP package
+// doesn't depend on the genai package — keeping the two telemetry helpers
+// compositional.
+const (
+	AttrGenAIOperationName = "gen_ai.operation.name"
+	AttrGenAIToolName      = "gen_ai.tool.name"
+	AttrGenAIPromptName    = "gen_ai.prompt.name"
+)
+
+// Well-known MCP method names (https://modelcontextprotocol.io/specification).
+// These match the values listed in the OTel semconv registry.
+const (
+	MethodInitialize         = "initialize"
+	MethodPing               = "ping"
+	MethodCompletionComplete = "completion/complete"
+	MethodPromptsList        = "prompts/list"
+	MethodPromptsGet         = "prompts/get"
+	MethodResourcesList      = "resources/list"
+	MethodResourcesRead      = "resources/read"
+	MethodResourcesSubscribe = "resources/subscribe"
+	MethodResourcesUnsub     = "resources/unsubscribe"
+	MethodResourcesTemplates = "resources/templates/list"
+	MethodRootsList          = "roots/list"
+	MethodSamplingCreate     = "sampling/createMessage"
+	MethodToolsList          = "tools/list"
+	MethodToolsCall          = "tools/call"
+	MethodLoggingSetLevel    = "logging/setLevel"
+	MethodElicitationCreate  = "elicitation/create"
+)
+
+// OperationExecuteTool is the gen_ai.operation.name value used on MCP
+// tools/call spans per the spec.
+const OperationExecuteTool = "execute_tool"
+
+// instrumentationName identifies this package as the OTel instrumentation
+// scope for spans, metrics, and log records it produces.
+const instrumentationName = "github.com/docker/docker-agent/pkg/telemetry/mcp"
diff --git a/pkg/telemetry/mcp/conversation.go b/pkg/telemetry/mcp/conversation.go
new file mode 100644
index 000000000..efeaad57f
--- /dev/null
+++ b/pkg/telemetry/mcp/conversation.go
@@ -0,0 +1,19 @@
+package mcp
+
+import (
+	"context"
+
+	"go.opentelemetry.io/otel/baggage"
+)
+
+// ConversationIDFromBaggage reads `gen_ai.conversation.id` from the
+// context's W3C baggage. The MCP package mirrors the genai package's
+// convention so MCP spans automatically carry the session id when the
+// runtime has seeded it; the value also propagates across MCP server
+// boundaries via the standard `baggage` header alongside `traceparent`.
+//
+// Exported so adjacent code (e.g. the MCP OAuth transport) can attach
+// the same attribute to spans it creates directly via `otel.Tracer`.
+func ConversationIDFromBaggage(ctx context.Context) string {
+	return baggage.FromContext(ctx).Member("gen_ai.conversation.id").Value()
+}
diff --git a/pkg/telemetry/mcp/doc.go b/pkg/telemetry/mcp/doc.go
new file mode 100644
index 000000000..401f7472d
--- /dev/null
+++ b/pkg/telemetry/mcp/doc.go
@@ -0,0 +1,13 @@
+// Package mcp provides OpenTelemetry instrumentation helpers that follow
+// the OTel GenAI semantic conventions for the Model Context Protocol
+// (https://opentelemetry.io/docs/specs/semconv/gen-ai/mcp/).
+//
+// MCP attributes use the `mcp.*` namespace (separate from `gen_ai.*`).
+// Trace context propagates through the MCP `params._meta` field so that
+// requests crossing client/server boundaries chain into a single trace.
+//
+// The package is structured so that callers describe what they are doing
+// in MCP terms (method name, tool name, session id) and the helpers
+// produce the spec-conformant spans, metrics, and propagation. All helpers
+// are no-op-safe when telemetry is disabled.
+package mcp
diff --git a/pkg/telemetry/mcp/mcp_test.go b/pkg/telemetry/mcp/mcp_test.go
new file mode 100644
index 000000000..5e5d78342
--- /dev/null
+++ b/pkg/telemetry/mcp/mcp_test.go
@@ -0,0 +1,99 @@
+package mcp
+
+import (
+	"context"
+	"errors"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/propagation"
+	"go.opentelemetry.io/otel/sdk/trace"
+	traceapi "go.opentelemetry.io/otel/trace"
+)
+
+func TestEnsureMeta(t *testing.T) {
+	t.Parallel()
+	got := EnsureMeta(nil)
+	assert.NotNil(t, got)
+	assert.Empty(t, got)
+
+	existing := map[string]any{"foo": "bar"}
+	got = EnsureMeta(existing)
+	assert.Equal(t, existing, got)
+}
+
+func TestInjectExtractRoundTrip(t *testing.T) {
+	// Mutates the global OTel text-map propagator, so this test cannot
+	// run in parallel with other tests that read or modify it.
+
+	// A propagator must be configured for inject/extract to do anything;
+	// install one for the duration of the test and put it back after.
+	prev := otel.GetTextMapPropagator()
+	otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
+		propagation.TraceContext{},
+		propagation.Baggage{},
+	))
+	t.Cleanup(func() { otel.SetTextMapPropagator(prev) })
+
+	// Start a sampled span so traceparent has a non-trivial trace id.
+	tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample()))
+	t.Cleanup(func() { _ = tp.Shutdown(t.Context()) })
+
+	parentCtx, parentSpan := tp.Tracer("test").Start(t.Context(), "parent")
+	defer parentSpan.End()
+	parentSC := traceapi.SpanContextFromContext(parentCtx)
+
+	meta := map[string]any{}
+	InjectMeta(parentCtx, meta)
+	assert.Contains(t, meta, "traceparent",
+		"propagator should have written W3C traceparent into _meta")
+
+	// Extract from a fresh context and verify the span context lines up
+	// with the parent we started with.
+	childCtx := ExtractMeta(t.Context(), meta)
+	extracted := traceapi.SpanContextFromContext(childCtx)
+	assert.Equal(t, parentSC.TraceID(), extracted.TraceID())
+	assert.Equal(t, parentSC.SpanID(), extracted.SpanID())
+}
+
+func TestInjectMetaNilNoOp(t *testing.T) {
+	t.Parallel()
+	// Should not panic on a nil map.
+	InjectMeta(t.Context(), nil)
+}
+
+func TestExtractMetaNilReturnsParent(t *testing.T) {
+	t.Parallel()
+	got := ExtractMeta(t.Context(), nil)
+	// Without trace context to extract we get back the same context.
+	assert.Equal(t, t.Context(), got)
+}
+
+func TestStartClientReturnsActiveSpan(t *testing.T) {
+	// Mutates the global OTel tracer provider, so this test cannot run
+	// in parallel with other tests that read or modify it.
+
+	tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample()))
+	t.Cleanup(func() { _ = tp.Shutdown(t.Context()) })
+	prev := otel.GetTracerProvider()
+	otel.SetTracerProvider(tp)
+	t.Cleanup(func() { otel.SetTracerProvider(prev) })
+
+	ctx, span := StartClient(t.Context(), CallOptions{
+		Method:   MethodToolsCall,
+		ToolName: "search-web",
+	})
+	defer span.End()
+
+	sc := traceapi.SpanContextFromContext(ctx)
+	assert.True(t, sc.IsValid(), "context should carry an active span")
+}
+
+func TestClassifyError(t *testing.T) {
+	t.Parallel()
+	assert.Empty(t, ClassifyError(nil))
+	assert.Equal(t, "context_canceled", ClassifyError(context.Canceled))
+	assert.Equal(t, "deadline_exceeded", ClassifyError(context.DeadlineExceeded))
+	assert.Equal(t, "rpc_error", ClassifyError(errors.New("some other error")))
+}
diff --git a/pkg/telemetry/mcp/metrics.go b/pkg/telemetry/mcp/metrics.go
new file mode 100644
index 000000000..fab407f9d
--- /dev/null
+++ b/pkg/telemetry/mcp/metrics.go
@@ -0,0 +1,56 @@
+package mcp
+
+import (
+	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/metric"
+)
+
+// metricBuckets matches the spec's bucket boundaries for all four MCP
+// duration histograms (mcp.client/server.operation.duration and
+// mcp.client/server.session.duration).
+var metricBuckets = []float64{
+	0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 30, 60, 120, 300,
+}
+
+type instruments struct {
+	clientOperationDuration metric.Float64Histogram
+	serverOperationDuration metric.Float64Histogram
+	// mcp.{client,server}.session.duration histograms are defined by
+	// the spec but require a SessionSpan that tracks open/close at
+	// the transport layer. Wire those up alongside the transport
+	// instrumentation; until then registering them here would create
+	// always-empty time series in Mimir.
+}
+
+var (
+	instOnce sync.Once
+	inst     *instruments
+)
+
+func getInstruments() *instruments {
+	instOnce.Do(func() {
+		meter := otel.Meter(instrumentationName)
+		i := &instruments{}
+
+		// Histogram registration rarely fails; on the rare miss we
+		// keep the successfully created instruments rather than
+		// abandoning the whole package — record sites nil-check.
+		i.clientOperationDuration, _ = meter.Float64Histogram(
+			"mcp.client.operation.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time taken by an MCP client to send a request and receive its response."),
+			metric.WithExplicitBucketBoundaries(metricBuckets...),
+		)
+		i.serverOperationDuration, _ = meter.Float64Histogram(
+			"mcp.server.operation.duration",
+			metric.WithUnit("s"),
+			metric.WithDescription("Time taken by an MCP server to handle a request and send its response."),
+			metric.WithExplicitBucketBoundaries(metricBuckets...),
+		)
+
+		inst = i
+	})
+	return inst
+}
diff --git a/pkg/telemetry/mcp/propagation.go b/pkg/telemetry/mcp/propagation.go
new file mode 100644
index 000000000..b0e62040b
--- /dev/null
+++ b/pkg/telemetry/mcp/propagation.go
@@ -0,0 +1,92 @@
+package mcp
+
+import (
+	"context"
+	"maps"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/propagation"
+)
+
+// metaCarrier adapts an MCP `params._meta` map (which the MCP SDK exposes
+// as `map[string]any`) to OTel's TextMapCarrier interface so the package's
+// configured propagator can read and write trace context (`traceparent`,
+// `tracestate`, `baggage`) the way it does for any HTTP carrier.
+type metaCarrier struct {
+	meta map[string]any
+}
+
+func (c metaCarrier) Get(key string) string {
+	if c.meta == nil {
+		return ""
+	}
+	v, ok := c.meta[key]
+	if !ok {
+		return ""
+	}
+	if s, ok := v.(string); ok {
+		return s
+	}
+	return ""
+}
+
+func (c metaCarrier) Set(key, value string) {
+	if c.meta == nil {
+		return
+	}
+	c.meta[key] = value
+}
+
+func (c metaCarrier) Keys() []string {
+	if c.meta == nil {
+		return nil
+	}
+	keys := make([]string, 0, len(c.meta))
+	for k, v := range c.meta {
+		if _, ok := v.(string); ok {
+			keys = append(keys, k)
+		}
+	}
+	return keys
+}
+
+// InjectMeta writes the active trace context into the given MCP `_meta`
+// map so the receiving server can extract it and parent its SERVER span
+// onto our CLIENT span. Per the MCP semconv, the keys written are
+// `traceparent`, `tracestate`, and `baggage` (W3C TraceContext + Baggage).
+//
+// If meta is nil, InjectMeta is a no-op — callers should ensure the map
+// is non-nil before calling so the keys actually persist on the request.
+func InjectMeta(ctx context.Context, meta map[string]any) {
+	if meta == nil {
+		return
+	}
+	otel.GetTextMapPropagator().Inject(ctx, metaCarrier{meta: meta})
+}
+
+// ExtractMeta reads trace context from the given MCP `_meta` map and
+// returns a context with the parent span attached. Use on the server side
+// to chain incoming spans onto the client's caller.
+func ExtractMeta(ctx context.Context, meta map[string]any) context.Context {
+	if meta == nil {
+		return ctx
+	}
+	return otel.GetTextMapPropagator().Extract(ctx, metaCarrier{meta: meta})
+}
+
+// EnsureMeta returns a metadata map suitable for InjectMeta to write
+// trace context into. When m is non-nil it is shallow-copied so an
+// upstream caller that reuses the same request struct (e.g. on retry)
+// does not see stale `traceparent` keys from a previous span injected
+// into the map they own. When m is nil a fresh map is allocated.
+func EnsureMeta(m map[string]any) map[string]any {
+	if m == nil {
+		return map[string]any{}
+	}
+	out := make(map[string]any, len(m)+3)
+	maps.Copy(out, m)
+	return out
+}
+
+// Verify metaCarrier satisfies the propagator interface at compile time.
+var _ propagation.TextMapCarrier = metaCarrier{}
diff --git a/pkg/telemetry/mcp/span.go b/pkg/telemetry/mcp/span.go
new file mode 100644
index 000000000..594ba99bd
--- /dev/null
+++ b/pkg/telemetry/mcp/span.go
@@ -0,0 +1,247 @@
+package mcp
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"sync"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/metric"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// CallOptions describes an MCP request being made or handled. Used by
+// both client- and server-side helpers so call sites depend on a single
+// vocabulary.
+type CallOptions struct {
+	// Method is the MCP method name (e.g. "tools/call"). Required.
+	Method string
+
+	// Target is the low-cardinality target of the operation: tool name
+	// for tools/call, prompt name for prompts/get, etc. When set the
+	// span name becomes "{method} {target}"; otherwise just "{method}".
+	Target string
+
+	// ToolName, when set, is recorded as gen_ai.tool.name and used as
+	// the default Target for tools/call.
+	ToolName string
+
+	// PromptName, when set, is recorded as gen_ai.prompt.name and used
+	// as the default Target for prompts/get.
+	PromptName string
+
+	// ResourceURI, when set, is recorded as mcp.resource.uri and used
+	// as the default Target for resources/* methods.
+	ResourceURI string
+
+	// SessionID identifies the MCP session and is recorded as
+	// mcp.session.id when set.
+	SessionID string
+
+	// ProtocolVersion is recorded as mcp.protocol.version when set.
+	ProtocolVersion string
+
+	// JSONRPCRequestID is recorded as jsonrpc.request.id when set
+	// (client-side requests; ignored for notifications).
+	JSONRPCRequestID string
+
+	// ServerAddress / ServerPort identify the MCP endpoint when known.
+	ServerAddress string
+	ServerPort    int
+}
+
+// Span is the handle returned by StartClient / StartServer. It carries
+// enough state to record `mcp.{client,server}.operation.duration` and to
+// flush span attributes as the operation proceeds.
+type Span struct {
+	span trace.Span
+	// metricCtx carries the active span context so the duration
+	// histogram measurement produces span-context exemplars (drill
+	// Mimir bucket → Tempo trace).
+	metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time
+	startedAt time.Time
+	method    string
+	kind      trace.SpanKind
+
+	mu      sync.Mutex
+	errType string
+	ended   bool
+}
+
+// StartClient begins a CLIENT-kind MCP span and returns a context carrying
+// it. Callers MUST call Span.End to flush the span and metrics.
+func StartClient(ctx context.Context, opts CallOptions) (context.Context, *Span) {
+	return startSpan(ctx, opts, trace.SpanKindClient)
+}
+
+// StartServer begins a SERVER-kind MCP span. Use after extracting trace
+// context from the incoming `params._meta` so the span chains onto the
+// caller. Callers MUST call Span.End.
+func StartServer(ctx context.Context, opts CallOptions) (context.Context, *Span) {
+	return startSpan(ctx, opts, trace.SpanKindServer)
+}
+
+func startSpan(ctx context.Context, opts CallOptions, kind trace.SpanKind) (context.Context, *Span) {
+	tracer := otel.Tracer(instrumentationName)
+
+	target := opts.Target
+	if target == "" {
+		switch {
+		case opts.ToolName != "":
+			target = opts.ToolName
+		case opts.PromptName != "":
+			target = opts.PromptName
+		case opts.ResourceURI != "":
+			target = opts.ResourceURI
+		}
+	}
+
+	name := opts.Method
+	if name == "" {
+		name = "mcp"
+	}
+	if target != "" {
+		name = name + " " + target
+	}
+
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrMethodName, opts.Method),
+	}
+	if opts.ToolName != "" {
+		attrs = append(attrs,
+			attribute.String(AttrGenAIToolName, opts.ToolName),
+		)
+		if strings.HasPrefix(opts.Method, "tools/") {
+			attrs = append(attrs, attribute.String(AttrGenAIOperationName, OperationExecuteTool))
+		}
+	}
+	if opts.PromptName != "" {
+		attrs = append(attrs, attribute.String(AttrGenAIPromptName, opts.PromptName))
+	}
+	if opts.ResourceURI != "" {
+		attrs = append(attrs, attribute.String(AttrResourceURI, opts.ResourceURI))
+	}
+	if opts.SessionID != "" {
+		attrs = append(attrs, attribute.String(AttrSessionID, opts.SessionID))
+	}
+	if opts.ProtocolVersion != "" {
+		attrs = append(attrs, attribute.String(AttrProtocolVersion, opts.ProtocolVersion))
+	}
+	if opts.JSONRPCRequestID != "" {
+		attrs = append(attrs, attribute.String(AttrJSONRPCRequestID, opts.JSONRPCRequestID))
+	}
+	if opts.ServerAddress != "" {
+		attrs = append(attrs, attribute.String("server.address", opts.ServerAddress))
+		if opts.ServerPort > 0 {
+			attrs = append(attrs, attribute.Int("server.port", opts.ServerPort))
+		}
+	}
+	if conv := ConversationIDFromBaggage(ctx); conv != "" {
+		attrs = append(attrs, attribute.String("gen_ai.conversation.id", conv))
+	}
+
+	ctx, span := tracer.Start(ctx, name,
+		trace.WithSpanKind(kind),
+		trace.WithAttributes(attrs...),
+	)
+
+	return ctx, &Span{
+		span:      span,
+		metricCtx: ctx,
+		startedAt: time.Now(),
+		method:    opts.Method,
+		kind:      kind,
+	}
+}
+
+// SetAttributes adds extra attributes to the span. Use for MCP extensions
+// or for response-side attributes the caller learns later
+// (e.g. rpc.response.status_code).
+func (s *Span) SetAttributes(attrs ...attribute.KeyValue) {
+	if s == nil {
+		return
+	}
+	s.span.SetAttributes(attrs...)
+}
+
+// RecordError marks the span as failed and stores error.type for the
+// duration metric. errType should be a short, low-cardinality string;
+// when empty, ClassifyError(err) supplies a value (one of
+// "context_canceled", "deadline_exceeded", "rpc_error").
+func (s *Span) RecordError(err error, errType string) {
+	if s == nil || err == nil {
+		return
+	}
+	if errType == "" {
+		errType = ClassifyError(err)
+	}
+	s.mu.Lock()
+	s.errType = errType
+	s.mu.Unlock()
+	s.span.RecordError(err)
+	s.span.SetStatus(codes.Error, err.Error())
+	s.span.SetAttributes(attribute.String("error.type", errType))
+}
+
+// End closes the span and records the operation duration metric. Safe to
+// call multiple times; subsequent calls are no-ops.
+func (s *Span) End() {
+	if s == nil {
+		return
+	}
+	s.mu.Lock()
+	if s.ended {
+		s.mu.Unlock()
+		return
+	}
+	s.ended = true
+	errType := s.errType
+	s.mu.Unlock()
+
+	s.span.End()
+
+	insts := getInstruments()
+	if insts == nil {
+		return
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String(AttrMethodName, s.method),
+	}
+	if errType != "" {
+		attrs = append(attrs, attribute.String("error.type", errType))
+	}
+
+	histogram := insts.clientOperationDuration
+	if s.kind == trace.SpanKindServer {
+		histogram = insts.serverOperationDuration
+	}
+	if histogram == nil {
+		return
+	}
+	// Use the span's started-at as the reference; we already snapshot
+	// errType under the lock above, so no additional locking is needed
+	// for the immutable startedAt field.
+	histogram.Record(s.metricCtx, time.Since(s.startedAt).Seconds(),
+		metric.WithAttributes(attrs...),
+	)
+}
+
+// ClassifyError maps an MCP error to a low-cardinality error.type value.
+// MCP errors are often plain RPC errors; this helper picks reasonable
+// labels for cancellation and falls back to the type name otherwise.
+func ClassifyError(err error) string {
+	if err == nil {
+		return ""
+	}
+	switch {
+	case errors.Is(err, context.Canceled):
+		return "context_canceled"
+	case errors.Is(err, context.DeadlineExceeded):
+		return "deadline_exceeded"
+	}
+	return "rpc_error"
+}
diff --git a/pkg/toolinstall/registry.go b/pkg/toolinstall/registry.go
index 1c53189ef..5529483b6 100644
--- a/pkg/toolinstall/registry.go
+++ b/pkg/toolinstall/registry.go
@@ -14,6 +14,8 @@ import (
 
 	"github.com/goccy/go-yaml"
 	"github.com/natefinch/atomic"
+
+	"github.com/docker/docker-agent/pkg/httpclient"
 )
 
 // githubToken returns a GitHub personal access token from the environment,
@@ -115,7 +117,7 @@ var (
 // NewRegistry creates a new Registry with default settings.
 func NewRegistry() *Registry {
 	return &Registry{
-		httpClient: http.DefaultClient,
+		httpClient: httpclient.TracedDefaultClient(),
 		baseURL:    registryBaseURL,
 		cacheDir:   RegistryDir(),
 	}
diff --git a/pkg/tools/builtin/agent/agent.go b/pkg/tools/builtin/agent/agent.go
index d195695ea..ff2b3f07a 100644
--- a/pkg/tools/builtin/agent/agent.go
+++ b/pkg/tools/builtin/agent/agent.go
@@ -12,9 +12,14 @@ import (
 	"time"
 
 	"github.com/google/uuid"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/session"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -295,6 +300,13 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 	// via HandleStop which calls cancel().
 	taskCtx, cancel := context.WithCancel(context.WithoutCancel(ctx))
 
+	// Capture a link to the current trace so the background task's
+	// new root trace can be navigated back to the spawning agent in
+	// observability-svc. The parent span context comes from the
+	// active `runtime.tool.call` span; the link survives even after
+	// that span ends, while a child-span relationship would not.
+	parentSpanContext := trace.SpanContextFromContext(ctx)
+
 	t := &task{
 		id:        taskID,
 		agentName: params.Agent,
@@ -308,9 +320,50 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 	h.wg.Go(func() {
 		defer cancel()
 
+		// Each background task starts its own trace (WithNewRoot)
+		// because it outlives the spawning request — making it a
+		// child would leave a span open after the parent ended.
+		// A span link preserves navigability from the spawning
+		// trace to the background task.
+		spanAttrs := []attribute.KeyValue{
+			attribute.String("cagent.background_agent.task_id", taskID),
+			attribute.String("cagent.background_agent.agent", params.Agent),
+		}
+		// Stamp gen_ai.conversation.id directly: WithNewRoot resets the
+		// span context but baggage flows through context.WithoutCancel,
+		// so the id is reachable yet would not appear as a span attr
+		// without an explicit lift.
+		if convID := genai.ConversationIDFromContext(taskCtx); convID != "" {
+			spanAttrs = append(spanAttrs, attribute.String(genai.AttrConversationID, convID))
+		}
+		startOpts := []trace.SpanStartOption{
+			trace.WithSpanKind(trace.SpanKindInternal),
+			trace.WithNewRoot(),
+			trace.WithAttributes(spanAttrs...),
+		}
+		if parentSpanContext.IsValid() {
+			startOpts = append(startOpts, trace.WithLinks(trace.Link{
+				SpanContext: parentSpanContext,
+				Attributes: []attribute.KeyValue{
+					attribute.String("cagent.link.kind", "spawned_from"),
+				},
+			}))
+		}
+		// Static span name; the agent name lives in the
+		// `cagent.background_agent.agent` attribute. Putting the
+		// user-defined agent name into the span name itself would
+		// blow up Tempo's operation-name index when many agents are
+		// configured.
+		tracedCtx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools/builtin/agent").Start(
+			taskCtx,
+			"background_agent.run",
+			startOpts...,
+		)
+		defer span.End()
+
 		slog.Debug("Starting background agent task", "task_id", taskID, "agent", params.Agent)
 
-		result := h.runner.RunAgent(taskCtx, RunParams{
+		result := h.runner.RunAgent(tracedCtx, RunParams{
 			AgentName:      params.Agent,
 			Task:           params.Task,
 			ExpectedOutput: params.ExpectedOutput,
@@ -321,12 +374,18 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 		if result.ErrMsg != "" {
 			t.errMsg = result.ErrMsg
 			t.storeStatus(taskFailed)
+			span.SetStatus(codes.Error, result.ErrMsg)
+			span.SetAttributes(
+				attribute.String("error.type", "agent_error"),
+				attribute.String("cagent.background_agent.outcome", "failed"),
+			)
 			slog.Debug("Background agent task failed", "task_id", taskID, "agent", params.Agent, "error", result.ErrMsg)
 			return
 		}
 
-		if taskCtx.Err() != nil && t.loadStatus() == taskRunning {
+		if tracedCtx.Err() != nil && t.loadStatus() == taskRunning {
 			t.storeStatus(taskStopped)
+			span.SetAttributes(attribute.String("cagent.background_agent.outcome", "stopped"))
 			slog.Debug("Background agent task stopped", "task_id", taskID)
 			return
 		}
@@ -335,6 +394,7 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall
 		// always see the populated result field.
 		t.result = result.Result
 		if t.casStatus(taskRunning, taskCompleted) {
+			span.SetAttributes(attribute.String("cagent.background_agent.outcome", "completed"))
 			slog.Debug("Background agent task completed", "task_id", taskID, "agent", params.Agent)
 		}
 	})
diff --git a/pkg/tools/builtin/api/api.go b/pkg/tools/builtin/api/api.go
index 9b13d5d1b..5fd30e7a2 100644
--- a/pkg/tools/builtin/api/api.go
+++ b/pkg/tools/builtin/api/api.go
@@ -13,6 +13,7 @@ import (
 	"time"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/js"
 	"github.com/docker/docker-agent/pkg/remote"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -34,7 +35,7 @@ var (
 func (t *Tool) callTool(ctx context.Context, toolCall tools.ToolCall) (*tools.ToolCallResult, error) {
 	client := &http.Client{
 		Timeout:   30 * time.Second,
-		Transport: remote.NewTransport(ctx),
+		Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx)),
 	}
 
 	endpoint := t.config.Endpoint
diff --git a/pkg/tools/builtin/deferred/deferred.go b/pkg/tools/builtin/deferred/deferred.go
index e18b354fb..30f35938e 100644
--- a/pkg/tools/builtin/deferred/deferred.go
+++ b/pkg/tools/builtin/deferred/deferred.go
@@ -8,6 +8,9 @@ import (
 	"strings"
 	"sync"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -84,7 +87,7 @@ type AddToolArgs struct {
 	Name string `json:"name" jsonschema:"The name of the tool to activate"`
 }
 
-func (d *Toolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*tools.ToolCallResult, error) {
+func (d *Toolset) handleSearchTool(ctx context.Context, args SearchToolArgs) (*tools.ToolCallResult, error) {
 	query := strings.ToLower(args.Query)
 
 	d.mu.RLock()
@@ -103,6 +106,15 @@ func (d *Toolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*too
 		}
 	}
 
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.deferred.op", "search_tool"),
+			attribute.String("cagent.tool.deferred.query", args.Query),
+			attribute.Int("cagent.tool.deferred.match_count", len(results)),
+			attribute.Int("cagent.tool.deferred.pool_size", len(d.deferredTools)),
+		)
+	}
+
 	if len(results) == 0 {
 		return tools.ResultError(fmt.Sprintf("No deferred tools found matching '%s'", args.Query)), nil
 	}
@@ -115,21 +127,37 @@ func (d *Toolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*too
 	return tools.ResultSuccess(fmt.Sprintf("Found %d deferred tool(s):\n%s", len(results), string(output))), nil
 }
 
-func (d *Toolset) handleAddTool(_ context.Context, args AddToolArgs) (*tools.ToolCallResult, error) {
+func (d *Toolset) handleAddTool(ctx context.Context, args AddToolArgs) (*tools.ToolCallResult, error) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
+	span := trace.SpanFromContext(ctx)
+	annotate := func(outcome string) {
+		if !span.IsRecording() {
+			return
+		}
+		span.SetAttributes(
+			attribute.String("cagent.tool.deferred.op", "add_tool"),
+			attribute.String("cagent.tool.deferred.tool_name", args.Name),
+			attribute.String("cagent.tool.deferred.outcome", outcome),
+			attribute.Int("cagent.tool.deferred.activated_count", len(d.activatedTools)),
+		)
+	}
+
 	if _, exists := d.activatedTools[args.Name]; exists {
+		annotate("already_active")
 		return tools.ResultSuccess(fmt.Sprintf("Tool '%s' is already active", args.Name)), nil
 	}
 
 	entry, exists := d.deferredTools[args.Name]
 	if !exists {
+		annotate("not_found")
 		return tools.ResultError(fmt.Sprintf("Tool '%s' not found.", args.Name)), nil
 	}
 
 	delete(d.deferredTools, args.Name)
 	d.activatedTools[args.Name] = entry.tool
+	annotate("activated")
 
 	return tools.ResultSuccess(fmt.Sprintf("Tool '%s' has been activated and is now available for use.\n\nDescription: %s", args.Name, entry.tool.Description)), nil
 }
diff --git a/pkg/tools/builtin/fetch/fetch.go b/pkg/tools/builtin/fetch/fetch.go
index eb1edd522..1c50d8f48 100644
--- a/pkg/tools/builtin/fetch/fetch.go
+++ b/pkg/tools/builtin/fetch/fetch.go
@@ -15,7 +15,10 @@ import (
 	htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
 	"github.com/k3a/html2text"
 	"github.com/temoto/robotstxt"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/remote"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/useragent"
@@ -48,15 +51,55 @@ type ToolArgs struct {
 	Format  string   `json:"format,omitempty"`
 }
 
+// sanitizeFetchURLs strips query strings and userinfo from each URL so
+// the resulting span attribute can ship by default without leaking
+// signed-URL tokens, OAuth codes, or inline credentials. URLs that fail
+// to parse are emitted as a sentinel rather than the raw string, since
+// an unparseable URL could also carry sensitive material.
+func sanitizeFetchURLs(urls []string) []string {
+	out := make([]string, len(urls))
+	for i, raw := range urls {
+		u, err := url.Parse(raw)
+		if err != nil {
+			out[i] = "<unparseable>"
+			continue
+		}
+		u.RawQuery = ""
+		u.Fragment = ""
+		u.User = nil
+		out[i] = u.String()
+	}
+	return out
+}
+
 func (h *fetchHandler) CallTool(ctx context.Context, params ToolArgs) (*tools.ToolCallResult, error) {
 	if len(params.URLs) == 0 {
 		return nil, errors.New("at least one URL is required")
 	}
 
+	// Decorate the active runtime.tool.handler span with the requested
+	// URLs. Strip query params and userinfo first: query strings often
+	// carry signed-URL tokens, OAuth codes, or session IDs, and userinfo
+	// carries credentials inline. The path stays intact so dashboards
+	// can still answer "which sites/endpoints did the agent hit?" — the
+	// HTTP CLIENT child span emitted by `httpclient.WrapWithOTel` below
+	// retains the full URL under `http.url` for callers that opt into
+	// that backend's full-URL capture.
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		attrs := []attribute.KeyValue{
+			attribute.Int("cagent.tool.fetch.url_count", len(params.URLs)),
+			attribute.StringSlice("cagent.tool.fetch.urls", sanitizeFetchURLs(params.URLs)),
+		}
+		if params.Format != "" {
+			attrs = append(attrs, attribute.String("cagent.tool.fetch.format", params.Format))
+		}
+		span.SetAttributes(attrs...)
+	}
+
 	// Set timeout if specified
 	client := &http.Client{
 		Timeout:   h.timeout,
-		Transport: remote.NewTransport(ctx),
+		Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx)),
 		// Re-check the domain allow/deny lists on every redirect: without this,
 		// an allowed origin could redirect into a denied one and bypass the
 		// policy. The 10-redirect cap mirrors the net/http default.
diff --git a/pkg/tools/builtin/filesystem/filesystem.go b/pkg/tools/builtin/filesystem/filesystem.go
index 1bd63ff57..e43605e2a 100644
--- a/pkg/tools/builtin/filesystem/filesystem.go
+++ b/pkg/tools/builtin/filesystem/filesystem.go
@@ -15,11 +15,49 @@ import (
 	"strings"
 	"sync"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/chat"
 	"github.com/docker/docker-agent/pkg/fsx"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
+// annotateFilesystemSpan stamps the operation kind and target path
+// onto the active runtime.tool.handler span. Paths ship unconditionally
+// — they're the main signal of what the agent touched. Drop or hash
+// `cagent.tool.filesystem.path` at the OTel collector if paths
+// routinely reveal identifiers you don't want shipped.
+func annotateFilesystemSpan(ctx context.Context, op, path string) {
+	span := trace.SpanFromContext(ctx)
+	if !span.IsRecording() {
+		return
+	}
+	attrs := []attribute.KeyValue{
+		attribute.String("cagent.tool.filesystem.op", op),
+	}
+	if path != "" {
+		attrs = append(attrs, attribute.String("cagent.tool.filesystem.path", path))
+	}
+	span.SetAttributes(attrs...)
+}
+
+// maxFilesystemPathsAttr caps how many entries from args.Paths land on a
+// span attribute. Many backends drop attributes over a few KiB and per-
+// element string costs add up fast on a multi-hundred-path call. The
+// path_count attribute (always recorded) preserves total fidelity.
+const maxFilesystemPathsAttr = 32
+
+// cappedPaths returns paths truncated to maxFilesystemPathsAttr entries.
+// Callers should also record `path_count = len(paths)` separately so the
+// truncation is visible.
+func cappedPaths(paths []string) []string {
+	if len(paths) <= maxFilesystemPathsAttr {
+		return paths
+	}
+	return paths[:maxFilesystemPathsAttr]
+}
+
 const (
 	ToolNameReadFile           = "read_file"
 	ToolNameReadMultipleFiles  = "read_multiple_files"
@@ -604,6 +642,7 @@ func (t *Tool) shouldIgnorePath(path string) bool {
 // Handler implementations
 
 func (t *Tool) handleDirectoryTree(ctx context.Context, args DirectoryTreeArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "directory_tree", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -676,6 +715,7 @@ func (t *Tool) editFileHandler() tools.ToolHandler {
 }
 
 func (t *Tool) handleEditFile(ctx context.Context, args EditFileArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "edit_file", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -713,7 +753,8 @@ func (t *Tool) handleEditFile(ctx context.Context, args EditFileArgs) (*tools.To
 	return tools.ResultSuccess("File edited successfully. Changes:\n" + strings.Join(changes, "\n")), nil
 }
 
-func (t *Tool) handleListDirectory(_ context.Context, args ListDirectoryArgs) (*tools.ToolCallResult, error) {
+func (t *Tool) handleListDirectory(ctx context.Context, args ListDirectoryArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "list_directory", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -754,7 +795,8 @@ func (t *Tool) handleListDirectory(_ context.Context, args ListDirectoryArgs) (*
 	}, nil
 }
 
-func (t *Tool) handleReadFile(_ context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) {
+func (t *Tool) handleReadFile(ctx context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "read_file", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return &tools.ToolCallResult{
@@ -861,6 +903,13 @@ func (t *Tool) readImageFile(resolvedPath, originalPath string) (*tools.ToolCall
 }
 
 func (t *Tool) handleReadMultipleFiles(ctx context.Context, args ReadMultipleFilesArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "read_multiple_files", "")
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
+			attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)),
+		)
+	}
 	type PathContent struct {
 		Path    string `json:"path"`
 		Content string `json:"content"`
@@ -934,7 +983,8 @@ func (t *Tool) handleReadMultipleFiles(ctx context.Context, args ReadMultipleFil
 	}, nil
 }
 
-func (t *Tool) handleSearchFilesContent(_ context.Context, args SearchFilesContentArgs) (*tools.ToolCallResult, error) {
+func (t *Tool) handleSearchFilesContent(ctx context.Context, args SearchFilesContentArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "search_files_content", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -1054,6 +1104,7 @@ func (t *Tool) handleSearchFilesContent(_ context.Context, args SearchFilesConte
 }
 
 func (t *Tool) handleWriteFile(ctx context.Context, args WriteFileArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "write_file", args.Path)
 	resolvedPath, err := t.resolveAndCheckPath(args.Path)
 	if err != nil {
 		return tools.ResultError(err.Error()), nil
@@ -1076,7 +1127,14 @@ func (t *Tool) handleWriteFile(ctx context.Context, args WriteFileArgs) (*tools.
 	return tools.ResultSuccess(fmt.Sprintf("File written successfully: %s (%d bytes)", args.Path, len(args.Content))), nil
 }
 
-func (t *Tool) handleCreateDirectory(_ context.Context, args CreateDirectoryArgs) (*tools.ToolCallResult, error) {
+func (t *Tool) handleCreateDirectory(ctx context.Context, args CreateDirectoryArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "create_directory", "")
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
+			attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)),
+		)
+	}
 	var results []string
 	for _, path := range args.Paths {
 		resolvedPath, err := t.resolveAndCheckPath(path)
@@ -1092,7 +1150,14 @@ func (t *Tool) handleCreateDirectory(_ context.Context, args CreateDirectoryArgs
 	return tools.ResultSuccess(strings.Join(results, "\n")), nil
 }
 
-func (t *Tool) handleRemoveDirectory(_ context.Context, args RemoveDirectoryArgs) (*tools.ToolCallResult, error) {
+func (t *Tool) handleRemoveDirectory(ctx context.Context, args RemoveDirectoryArgs) (*tools.ToolCallResult, error) {
+	annotateFilesystemSpan(ctx, "remove_directory", "")
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)),
+			attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)),
+		)
+	}
 	var results []string
 	for _, path := range args.Paths {
 		resolvedPath, err := t.resolveAndCheckPath(path)
diff --git a/pkg/tools/builtin/lsp/lsp.go b/pkg/tools/builtin/lsp/lsp.go
index ed1c42ace..3c3cca40b 100644
--- a/pkg/tools/builtin/lsp/lsp.go
+++ b/pkg/tools/builtin/lsp/lsp.go
@@ -19,6 +19,9 @@ import (
 	"sync/atomic"
 	"time"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/tools/lifecycle"
@@ -460,12 +463,29 @@ type WorkspaceArgs struct{}
 
 // lspTool is a shorthand for constructing a tools.Tool with common LSP defaults.
 func lspTool(name, title, description string, readOnly bool, params any, handler tools.ToolHandler) tools.Tool {
+	// Wrap the handler so every LSP RPC stamps the LSP method name on
+	// the active runtime.tool.handler span. Single tool name = single
+	// LSP operation, so the gen_ai.tool.name attribute on the parent
+	// span is enough for filtering by RPC kind in dashboards. The
+	// `cagent.tool.lsp.tool` is redundant with gen_ai.tool.name but
+	// kept under the cagent.* namespace for symmetry with the other
+	// builtin tool annotations and so dashboards have a uniform
+	// `cagent.tool.{kind}.*` query surface across builtins.
+	wrapped := func(ctx context.Context, tc tools.ToolCall) (*tools.ToolCallResult, error) {
+		if span := trace.SpanFromContext(ctx); span.IsRecording() {
+			span.SetAttributes(
+				attribute.String("cagent.tool.lsp.tool", name),
+				attribute.Bool("cagent.tool.lsp.read_only", readOnly),
+			)
+		}
+		return handler(ctx, tc)
+	}
 	return tools.Tool{
 		Name:        name,
 		Category:    "lsp",
 		Description: description,
 		Parameters:  params,
-		Handler:     handler,
+		Handler:     wrapped,
 		Annotations: tools.ToolAnnotations{
 			Title:        title,
 			ReadOnlyHint: readOnly,
diff --git a/pkg/tools/builtin/lsp/lsp_lifecycle.go b/pkg/tools/builtin/lsp/lsp_lifecycle.go
index 4a7376497..bc8cf8469 100644
--- a/pkg/tools/builtin/lsp/lsp_lifecycle.go
+++ b/pkg/tools/builtin/lsp/lsp_lifecycle.go
@@ -12,6 +12,7 @@ import (
 	"sync"
 
 	"github.com/docker/docker-agent/pkg/concurrent"
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools/lifecycle"
 )
 
@@ -28,7 +29,7 @@ func (c *lspConnector) Connect(ctx context.Context) (lifecycle.Session, error) {
 	h := c.h
 	slog.Debug("Starting LSP server", "command", h.command, "args", h.args)
 
-	p, err := spawnLSPProcess(h)
+	p, err := spawnLSPProcess(ctx, h)
 	if err != nil {
 		return nil, err
 	}
@@ -73,14 +74,19 @@ type lspProcess struct {
 // kicks off a stderr-drain goroutine bound to the process lifetime.
 // Errors are mapped to typed lifecycle errors so the supervisor can
 // apply the right policy.
-func spawnLSPProcess(h *lspHandler) (*lspProcess, error) {
+func spawnLSPProcess(callerCtx context.Context, h *lspHandler) (*lspProcess, error) {
 	// The process must outlive the caller's request context (which is
 	// often cancelled when an HTTP/agent turn ends). The supervisor
 	// calls Close to shut it down on Stop or restart.
 	processCtx, processCancel := context.WithCancel(context.Background())
 
 	cmd := exec.CommandContext(processCtx, h.command, h.args...)
+	// Inherit the caller's W3C trace context (the Connect call's
+	// `toolset.start` or per-request span) so an OTel-aware LSP server
+	// can chain its spans onto the agent trace. Most LSPs do not emit
+	// OTel today, so this is defensive parity with sandbox.exec.
 	cmd.Env = append(os.Environ(), h.env...)
+	cmd.Env = append(cmd.Env, genai.InjectTraceContextEnv(callerCtx)...)
 	cmd.Dir = h.workingDir
 
 	stdin, err := cmd.StdinPipe()
diff --git a/pkg/tools/builtin/openapi/openapi.go b/pkg/tools/builtin/openapi/openapi.go
index 863098e30..97a7c2034 100644
--- a/pkg/tools/builtin/openapi/openapi.go
+++ b/pkg/tools/builtin/openapi/openapi.go
@@ -18,6 +18,7 @@ import (
 	v3 "github.com/pb33f/libopenapi/datamodel/high/v3"
 	"go.yaml.in/yaml/v4"
 
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/remote"
 	"github.com/docker/docker-agent/pkg/tools"
 	"github.com/docker/docker-agent/pkg/upstream"
@@ -74,7 +75,7 @@ func (t *Tool) fetchSpec(ctx context.Context) (*v3.Document, error) {
 	req.Header.Set("Accept", "application/json")
 	setHeaders(req, t.headers)
 
-	resp, err := (&http.Client{Timeout: httpTimeout, Transport: remote.NewTransport(ctx)}).Do(req)
+	resp, err := (&http.Client{Timeout: httpTimeout, Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx))}).Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("request failed: %w", err)
 	}
@@ -423,7 +424,7 @@ func (h *openAPIHandler) callTool(ctx context.Context, params openAPICallArgs) (
 	req.Header.Set("Accept", "application/json")
 	setHeaders(req, h.headers)
 
-	resp, err := (&http.Client{Timeout: httpTimeout, Transport: remote.NewTransport(ctx)}).Do(req)
+	resp, err := (&http.Client{Timeout: httpTimeout, Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx))}).Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("request failed: %w", err)
 	}
diff --git a/pkg/tools/builtin/shell/script_shell.go b/pkg/tools/builtin/shell/script_shell.go
index 2eacd4259..061466067 100644
--- a/pkg/tools/builtin/shell/script_shell.go
+++ b/pkg/tools/builtin/shell/script_shell.go
@@ -11,6 +11,9 @@ import (
 	"slices"
 	"strings"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/shellpath"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -138,6 +141,17 @@ func (t *ScriptShellTool) execute(ctx context.Context, toolConfig *latest.Script
 		}
 	}
 
+	// Stamp the script_shell call shape onto the active span. Cmd
+	// ships unconditionally for the same reason as shell.RunShell —
+	// see that comment for the redact-at-collector guidance.
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.script_shell.tool_name", toolCall.Function.Name),
+			attribute.String("cagent.tool.script_shell.cmd", toolConfig.Cmd),
+			attribute.String("cagent.tool.script_shell.cwd", cmp.Or(toolConfig.WorkingDir, ".")),
+		)
+	}
+
 	shell, argsPrefix := shellpath.DetectShell()
 
 	cmd := exec.CommandContext(ctx, shell, append(argsPrefix, toolConfig.Cmd)...)
diff --git a/pkg/tools/builtin/shell/shell.go b/pkg/tools/builtin/shell/shell.go
index 53d8beb61..96377bc24 100644
--- a/pkg/tools/builtin/shell/shell.go
+++ b/pkg/tools/builtin/shell/shell.go
@@ -16,6 +16,9 @@ import (
 	"sync/atomic"
 	"time"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/shellpath"
@@ -199,6 +202,19 @@ func (h *shellHandler) RunShell(ctx context.Context, params RunShellArgs) (*tool
 
 	cwd := h.resolveWorkDir(params.Cwd)
 
+	// Stamp the call shape (cmd, cwd, timeout) onto the active span.
+	// Cmd ships unconditionally — it's the main signal of what the
+	// agent actually did, and gating it on chat-content capture loses
+	// too much debug value. Drop or hash `cagent.tool.shell.cmd` at
+	// the OTel collector if commands routinely carry secrets.
+	if span := trace.SpanFromContext(ctx); span.IsRecording() {
+		span.SetAttributes(
+			attribute.String("cagent.tool.shell.cmd", params.Cmd),
+			attribute.Float64("cagent.tool.shell.timeout_seconds", timeout.Seconds()),
+			attribute.String("cagent.tool.shell.cwd", cwd),
+		)
+	}
+
 	slog.Debug("Executing native shell command", "command", params.Cmd, "cwd", cwd)
 
 	return h.runNativeCommand(timeoutCtx, ctx, params.Cmd, cwd, timeout), nil
diff --git a/pkg/tools/builtin/todo/todo.go b/pkg/tools/builtin/todo/todo.go
index 58dd73f98..450fa5a60 100644
--- a/pkg/tools/builtin/todo/todo.go
+++ b/pkg/tools/builtin/todo/todo.go
@@ -8,10 +8,43 @@ import (
 	"sync"
 	"sync/atomic"
 
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
+
 	"github.com/docker/docker-agent/pkg/concurrent"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
+// annotateTodoSpan stamps the operation kind, batch size, and the
+// resulting list size onto the active runtime.tool.handler span so a
+// glance at a session shows when the agent was actually managing
+// progress vs. just chatting.
+func annotateTodoSpan(ctx context.Context, op string, batch, total, completed int) {
+	span := trace.SpanFromContext(ctx)
+	if !span.IsRecording() {
+		return
+	}
+	span.SetAttributes(
+		attribute.String("cagent.tool.todo.op", op),
+		attribute.Int("cagent.tool.todo.batch_size", batch),
+		attribute.Int("cagent.tool.todo.total", total),
+		attribute.Int("cagent.tool.todo.completed", completed),
+	)
+}
+
+// countCompleted returns how many todos in the current snapshot are
+// marked completed. Cheap O(n) scan over a typically-tiny slice; called
+// once per todo handler invocation for the span annotation.
+func countCompleted(all []Todo) int {
+	n := 0
+	for _, t := range all {
+		if t.Status == "completed" {
+			n++
+		}
+	}
+	return n
+}
+
 const (
 	ToolNameCreateTodo  = "create_todo"
 	ToolNameCreateTodos = "create_todos"
@@ -199,9 +232,11 @@ func (h *todoHandler) jsonResult(ctx context.Context, v any) (*tools.ToolCallRes
 
 func (h *todoHandler) createTodo(ctx context.Context, params CreateTodoArgs) (*tools.ToolCallResult, error) {
 	created := h.addTodo(ctx, params.Description)
+	all := h.storage.All(ctx)
+	annotateTodoSpan(ctx, "create_todo", 1, len(all), countCompleted(all))
 	return h.jsonResult(ctx, CreateTodoOutput{
 		Created:  created,
-		AllTodos: h.storage.All(ctx),
+		AllTodos: all,
 		Reminder: h.incompleteReminder(ctx),
 	})
 }
@@ -211,9 +246,11 @@ func (h *todoHandler) createTodos(ctx context.Context, params CreateTodosArgs) (
 	for _, desc := range params.Descriptions {
 		created = append(created, h.addTodo(ctx, desc))
 	}
+	all := h.storage.All(ctx)
+	annotateTodoSpan(ctx, "create_todos", len(params.Descriptions), len(all), countCompleted(all))
 	return h.jsonResult(ctx, CreateTodosOutput{
 		Created:  created,
-		AllTodos: h.storage.All(ctx),
+		AllTodos: all,
 		Reminder: h.incompleteReminder(ctx),
 	})
 }
@@ -246,6 +283,7 @@ func (h *todoHandler) updateTodos(ctx context.Context, params UpdateTodosArgs) (
 
 	result.AllTodos = h.storage.All(ctx)
 	result.Reminder = h.incompleteReminder(ctx)
+	annotateTodoSpan(ctx, "update_todos", len(params.Updates), len(result.AllTodos), countCompleted(result.AllTodos))
 
 	return h.jsonResult(ctx, result)
 }
@@ -283,6 +321,7 @@ func (h *todoHandler) listTodos(ctx context.Context, _ tools.ToolCall) (*tools.T
 	if todos == nil {
 		todos = []Todo{}
 	}
+	annotateTodoSpan(ctx, "list_todos", 0, len(todos), countCompleted(todos))
 	out := ListTodosOutput{Todos: todos}
 	out.Reminder = h.incompleteReminder(ctx)
 	return h.jsonResult(ctx, out)
diff --git a/pkg/tools/builtin/userprompt/userprompt.go b/pkg/tools/builtin/userprompt/userprompt.go
index 421845a5e..aecf2ce68 100644
--- a/pkg/tools/builtin/userprompt/userprompt.go
+++ b/pkg/tools/builtin/userprompt/userprompt.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 
 	"github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/tools"
 )
@@ -47,6 +49,14 @@ func (t *Tool) userPrompt(ctx context.Context, params Args) (*tools.ToolCallResu
 		return tools.ResultError("user_prompt tool is not available in this context (no elicitation handler configured)"), nil
 	}
 
+	span := trace.SpanFromContext(ctx)
+	if span.IsRecording() {
+		span.SetAttributes(
+			attribute.Int("cagent.tool.user_prompt.message_length", len(params.Message)),
+			attribute.Bool("cagent.tool.user_prompt.has_schema", params.Schema != nil),
+		)
+	}
+
 	var meta mcp.Meta
 	if params.Title != "" {
 		meta = mcp.Meta{"cagent/title": params.Title}
@@ -68,6 +78,10 @@ func (t *Tool) userPrompt(ctx context.Context, params Args) (*tools.ToolCallResu
 		Content: result.Content,
 	}
 
+	if span.IsRecording() {
+		span.SetAttributes(attribute.String("cagent.tool.user_prompt.action", string(result.Action)))
+	}
+
 	responseJSON, err := json.Marshal(response)
 	if err != nil {
 		return nil, fmt.Errorf("failed to marshal response: %w", err)
diff --git a/pkg/tools/codemode/exec.go b/pkg/tools/codemode/exec.go
index 0d16b3035..df143b1f4 100644
--- a/pkg/tools/codemode/exec.go
+++ b/pkg/tools/codemode/exec.go
@@ -3,12 +3,17 @@ package codemode
 import (
 	"bytes"
 	"context"
+	"crypto/sha256"
+	"encoding/hex"
 	"encoding/json"
 	"fmt"
 	"slices"
 
 	"github.com/dop251/goja"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
+	"github.com/docker/docker-agent/pkg/telemetry/genai"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -40,6 +45,29 @@ func (c *codeModeTool) runJavascript(ctx context.Context, script string) (Script
 	vm := goja.New()
 	tracker := &toolCallTracker{}
 
+	// Always stamp a hash + length so dashboards can correlate
+	// identical scripts ("model ran the same script 200 times this
+	// hour") without ever shipping the body. Codemode scripts are
+	// kilobyte-scale arbitrary JS — embedded auth tokens, pasted
+	// user data, and inline secrets are common — so the body itself
+	// is gated behind the GenAI content-capture opt-in.
+	span := trace.SpanFromContext(ctx)
+	if span.IsRecording() {
+		sum := sha256.Sum256([]byte(script))
+		span.SetAttributes(
+			attribute.String("cagent.tool.codemode.script_hash", hex.EncodeToString(sum[:])),
+			attribute.Int("cagent.tool.codemode.script_length", len(script)),
+		)
+		if genai.IsContentCaptureEnabled() {
+			span.SetAttributes(attribute.String("cagent.tool.codemode.script", script))
+		}
+	}
+	defer func() {
+		if span.IsRecording() {
+			span.SetAttributes(attribute.Int("cagent.tool.codemode.tool_call_count", len(tracker.calls)))
+		}
+	}()
+
 	// Inject console object to the help the LLM debug its own code.
 	var (
 		stdOut bytes.Buffer
diff --git a/pkg/tools/mcp/mcp.go b/pkg/tools/mcp/mcp.go
index a0537fba3..34f9f3abf 100644
--- a/pkg/tools/mcp/mcp.go
+++ b/pkg/tools/mcp/mcp.go
@@ -16,6 +16,8 @@ import (
 	"time"
 
 	"github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/tools"
@@ -33,6 +35,11 @@ type mcpClient interface {
 	SetManagedOAuth(managed bool)
 	SetToolListChangedHandler(handler func())
 	SetPromptListChangedHandler(handler func())
+	// ServerAddress returns the connection identifier (URL for remote
+	// clients, executable name for stdio). Used by `Toolset.Start` to
+	// stamp `server.address` on the parent `toolset.start` span so
+	// initialize failures show which target produced them.
+	ServerAddress() string
 	// Wait blocks until the underlying connection is closed by the server.
 	// It returns nil if the connection was closed gracefully.
 	Wait() error
@@ -286,6 +293,19 @@ func (ts *Toolset) Start(ctx context.Context) error {
 	if ts.supervisor == nil {
 		return errors.New("toolset has no supervisor: must be created via NewToolsetCommand or NewRemoteToolset")
 	}
+	// Stamp the connection identifier on the parent `toolset.start`
+	// span before doing anything else so an Initialize failure (e.g.
+	// the multi-replica MCP "session not found" 404 case) carries the
+	// target address as `server.address` — without this, the error
+	// message has the only clue and triage requires log greppage to
+	// match toolsets to URLs.
+	if ts.mcpClient != nil {
+		if addr := ts.mcpClient.ServerAddress(); addr != "" {
+			if span := trace.SpanFromContext(ctx); span.IsRecording() {
+				span.SetAttributes(attribute.String("server.address", addr))
+			}
+		}
+	}
 	return ts.supervisor.Start(ctx)
 }
 
diff --git a/pkg/tools/mcp/mcp_test.go b/pkg/tools/mcp/mcp_test.go
index 8a80e6264..63be08bab 100644
--- a/pkg/tools/mcp/mcp_test.go
+++ b/pkg/tools/mcp/mcp_test.go
@@ -50,6 +50,8 @@ func (m *mockMCPClient) SetToolListChangedHandler(func()) {}
 
 func (m *mockMCPClient) SetPromptListChangedHandler(func()) {}
 
+func (m *mockMCPClient) ServerAddress() string { return "mock://test" }
+
 func (m *mockMCPClient) Wait() error { return nil }
 
 func (m *mockMCPClient) Close(context.Context) error { return nil }
diff --git a/pkg/tools/mcp/oauth.go b/pkg/tools/mcp/oauth.go
index fa3fb3b72..6385db666 100644
--- a/pkg/tools/mcp/oauth.go
+++ b/pkg/tools/mcp/oauth.go
@@ -16,9 +16,15 @@ import (
 	"time"
 
 	mcpsdk "github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 	"golang.org/x/oauth2"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/httpclient"
+	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -475,17 +481,42 @@ func (t *oauthTransport) getValidToken(ctx context.Context) *OAuthToken {
 
 	slog.Debug("Attempting silent token refresh", "url", t.baseURL)
 
-	o := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	// Wrap the refresh path in a span so the latency and failure
+	// rate of silent OAuth token refreshes are visible — the user
+	// otherwise just sees a stalled MCP request with no obvious
+	// cause. Pull conversation id from baggage so observability-svc
+	// can attribute the refresh to the spawning session.
+	refreshAttrs := []attribute.KeyValue{
+		attribute.String("cagent.oauth.base_url", t.baseURL),
+	}
+	if convID := otelmcp.ConversationIDFromBaggage(ctx); convID != "" {
+		refreshAttrs = append(refreshAttrs, attribute.String("gen_ai.conversation.id", convID))
+	}
+	ctx, refreshSpan := otel.Tracer("github.com/docker/docker-agent/pkg/tools/mcp").Start(
+		ctx,
+		"oauth.token.refresh",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(refreshAttrs...),
+	)
+	defer refreshSpan.End()
+
+	o := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
 	authServer := cmp.Or(token.AuthServer, t.baseURL)
 	metadata, err := o.getAuthorizationServerMetadata(ctx, authServer)
 	if err != nil {
 		slog.Debug("Failed to fetch auth server metadata for refresh", "auth_server", authServer, "error", err)
+		refreshSpan.RecordError(err)
+		refreshSpan.SetStatus(codes.Error, "metadata fetch failed")
+		refreshSpan.SetAttributes(attribute.String("error.type", "metadata"))
 		return nil
 	}
 
 	newToken, err := RefreshAccessToken(ctx, metadata.TokenEndpoint, token.RefreshToken, token.ClientID, token.ClientSecret)
 	if err != nil {
 		slog.Debug("Token refresh failed, will require interactive auth", "error", err)
+		refreshSpan.RecordError(err)
+		refreshSpan.SetStatus(codes.Error, "refresh failed")
+		refreshSpan.SetAttributes(attribute.String("error.type", "refresh_token"))
 		t.mu.Lock()
 		t.refreshFailedAt = time.Now()
 		t.mu.Unlock()
@@ -546,24 +577,54 @@ func configuredScopes(c *latest.RemoteOAuthConfig) []string {
 }
 
 // handleOAuthFlow performs the OAuth flow when a 401 response is received
-func (t *oauthTransport) handleOAuthFlow(ctx context.Context, authServer, wwwAuth string) error {
+func (t *oauthTransport) handleOAuthFlow(ctx context.Context, authServer, wwwAuth string) (err error) {
+	kind := "unmanaged"
 	if t.managed {
-		return t.handleManagedOAuthFlow(ctx, authServer, wwwAuth)
+		kind = "managed"
+	}
+	// Interactive OAuth flows can take seconds to minutes (user
+	// switches to browser, completes the consent screen, comes
+	// back). The span makes that latency attributable and gives
+	// dashboards a way to count auth-failure rates by managed kind.
+	flowAttrs := []attribute.KeyValue{
+		attribute.String("cagent.oauth.base_url", t.baseURL),
+		attribute.String("cagent.oauth.kind", kind),
 	}
+	if convID := otelmcp.ConversationIDFromBaggage(ctx); convID != "" {
+		flowAttrs = append(flowAttrs, attribute.String("gen_ai.conversation.id", convID))
+	}
+	ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools/mcp").Start(
+		ctx,
+		"oauth.flow",
+		trace.WithSpanKind(trace.SpanKindClient),
+		trace.WithAttributes(flowAttrs...),
+	)
+	defer func() {
+		if err != nil {
+			span.RecordError(err)
+			span.SetStatus(codes.Error, err.Error())
+		}
+		span.End()
+	}()
 
+	if t.managed {
+		return t.handleManagedOAuthFlow(ctx, authServer, wwwAuth)
+	}
 	return t.handleUnmanagedOAuthFlow(ctx, authServer, wwwAuth)
 }
 
 func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, wwwAuth string) error {
 	slog.Debug("Starting OAuth flow for server", "url", t.baseURL)
+	span := trace.SpanFromContext(ctx)
 
 	resourceURL := cmp.Or(resourceMetadataFromWWWAuth(wwwAuth), authServer+"/.well-known/oauth-protected-resource")
 
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_protected_resource_metadata")))
 	resourceReq, err := http.NewRequestWithContext(ctx, http.MethodGet, resourceURL, http.NoBody)
 	if err != nil {
 		return err
 	}
-	resp, err := http.DefaultClient.Do(resourceReq)
+	resp, err := httpclient.TracedDefaultClient().Do(resourceReq)
 	if err != nil {
 		return err
 	}
@@ -585,7 +646,8 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 		resourceMetadata.AuthorizationServers = []string{authServer}
 	}
 
-	oauth := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	oauth := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_authorization_server_metadata")))
 	authServerMetadata, err := oauth.getAuthorizationServerMetadata(ctx, resourceMetadata.AuthorizationServers[0])
 	if err != nil {
 		return fmt.Errorf("failed to fetch authorization server metadata: %w", err)
@@ -628,6 +690,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 		scopes = t.oauthConfig.Scopes
 	case authServerMetadata.RegistrationEndpoint != "":
 		slog.Debug("Attempting dynamic client registration")
+		span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "dynamic_client_registration")))
 		clientID, clientSecret, err = RegisterClient(ctx, authServerMetadata, redirectURI, nil)
 		if err != nil {
 			slog.Debug("Dynamic registration failed", "error", err)
@@ -676,6 +739,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 	}
 
 	slog.Debug("Requesting authorization code", "url", authURL)
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "request_authorization_code")))
 
 	code, receivedState, err := RequestAuthorizationCode(ctx, authURL, callbackServer, state)
 	if err != nil {
@@ -687,6 +751,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 	}
 
 	slog.Debug("Exchanging authorization code for token")
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "token_exchange")))
 	token, err := ExchangeCodeForToken(
 		ctx,
 		authServerMetadata.TokenEndpoint,
@@ -720,15 +785,17 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer,
 // where the client handles the OAuth interaction instead of us
 func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServer, wwwAuth string) error {
 	slog.Debug("Starting unmanaged OAuth flow for server", "url", t.baseURL)
+	span := trace.SpanFromContext(ctx)
 
 	// Extract resource URL from WWW-Authenticate header
 	resourceURL := cmp.Or(resourceMetadataFromWWWAuth(wwwAuth), authServer+"/.well-known/oauth-protected-resource")
 
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_protected_resource_metadata")))
 	resourceReq, err := http.NewRequestWithContext(ctx, http.MethodGet, resourceURL, http.NoBody)
 	if err != nil {
 		return err
 	}
-	resp, err := http.DefaultClient.Do(resourceReq)
+	resp, err := httpclient.TracedDefaultClient().Do(resourceReq)
 	if err != nil {
 		return err
 	}
@@ -750,7 +817,8 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe
 		resourceMetadata.AuthorizationServers = []string{authServer}
 	}
 
-	oauth := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	oauth := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
+	span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_authorization_server_metadata")))
 	authServerMetadata, err := oauth.getAuthorizationServerMetadata(ctx, resourceMetadata.AuthorizationServers[0])
 	if err != nil {
 		return fmt.Errorf("failed to fetch authorization server metadata: %w", err)
diff --git a/pkg/tools/mcp/oauth_helpers.go b/pkg/tools/mcp/oauth_helpers.go
index ca9e862c8..768bec002 100644
--- a/pkg/tools/mcp/oauth_helpers.go
+++ b/pkg/tools/mcp/oauth_helpers.go
@@ -16,6 +16,7 @@ import (
 	"golang.org/x/oauth2"
 
 	"github.com/docker/docker-agent/pkg/browser"
+	"github.com/docker/docker-agent/pkg/httpclient"
 )
 
 // GenerateState generates a random state parameter for OAuth CSRF protection
@@ -62,7 +63,7 @@ func ExchangeCodeForToken(ctx context.Context, tokenEndpoint, code, codeVerifier
 
 	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.TracedDefaultClient().Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("failed to exchange code for token: %w", err)
 	}
@@ -221,7 +222,7 @@ func RegisterClient(ctx context.Context, authMetadata *AuthorizationServerMetada
 	}
 	req.Header.Set("Content-Type", "application/json")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.TracedDefaultClient().Do(req)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to register client: %w", err)
 	}
@@ -269,7 +270,7 @@ func RefreshAccessToken(ctx context.Context, tokenEndpoint, refreshToken, client
 	}
 	req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.TracedDefaultClient().Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("failed to refresh token: %w", err)
 	}
diff --git a/pkg/tools/mcp/oauth_login.go b/pkg/tools/mcp/oauth_login.go
index 00d57c8fb..a71ddc2a2 100644
--- a/pkg/tools/mcp/oauth_login.go
+++ b/pkg/tools/mcp/oauth_login.go
@@ -11,6 +11,8 @@ import (
 	"time"
 
 	"golang.org/x/oauth2"
+
+	"github.com/docker/docker-agent/pkg/httpclient"
 )
 
 // PerformOAuthLogin performs a standalone OAuth flow for the given MCP server URL.
@@ -19,7 +21,7 @@ import (
 func PerformOAuthLogin(ctx context.Context, serverURL string) error {
 	tokenStore := NewKeyringTokenStore()
 
-	o := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}}
+	o := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })}
 
 	// Derive the base origin (scheme + host) from the server URL.
 	// The well-known endpoints live at the origin, not under the SSE/path.
@@ -35,7 +37,7 @@ func PerformOAuthLogin(ctx context.Context, serverURL string) error {
 	if err != nil {
 		return fmt.Errorf("failed to create resource metadata request: %w", err)
 	}
-	resp, err := http.DefaultClient.Do(resourceReq)
+	resp, err := httpclient.TracedDefaultClient().Do(resourceReq)
 	if err != nil {
 		return fmt.Errorf("failed to fetch protected resource metadata: %w", err)
 	}
diff --git a/pkg/tools/mcp/oauth_server.go b/pkg/tools/mcp/oauth_server.go
index 5a355ccb4..527316a5d 100644
--- a/pkg/tools/mcp/oauth_server.go
+++ b/pkg/tools/mcp/oauth_server.go
@@ -12,6 +12,8 @@ import (
 	"strings"
 	"sync"
 	"time"
+
+	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 )
 
 // CallbackServer handles OAuth callback requests
@@ -53,8 +55,12 @@ func NewCallbackServerOnPort(port int) (*CallbackServer, error) {
 	mux := http.NewServeMux()
 	mux.HandleFunc("/callback", cs.handleCallback)
 
+	// Wrap with otelhttp so the OAuth callback span chains onto the
+	// caller's trace when the OAuth provider preserves trace context
+	// in the redirect (most don't, but the wrap is harmless when
+	// they don't, and useful when they do).
 	cs.server = &http.Server{
-		Handler:      mux,
+		Handler:      otelhttp.NewHandler(mux, "oauth.callback"),
 		ReadTimeout:  10 * time.Second,
 		WriteTimeout: 10 * time.Second,
 	}
diff --git a/pkg/tools/mcp/reconnect_test.go b/pkg/tools/mcp/reconnect_test.go
index 71ece482b..df0257a89 100644
--- a/pkg/tools/mcp/reconnect_test.go
+++ b/pkg/tools/mcp/reconnect_test.go
@@ -72,6 +72,7 @@ func (m *failingInitClient) SetOAuthSuccessHandler(func())                  {}
 func (m *failingInitClient) SetManagedOAuth(bool)                           {}
 func (m *failingInitClient) SetToolListChangedHandler(func())               {}
 func (m *failingInitClient) SetPromptListChangedHandler(func())             {}
+func (m *failingInitClient) ServerAddress() string                          { return "mock://failing" }
 
 func (m *failingInitClient) Wait() error {
 	m.mu.Lock()
diff --git a/pkg/tools/mcp/remote.go b/pkg/tools/mcp/remote.go
index 805c3fe1a..42a7a4254 100644
--- a/pkg/tools/mcp/remote.go
+++ b/pkg/tools/mcp/remote.go
@@ -5,10 +5,12 @@ import (
 	"fmt"
 	"log/slog"
 	"net/http"
+	neturl "net/url"
 
 	gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
 
 	"github.com/docker/docker-agent/pkg/config/latest"
+	"github.com/docker/docker-agent/pkg/httpclient"
 	"github.com/docker/docker-agent/pkg/upstream"
 )
 
@@ -31,6 +33,7 @@ func newRemoteClient(url, transportType string, headers map[string]string, token
 	}
 
 	return &remoteMCPClient{
+		sessionClient: sessionClient{serverAddress: sanitizeRemoteAddress(url)},
 		url:           url,
 		transportType: transportType,
 		headers:       headers,
@@ -39,6 +42,26 @@ func newRemoteClient(url, transportType string, headers map[string]string, token
 	}
 }
 
+// sanitizeRemoteAddress extracts a span-safe identifier from an MCP URL
+// before stamping it as `server.address`. The URL may legitimately
+// contain credentials in userinfo (`https://user:token@host/`) or query
+// params (`?api_key=...`); sending those to the trace backend would be
+// a real exfiltration risk. OTel's semantic convention for
+// `server.address` is the host (with optional port) anyway, so we keep
+// only `u.Host` and drop everything else.
+//
+// Returns the empty string on parse failure or hostless URLs (file://,
+// stdio commands, malformed input). The caller stamps `server.address`
+// only when it's non-empty, so a sanitisation miss leaves the span
+// without that attribute rather than leaking a raw URL.
+func sanitizeRemoteAddress(rawURL string) string {
+	u, err := neturl.Parse(rawURL)
+	if err != nil || u.Host == "" {
+		return ""
+	}
+	return u.Host
+}
+
 func (c *remoteMCPClient) Initialize(ctx context.Context, _ *gomcp.InitializeRequest) (*gomcp.InitializeResult, error) {
 	// Create HTTP client with OAuth support. We keep a reference to the
 	// oauthTransport so we can enrich Connect errors with the server's own
@@ -132,6 +155,16 @@ func (c *remoteMCPClient) SetManagedOAuth(managed bool) {
 // The oauthTransport is returned alongside the client so callers can inspect
 // the most recent server-side failure (via lastServerError) when Connect()
 // returns a bare HTTP-status error and we need to surface the actual cause.
+//
+// The transport chain wraps `httpclient.WrapWithOTel` outermost so every
+// outbound MCP request injects W3C `traceparent` (and creates an HTTP
+// CLIENT span). Without this wrap, the streamable-HTTP / SSE transports
+// the gomcp SDK builds with our `*http.Client` send raw POST/GET requests
+// that never chain onto the calling cagent span — the downstream MCP
+// server's spans then live in a separate root trace, breaking end-to-end
+// observability for any agent talking to a remote MCP. `WrapWithOTel` is
+// a no-op when OTel is disabled at runtime, so the laptop-mode default
+// stays unchanged.
 func (c *remoteMCPClient) createHTTPClient() (*http.Client, *oauthTransport) {
 	base := c.headerTransport()
 
@@ -145,7 +178,7 @@ func (c *remoteMCPClient) createHTTPClient() (*http.Client, *oauthTransport) {
 		oauthConfig: c.oauthConfig,
 	}
 
-	return &http.Client{Transport: oauthT}, oauthT
+	return &http.Client{Transport: httpclient.WrapWithOTel(oauthT)}, oauthT
 }
 
 func (c *remoteMCPClient) headerTransport() http.RoundTripper {
diff --git a/pkg/tools/mcp/remote_test.go b/pkg/tools/mcp/remote_test.go
index 98678fd5d..17d97c9b0 100644
--- a/pkg/tools/mcp/remote_test.go
+++ b/pkg/tools/mcp/remote_test.go
@@ -12,6 +12,38 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// TestSanitizeRemoteAddress verifies that URLs with embedded credentials
+// (basic-auth userinfo, query-string secrets) collapse to a host-only
+// string before reaching the `server.address` span attribute. The point
+// is exfiltration safety: a URL like `https://user:token@host/?api_key=…`
+// would otherwise be replicated verbatim into every CLIENT span and
+// shipped to the trace backend.
+func TestSanitizeRemoteAddress(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		url  string
+		want string
+	}{
+		{name: "plain", url: "https://example.com/mcp", want: "example.com"},
+		{name: "host with port", url: "https://example.com:8443/mcp", want: "example.com:8443"},
+		{name: "userinfo stripped", url: "https://alice:s3cret@example.com/mcp", want: "example.com"},
+		{name: "query stripped", url: "https://example.com/mcp?api_key=s3cret", want: "example.com"},
+		{name: "userinfo and query stripped", url: "https://alice:s3cret@example.com:8443/mcp?api_key=x", want: "example.com:8443"},
+		{name: "fragment stripped", url: "https://example.com/mcp#frag", want: "example.com"},
+		{name: "hostless empty fallback", url: "not-a-url", want: ""},
+		{name: "empty input", url: "", want: ""},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			got := sanitizeRemoteAddress(tc.url)
+			assert.Equal(t, tc.want, got, "sanitizeRemoteAddress(%q)", tc.url)
+		})
+	}
+}
+
 // TestRemoteClientCustomHeaders verifies that custom headers passed to the remote
 // MCP client are actually applied to HTTP requests sent to the MCP server.
 func TestRemoteClientCustomHeaders(t *testing.T) {
diff --git a/pkg/tools/mcp/session_client.go b/pkg/tools/mcp/session_client.go
index 778ee1530..e2259142c 100644
--- a/pkg/tools/mcp/session_client.go
+++ b/pkg/tools/mcp/session_client.go
@@ -9,7 +9,9 @@ import (
 	"sync"
 
 	gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
+	"go.opentelemetry.io/otel/attribute"
 
+	otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp"
 	"github.com/docker/docker-agent/pkg/tools"
 )
 
@@ -17,8 +19,16 @@ import (
 // implementations. Both stdioMCPClient and remoteMCPClient embed it to avoid
 // duplicating the session-nil guards, notification handlers, and delegating
 // methods.
+//
+// `serverAddress` is captured at construction time (the remote URL for
+// HTTP/SSE clients, the executable name for stdio clients) and stamped on
+// every CLIENT-kind MCP span as the OTel `server.address` attribute. Without
+// it, a `tools/list` failure span carries `mcp.method.name=tools/list` and
+// nothing else identifying which target produced the error — useful in a
+// single-MCP agent, useless in any agent wired to two or more.
 type sessionClient struct {
 	session                  *gomcp.ClientSession
+	serverAddress            string
 	toolListChangedHandler   func()
 	promptListChangedHandler func()
 	elicitationHandler       tools.ElicitationHandler
@@ -33,6 +43,15 @@ func (c *sessionClient) setSession(s *gomcp.ClientSession) {
 	c.mu.Unlock()
 }
 
+// ServerAddress returns the connection identifier captured at construction
+// time (URL for remote clients, executable name for stdio). Exposed so
+// the parent `toolset.start` span can stamp it as `server.address` —
+// otherwise an Initialize failure surfaces the error message but no
+// indication of which MCP target produced it.
+func (c *sessionClient) ServerAddress() string {
+	return c.serverAddress
+}
+
 // getSession returns the current session under the read lock.
 func (c *sessionClient) getSession() *gomcp.ClientSession {
 	c.mu.RLock()
@@ -93,35 +112,140 @@ func (c *sessionClient) Close(context.Context) error {
 }
 
 func (c *sessionClient) ListTools(ctx context.Context, request *gomcp.ListToolsParams) iter.Seq2[*gomcp.Tool, error] {
-	if s := c.getSession(); s != nil {
-		return s.Tools(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return func(yield func(*gomcp.Tool, error) bool) {
+			yield(nil, errors.New("session not initialized"))
+		}
 	}
+	// Start the span and the underlying RPC inside the closure so a
+	// caller that obtains the iterator and never iterates does not
+	// leak the span (and the in-flight RPC). Span lifetime now equals
+	// iteration lifetime.
 	return func(yield func(*gomcp.Tool, error) bool) {
-		yield(nil, errors.New("session not initialized"))
+		spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{
+			Method:        otelmcp.MethodToolsList,
+			SessionID:     s.ID(),
+			ServerAddress: c.serverAddress,
+		})
+		defer span.End()
+
+		// Stamp the tool count on the span when iteration finishes —
+		// answers "what did this server actually return?" without
+		// having to walk into the JSON-RPC payload. Counts only the
+		// tools the iterator yielded successfully; partial counts are
+		// preserved when the caller breaks out early.
+		var count int
+		defer func() {
+			span.SetAttributes(attribute.Int("cagent.mcp.tools.count", count))
+		}()
+
+		if request != nil {
+			request.Meta = otelmcp.EnsureMeta(request.Meta)
+			otelmcp.InjectMeta(spanCtx, request.Meta)
+		}
+		for tool, err := range s.Tools(spanCtx, request) {
+			if err != nil {
+				// Record each error inline rather than only the
+				// last one — paginated lists may yield multiple
+				// failures and the trace should reflect them all.
+				span.RecordError(err, "")
+			} else if tool != nil {
+				count++
+			}
+			if !yield(tool, err) {
+				return
+			}
+		}
 	}
 }
 
 func (c *sessionClient) CallTool(ctx context.Context, request *gomcp.CallToolParams) (*gomcp.CallToolResult, error) {
-	if s := c.getSession(); s != nil {
-		return s.CallTool(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return nil, errors.New("session not initialized")
+	}
+	opts := otelmcp.CallOptions{
+		Method:        otelmcp.MethodToolsCall,
+		SessionID:     s.ID(),
+		ServerAddress: c.serverAddress,
+	}
+	if request != nil {
+		opts.ToolName = request.Name
+	}
+	spanCtx, span := otelmcp.StartClient(ctx, opts)
+	defer span.End()
+
+	if request != nil {
+		request.Meta = otelmcp.EnsureMeta(request.Meta)
+		otelmcp.InjectMeta(spanCtx, request.Meta)
+	}
+
+	result, err := s.CallTool(spanCtx, request)
+	if err != nil {
+		span.RecordError(err, "")
 	}
-	return nil, errors.New("session not initialized")
+	return result, err
 }
 
 func (c *sessionClient) ListPrompts(ctx context.Context, request *gomcp.ListPromptsParams) iter.Seq2[*gomcp.Prompt, error] {
-	if s := c.getSession(); s != nil {
-		return s.Prompts(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return func(yield func(*gomcp.Prompt, error) bool) {
+			yield(nil, errors.New("session not initialized"))
+		}
 	}
 	return func(yield func(*gomcp.Prompt, error) bool) {
-		yield(nil, errors.New("session not initialized"))
+		// Span and RPC start at iteration time so an unused
+		// iterator never leaks either.
+		spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{
+			Method:        otelmcp.MethodPromptsList,
+			SessionID:     s.ID(),
+			ServerAddress: c.serverAddress,
+		})
+		defer span.End()
+
+		if request != nil {
+			request.Meta = otelmcp.EnsureMeta(request.Meta)
+			otelmcp.InjectMeta(spanCtx, request.Meta)
+		}
+		for prompt, err := range s.Prompts(spanCtx, request) {
+			if err != nil {
+				span.RecordError(err, "")
+			}
+			if !yield(prompt, err) {
+				return
+			}
+		}
 	}
 }
 
 func (c *sessionClient) GetPrompt(ctx context.Context, request *gomcp.GetPromptParams) (*gomcp.GetPromptResult, error) {
-	if s := c.getSession(); s != nil {
-		return s.GetPrompt(ctx, request)
+	s := c.getSession()
+	if s == nil {
+		return nil, errors.New("session not initialized")
+	}
+	opts := otelmcp.CallOptions{
+		Method:        otelmcp.MethodPromptsGet,
+		SessionID:     s.ID(),
+		ServerAddress: c.serverAddress,
+	}
+	if request != nil {
+		opts.PromptName = request.Name
+	}
+	spanCtx, span := otelmcp.StartClient(ctx, opts)
+	defer span.End()
+
+	if request != nil {
+		request.Meta = otelmcp.EnsureMeta(request.Meta)
+		otelmcp.InjectMeta(spanCtx, request.Meta)
+	}
+
+	result, err := s.GetPrompt(spanCtx, request)
+	if err != nil {
+		span.RecordError(err, "")
 	}
-	return nil, errors.New("session not initialized")
+	return result, err
 }
 
 // handleElicitationRequest forwards incoming elicitation requests from the MCP
diff --git a/pkg/tools/mcp/stdio.go b/pkg/tools/mcp/stdio.go
index 01e3fab25..454fb3139 100644
--- a/pkg/tools/mcp/stdio.go
+++ b/pkg/tools/mcp/stdio.go
@@ -22,10 +22,15 @@ type stdioMCPClient struct {
 
 func newStdioCmdClient(command string, args, env []string, cwd string) *stdioMCPClient {
 	return &stdioMCPClient{
-		command: command,
-		args:    args,
-		env:     env,
-		cwd:     cwd,
+		// stdio has no real "server address" in the OTel HTTP sense; using
+		// the command as a stand-in keeps spans triageable when the agent
+		// has multiple stdio MCPs wired up. Span readers see the
+		// executable name (e.g. `foo-mcp-server`) on `server.address`.
+		sessionClient: sessionClient{serverAddress: command},
+		command:       command,
+		args:          args,
+		env:           env,
+		cwd:           cwd,
 	}
 }
 
diff --git a/pkg/tools/startable.go b/pkg/tools/startable.go
index f550a4553..67258ba6e 100644
--- a/pkg/tools/startable.go
+++ b/pkg/tools/startable.go
@@ -4,6 +4,11 @@ import (
 	"context"
 	"fmt"
 	"sync"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 )
 
 // Describer can be implemented by a ToolSet to provide a short, user-visible
@@ -65,7 +70,7 @@ func (s *StartableToolSet) IsStarted() bool {
 // Concurrent callers block until the start attempt completes.
 // If start fails, a future call will retry.
 // If the underlying toolset doesn't implement Startable, this is a no-op.
-func (s *StartableToolSet) Start(ctx context.Context) error {
+func (s *StartableToolSet) Start(ctx context.Context) (err error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
@@ -74,6 +79,32 @@ func (s *StartableToolSet) Start(ctx context.Context) error {
 	}
 
 	if startable, ok := As[Startable](s.ToolSet); ok {
+		// Span the toolset startup — MCP handshake, OAuth probes,
+		// tool discovery, etc. can take seconds to minutes and the
+		// "tools loading…" UI was previously unattributable. Only
+		// fires when the toolset has work to do; cheap toolsets
+		// without a Startable implementation skip the span entirely.
+		// Unwrap once so the kind attribute names the underlying toolset
+		// (e.g. *mcp.Toolset, *builtin.ShellTool) instead of the
+		// *tools.namedToolSet wrapper that every toolset gets in the
+		// registry — same pattern DescribeToolSet uses.
+		inner := s.ToolSet
+		if u, ok := inner.(Unwrapper); ok {
+			inner = u.Unwrap()
+		}
+		ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools").Start(
+			ctx,
+			"toolset.start",
+			trace.WithSpanKind(trace.SpanKindInternal),
+			trace.WithAttributes(attribute.String("cagent.toolset.kind", fmt.Sprintf("%T", inner))),
+		)
+		defer func() {
+			if err != nil {
+				span.RecordError(err)
+				span.SetStatus(codes.Error, err.Error())
+			}
+			span.End()
+		}()
 		if err := startable.Start(ctx); err != nil {
 			// Queue a warning ONLY on the first failure of a streak so
 			// repeated retries don't re-queue duplicate warnings.