diff --git a/cmd/root/new.go b/cmd/root/new.go index a52bed77d..a34c4f1f6 100644 --- a/cmd/root/new.go +++ b/cmd/root/new.go @@ -7,6 +7,7 @@ import ( tea "charm.land/bubbletea/v2" "github.com/spf13/cobra" + "go.opentelemetry.io/otel" "github.com/docker/docker-agent/pkg/app" "github.com/docker/docker-agent/pkg/config" @@ -63,7 +64,9 @@ func (f *newFlags) runNewCommand(cmd *cobra.Command, args []string) (commandErr } defer stopToolSets(t) - rt, err := runtime.New(t) + rt, err := runtime.New(t, + runtime.WithTracer(otel.Tracer(AppName)), + ) if err != nil { return err } diff --git a/cmd/root/otel.go b/cmd/root/otel.go index 9fc1f044d..32e8afd93 100644 --- a/cmd/root/otel.go +++ b/cmd/root/otel.go @@ -5,15 +5,26 @@ import ( "fmt" "net" "os" + "runtime" "strings" "time" + "github.com/google/uuid" "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/log/global" "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/log" + "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.40.0" + + "github.com/docker/docker-agent/pkg/httpclient" + "github.com/docker/docker-agent/pkg/version" ) const AppName = "cagent" @@ -25,73 +36,188 @@ func initOTelSDK(ctx context.Context) (err error) { return fmt.Errorf("failed to create resource: %w", err) } - var traceExporter trace.SpanExporter endpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") - // Only initialize if endpoint is configured - if endpoint != "" { - var opts []otlptracehttp.Option - // An endpoint with an http:// or https:// scheme goes through - // WithEndpointURL so the SDK picks the transport from the scheme - // (per the OTLP/HTTP spec). Bare host:port still flows through - // WithEndpoint with the loopback-insecure shortcut preserved. - if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") { - opts = []otlptracehttp.Option{otlptracehttp.WithEndpointURL(endpoint)} - } else { - opts = []otlptracehttp.Option{otlptracehttp.WithEndpoint(endpoint)} - if isLocalhostEndpoint(endpoint) { - opts = append(opts, otlptracehttp.WithInsecure()) - } - } - traceExporter, err = otlptracehttp.New(ctx, opts...) - if err != nil { - return fmt.Errorf("failed to create trace exporter: %w", err) - } + tp, err := newTracerProvider(ctx, res, endpoint) + if err != nil { + return fmt.Errorf("failed to create tracer provider: %w", err) } + otel.SetTracerProvider(tp) - // Configure tracer provider - tracerProviderOpts := []trace.TracerProviderOption{ - trace.WithResource(res), + mp, err := newMeterProvider(ctx, res, endpoint) + if err != nil { + _ = shutdownTracerProvider(tp) + return fmt.Errorf("failed to create meter provider: %w", err) } + otel.SetMeterProvider(mp) - if traceExporter != nil { - tracerProviderOpts = append(tracerProviderOpts, - trace.WithBatcher(traceExporter, - trace.WithBatchTimeout(5*time.Second), - trace.WithMaxExportBatchSize(512), - ), - ) + lp, err := newLoggerProvider(ctx, res, endpoint) + if err != nil { + _ = mp.Shutdown(context.Background()) + _ = shutdownTracerProvider(tp) + return fmt.Errorf("failed to create logger provider: %w", err) } + global.SetLoggerProvider(lp) - tp := trace.NewTracerProvider(tracerProviderOpts...) - otel.SetTracerProvider(tp) - - // Propagator must be set so otelhttp injects W3C traceparent on - // outbound requests and extracts it from incoming ones. Without this - // the SDK records spans locally but they never chain across services. + // Set the global text-map propagator unconditionally so otelhttp + // (and any other propagation-aware instrumentation) injects W3C + // `traceparent` / `tracestate` / `baggage` on outbound requests + // and extracts them on incoming ones. The propagator is a global + // no-op until set; without this the SDK records spans locally + // but they never chain across processes — `gen_ai.conversation.id` + // baggage and the MCP `_meta` / sandbox env-var injectors are + // dormant until this runs. otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) + // Single source of truth for "is OTel enabled?" — flip the + // httpclient gate now so outbound requests start emitting CLIENT + // spans and injecting traceparent. Previously the gate read + // OTEL_EXPORTER_OTLP_ENDPOINT directly, which diverged from the + // `--otel` CLI gate that controls this function: we'd either + // initialise providers without HTTP wrapping, or wrap HTTP without + // having a usable propagator. + httpclient.SetOTelEnabled(true) + go func() { <-ctx.Done() - shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - _ = tp.Shutdown(shutdownCtx) + // Flush in dependency order: logs and metrics first (they may + // reference active spans), then traces. Each provider gets its + // own 5s budget so a slow exporter can't starve the others — + // sharing a single timeout meant a stuck logs endpoint silently + // dropped buffered metrics and spans. + shutdown := func(fn func(context.Context) error) { + c, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = fn(c) + } + shutdown(lp.Shutdown) + shutdown(mp.Shutdown) + shutdown(tp.Shutdown) }() return nil } +// newTracerProvider builds the SDK tracer provider with an OTLP/HTTP +// exporter when an endpoint is set. +func newTracerProvider(ctx context.Context, res *resource.Resource, endpoint string) (*trace.TracerProvider, error) { + opts := []trace.TracerProviderOption{trace.WithResource(res)} + + if endpoint == "" { + return trace.NewTracerProvider(opts...), nil + } + + exp, err := otlptracehttp.New(ctx, traceExporterOptions(endpoint)...) + if err != nil { + return nil, fmt.Errorf("failed to create trace exporter: %w", err) + } + opts = append(opts, trace.WithBatcher(exp, + trace.WithBatchTimeout(5*time.Second), + trace.WithMaxExportBatchSize(512), + )) + return trace.NewTracerProvider(opts...), nil +} + +// newMeterProvider builds the SDK meter provider. Without an endpoint the +// provider still wires up so meters callers create are valid no-ops; with +// an endpoint, a periodic reader exports via OTLP/HTTP. +func newMeterProvider(ctx context.Context, res *resource.Resource, endpoint string) (*metric.MeterProvider, error) { + opts := []metric.Option{metric.WithResource(res)} + + if endpoint != "" { + exp, err := otlpmetrichttp.New(ctx, metricExporterOptions(endpoint)...) + if err != nil { + return nil, fmt.Errorf("failed to create metric exporter: %w", err) + } + opts = append(opts, metric.WithReader(metric.NewPeriodicReader(exp, + metric.WithInterval(60*time.Second), + ))) + } + + return metric.NewMeterProvider(opts...), nil +} + +// newLoggerProvider builds the SDK logger provider. Required for the +// gen_ai.client.operation.exception event (a log record per spec) and for +// any future log-bridge instrumentation. +func newLoggerProvider(ctx context.Context, res *resource.Resource, endpoint string) (*log.LoggerProvider, error) { + opts := []log.LoggerProviderOption{log.WithResource(res)} + + if endpoint != "" { + exp, err := otlploghttp.New(ctx, logExporterOptions(endpoint)...) + if err != nil { + return nil, fmt.Errorf("failed to create log exporter: %w", err) + } + opts = append(opts, log.WithProcessor(log.NewBatchProcessor(exp))) + } + + return log.NewLoggerProvider(opts...), nil +} + +// normalizeOTLPEndpoint turns a possibly-bare `host:port` into a fully +// scheme-qualified URL so all three OTLP/HTTP exporters can be wired via +// `WithEndpointURL` consistently. We can't rely on the SDKs' default +// scheme inference: `otlptracehttp` (older API) treats a bare endpoint +// as TLS-by-default while `otlploghttp` (newer API) treats the same +// bare endpoint as insecure-by-default. With `OTEL_EXPORTER_OTLP_CERTIFICATE` +// set in the env, the log exporter then errors out with +// `insecure HTTP endpoint cannot use TLS client configuration`, +// `initOTelSDK` propagates the failure, and the entire telemetry +// pipeline (including traces) is torn down. +// +// Pinning the scheme up front removes that asymmetry: localhost gets +// `http://`, every other host gets `https://`, and any explicit scheme +// the caller already supplied is honoured verbatim. +func normalizeOTLPEndpoint(endpoint string) string { + if strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") { + return endpoint + } + if isLocalhostEndpoint(endpoint) { + return "http://" + endpoint + } + return "https://" + endpoint +} + +func traceExporterOptions(endpoint string) []otlptracehttp.Option { + return []otlptracehttp.Option{otlptracehttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))} +} + +func metricExporterOptions(endpoint string) []otlpmetrichttp.Option { + return []otlpmetrichttp.Option{otlpmetrichttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))} +} + +func logExporterOptions(endpoint string) []otlploghttp.Option { + return []otlploghttp.Option{otlploghttp.WithEndpointURL(normalizeOTLPEndpoint(endpoint))} +} + +func shutdownTracerProvider(tp *trace.TracerProvider) error { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return tp.Shutdown(shutdownCtx) +} + func newOTelResource() (*resource.Resource, error) { + // Standard OTel resource attributes; users can layer additional + // labels via the spec-defined `OTEL_RESOURCE_ATTRIBUTES` env var, + // which `resource.Default` merges in. + attrs := []attribute.KeyValue{ + semconv.ServiceName(AppName), + semconv.ServiceVersion(version.Version), + semconv.ServiceInstanceID(uuid.NewString()), + semconv.ProcessPID(os.Getpid()), + semconv.ProcessRuntimeName("go"), + semconv.OSTypeKey.String(runtime.GOOS), + semconv.HostArchKey.String(runtime.GOARCH), + } + if hostname, err := os.Hostname(); err == nil && hostname != "" { + attrs = append(attrs, semconv.HostName(hostname)) + } return resource.Merge( resource.Default(), - resource.NewWithAttributes( - semconv.SchemaURL, - semconv.ServiceName(AppName), - semconv.ServiceVersion("dev"), // TODO: use actual version - ), + resource.NewWithAttributes(semconv.SchemaURL, attrs...), ) } diff --git a/cmd/root/otel_test.go b/cmd/root/otel_test.go index 042973a9e..961383e45 100644 --- a/cmd/root/otel_test.go +++ b/cmd/root/otel_test.go @@ -16,6 +16,63 @@ func TestNewOTelResourceUsesCurrentSchemaURL(t *testing.T) { assert.Equal(t, semconv.SchemaURL, res.SchemaURL()) } +// TestProvidersWithoutEndpoint verifies all three providers build cleanly +// when no OTLP endpoint is configured — they're no-op exporters but must +// still produce valid, non-nil providers so callers can create instruments. +func TestProvidersWithoutEndpoint(t *testing.T) { + t.Parallel() + + ctx := t.Context() + res, err := newOTelResource() + require.NoError(t, err) + + tp, err := newTracerProvider(ctx, res, "") + require.NoError(t, err) + require.NotNil(t, tp) + assert.NotNil(t, tp.Tracer("test")) + + mp, err := newMeterProvider(ctx, res, "") + require.NoError(t, err) + require.NotNil(t, mp) + assert.NotNil(t, mp.Meter("test")) + + lp, err := newLoggerProvider(ctx, res, "") + require.NoError(t, err) + require.NotNil(t, lp) + assert.NotNil(t, lp.Logger("test")) +} + +// TestNormalizeOTLPEndpoint pins the bare-endpoint -> URL mapping the +// three OTLP/HTTP exporters share. Without this normalization the log +// exporter (insecure-by-default for bare hosts) conflicted with +// OTEL_EXPORTER_OTLP_CERTIFICATE and tore down the whole telemetry +// pipeline; the trace exporter (TLS-by-default for bare hosts) hid +// the inconsistency. +func TestNormalizeOTLPEndpoint(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + endpoint string + want string + }{ + {"bare remote host:port -> https", "alloy.observability.svc.cluster.local:4318", "https://alloy.observability.svc.cluster.local:4318"}, + {"bare remote host -> https", "example.com", "https://example.com"}, + {"bare localhost host:port -> http", "localhost:4318", "http://localhost:4318"}, + {"bare localhost -> http", "localhost", "http://localhost"}, + {"bare ipv4 loopback -> http", "127.0.0.1:4318", "http://127.0.0.1:4318"}, + {"bare ipv6 loopback -> http", "[::1]:4318", "http://[::1]:4318"}, + {"explicit https preserved", "https://example.com:4318", "https://example.com:4318"}, + {"explicit http preserved", "http://localhost:4318", "http://localhost:4318"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, normalizeOTLPEndpoint(tt.endpoint)) + }) + } +} + func TestIsLocalhostEndpoint(t *testing.T) { t.Parallel() diff --git a/cmd/root/sandbox.go b/cmd/root/sandbox.go index 8a506138a..c163ed05e 100644 --- a/cmd/root/sandbox.go +++ b/cmd/root/sandbox.go @@ -18,6 +18,7 @@ import ( "github.com/docker/docker-agent/pkg/environment" "github.com/docker/docker-agent/pkg/paths" "github.com/docker/docker-agent/pkg/sandbox" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) // runInSandbox delegates the current command to a Docker sandbox. @@ -68,15 +69,30 @@ func runInSandbox(ctx context.Context, cmd *cobra.Command, args []string, runCon envFlags = append(envFlags, "-e", envModelsGateway+"="+gateway) } + // Wrap the sandbox exec in a span so the host side captures timing + // and exit code, and inject W3C trace context via env vars so the + // agent process spawned inside the sandbox container chains its + // own spans onto this parent. + ctx, sbxSpan := genai.StartSandboxExec(ctx, genai.SandboxOptions{ + Runtime: "docker", + Container: name, + }) + defer sbxSpan.End() + envFlags = append(envFlags, genai.InjectSandboxEnv(ctx)...) + dockerCmd := backend.BuildExecCmd(ctx, name, wd, dockerAgentArgs, envFlags, envVars) slog.Debug("Executing in sandbox", "name", name, "args", dockerCmd.Args) if err := dockerCmd.Run(); err != nil { if exitErr, ok := errors.AsType[*exec.ExitError](err); ok { + sbxSpan.SetExitCode(exitErr.ExitCode()) + sbxSpan.RecordError(err, "") return cli.StatusError{StatusCode: exitErr.ExitCode()} } + sbxSpan.RecordError(err, "") return fmt.Errorf("docker sandbox exec failed: %w", err) } + sbxSpan.SetExitCode(0) return nil } diff --git a/go.mod b/go.mod index c050f2778..02eb88cdf 100644 --- a/go.mod +++ b/go.mod @@ -61,8 +61,11 @@ require ( github.com/yuin/goldmark v1.8.2 github.com/zclconf/go-cty v1.18.1 go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/log v0.19.0 go.opentelemetry.io/otel/trace v1.43.0 golang.org/x/image v0.39.0 golang.org/x/oauth2 v0.36.0 @@ -234,9 +237,9 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 // indirect - go.opentelemetry.io/otel/log v0.16.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect + go.opentelemetry.io/otel/log v0.19.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.yaml.in/yaml/v4 v4.0.0-rc.4 golang.org/x/crypto v0.50.0 // indirect diff --git a/go.sum b/go.sum index 7a79c0304..6164d4149 100644 --- a/go.sum +++ b/go.sum @@ -539,22 +539,28 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8V go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0 h1:HIBTQ3VO5aupLKjC90JgMqpezVXwFuq6Ryjn0/izoag= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0/go.mod h1:ji9vId85hMxqfvICA0Jt8JqEdrXaAkcpkI9HPXya0ro= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 h1:NOyNnS19BF2SUDApbOKbDtWZ0IK7b8FJ2uAGdIWOGb0= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0/go.mod h1:VL6EgVikRLcJa9ftukrHu/ZkkhFBSo1lzvdBC9CF1ss= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak= -go.opentelemetry.io/otel/log v0.16.0 h1:DeuBPqCi6pQwtCK0pO4fvMB5eBq6sNxEnuTs88pjsN4= -go.opentelemetry.io/otel/log v0.16.0/go.mod h1:rWsmqNVTLIA8UnwYVOItjyEZDbKIkMxdQunsIhpUMes= +go.opentelemetry.io/otel/log v0.19.0 h1:KUZs/GOsw79TBBMfDWsXS+KZ4g2Ckzksd1ymzsIEbo4= +go.opentelemetry.io/otel/log v0.19.0/go.mod h1:5DQYeGmxVIr4n0/BcJvF4upsraHjg6vudJJpnkL6Ipk= go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= -go.opentelemetry.io/otel/sdk/log v0.16.0 h1:e/b4bdlQwC5fnGtG3dlXUrNOnP7c8YLVSpSfEBIkTnI= -go.opentelemetry.io/otel/sdk/log v0.16.0/go.mod h1:JKfP3T6ycy7QEuv3Hj8oKDy7KItrEkus8XJE6EoSzw4= +go.opentelemetry.io/otel/sdk/log v0.19.0 h1:scYVLqT22D2gqXItnWiocLUKGH9yvkkeql5dBDiXyko= +go.opentelemetry.io/otel/sdk/log v0.19.0/go.mod h1:vFBowwXGLlW9AvpuF7bMgnNI95LiW10szrOdvzBHlAg= +go.opentelemetry.io/otel/sdk/log/logtest v0.19.0 h1:BEbF7ZBB6qQloV/Ub1+3NQoOUnVtcGkU3XX4Ws3GQfk= +go.opentelemetry.io/otel/sdk/log/logtest v0.19.0/go.mod h1:Lua81/3yM0wOmoHTokLj9y9ADeA02v1naRrVrkAZuKk= go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= diff --git a/pkg/a2a/adapter.go b/pkg/a2a/adapter.go index 333083dc6..3be77917e 100644 --- a/pkg/a2a/adapter.go +++ b/pkg/a2a/adapter.go @@ -8,6 +8,8 @@ import ( "strings" "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "google.golang.org/adk/agent" "google.golang.org/adk/model" adksession "google.golang.org/adk/session" @@ -17,6 +19,7 @@ import ( "github.com/docker/docker-agent/pkg/runtime" "github.com/docker/docker-agent/pkg/session" "github.com/docker/docker-agent/pkg/team" + cgenai "github.com/docker/docker-agent/pkg/telemetry/genai" ) // newDockerAgentAdapter creates a new ADK agent adapter from a docker agent team and agent name. @@ -43,6 +46,21 @@ func newDockerAgentAdapter(t *team.Team, agentName string) (agent.Agent, error) // runDockerAgent executes a docker agent and returns ADK session events func runDockerAgent(ctx agent.InvocationContext, t *team.Team, agentName string, a *dagent.Agent) iter.Seq2[*adksession.Event, error] { return func(yield func(*adksession.Event, error) bool) { + // Decorate the inbound `a2a.message` SERVER span (created by + // otelhttp.NewHandler in server.go) with the GenAI semconv + // invoke_agent shape so dashboards can recognise A2A traffic as + // agent invocations rather than generic JSON-RPC POSTs. The + // runtime.session span we open below is the child that records + // the actual work; this annotation makes the parent searchable + // via gen_ai.operation.name="invoke_agent". + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.String(cgenai.AttrOperationName, cgenai.OperationInvokeAgent), + attribute.String(cgenai.AttrAgentName, agentName), + attribute.String(cgenai.AttrAgentNameRuntime, agentName), + ) + } + // Extract user message from the ADK context userContent := ctx.UserContent() message := contentToMessage(userContent) @@ -60,6 +78,13 @@ func runDockerAgent(ctx agent.InvocationContext, t *team.Team, agentName string, // Create runtime rt, err := runtime.New(t, runtime.WithCurrentAgent(agentName), + // Match the tracer scope used by `cmd/root/run.go` so + // MCP / A2A / API spans share the same instrumentation + // scope as the CLI's runtime spans. Without this option + // `LocalRuntime.startSpan` sees a nil tracer and silently + // returns no-op spans for runtime.session, runtime.stream, + // runtime.tool.call, runtime.fallback, runtime.run_skill, + // hook events, and so on. runtime.WithTracer(otel.Tracer("cagent")), ) if err != nil { diff --git a/pkg/a2a/server.go b/pkg/a2a/server.go index c9fa93081..8914b8f9b 100644 --- a/pkg/a2a/server.go +++ b/pkg/a2a/server.go @@ -14,6 +14,7 @@ import ( "github.com/a2aproject/a2a-go/a2asrv" "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "google.golang.org/adk/runner" "google.golang.org/adk/server/adka2a" "google.golang.org/adk/session" @@ -104,8 +105,26 @@ func Run(ctx context.Context, agentFilename, agentName string, runConfig *config })) e.Use(middleware.RequestLogger()) - e.GET(a2asrv.WellKnownAgentCardPath, echo.WrapHandler(a2asrv.NewStaticAgentCardHandler(agentCard))) - e.POST(agentPath, echo.WrapHandler(a2asrv.NewJSONRPCHandler(a2asrv.NewHandler(executor)))) + // Wrap both A2A endpoints with otelhttp so the configured W3C + // propagator extracts `traceparent` / `tracestate` / `baggage` + // from incoming requests. The agent runtime started inside + // `runDockerAgent` then chains its spans onto the calling agent's + // trace, and the `gen_ai.conversation.id` baggage seeded by the + // caller flows through into our local runtime spans without + // per-call plumbing. The agent-card endpoint is included so + // discovery requests carry the same trace context as the + // downstream invocation — propagation is uniform across all + // public surfaces of the server. + cardHandler := otelhttp.NewHandler( + a2asrv.NewStaticAgentCardHandler(agentCard), + "a2a.agent_card", + ) + jsonrpcHandler := otelhttp.NewHandler( + a2asrv.NewJSONRPCHandler(a2asrv.NewHandler(executor)), + "a2a.message", + ) + e.GET(a2asrv.WellKnownAgentCardPath, echo.WrapHandler(cardHandler)) + e.POST(agentPath, echo.WrapHandler(jsonrpcHandler)) if err := e.Server.Serve(ln); err != nil && ctx.Err() == nil { slog.Error("Failed to start server", "error", err) diff --git a/pkg/acp/agent.go b/pkg/acp/agent.go index 06b8a4879..5de115fca 100644 --- a/pkg/acp/agent.go +++ b/pkg/acp/agent.go @@ -14,6 +14,7 @@ import ( "sync" "github.com/coder/acp-go-sdk" + "go.opentelemetry.io/otel" "github.com/docker/docker-agent/pkg/config" "github.com/docker/docker-agent/pkg/runtime" @@ -144,6 +145,9 @@ func (a *Agent) NewSession(ctx context.Context, params acp.NewSessionRequest) (a rt, err := runtime.New(a.team, runtime.WithCurrentAgent(defaultAgent.Name()), runtime.WithSessionStore(a.sessionStore), + // Match the CLI tracer scope; without this the ACP-mode + // runtime's `startSpan` is a no-op for every runtime.* span. + runtime.WithTracer(otel.Tracer("cagent")), ) if err != nil { return acp.NewSessionResponse{}, fmt.Errorf("failed to create runtime: %w", err) diff --git a/pkg/chatserver/runtime_pool.go b/pkg/chatserver/runtime_pool.go index d79f03448..397d13513 100644 --- a/pkg/chatserver/runtime_pool.go +++ b/pkg/chatserver/runtime_pool.go @@ -4,6 +4,8 @@ import ( "errors" "sync" + "go.opentelemetry.io/otel" + "github.com/docker/docker-agent/pkg/runtime" "github.com/docker/docker-agent/pkg/team" ) @@ -56,7 +58,13 @@ func (p *runtimePool) Get(agent string) (runtime.Runtime, error) { if rt := p.takeIdle(agent); rt != nil { return rt, nil } - rt, err := runtime.New(p.team, runtime.WithCurrentAgent(agent)) + // Match the tracer scope used by the CLI; without this the + // pooled chatserver runtimes are tracer-less so all `runtime.*` + // spans go silent in OpenAI-compatible chat-completions mode. + rt, err := runtime.New(p.team, + runtime.WithCurrentAgent(agent), + runtime.WithTracer(otel.Tracer("cagent")), + ) if err != nil { return nil, err } diff --git a/pkg/chatserver/server.go b/pkg/chatserver/server.go index 60ab69d51..aa2a1bbdf 100644 --- a/pkg/chatserver/server.go +++ b/pkg/chatserver/server.go @@ -36,6 +36,7 @@ import ( "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" "github.com/openai/openai-go/v3" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "github.com/docker/docker-agent/pkg/config" "github.com/docker/docker-agent/pkg/runtime" @@ -125,14 +126,23 @@ func Run(ctx context.Context, agentFilename string, opts Options, ln net.Listene return err } - httpServer := &http.Server{ - Handler: newRouter(&server{ + // Wrap with otelhttp so incoming /v1/chat/completions requests + // (including SSE streams) extract the caller's trace context. + // otelhttp ends the span when the response body is closed, so + // SSE streaming responses get a span that covers the full + // stream duration. + handler := otelhttp.NewHandler( + newRouter(&server{ team: t, policy: policy, conversations: newConversationStore(opts.ConversationsMaxSessions, conversationTTL(opts)), conversationLocks: newConversationLockSet(), runtimes: newRuntimePool(t, opts.MaxIdleRuntimes), }, opts), + "chatserver", + ) + httpServer := &http.Server{ + Handler: handler, ReadHeaderTimeout: 30 * time.Second, } return serve(ctx, httpServer, ln) diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go index 38ae652fd..391536aee 100644 --- a/pkg/evaluation/judge.go +++ b/pkg/evaluation/judge.go @@ -13,6 +13,7 @@ import ( "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/config/latest" "github.com/docker/docker-agent/pkg/model/provider" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) // relevancePrompt is the prompt template for the judge model to evaluate responses. @@ -155,10 +156,34 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria [] for i, r := range rawResults { if r.err != nil { errs = append(errs, fmt.Errorf("checking %q: %w", criteria[i], r.err)) + // Emit gen_ai.evaluation.result with error.type so the + // failed checks show up alongside the successful ones in + // log-based dashboards. Set ScoreLabel="error" so + // dashboards that GROUP BY label still surface these + // rows (otherwise the missing label silently drops them). + genai.EmitEvaluationResult(ctx, genai.EvaluationResult{ + Name: "relevance", + ScoreLabel: "error", + ErrorType: genai.ClassifyError(r.err), + }) continue } results[i].Passed = r.passed results[i].Reason = r.reason + + score := 0.0 + label := "failed" + if r.passed { + score = 1.0 + label = "passed" + } + genai.EmitEvaluationResult(ctx, genai.EvaluationResult{ + Name: "relevance", + ScoreLabel: label, + ScoreValue: score, + HasScoreValue: true, + Explanation: r.reason, + }) } if len(errs) > 0 { diff --git a/pkg/hooks/executor.go b/pkg/hooks/executor.go index e6b0e0b8f..fb85905e1 100644 --- a/pkg/hooks/executor.go +++ b/pkg/hooks/executor.go @@ -10,6 +10,13 @@ import ( "regexp" "strings" "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + + "github.com/docker/docker-agent/pkg/telemetry/genai" ) // Executor dispatches configured hooks. Hook types are resolved against @@ -134,6 +141,27 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input) return &Result{Allowed: true}, nil } + // Single span per Dispatch call covers every hook the event matched. + // Custom name `hook.{event}` because there is no GenAI semconv for + // arbitrary user-defined lifecycle hooks; we surface the event type, + // matched hook count, and session/agent identifiers so dashboards can + // split by event class without parsing span events. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/hooks").Start( + ctx, + "hook."+string(event), + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes( + attribute.String("cagent.hook.event", string(event)), + attribute.Int("cagent.hook.count", len(hooks)), + attribute.String("cagent.agent.name", input.AgentName), + attribute.String("gen_ai.conversation.id", input.SessionID), + ), + ) + if input.ToolName != "" { + span.SetAttributes(attribute.String("gen_ai.tool.name", input.ToolName)) + } + defer span.End() + input.HookEventName = event if input.Cwd == "" { input.Cwd = e.workingDir @@ -143,6 +171,8 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input) inputJSON, err := input.ToJSON() if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) return nil, fmt.Errorf("failed to serialize hook input: %w", err) } @@ -153,7 +183,57 @@ func (e *Executor) Dispatch(ctx context.Context, event EventType, input *Input) } wg.Wait() - return aggregate(results, event), nil + final := aggregate(results, event) + annotateHookSpan(span, event, final) + return final, nil +} + +// annotateHookSpan stamps the aggregated verdict onto the hook.{event} +// span so dashboards can answer "did the hook block this?" and "why?" +// without re-running the hook. Prior to this the span only carried the +// event type and hook count — a denied call looked identical to an +// allowed one. The verdict booleans and short reason are unconditional +// (they're decisions, not content); free-text fields that may contain +// PII or LLM output (Message, AdditionalContext, SystemMessage, +// Summary) are gated on the GenAI content-capture opt-in. +func annotateHookSpan(span trace.Span, event EventType, r *Result) { + if span == nil || r == nil { + return + } + attrs := []attribute.KeyValue{ + attribute.Bool("cagent.hook.allowed", r.Allowed), + attribute.Int("cagent.hook.exit_code", r.ExitCode), + } + if r.Decision != "" { + attrs = append(attrs, attribute.String("cagent.hook.decision", string(r.Decision))) + } + if r.DecisionReason != "" { + attrs = append(attrs, attribute.String("cagent.hook.decision_reason", r.DecisionReason)) + } + if event == EventPermissionRequest { + attrs = append(attrs, attribute.Bool("cagent.hook.permission_allowed", r.PermissionAllowed)) + } + if r.ModifiedInput != nil { + attrs = append(attrs, attribute.Bool("cagent.hook.modified_input", true)) + } + if r.Summary != "" { + attrs = append(attrs, attribute.Bool("cagent.hook.summary_provided", true)) + } + if genai.IsContentCaptureEnabled() { + if r.Message != "" { + attrs = append(attrs, attribute.String("cagent.hook.message", r.Message)) + } + if r.AdditionalContext != "" { + attrs = append(attrs, attribute.String("cagent.hook.additional_context", r.AdditionalContext)) + } + if r.SystemMessage != "" { + attrs = append(attrs, attribute.String("cagent.hook.system_message", r.SystemMessage)) + } + if r.Summary != "" { + attrs = append(attrs, attribute.String("cagent.hook.summary", r.Summary)) + } + } + span.SetAttributes(attrs...) } // hooksFor returns the deduplicated list of hooks that should run for diff --git a/pkg/hooks/handler.go b/pkg/hooks/handler.go index 2d5a2974a..ea8276157 100644 --- a/pkg/hooks/handler.go +++ b/pkg/hooks/handler.go @@ -14,6 +14,7 @@ import ( "sync" "github.com/docker/docker-agent/pkg/shellpath" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) // Handler executes a single hook invocation. It is built by a @@ -188,7 +189,19 @@ type commandHandler struct { func (h *commandHandler) Run(ctx context.Context, input []byte) (HandlerResult, error) { cmd := exec.CommandContext(ctx, h.shell, append(h.shellArgs, h.command)...) cmd.Dir = h.workingDir - cmd.Env = h.env + // Expand nil to os.Environ() so the child inherits the parent env + // (matching the pre-OTel cmd.Env=h.env=nil behaviour), and copy + // into a fresh backing array so concurrent hooks don't race on a + // shared slice when adding the trace-context vars. + base := h.env + if base == nil { + base = os.Environ() + } + traceEnv := genai.InjectTraceContextEnv(ctx) + envCopy := make([]string, 0, len(base)+len(traceEnv)) + envCopy = append(envCopy, base...) + envCopy = append(envCopy, traceEnv...) + cmd.Env = envCopy cmd.Stdin = bytes.NewReader(input) var stdout, stderr bytes.Buffer diff --git a/pkg/httpclient/client.go b/pkg/httpclient/client.go index bb256c7b7..b4d9e7bd5 100644 --- a/pkg/httpclient/client.go +++ b/pkg/httpclient/client.go @@ -6,8 +6,8 @@ import ( "maps" "net/http" "net/url" - "os" "runtime" + "sync/atomic" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" @@ -41,13 +41,66 @@ func NewHTTPClient(ctx context.Context, opts ...Opt) *http.Client { rt := newTransport(ctx) return &http.Client{ - Transport: &userAgentTransport{ + Transport: WrapWithOTel(&userAgentTransport{ httpOptions: httpOptions, rt: rt, - }, + }), } } +// otelEnabled tracks whether the OTel SDK has been initialised in this +// process. `cmd/root/otel.go:initOTelSDK` calls `SetOTelEnabled(true)` +// on success; nothing else flips this flag. Gating on a single source +// of truth (rather than re-reading `OTEL_EXPORTER_OTLP_ENDPOINT`) +// avoids the previous mismatch where the SDK could be initialised +// without the HTTP wrap, or the HTTP wrap could fire without the SDK +// initialising the propagator. +var otelEnabled atomic.Bool + +// SetOTelEnabled toggles the gate consulted by WrapWithOTel. Called by +// `initOTelSDK` after providers and the propagator are wired so HTTP +// clients start injecting `traceparent` only once the rest of the SDK +// can actually use the resulting spans. +func SetOTelEnabled(enabled bool) { + otelEnabled.Store(enabled) +} + +// WrapWithOTel returns rt wrapped with otelhttp when OpenTelemetry has +// been enabled via `SetOTelEnabled` (called by `initOTelSDK`), or rt +// unchanged otherwise. Gating avoids per-request span allocation on +// the no-OTel path and stops sending a `traceparent` header to +// upstream LLM providers that have no use for it. Exposed so callers +// that build their own transports outside of `NewHTTPClient` can opt +// into the same gating without duplicating the check. +func WrapWithOTel(rt http.RoundTripper) http.RoundTripper { + if !otelEnabled.Load() { + return rt + } + return otelhttp.NewTransport(rt) +} + +// TracedDefaultClient returns an `http.Client` equivalent to +// `http.DefaultClient` but with the default transport wrapped via +// `WrapWithOTel`. Use as a drop-in replacement at call sites that +// previously did `http.DefaultClient.Do(req)` so OAuth metadata fetches, +// fetch-tool requests, registry probes, and similar one-off HTTP calls +// chain into the active trace. +func TracedDefaultClient() *http.Client { + return &http.Client{Transport: WrapWithOTel(http.DefaultTransport)} +} + +// TracedClient returns a configurable `http.Client` with the default +// transport already wrapped via `WrapWithOTel`. The supplied options +// (timeout, redirect policy, jar, etc.) are applied after construction. +// Convenience wrapper for short-lived clients with custom timeouts. +func TracedClient(opts ...func(*http.Client)) *http.Client { + c := &http.Client{Transport: WrapWithOTel(http.DefaultTransport)} + for _, opt := range opts { + opt(c) + } + return c +} + func WithHeader(key, value string) Opt { return func(o *HTTPOptions) { o.Header.Set(key, value) @@ -109,15 +162,7 @@ func WithQuery(query url.Values) Opt { } } -// newTransport returns an HTTP transport with automatic gzip compression -// disabled and using Docker Desktop proxy if available. -// -// When OpenTelemetry is enabled (i.e. OTEL_EXPORTER_OTLP_ENDPOINT is set, -// matching the gating in initOTelSDK), the transport is wrapped with -// otelhttp so each outbound request emits a CLIENT span and the W3C -// traceparent header is injected. When OTel is disabled, the bare -// transport is returned so we don't allocate per-request spans nor send -// a traceparent header to upstream LLM providers. +// newTransport returns an HTTP transport with automatic gzip compression disabled and using Docker Desktop proxy if available. func newTransport(ctx context.Context) http.RoundTripper { // Get the base transport with Desktop proxy support from remote package rt := remote.NewTransport(ctx) @@ -131,19 +176,7 @@ func newTransport(ctx context.Context) http.RoundTripper { t.DisableCompression() } - return WrapWithOTel(rt) -} - -// WrapWithOTel returns rt wrapped with otelhttp when OpenTelemetry is -// enabled (OTEL_EXPORTER_OTLP_ENDPOINT set, matching the gating in -// cmd/root/otel.go), or rt unchanged otherwise. Exposed so callers that -// build their own transports outside of NewHTTPClient can opt into the -// same env-gated instrumentation without duplicating the gating logic. -func WrapWithOTel(rt http.RoundTripper) http.RoundTripper { - if os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") == "" { - return rt - } - return otelhttp.NewTransport(rt) + return rt } type userAgentTransport struct { diff --git a/pkg/mcp/server.go b/pkg/mcp/server.go index 9a5a0a22f..583aa01ba 100644 --- a/pkg/mcp/server.go +++ b/pkg/mcp/server.go @@ -11,6 +11,7 @@ import ( "slices" "github.com/modelcontextprotocol/go-sdk/mcp" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel" "github.com/docker/docker-agent/pkg/agent" @@ -19,6 +20,7 @@ import ( "github.com/docker/docker-agent/pkg/session" "github.com/docker/docker-agent/pkg/team" "github.com/docker/docker-agent/pkg/teamloader" + otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp" "github.com/docker/docker-agent/pkg/tools" "github.com/docker/docker-agent/pkg/version" ) @@ -61,10 +63,17 @@ func StartHTTPServer(ctx context.Context, agentFilename, agentName string, runCo fmt.Printf("MCP HTTP server listening on http://%s\n", ln.Addr()) + // Wrap with otelhttp so the MCP-over-HTTP transport extracts + // `traceparent` / `baggage` from incoming requests just like the + // stdio transport extracts them from `params._meta`. Without this + // HTTP-mode MCP clients lose trace context at the boundary. httpServer := &http.Server{ - Handler: mcp.NewStreamableHTTPHandler(func(_ *http.Request) *mcp.Server { - return server - }, nil), + Handler: otelhttp.NewHandler( + mcp.NewStreamableHTTPHandler(func(_ *http.Request) *mcp.Server { + return server + }, nil), + "mcp.http", + ), } errCh := make(chan error, 1) @@ -158,7 +167,25 @@ func createMCPServer(ctx context.Context, agentFilename, agentName string, runCo } func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mcp.CallToolRequest, ToolInput) (*mcp.CallToolResult, ToolOutput, error) { - return func(ctx context.Context, req *mcp.CallToolRequest, input ToolInput) (*mcp.CallToolResult, ToolOutput, error) { + return func(ctx context.Context, req *mcp.CallToolRequest, input ToolInput) (result *mcp.CallToolResult, output ToolOutput, err error) { + // Extract W3C trace context from `params._meta` (per the OTel + // MCP semconv) so the SERVER span chains onto the calling + // CLIENT span. Then start a `tools/call {agent}` SERVER span + // covering the full handler execution. + if req != nil && req.Params != nil { + ctx = otelmcp.ExtractMeta(ctx, req.Params.Meta) + } + ctx, span := otelmcp.StartServer(ctx, otelmcp.CallOptions{ + Method: otelmcp.MethodToolsCall, + ToolName: agentName, + }) + defer func() { + if err != nil { + span.RecordError(err, "") + } + span.End() + }() + slog.Debug("MCP tool called", "agent", agentName, "message", input.Message) ag, err := t.Agent(agentName) @@ -179,6 +206,9 @@ func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mc rt, err := runtime.New(t, runtime.WithCurrentAgent(agentName), runtime.WithNonInteractive(true), + // See pkg/a2a/adapter.go for rationale — without this + // the runtime's startSpan is a no-op when cagent runs as + // an MCP server, so all our runtime.* spans go silent. runtime.WithTracer(otel.Tracer("cagent")), ) if err != nil { @@ -191,11 +221,11 @@ func CreateToolHandler(t *team.Team, agentName string) func(context.Context, *mc return nil, ToolOutput{}, fmt.Errorf("agent execution failed: %w", err) } - result := cmp.Or(sess.GetLastAssistantMessageContent(), "No response from agent") + response := cmp.Or(sess.GetLastAssistantMessageContent(), "No response from agent") - slog.Debug("Agent execution completed", "agent", agentName, "response_length", len(result)) + slog.Debug("Agent execution completed", "agent", agentName, "response_length", len(response)) - return nil, ToolOutput{Response: result}, nil + return nil, ToolOutput{Response: response}, nil } } diff --git a/pkg/memory/database/sqlite/sqlite.go b/pkg/memory/database/sqlite/sqlite.go index e1e349893..cc2409729 100644 --- a/pkg/memory/database/sqlite/sqlite.go +++ b/pkg/memory/database/sqlite/sqlite.go @@ -6,10 +6,40 @@ import ( "fmt" "strings" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/memory/database" "github.com/docker/docker-agent/pkg/sqliteutil" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) +// memoryDataSourceID is the `gen_ai.data_source.id` value used on +// retrieval-shaped memory operations (SearchMemories) so observability-svc +// can group "agent recalled this memory" timeline entries the same way it +// groups RAG retrievals. +const memoryDataSourceID = "memory" + +// startMemorySpan opens a small INTERNAL span for a memory CRUD operation. +// op is recorded as `cagent.memory.op` and the span name is +// `memory.{op}`. Conversation id flows in via baggage so the span lands +// on the right session timeline. +func startMemorySpan(ctx context.Context, op string) (context.Context, trace.Span) { + tracer := otel.Tracer("github.com/docker/docker-agent/pkg/memory/database/sqlite") + attrs := []attribute.KeyValue{ + attribute.String("cagent.memory.op", op), + } + if convID := genai.ConversationIDFromContext(ctx); convID != "" { + attrs = append(attrs, attribute.String(genai.AttrConversationID, convID)) + } + return tracer.Start(ctx, "memory."+op, + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attrs...), + ) +} + type MemoryDatabase struct { db *sql.DB } @@ -40,15 +70,25 @@ func NewMemoryDatabase(path string) (database.Database, error) { } func (m *MemoryDatabase) AddMemory(ctx context.Context, memory database.UserMemory) error { + ctx, span := startMemorySpan(ctx, "add") + defer span.End() + if memory.ID == "" { return database.ErrEmptyID } _, err := m.db.ExecContext(ctx, "INSERT INTO memories (id, created_at, memory, category) VALUES (?, ?, ?, ?)", memory.ID, memory.CreatedAt, memory.Memory, memory.Category) + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } return err } func (m *MemoryDatabase) GetMemories(ctx context.Context) ([]database.UserMemory, error) { + ctx, span := startMemorySpan(ctx, "list") + defer span.End() + rows, err := m.db.QueryContext(ctx, "SELECT id, created_at, memory, COALESCE(category, '') FROM memories") if err != nil { return nil, err @@ -73,11 +113,37 @@ func (m *MemoryDatabase) GetMemories(ctx context.Context) ([]database.UserMemory } func (m *MemoryDatabase) DeleteMemory(ctx context.Context, memory database.UserMemory) error { + ctx, span := startMemorySpan(ctx, "delete") + defer span.End() + _, err := m.db.ExecContext(ctx, "DELETE FROM memories WHERE id = ?", memory.ID) + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } return err } -func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category string) ([]database.UserMemory, error) { +func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category string) (results []database.UserMemory, err error) { + // SearchMemories is the retrieval shape per the OTel GenAI semconv: + // the agent is recalling stored memories filtered by query/category. + // Use the spec'd `retrieval {data_source.id}` span so this lands on + // the same dashboard row as RAG retrievals. + ctx, retSpan := genai.StartRetrieval(ctx, "sqlite", memoryDataSourceID, false, "") + defer func() { + if err != nil { + retSpan.RecordError(err, "") + } + retSpan.SetResultCount(len(results)) + retSpan.End() + }() + if category != "" { + retSpan.SetAttributes(attribute.String("cagent.memory.category", category)) + } + + // Assign to the named returns (not local shadows) so the deferred + // span closure observes the live error and result count regardless + // of which return path fires. var conditions []string var args []any @@ -102,30 +168,35 @@ func (m *MemoryDatabase) SearchMemories(ctx context.Context, query, category str stmt += " WHERE " + strings.Join(conditions, " AND ") } - rows, err := m.db.QueryContext(ctx, stmt, args...) + var rows *sql.Rows + rows, err = m.db.QueryContext(ctx, stmt, args...) if err != nil { return nil, err } defer rows.Close() - var memories []database.UserMemory for rows.Next() { var memory database.UserMemory - err := rows.Scan(&memory.ID, &memory.CreatedAt, &memory.Memory, &memory.Category) - if err != nil { + // gocritic suggests `:=` here, but we want to assign to the + // named return `err` so the deferred span closure observes + // the failure. nolint pragma documents the intent. + if err = rows.Scan(&memory.ID, &memory.CreatedAt, &memory.Memory, &memory.Category); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability return nil, err } - memories = append(memories, memory) + results = append(results, memory) } - if err := rows.Err(); err != nil { + if err = rows.Err(); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability return nil, err } - return memories, nil + return results, nil } func (m *MemoryDatabase) UpdateMemory(ctx context.Context, memory database.UserMemory) error { + ctx, span := startMemorySpan(ctx, "update") + defer span.End() + if memory.ID == "" { return database.ErrEmptyID } diff --git a/pkg/model/provider/anthropic/client.go b/pkg/model/provider/anthropic/client.go index 115274458..db82bdbe8 100644 --- a/pkg/model/provider/anthropic/client.go +++ b/pkg/model/provider/anthropic/client.go @@ -14,6 +14,10 @@ import ( "github.com/anthropics/anthropic-sdk-go/option" "github.com/anthropics/anthropic-sdk-go/packages/param" "github.com/anthropics/anthropic-sdk-go/packages/ssestream" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/config/latest" @@ -22,6 +26,7 @@ import ( "github.com/docker/docker-agent/pkg/model/provider/base" "github.com/docker/docker-agent/pkg/model/provider/options" "github.com/docker/docker-agent/pkg/model/provider/providerutil" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" ) @@ -696,7 +701,30 @@ func countAnthropicTokens( messages []anthropic.MessageParam, system []anthropic.TextBlockParam, anthropicTools []anthropic.ToolUnionParam, -) (int64, error) { +) (count int64, err error) { + // Token counting is a blocking API call to Anthropic that fires + // on the context-overflow retry path. Span it so the latency is + // attributable when the retry stalls. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider/anthropic").Start( + ctx, + "anthropic.tokens.count", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes( + attribute.String(genai.AttrProviderName, genai.ProviderAnthropic), + attribute.String(genai.AttrRequestModel, model), + ), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + if count > 0 { + span.SetAttributes(attribute.Int64("cagent.anthropic.tokens.counted", count)) + } + span.End() + }() + params := anthropic.MessageCountTokensParams{ Model: model, Messages: messages, diff --git a/pkg/model/provider/anthropic/files.go b/pkg/model/provider/anthropic/files.go index 98417abd4..015f102d2 100644 --- a/pkg/model/provider/anthropic/files.go +++ b/pkg/model/provider/anthropic/files.go @@ -15,6 +15,10 @@ import ( "time" "github.com/anthropics/anthropic-sdk-go" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/chat" ) @@ -78,7 +82,25 @@ func NewFileManager(clientFn func(context.Context) (anthropic.Client, error)) *F // Files are deduplicated by content hash AND MIME type, so identical files with // different extensions will be uploaded separately. // Concurrent calls for the same file will wait for a single upload to complete. -func (fm *FileManager) GetOrUpload(ctx context.Context, filePath string) (*UploadedFile, error) { +func (fm *FileManager) GetOrUpload(ctx context.Context, filePath string) (result *UploadedFile, err error) { + // Span the whole upload — large files take seconds to minutes + // over slow links and previously the latency was completely + // dark. cache_hit=true paths are short-lived siblings; the + // network upload path is the long branch. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider/anthropic").Start( + ctx, + "anthropic.files.get_or_upload", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(attribute.String("cagent.file.path", filePath)), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + absPath, err := filepath.Abs(filePath) if err != nil { return nil, fmt.Errorf("failed to get absolute path: %w", err) diff --git a/pkg/model/provider/factory.go b/pkg/model/provider/factory.go index 5ca4fdd8a..22c78288d 100644 --- a/pkg/model/provider/factory.go +++ b/pkg/model/provider/factory.go @@ -71,7 +71,16 @@ func createDirectProvider(ctx context.Context, cfg *latest.ModelConfig, env envi slog.Error("Unknown provider type", "type", providerType) return nil, fmt.Errorf("unknown provider type: %s", providerType) } - return factory(ctx, enhancedCfg, env, opts...) + p, err := factory(ctx, enhancedCfg, env, opts...) + if err != nil { + return nil, err + } + // Wrap leaf providers with the GenAI semconv tracer so every chat + // completion emits a `chat {model}` CLIENT span and the standard + // gen_ai.client.* metrics. The rule-based router constructed by + // createRuleBasedRouter is left bare — its routed targets go through + // resolveRoutedModel → createDirectProvider and end up wrapped here. + return instrumentProvider(p), nil } // providerFactory builds a Provider from a fully-resolved ModelConfig. diff --git a/pkg/model/provider/factory_test.go b/pkg/model/provider/factory_test.go index 3f849f786..339b86323 100644 --- a/pkg/model/provider/factory_test.go +++ b/pkg/model/provider/factory_test.go @@ -108,8 +108,9 @@ func TestCreateDirectProvider_DispatchByType(t *testing.T) { t.Run(tt.name, func(t *testing.T) { p, err := createDirectProvider(t.Context(), tt.cfg, environment.NewNoEnvProvider()) require.NoError(t, err) - fp, ok := p.(*fakeProvider) - require.True(t, ok, "expected fakeProvider, got %T", p) + leaf := unwrapProvider(p) + fp, ok := leaf.(*fakeProvider) + require.True(t, ok, "expected fakeProvider, got %T", leaf) assert.Equal(t, tt.expectID, fp.id) }) } diff --git a/pkg/model/provider/instrument.go b/pkg/model/provider/instrument.go new file mode 100644 index 000000000..92c44e42b --- /dev/null +++ b/pkg/model/provider/instrument.go @@ -0,0 +1,309 @@ +package provider + +import ( + "context" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + + "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/model/provider/base" + "github.com/docker/docker-agent/pkg/rag/types" + "github.com/docker/docker-agent/pkg/telemetry/genai" + "github.com/docker/docker-agent/pkg/tools" +) + +// unwrapProvider returns the leaf provider underneath any number of +// instrumentation wrappers. Used by tests and by code paths that need to +// reach back to the concrete implementation (e.g. capability assertions +// that the wrappers do not transparently forward). +func unwrapProvider(p Provider) Provider { + for { + u, ok := p.(interface{ Unwrap() Provider }) + if !ok { + return p + } + p = u.Unwrap() + } +} + +// instrumentProvider wraps the leaf provider so every chat completion is +// surrounded by a GenAI semconv-compliant span and the matching client +// metrics. The wrapper is added once at the createDirectProvider boundary +// — the rule-based router (createRuleBasedRouter) is left bare because it +// dispatches to providers that are themselves already wrapped, so a +// single chat span is emitted per call regardless of routing depth. +// +// To avoid changing the apparent capability of the inner provider, the +// wrapper that is returned satisfies exactly the same set of interfaces +// that the inner provider satisfies — chat-only, chat+rerank, +// chat+embed+rerank, etc. RAG callers do `p.(EmbeddingProvider)` and rely +// on `ok=false` to fall back to sequential processing; if the wrapper +// always implemented EmbeddingProvider that fallback would silently +// disappear. +func instrumentProvider(p Provider) Provider { + if p == nil { + return nil + } + + tc := &tracedChat{inner: p} + + bep, isBatchEmbed := p.(BatchEmbeddingProvider) + ep, isEmbed := p.(EmbeddingProvider) + rp, isRerank := p.(RerankingProvider) + + switch { + case isBatchEmbed && isRerank: + return &tracedBatchEmbedRerank{tracedChat: tc, batchEmbed: bep, rerank: rp} + case isBatchEmbed: + return &tracedBatchEmbed{tracedChat: tc, batchEmbed: bep} + case isEmbed && isRerank: + return &tracedEmbedRerank{tracedChat: tc, embed: ep, rerank: rp} + case isEmbed: + return &tracedEmbed{tracedChat: tc, embed: ep} + case isRerank: + return &tracedRerank{tracedChat: tc, rerank: rp} + default: + return tc + } +} + +// tracedChat is the base wrapper. It satisfies just Provider and is +// embedded by every richer wrapper. CreateChatCompletionStream is the +// only method that adds behaviour — everything else delegates. +type tracedChat struct { + inner Provider +} + +func (t *tracedChat) ID() string { return t.inner.ID() } +func (t *tracedChat) BaseConfig() base.Config { return t.inner.BaseConfig() } + +// Unwrap returns the wrapped provider. Tests and any other caller that +// needs the leaf type (e.g. for type assertions on internal helper +// methods) can use the standard unwrap pattern: +// +// if u, ok := p.(interface{ Unwrap() Provider }); ok { p = u.Unwrap() } +func (t *tracedChat) Unwrap() Provider { return t.inner } + +func (t *tracedChat) CreateChatCompletionStream(ctx context.Context, messages []chat.Message, requestTools []tools.Tool) (chat.MessageStream, error) { + cfg := t.inner.BaseConfig() + req := genai.ChatRequest{ + Provider: genai.ProviderNameForConfig(cfg.ModelConfig.Provider), + Model: cfg.ModelConfig.Model, + Stream: true, + } + // Populate sampling parameters from the resolved model config so the + // `gen_ai.request.max_tokens` / `temperature` / `top_p` / `top_k` + // attributes the GenAI semconv conditionally requires actually land + // on the span. Without this, the helper's gated emission paths were + // unreachable. Pointer fields distinguish "explicitly set" from + // "unset"; the matching Has* flags carry that signal through. + if mc := cfg.ModelConfig.MaxTokens; mc != nil { + req.MaxTokens = int(*mc) + } + if t := cfg.ModelConfig.Temperature; t != nil { + req.Temperature = *t + req.HasTemperature = true + } + if tp := cfg.ModelConfig.TopP; tp != nil { + req.TopP = *tp + req.HasTopP = true + } + chatCtx, span := genai.StartChat(ctx, req) + + // Opt-in capture of request content. Helpers internally check the + // `OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT` env var and + // no-op when unset, so the cost on the default path is the + // function-call overhead and nothing else. + genai.SetInputMessages(span, messages) + genai.SetToolDefinitions(span, requestTools) + + stream, err := t.inner.CreateChatCompletionStream(chatCtx, messages, requestTools) + if err != nil { + span.RecordError(err, genai.ClassifyError(err)) + span.End() + return nil, err + } + return genai.WrapStream(span, stream), nil +} + +// embeddingRequestForConfig builds an EmbeddingRequest from the inner +// provider's BaseConfig — same shape as the chat path so the spec +// `gen_ai.provider.name` / `gen_ai.request.model` attributes use the +// canonical names. +func (t *tracedChat) embeddingRequestForConfig(batchSize int) genai.EmbeddingRequest { + cfg := t.inner.BaseConfig() + return genai.EmbeddingRequest{ + Provider: genai.ProviderNameForConfig(cfg.ModelConfig.Provider), + Model: cfg.ModelConfig.Model, + BatchSize: batchSize, + } +} + +// rerankSpan opens a `cagent.rerank` span. There is no spec-defined +// rerank span yet; the operation is closely related to retrieval but +// distinct enough to warrant its own name. Custom attributes use the +// `cagent.*` namespace. +func (t *tracedChat) rerankSpan(ctx context.Context, docCount int) (context.Context, trace.Span) { + cfg := t.inner.BaseConfig() + tracer := otel.Tracer("github.com/docker/docker-agent/pkg/model/provider") + attrs := []attribute.KeyValue{ + attribute.String(genai.AttrProviderName, genai.ProviderNameForConfig(cfg.ModelConfig.Provider)), + attribute.String(genai.AttrRequestModel, cfg.ModelConfig.Model), + attribute.Int("cagent.rerank.document_count", docCount), + } + // Carry `gen_ai.conversation.id` from baggage like every other + // span helper in the branch. The chat / embedding / retrieval / + // fallback / sandbox / MCP starters all do this; rerank was the + // odd one out, leaving rerank latency unattributable in + // per-conversation dashboards. + if convID := genai.ConversationIDFromContext(ctx); convID != "" { + attrs = append(attrs, attribute.String(genai.AttrConversationID, convID)) + } + return tracer.Start(ctx, "rerank", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(attrs...), + ) +} + +// wrapEmbedding wraps a single-input embedding call with a spec +// `embeddings {model}` span. Records token usage and dimension count on +// success; classifies errors on failure. +func wrapEmbedding(ctx context.Context, req genai.EmbeddingRequest, fn func(context.Context) (*base.EmbeddingResult, error)) (*base.EmbeddingResult, error) { + ctx, span := genai.StartEmbedding(ctx, req) + defer span.End() + res, err := fn(ctx) + if err != nil { + span.RecordError(err, "") + return nil, err + } + if res != nil { + span.SetInputTokens(res.InputTokens) + span.SetDimensions(len(res.Embedding)) + } + return res, nil +} + +// wrapBatchEmbedding wraps a batch embedding call. Records the total +// input tokens across the batch and the per-vector dimensionality. +func wrapBatchEmbedding(ctx context.Context, req genai.EmbeddingRequest, fn func(context.Context) (*base.BatchEmbeddingResult, error)) (*base.BatchEmbeddingResult, error) { + ctx, span := genai.StartEmbedding(ctx, req) + defer span.End() + res, err := fn(ctx) + if err != nil { + span.RecordError(err, "") + return nil, err + } + if res != nil { + span.SetInputTokens(res.InputTokens) + if len(res.Embeddings) > 0 { + span.SetDimensions(len(res.Embeddings[0])) + } + } + return res, nil +} + +// wrapRerank wraps a Rerank call with a `rerank` CLIENT span that +// captures document count and error classification. +func (t *tracedChat) wrapRerank(ctx context.Context, query string, documents []types.Document, criteria string, fn func(context.Context, string, []types.Document, string) ([]float64, error)) ([]float64, error) { + ctx, span := t.rerankSpan(ctx, len(documents)) + defer span.End() + scores, err := fn(ctx, query, documents, criteria) + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + span.SetAttributes(attribute.String("error.type", genai.ClassifyError(err))) + return nil, err + } + return scores, nil +} + +// tracedRerank adds RerankingProvider while still satisfying just Provider +// at the chat layer. +type tracedRerank struct { + *tracedChat + + rerank RerankingProvider +} + +func (t *tracedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) { + return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank) +} + +// tracedEmbed satisfies EmbeddingProvider. +type tracedEmbed struct { + *tracedChat + + embed EmbeddingProvider +} + +func (t *tracedEmbed) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) { + return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) { + return t.embed.CreateEmbedding(ctx, text) + }) +} + +// tracedEmbedRerank satisfies EmbeddingProvider and RerankingProvider. +type tracedEmbedRerank struct { + *tracedChat + + embed EmbeddingProvider + rerank RerankingProvider +} + +func (t *tracedEmbedRerank) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) { + return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) { + return t.embed.CreateEmbedding(ctx, text) + }) +} + +func (t *tracedEmbedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) { + return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank) +} + +// tracedBatchEmbed satisfies BatchEmbeddingProvider (which embeds +// EmbeddingProvider). +type tracedBatchEmbed struct { + *tracedChat + + batchEmbed BatchEmbeddingProvider +} + +func (t *tracedBatchEmbed) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) { + return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) { + return t.batchEmbed.CreateEmbedding(ctx, text) + }) +} + +func (t *tracedBatchEmbed) CreateBatchEmbedding(ctx context.Context, texts []string) (*base.BatchEmbeddingResult, error) { + return wrapBatchEmbedding(ctx, t.embeddingRequestForConfig(len(texts)), func(ctx context.Context) (*base.BatchEmbeddingResult, error) { + return t.batchEmbed.CreateBatchEmbedding(ctx, texts) + }) +} + +// tracedBatchEmbedRerank satisfies BatchEmbeddingProvider and +// RerankingProvider — the broadest combination, used by openai and dmr. +type tracedBatchEmbedRerank struct { + *tracedChat + + batchEmbed BatchEmbeddingProvider + rerank RerankingProvider +} + +func (t *tracedBatchEmbedRerank) CreateEmbedding(ctx context.Context, text string) (*base.EmbeddingResult, error) { + return wrapEmbedding(ctx, t.embeddingRequestForConfig(0), func(ctx context.Context) (*base.EmbeddingResult, error) { + return t.batchEmbed.CreateEmbedding(ctx, text) + }) +} + +func (t *tracedBatchEmbedRerank) CreateBatchEmbedding(ctx context.Context, texts []string) (*base.BatchEmbeddingResult, error) { + return wrapBatchEmbedding(ctx, t.embeddingRequestForConfig(len(texts)), func(ctx context.Context) (*base.BatchEmbeddingResult, error) { + return t.batchEmbed.CreateBatchEmbedding(ctx, texts) + }) +} + +func (t *tracedBatchEmbedRerank) Rerank(ctx context.Context, query string, documents []types.Document, criteria string) ([]float64, error) { + return t.wrapRerank(ctx, query, documents, criteria, t.rerank.Rerank) +} diff --git a/pkg/rag/manager.go b/pkg/rag/manager.go index 17e77675f..40b051a52 100644 --- a/pkg/rag/manager.go +++ b/pkg/rag/manager.go @@ -11,11 +11,17 @@ import ( "slices" "time" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/rag/database" "github.com/docker/docker-agent/pkg/rag/fusion" "github.com/docker/docker-agent/pkg/rag/rerank" "github.com/docker/docker-agent/pkg/rag/strategy" "github.com/docker/docker-agent/pkg/rag/types" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) // ToolConfig represents tool-specific configuration @@ -143,7 +149,23 @@ func New(_ context.Context, name string, config Config, strategyEvents <-chan ty // Initialize indexes all documents using all configured strategies // Each strategy indexes its own document set (shared + strategy-specific) // Strategies are initialized in parallel for better performance -func (m *Manager) Initialize(ctx context.Context) error { +func (m *Manager) Initialize(ctx context.Context) (err error) { + tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag") + ctx, span := tracer.Start(ctx, "rag.initialize", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes( + attribute.String(genai.AttrDataSourceID, m.name), + attribute.Int("cagent.rag.num_strategies", len(m.strategies)), + ), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + slog.Debug("[RAG Manager] Starting initialization", "rag_name", m.name, "num_strategies", len(m.strategies)) @@ -211,7 +233,20 @@ func (m *Manager) Initialize(ctx context.Context) error { // Query searches for relevant documents using all configured strategies // If multiple strategies are configured, results are combined using the fusion strategy -func (m *Manager) Query(ctx context.Context, query string) ([]database.SearchResult, error) { +func (m *Manager) Query(ctx context.Context, query string) (results []database.SearchResult, err error) { + // Start a `retrieval {rag_name}` span per the OTel GenAI semconv. + // The query text itself is sensitive so we never capture it on the + // span here — content capture is gated by a separate environment + // variable in a later commit and emitted via a span event then. + ctx, retSpan := genai.StartRetrieval(ctx, "rag", m.name, false, "") + defer func() { + if err != nil { + retSpan.RecordError(err, "") + } + retSpan.SetResultCount(len(results)) + retSpan.End() + }() + slog.Debug("[RAG Manager] Starting query", "rag_name", m.name, "num_strategies", len(m.strategies), @@ -228,7 +263,11 @@ func (m *Manager) Query(ctx context.Context, query string) ([]database.SearchRes "strategy_limit", strategyCfg.Limit, "strategy_threshold", strategyCfg.Threshold) - results, err := strategyImpl.Query(ctx, query, strategyCfg.Limit, strategyCfg.Threshold) + // Assign to the function's named returns (note `=`, not + // `:=`) so the deferred span closure sees the live values + // even if a future change replaces the explicit + // `return X, Y` form below with a bare `return`. + results, err = strategyImpl.Query(ctx, query, strategyCfg.Limit, strategyCfg.Threshold) if err != nil { slog.Error("[RAG Manager] Strategy query failed", "rag_name", m.name, @@ -431,7 +470,20 @@ func getStrategyNames(stratMap map[string]strategy.Strategy) []string { } // CheckAndReindexChangedFiles checks for file changes and re-indexes if needed -func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) error { +func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) (err error) { + tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag") + ctx, span := tracer.Start(ctx, "rag.reindex", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attribute.String(genai.AttrDataSourceID, m.name)), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + for strategyName, strategyImpl := range m.strategies { strategyCfg := m.strategyConfigs[strategyName] if err := strategyImpl.CheckAndReindexChangedFiles(ctx, strategyCfg.Docs, strategyCfg.Chunking); err != nil { @@ -442,7 +494,20 @@ func (m *Manager) CheckAndReindexChangedFiles(ctx context.Context) error { } // StartFileWatcher starts monitoring files and directories for changes -func (m *Manager) StartFileWatcher(ctx context.Context) error { +func (m *Manager) StartFileWatcher(ctx context.Context) (err error) { + tracer := otel.Tracer("github.com/docker/docker-agent/pkg/rag") + ctx, span := tracer.Start(ctx, "rag.file_watcher.start", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attribute.String(genai.AttrDataSourceID, m.name)), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + for strategyName, strategyImpl := range m.strategies { strategyCfg := m.strategyConfigs[strategyName] if err := strategyImpl.StartFileWatcher(ctx, strategyCfg.Docs, strategyCfg.Chunking); err != nil { diff --git a/pkg/runtime/agent_delegation.go b/pkg/runtime/agent_delegation.go index 0f46f280e..e8c5e39fe 100644 --- a/pkg/runtime/agent_delegation.go +++ b/pkg/runtime/agent_delegation.go @@ -14,6 +14,7 @@ import ( "github.com/docker/docker-agent/pkg/agent" "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" agenttool "github.com/docker/docker-agent/pkg/tools/builtin/agent" "github.com/docker/docker-agent/pkg/tools/builtin/handoff" @@ -408,11 +409,34 @@ func (r *LocalRuntime) handleTaskTransfer(ctx context.Context, sess *session.Ses slog.Debug("Transferring task to agent", "from_agent", a.Name(), "to_agent", params.Agent, "task", params.Task) - ctx, span := r.startSpan(ctx, "runtime.task_transfer", trace.WithAttributes( - attribute.String("from.agent", a.Name()), - attribute.String("to.agent", params.Agent), - attribute.String("session.id", sess.ID), - )) + delegationAttrs := []attribute.KeyValue{ + attribute.String(genai.AttrOperationName, genai.OperationInvokeAgent), + // gen_ai.agent.name identifies the target agent of the invoke_agent + // operation per the OTel GenAI semconv (Required). cagent.agent.name + // is the same value but in our internal namespace; we emit both so + // spec-aware backends and existing cagent dashboards both see it. + attribute.String(genai.AttrAgentName, params.Agent), + attribute.String("cagent.delegation.from_agent", a.Name()), + attribute.String("cagent.delegation.to_agent", params.Agent), + attribute.String("cagent.delegation.kind", "transfer_task"), + attribute.String(genai.AttrConversationID, sess.ID), + attribute.String(genai.AttrAgentNameRuntime, params.Agent), + } + if params.Task != "" { + // Task length is bounded enough to be useful as a span + // attribute for debugging "agent X transferred which task + // to Y". The full task body lands on the sub-session's + // runtime.session span when content capture is opt-in. + delegationAttrs = append(delegationAttrs, attribute.Int("cagent.delegation.task_length", len(params.Task))) + } + if genai.EmitLegacyAttributes() { + delegationAttrs = append(delegationAttrs, + attribute.String("from.agent", a.Name()), + attribute.String("to.agent", params.Agent), + attribute.String("session.id", sess.ID), + ) + } + ctx, span := r.startSpan(ctx, "runtime.task_transfer", trace.WithAttributes(delegationAttrs...)) defer span.End() return r.runForwarding(ctx, sess, evts, delegationRequest{ @@ -449,6 +473,26 @@ func (r *LocalRuntime) handleHandoff(ctx context.Context, sess *session.Session, return nil, err } + // Handoff is in-place agent swap (same session, different agent + // from the next turn). Span name keeps the runtime.* family; + // attributes mirror the transfer_task span shape so dashboards + // can union both delegation kinds. Take the returned ctx so + // `executeOnAgentSwitchHooks` and any of its children parent + // onto this span instead of bypassing it. + ctx, span := r.startSpan(ctx, "runtime.handoff", trace.WithAttributes( + attribute.String(genai.AttrOperationName, genai.OperationInvokeAgent), + // gen_ai.agent.name — Required by OTel GenAI semconv on invoke_agent + // spans; identifies the agent being handed off to. See task_transfer + // for the rationale of dual-emitting alongside cagent.agent.name. + attribute.String(genai.AttrAgentName, next.Name()), + attribute.String("cagent.delegation.from_agent", ca), + attribute.String("cagent.delegation.to_agent", next.Name()), + attribute.String("cagent.delegation.kind", "handoff"), + attribute.String(genai.AttrConversationID, sess.ID), + attribute.String(genai.AttrAgentNameRuntime, next.Name()), + )) + defer span.End() + r.executeOnAgentSwitchHooks(ctx, currentAgent, sess.ID, ca, next.Name(), agentSwitchKindHandoff) r.setCurrentAgent(next.Name()) handoffMessage := "The agent " + ca + " handed off the conversation to you. " + diff --git a/pkg/runtime/cache.go b/pkg/runtime/cache.go index 3e5e5a307..7448e418b 100644 --- a/pkg/runtime/cache.go +++ b/pkg/runtime/cache.go @@ -10,6 +10,7 @@ import ( "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/hooks" "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) // BuiltinCacheResponse is the name of the builtin stop hook that persists @@ -63,7 +64,10 @@ func (r *LocalRuntime) tryReplayCachedResponse( if question == "" { return false } + _, cacheSpan := genai.RecordCacheLookup(ctx, "") cached, ok := c.Lookup(question) + cacheSpan.SetHit(ok && cached != "") + cacheSpan.End() // Treat empty stored values as misses: cache_response only stores // non-empty responses, so an empty entry only surfaces if the JSON // file was hand-edited or downgraded from a future version. Replaying @@ -99,7 +103,7 @@ func (r *LocalRuntime) tryReplayCachedResponse( // (handled inside [cache.Cache.Store]), which makes the replay path — // where [LocalRuntime.tryReplayCachedResponse] fires stop hooks for the // cached answer — free of redundant disk writes. -func (r *LocalRuntime) cacheResponseBuiltin(_ context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) { +func (r *LocalRuntime) cacheResponseBuiltin(ctx context.Context, in *hooks.Input, _ []string) (*hooks.Output, error) { if in == nil || in.AgentName == "" || in.LastUserMessage == "" || strings.TrimSpace(in.StopResponse) == "" { return nil, nil @@ -111,7 +115,16 @@ func (r *LocalRuntime) cacheResponseBuiltin(_ context.Context, in *hooks.Input, return nil, nil } if c := a.Cache(); c != nil { + // Thread the active context so the cache.store span chains + // onto the surrounding stop-hook trace instead of starting a + // detached one. Mark the operation as a successful write so + // the `cagent.cache.requests{operation="store"}` counter is + // incremented — without SetHit the store path would never + // register on the metric. + _, storeSpan := genai.RecordCacheStore(ctx, "") c.Store(in.LastUserMessage, in.StopResponse) + storeSpan.SetHit(true) + storeSpan.End() } return nil, nil } diff --git a/pkg/runtime/compactor/compactor.go b/pkg/runtime/compactor/compactor.go index 721edd2d7..cc52030d5 100644 --- a/pkg/runtime/compactor/compactor.go +++ b/pkg/runtime/compactor/compactor.go @@ -24,6 +24,11 @@ import ( "fmt" "time" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/agent" "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/compaction" @@ -104,7 +109,39 @@ type LLMArgs struct { // Returns (nil, nil) when the model returns an empty summary; callers // should treat that as "compaction was a no-op" and skip the apply // step. -func RunLLM(ctx context.Context, args LLMArgs) (*Result, error) { +func RunLLM(ctx context.Context, args LLMArgs) (result *Result, err error) { + // One INTERNAL `compaction` span covers the LLM-driven summarization + // strategy end-to-end. The inner LLM call gets its own `chat {model}` + // CLIENT child span via the provider decorator, so this parent span + // is a useful aggregate boundary (context limit, summary tokens, + // outcome) without duplicating per-call timing data. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/runtime/compactor").Start( + ctx, + "compaction", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes( + attribute.Int64("cagent.compaction.context_limit", args.ContextLimit), + ), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + if result != nil { + // `Result.InputTokens` actually holds the compaction + // sub-session's *output* token count (the summary length) + // per the field's doc — name the span attribute by what the + // value is, not by what the source struct field is named. + span.SetAttributes( + attribute.Int("cagent.compaction.summary_output_tokens", int(result.InputTokens)), + attribute.Float64("cagent.compaction.cost", result.Cost), + attribute.Int("cagent.compaction.first_kept_entry", result.FirstKeptEntry), + ) + } + span.End() + }() + if args.RunAgent == nil { return nil, errors.New("compactor: RunAgent is required") } diff --git a/pkg/runtime/fallback.go b/pkg/runtime/fallback.go index 8b0780aab..ee539e2a1 100644 --- a/pkg/runtime/fallback.go +++ b/pkg/runtime/fallback.go @@ -14,6 +14,7 @@ import ( "github.com/docker/docker-agent/pkg/modelerrors" "github.com/docker/docker-agent/pkg/modelsdev" "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" ) @@ -237,6 +238,14 @@ func (e *fallbackExecutor) execute( modelChain := buildModelChain(primaryModel, fallbackModels) startIndex := e.chainStartIndex(a, len(fallbackModels)) + // One runtime.fallback span wraps the whole chain. Each per-model + // CreateChatCompletionStream call below opens its own `chat {model}` + // CLIENT child span via the provider decorator, so the fallback span + // is a useful aggregate boundary (total attempts, final model, + // terminal outcome) without duplicating per-model timing data. + ctx, fbSpan := genai.StartFallback(ctx, a.Name(), primaryModel.ID(), startIndex > 0) + defer fbSpan.End() + var lastErr error primaryFailedWithNonRetryable := false hasFallbacks := len(fallbackModels) > 0 @@ -252,14 +261,17 @@ func (e *fallbackExecutor) execute( for attempt := range maxAttempts { // Check context before each attempt if ctx.Err() != nil { + fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled) return streamResult{}, nil, ctx.Err() } + fbSpan.IncrementAttempt() // Apply backoff before retry (not on first attempt of each model) if attempt > 0 { backoffDelay := backoff.Calculate(attempt - 1) logRetryBackoff(a.Name(), modelEntry.provider.ID(), attempt, backoffDelay) if !backoff.SleepWithContext(ctx, backoffDelay) { + fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled) return streamResult{}, nil, ctx.Err() } } @@ -294,6 +306,7 @@ func (e *fallbackExecutor) execute( lastErr = err decision, retErr := e.classifyAttemptError(ctx, err, a, modelEntry, attempt, hasFallbacks, &primaryFailedWithNonRetryable) if retErr != nil { + fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled) return streamResult{}, nil, retErr } if decision == retryDecisionBreak { @@ -317,6 +330,7 @@ func (e *fallbackExecutor) execute( lastErr = err decision, retErr := e.classifyAttemptError(ctx, err, a, modelEntry, attempt, hasFallbacks, &primaryFailedWithNonRetryable) if retErr != nil { + fbSpan.SetOutcome(genai.FallbackOutcomeContextCanceled) return streamResult{}, nil, retErr } if decision == retryDecisionBreak { @@ -326,6 +340,8 @@ func (e *fallbackExecutor) execute( } e.recordSuccess(a, modelEntry, primaryFailedWithNonRetryable) + fbSpan.SetFinalModel(modelEntry.provider.ID()) + fbSpan.SetOutcome(genai.FallbackOutcomeSuccess) return res, modelEntry.provider, nil } } @@ -339,12 +355,17 @@ func (e *fallbackExecutor) execute( prefix = "all models failed" } wrapped := fmt.Errorf("%s: %w", prefix, lastErr) + fbSpan.RecordError(wrapped, "") + fbSpan.SetOutcome(genai.FallbackOutcomeFailed) if modelerrors.IsContextOverflowError(lastErr) { return streamResult{}, nil, modelerrors.NewContextOverflowError(wrapped) } return streamResult{}, nil, wrapped } - return streamResult{}, nil, errors.New("model failed with unknown error") + unknownErr := errors.New("model failed with unknown error") + fbSpan.RecordError(unknownErr, "") + fbSpan.SetOutcome(genai.FallbackOutcomeFailed) + return streamResult{}, nil, unknownErr } // retryDecision is the outcome of handleModelError. diff --git a/pkg/runtime/loop.go b/pkg/runtime/loop.go index 96f2f4f7e..e366f0de8 100644 --- a/pkg/runtime/loop.go +++ b/pkg/runtime/loop.go @@ -20,6 +20,7 @@ import ( "github.com/docker/docker-agent/pkg/modelsdev" "github.com/docker/docker-agent/pkg/runtime/toolexec" "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" bgagent "github.com/docker/docker-agent/pkg/tools/builtin/agent" "github.com/docker/docker-agent/pkg/tools/builtin/handoff" @@ -179,10 +180,32 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session, ctx = httpclient.ContextWithSessionID(ctx, sess.ID) r.telemetry.RecordSessionStart(ctx, r.CurrentAgentName(), sess.ID) - ctx, sessionSpan := r.startSpan(ctx, "runtime.session", trace.WithAttributes( - attribute.String("agent", r.CurrentAgentName()), - attribute.String("session.id", sess.ID), - )) + // Seed `gen_ai.conversation.id` into baggage at the session + // boundary. Every span the runtime, providers, MCP client, RAG, + // sandbox, evaluation, hooks, and (downstream) any subprocess + // or remote service create from here on will pick it up + // automatically without per-helper plumbing — and the value + // rides over W3C `baggage` so it crosses MCP / sandbox / + // HTTP boundaries too. + ctx = genai.WithConversationID(ctx, sess.ID) + + // runtime.session is the root span for one stream. gen_ai.* keys + // are emitted alongside the legacy `agent` / `session.id` keys + // so existing dashboards keep matching while spec-aware tooling + // can filter by `gen_ai.conversation.id` and + // `cagent.agent.name`. Legacy keys drop out under + // OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental. + sessionAttrs := []attribute.KeyValue{ + attribute.String(genai.AttrConversationID, sess.ID), + attribute.String(genai.AttrAgentNameRuntime, r.CurrentAgentName()), + } + if genai.EmitLegacyAttributes() { + sessionAttrs = append(sessionAttrs, + attribute.String("agent", r.CurrentAgentName()), + attribute.String("session.id", sess.ID), + ) + } + ctx, sessionSpan := r.startSpan(ctx, "runtime.session", trace.WithAttributes(sessionAttrs...)) defer sessionSpan.End() // Swap in this stream's events channel for elicitation and save the @@ -213,6 +236,12 @@ func (r *LocalRuntime) runStreamLoop(ctx context.Context, sess *session.Session, } agentTools = filterExcludedTools(agentTools, sess.ExcludedTools) + // Record the catalogue size on the session span — answers "how + // many tools could this turn actually use?" without having to + // walk into per-toolset spans. Stamped after exclusion filters + // so the count matches what was offered to the model. + sessionSpan.SetAttributes(attribute.Int("cagent.agent.tools.count", len(agentTools))) + events <- ToolsetInfo(len(agentTools), false, a.Name()) messages := sess.GetMessages(a) @@ -445,10 +474,17 @@ func (r *LocalRuntime) runTurn( toolModelOverride *string, events chan Event, ) (ctrl turnControl) { - streamCtx, streamSpan := r.startSpan(ctx, "runtime.stream", trace.WithAttributes( - attribute.String("agent", a.Name()), - attribute.String("session.id", sess.ID), - )) + streamAttrs := []attribute.KeyValue{ + attribute.String(genai.AttrConversationID, sess.ID), + attribute.String(genai.AttrAgentNameRuntime, a.Name()), + } + if genai.EmitLegacyAttributes() { + streamAttrs = append(streamAttrs, + attribute.String("agent", a.Name()), + attribute.String("session.id", sess.ID), + ) + } + streamCtx, streamSpan := r.startSpan(ctx, "runtime.stream", trace.WithAttributes(streamAttrs...)) // streamSpan ends inline at the natural points (success path before // recordAssistantMessage, error path after handleStreamError) so its // duration tracks the model call only, not the whole iteration. The @@ -600,6 +636,15 @@ func (r *LocalRuntime) runTurn( "Agent terminated: detected %d consecutive identical calls to %s. "+ "This indicates a degenerate loop where the model is not making progress.", consecutive, toolName) + // Mark the session span as Error so loop-termination shows up + // in trace status / error-rate dashboards instead of blending + // in with normal completions. + sessionSpan.SetAttributes( + attribute.String("error.type", "loop_detected"), + attribute.String("cagent.session.terminated_by", "loop_detector"), + attribute.Int("cagent.loop.consecutive_calls", consecutive), + ) + sessionSpan.SetStatus(codes.Error, errMsg) events <- Error(errMsg) r.notifyError(ctx, a, sess.ID, errMsg) loopDetector.Reset() diff --git a/pkg/runtime/skill_runner.go b/pkg/runtime/skill_runner.go index 71e9a7c6d..6ca31c6a6 100644 --- a/pkg/runtime/skill_runner.go +++ b/pkg/runtime/skill_runner.go @@ -10,6 +10,7 @@ import ( "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" "github.com/docker/docker-agent/pkg/tools/builtin/skills" ) @@ -49,11 +50,37 @@ func (r *LocalRuntime) handleRunSkill(ctx context.Context, sess *session.Session // Open the span before any pre-delegation work so model resolution // (inside WithAgentModel) is recorded under runtime.run_skill rather // than the parent session span. - ctx, span := r.startSpan(ctx, "runtime.run_skill", trace.WithAttributes( - attribute.String("agent", ca), - attribute.String("skill", prepared.SkillName), - attribute.String("session.id", sess.ID), - )) + // + // Skills are workflow-shaped (a coordinated process the agent + // orchestrates), so the GenAI semconv `invoke_workflow` operation + // applies. Emit it via gen_ai.* attrs alongside the legacy keys + // for back-compat. + skillAttrs := []attribute.KeyValue{ + attribute.String(genai.AttrOperationName, genai.OperationInvokeWorkflow), + attribute.String(genai.AttrWorkflowName, prepared.SkillName), + attribute.String(genai.AttrAgentNameRuntime, ca), + attribute.String(genai.AttrConversationID, sess.ID), + } + if genai.EmitLegacyAttributes() { + skillAttrs = append(skillAttrs, + attribute.String("agent", ca), + attribute.String("skill", prepared.SkillName), + attribute.String("session.id", sess.ID), + ) + } + // Span name follows the GenAI agent semconv pattern + // `invoke_workflow {workflow.name}` so spec-aware backends + // classify the span as a workflow invocation. SpanKindInternal is + // passed explicitly per spec rather than relying on the SDK + // default — keeps intent clear and immune to default changes. + spanName := genai.OperationInvokeWorkflow + if prepared.SkillName != "" { + spanName = genai.OperationInvokeWorkflow + " " + prepared.SkillName + } + ctx, span := r.startSpan(ctx, spanName, + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(skillAttrs...), + ) defer span.End() slog.Debug("Running skill as sub-agent", diff --git a/pkg/runtime/toolexec/dispatcher.go b/pkg/runtime/toolexec/dispatcher.go index 1d1636eb7..21cd1050a 100644 --- a/pkg/runtime/toolexec/dispatcher.go +++ b/pkg/runtime/toolexec/dispatcher.go @@ -19,6 +19,7 @@ import ( "github.com/docker/docker-agent/pkg/hooks" "github.com/docker/docker-agent/pkg/session" "github.com/docker/docker-agent/pkg/telemetry" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" ) @@ -30,19 +31,21 @@ const ( ApprovalDecisionDeny = "deny" ApprovalDecisionCanceled = "canceled" - ApprovalSourceYolo = "yolo" - ApprovalSourceSessionPermissionsAllow = "session_permissions_allow" - ApprovalSourceSessionPermissionsDeny = "session_permissions_deny" - ApprovalSourceTeamPermissionsAllow = "team_permissions_allow" - ApprovalSourceTeamPermissionsDeny = "team_permissions_deny" - ApprovalSourcePreToolUseHookAllow = "pre_tool_use_hook_allow" - ApprovalSourcePreToolUseHookDeny = "pre_tool_use_hook_deny" - ApprovalSourceReadOnlyHint = "readonly_hint" - ApprovalSourceUserApproved = "user_approved" - ApprovalSourceUserApprovedSession = "user_approved_session" - ApprovalSourceUserApprovedTool = "user_approved_tool" - ApprovalSourceUserRejected = "user_rejected" - ApprovalSourceContextCanceled = "context_canceled" + ApprovalSourceYolo = "yolo" + ApprovalSourceSessionPermissionsAllow = "session_permissions_allow" + ApprovalSourceSessionPermissionsDeny = "session_permissions_deny" + ApprovalSourceTeamPermissionsAllow = "team_permissions_allow" + ApprovalSourceTeamPermissionsDeny = "team_permissions_deny" + ApprovalSourcePreToolUseHookAllow = "pre_tool_use_hook_allow" + ApprovalSourcePreToolUseHookDeny = "pre_tool_use_hook_deny" + ApprovalSourcePermissionRequestHookDeny = "permission_request_hook_deny" + ApprovalSourcePermissionRequestHookAllow = "permission_request_hook_allow" + ApprovalSourceReadOnlyHint = "readonly_hint" + ApprovalSourceUserApproved = "user_approved" + ApprovalSourceUserApprovedSession = "user_approved_session" + ApprovalSourceUserApprovedTool = "user_approved_tool" + ApprovalSourceUserRejected = "user_rejected" + ApprovalSourceContextCanceled = "context_canceled" ) // CallOutcome captures the verdicts of a single tool invocation as @@ -245,13 +248,25 @@ type call struct { // and approval bookkeeping lives here so the call lifecycle is visible // at a glance. func (c *call) run(ctx context.Context) CallOutcome { - ctx, span := c.d.startSpan(ctx, "runtime.tool.call", trace.WithAttributes( - attribute.String("tool.name", c.tc.Function.Name), - attribute.String("tool.type", string(c.tc.Type)), - attribute.String("agent", c.a.Name()), - attribute.String("session.id", c.sess.ID), - attribute.String("tool.call_id", c.tc.ID), - )) + // gen_ai.* attributes are always emitted (spec-compliant). Legacy + // attribute names are added only when the OTel stability flag is + // at its default — `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental` + // drops the legacy keys. Tool type is "function" because every tool + // presented here is an LLM-callable function (transfer_task / + // handoff are runtime-managed but still appear as functions to the + // model). + attrs := []attribute.KeyValue{ + attribute.String(genai.AttrOperationName, genai.OperationExecuteTool), + attribute.String(genai.AttrToolName, c.tc.Function.Name), + attribute.String(genai.AttrToolType, "function"), + attribute.String(genai.AttrToolCallID, c.tc.ID), + attribute.String(genai.AttrAgentNameRuntime, c.a.Name()), + attribute.String(genai.AttrConversationID, c.sess.ID), + } + attrs = append(attrs, genai.LegacyToolAttributes( + c.tc.Function.Name, string(c.tc.Type), c.a.Name(), c.sess.ID, c.tc.ID, + )...) + ctx, span := c.d.startSpan(ctx, "runtime.tool.call", trace.WithAttributes(attrs...)) defer span.End() slog.Debug("Processing tool call", "agent", c.a.Name(), "tool", c.tc.Function.Name, "session_id", c.sess.ID) @@ -422,9 +437,17 @@ func (c *call) applyHookModifiedInput(result *hooks.Result) { } // notifyApproval forwards the resolved approval decision to the -// HookDispatcher, when one is configured. Centralised so the nil-guard -// stays in one place. +// HookDispatcher, when one is configured. Also stamps the decision + +// source on the active runtime.tool.call span so denied / canceled +// calls are visible in trace dashboards (without it, denied tool calls +// are indistinguishable from user-canceled ones at the span level). func (c *call) notifyApproval(ctx context.Context, decision, source string) { + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.String("cagent.approval.decision", decision), + attribute.String("cagent.approval.source", source), + ) + } if c.d.Hooks == nil { return } @@ -529,6 +552,12 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call if !result.Allowed { slog.Debug("Tool denied by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.Message) + // Stamp the deny on the runtime.tool.call span via notifyApproval + // before returning. Without this the span would end with status + // Ok and no cagent.approval.* attrs — denied-by-hook calls would + // look identical to successful ones in trace dashboards, while + // pre_tool_use deny does emit the attrs. Symmetry matters. + c.notifyApproval(ctx, ApprovalDecisionDeny, ApprovalSourcePermissionRequestHookDeny) rejectMsg := "The tool call was rejected by a permission_request hook." if reason := strings.TrimSpace(result.Message); reason != "" { rejectMsg += " Reason: " + reason @@ -539,6 +568,7 @@ func (c *call) runPermissionRequestHook(ctx context.Context, runTool func() Call if result.PermissionAllowed { slog.Debug("Tool auto-approved by permission_request hook", "tool", toolName, "session_id", c.sess.ID, "reason", result.AdditionalContext) + c.notifyApproval(ctx, ApprovalDecisionAllow, ApprovalSourcePermissionRequestHookAllow) return runTool(), true } @@ -618,14 +648,28 @@ func (c *call) runHandler(ctx context.Context, handler ToolHandler) { // translation, and session message persistence. It is the only place // where a tool actually runs. func (c *call) invoke(ctx context.Context, spanName string, exec func(ctx context.Context) (*tools.ToolCallResult, time.Duration, error)) *tools.ToolCallResult { - ctx, span := c.d.startSpan(ctx, spanName, trace.WithAttributes( - attribute.String("tool.name", c.tc.Function.Name), - attribute.String("agent", c.a.Name()), - attribute.String("session.id", c.sess.ID), - attribute.String("tool.call_id", c.tc.ID), - )) + attrs := []attribute.KeyValue{ + attribute.String(genai.AttrOperationName, genai.OperationExecuteTool), + attribute.String(genai.AttrToolName, c.tc.Function.Name), + attribute.String(genai.AttrToolType, "function"), + attribute.String(genai.AttrToolCallID, c.tc.ID), + attribute.String(genai.AttrAgentNameRuntime, c.a.Name()), + attribute.String(genai.AttrConversationID, c.sess.ID), + } + attrs = append(attrs, genai.LegacyToolAttributes( + c.tc.Function.Name, string(c.tc.Type), c.a.Name(), c.sess.ID, c.tc.ID, + )...) + ctx, span := c.d.startSpan(ctx, spanName, trace.WithAttributes(attrs...)) defer span.End() + // gen_ai.tool.call.arguments capture is gated on the same opt-in as + // chat content (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`) + // because tool arguments commonly carry the same PII / secrets as the + // chat history that produced them (file paths, API tokens, prompts). + if genai.IsContentCaptureEnabled() && c.tc.Function.Arguments != "" { + span.SetAttributes(attribute.String(genai.AttrToolCallArguments, c.tc.Function.Arguments)) + } + c.em.EmitToolCall(c.tc, c.tool, c.a.Name()) res, duration, err := exec(ctx) @@ -647,6 +691,14 @@ func (c *call) invoke(ctx context.Context, spanName string, exec func(ctx contex // path through Dispatch's `exec.Has(event)` short-circuit. res.Output = c.applyToolResponseTransform(ctx, res.Output, false) + // gen_ai.tool.call.result captures the post-transform output so the + // span matches what the LLM actually saw on the next turn (any + // redact_secrets / scrubber rewrite is reflected). Same content-capture + // gating as arguments above. + if genai.IsContentCaptureEnabled() && res != nil && res.Output != "" { + span.SetAttributes(attribute.String(genai.AttrToolCallResult, res.Output)) + } + c.em.EmitToolCallResponse(c.tc.ID, c.tool, res, res.Output, c.a.Name()) c.recordToolResponse(res) return res diff --git a/pkg/server/server.go b/pkg/server/server.go index dd33b1290..030cdbb1a 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -14,6 +14,7 @@ import ( "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "github.com/docker/docker-agent/pkg/api" "github.com/docker/docker-agent/pkg/config" @@ -80,8 +81,14 @@ func New(ctx context.Context, sessionStore session.Store, runConfig *config.Runt } func (s *Server) Serve(ctx context.Context, ln net.Listener) error { + // Wrap the Echo handler with otelhttp so the configured W3C + // propagator extracts `traceparent` / `tracestate` / `baggage` + // from incoming API requests. Without this the API server's + // runtime spans (already wired via `WithTracer` in the session + // manager) start fresh trace ids per request rather than + // chaining onto the calling client's trace. srv := http.Server{ - Handler: s.e, + Handler: otelhttp.NewHandler(s.e, "agent-api"), } if err := srv.Serve(ln); err != nil && ctx.Err() == nil { diff --git a/pkg/server/session_manager.go b/pkg/server/session_manager.go index 0b6d82605..7d6130440 100644 --- a/pkg/server/session_manager.go +++ b/pkg/server/session_manager.go @@ -12,6 +12,9 @@ import ( "time" "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/api" "github.com/docker/docker-agent/pkg/concurrent" @@ -402,12 +405,30 @@ func (sm *SessionManager) generateTitle(ctx context.Context, sess *session.Sessi } } -func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.Session, agentFilename, currentAgent string, rc *config.RuntimeConfig) (runtime.Runtime, *sessiontitle.Generator, error) { +func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.Session, agentFilename, currentAgent string, rc *config.RuntimeConfig) (_ runtime.Runtime, _ *sessiontitle.Generator, err error) { // Caller (RunSession) holds sm.mux and has already verified that no // active runtime exists for this session. This function is purely a // constructor: it must not touch sm.runtimeSessions, otherwise it would // briefly publish a half-initialised activeRuntimes (e.g. without the // cancel func) that other goroutines could observe. + // + // Every call is a cold-path construction (caller short-circuits + // cached hits), so a span here attributes per-request first-use + // latency (team load + runtime construction) without adding noise + // on warm paths. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/server").Start( + ctx, "session.runtime_init", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attribute.String("gen_ai.conversation.id", sess.ID)), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + t, err := sm.loadTeam(ctx, agentFilename, rc) if err != nil { return nil, nil, err @@ -427,6 +448,9 @@ func (sm *SessionManager) runtimeForSession(ctx context.Context, sess *session.S runtime.WithCurrentAgent(currentAgent), runtime.WithManagedOAuth(false), runtime.WithSessionStore(sm.sessionStore), + // Match the tracer scope used by the CLI; without this the + // API-server runtime's startSpan is a no-op so all the + // runtime.* spans go silent in HTTP-server mode. runtime.WithTracer(otel.Tracer("cagent")), } run, err := runtime.New(t, opts...) diff --git a/pkg/sessiontitle/generator.go b/pkg/sessiontitle/generator.go index be8b33166..21f0a8ff9 100644 --- a/pkg/sessiontitle/generator.go +++ b/pkg/sessiontitle/generator.go @@ -13,10 +13,16 @@ import ( "strings" "time" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/httpclient" "github.com/docker/docker-agent/pkg/model/provider" "github.com/docker/docker-agent/pkg/model/provider/options" + "github.com/docker/docker-agent/pkg/telemetry/genai" ) const ( @@ -56,7 +62,7 @@ func New(model provider.Provider, fallbackModels ...provider.Provider) *Generato // CreateChatCompletionStream, avoiding the overhead of spinning up a nested // runtime, and falls back to the next model on failure. // Returns an empty string if no models or messages are configured. -func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages []string) (string, error) { +func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages []string) (title string, err error) { if g == nil || len(g.models) == 0 || len(userMessages) == 0 { return "", nil } @@ -67,6 +73,27 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages // the originating session. ctx = httpclient.ContextWithSessionID(ctx, sessionID) + // Wrap the whole title-generation in a span so the boundary is + // visible on the session timeline. The inner per-attempt LLM + // calls each get their own `chat {model}` CLIENT child span via + // the provider decorator. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/sessiontitle").Start( + ctx, + "sessiontitle.generate", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes( + attribute.String(genai.AttrConversationID, sessionID), + attribute.Int("cagent.sessiontitle.candidate_count", len(g.models)), + ), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + // Apply timeout to prevent hanging on slow or unresponsive models. ctx, cancel := context.WithTimeout(ctx, titleGenerationTimeout) defer cancel() @@ -77,7 +104,10 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages var lastErr error for idx, baseModel := range g.models { - if err := ctx.Err(); err != nil { + // Assign to the named-return `err` so a context cancellation + // is observed by the deferred span closure as a recorded + // error rather than silently slipping through. + if err = ctx.Err(); err != nil { //nolint:gocritic // assigns to named return `err` for deferred span observability return "", err } diff --git a/pkg/teamloader/teamloader.go b/pkg/teamloader/teamloader.go index f28ceea8b..bbe4a8c6f 100644 --- a/pkg/teamloader/teamloader.go +++ b/pkg/teamloader/teamloader.go @@ -13,6 +13,11 @@ import ( "strings" "sync" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/agent" "github.com/docker/docker-agent/pkg/config" "github.com/docker/docker-agent/pkg/config/latest" @@ -88,7 +93,23 @@ func Load(ctx context.Context, agentSource config.Source, runConfig *config.Runt // LoadWithConfig loads an agent team and returns both the team and config info // needed for runtime model switching. -func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *config.RuntimeConfig, opts ...Opt) (*LoadResult, error) { +func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *config.RuntimeConfig, opts ...Opt) (result *LoadResult, err error) { + // Cold-start path: parses config, resolves model aliases, may pull + // referenced sub-agents over the network, and starts every toolset. + // All synchronous from the caller's perspective. The span makes the + // breakdown attributable when first-use latency is high. + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/teamloader").Start( + ctx, "teamloader.load", + trace.WithSpanKind(trace.SpanKindInternal), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + var loadOpts loadOptions loadOpts.toolsetRegistry = NewDefaultToolsetRegistry() @@ -103,6 +124,12 @@ func LoadWithConfig(ctx context.Context, agentSource config.Source, runConfig *c if err != nil { return nil, err } + if cfg != nil { + span.SetAttributes( + attribute.Int("cagent.teamloader.agent_count", len(cfg.Agents)), + attribute.Int("cagent.teamloader.model_count", len(cfg.Models)), + ) + } // Resolve model aliases (e.g., "claude-sonnet-4-5" -> "claude-sonnet-4-5-20250929") // This ensures the API uses the pinned model version. The original name is preserved diff --git a/pkg/telemetry/genai/attrs.go b/pkg/telemetry/genai/attrs.go new file mode 100644 index 000000000..48e0b82f4 --- /dev/null +++ b/pkg/telemetry/genai/attrs.go @@ -0,0 +1,105 @@ +package genai + +// Attribute keys defined by the OTel GenAI semantic conventions. All are +// Development stability — declared as constants here so call sites depend +// on a stable local symbol rather than a moving upstream import path. +const ( + AttrOperationName = "gen_ai.operation.name" + AttrProviderName = "gen_ai.provider.name" + AttrConversationID = "gen_ai.conversation.id" + AttrOutputType = "gen_ai.output.type" + + AttrAgentName = "gen_ai.agent.name" + AttrAgentID = "gen_ai.agent.id" + AttrAgentDescription = "gen_ai.agent.description" + AttrAgentVersion = "gen_ai.agent.version" + + AttrWorkflowName = "gen_ai.workflow.name" + + AttrRequestModel = "gen_ai.request.model" + AttrRequestStream = "gen_ai.request.stream" + AttrRequestMaxTokens = "gen_ai.request.max_tokens" + AttrRequestTemperature = "gen_ai.request.temperature" + AttrRequestTopP = "gen_ai.request.top_p" + AttrRequestTopK = "gen_ai.request.top_k" + AttrRequestFrequencyPenalty = "gen_ai.request.frequency_penalty" + AttrRequestPresencePenalty = "gen_ai.request.presence_penalty" + AttrRequestStopSequences = "gen_ai.request.stop_sequences" + AttrRequestChoiceCount = "gen_ai.request.choice.count" + AttrRequestSeed = "gen_ai.request.seed" + AttrRequestEncodingFormats = "gen_ai.request.encoding_formats" + + AttrResponseModel = "gen_ai.response.model" + AttrResponseID = "gen_ai.response.id" + AttrResponseFinishReasons = "gen_ai.response.finish_reasons" + AttrResponseTimeToFirstChunk = "gen_ai.response.time_to_first_chunk" + + AttrUsageInputTokens = "gen_ai.usage.input_tokens" + AttrUsageOutputTokens = "gen_ai.usage.output_tokens" + AttrUsageCacheReadInputTokens = "gen_ai.usage.cache_read.input_tokens" + AttrUsageCacheCreationInputTokens = "gen_ai.usage.cache_creation.input_tokens" + AttrUsageReasoningOutputTokens = "gen_ai.usage.reasoning.output_tokens" + + AttrTokenType = "gen_ai.token.type" + + AttrToolName = "gen_ai.tool.name" + AttrToolCallID = "gen_ai.tool.call.id" + AttrToolType = "gen_ai.tool.type" + AttrToolDescription = "gen_ai.tool.description" + AttrToolDefinitions = "gen_ai.tool.definitions" + AttrToolCallArguments = "gen_ai.tool.call.arguments" + AttrToolCallResult = "gen_ai.tool.call.result" + + AttrInputMessages = "gen_ai.input.messages" + AttrOutputMessages = "gen_ai.output.messages" + AttrSystemInstructions = "gen_ai.system_instructions" + + AttrPromptName = "gen_ai.prompt.name" + + AttrDataSourceID = "gen_ai.data_source.id" + AttrEmbeddingsDimensionCount = "gen_ai.embeddings.dimension.count" + AttrRetrievalDocuments = "gen_ai.retrieval.documents" + AttrRetrievalQueryText = "gen_ai.retrieval.query.text" + + AttrEvaluationName = "gen_ai.evaluation.name" + AttrEvaluationScoreLabel = "gen_ai.evaluation.score.label" + AttrEvaluationScoreValue = "gen_ai.evaluation.score.value" + AttrEvaluationExplanation = "gen_ai.evaluation.explanation" +) + +// Operation names — values for AttrOperationName. +const ( + OperationChat = "chat" + OperationTextCompletion = "text_completion" + OperationGenerateContent = "generate_content" + OperationEmbeddings = "embeddings" + OperationCreateAgent = "create_agent" + OperationInvokeAgent = "invoke_agent" + OperationInvokeWorkflow = "invoke_workflow" + OperationExecuteTool = "execute_tool" + OperationRetrieval = "retrieval" +) + +// Token types — values for AttrTokenType when recording the token usage +// histogram. Spec defines `input` and `output`; we use the cache_read / +// cache_creation / reasoning variants to mirror the per-token-type +// usage attributes for richer breakdowns. +const ( + TokenTypeInput = "input" + TokenTypeOutput = "output" + TokenTypeCacheRead = "cache_read.input" + TokenTypeCacheCreation = "cache_creation.input" + TokenTypeReasoning = "reasoning.output" +) + +// Provider names — values for AttrProviderName. Names follow the values +// defined in the provider-specific GenAI semconv pages. +const ( + ProviderAnthropic = "anthropic" + ProviderOpenAI = "openai" + ProviderAWSBedrock = "aws.bedrock" + ProviderGCPVertexAI = "gcp.vertex_ai" + ProviderGCPGenAI = "gcp.gen_ai" + ProviderAzureAI = "azure.ai.inference" + ProviderDMR = "docker.dmr" +) diff --git a/pkg/telemetry/genai/content.go b/pkg/telemetry/genai/content.go new file mode 100644 index 000000000..b7d09cc24 --- /dev/null +++ b/pkg/telemetry/genai/content.go @@ -0,0 +1,207 @@ +package genai + +import ( + "encoding/json" + "os" + "strings" + + "go.opentelemetry.io/otel/attribute" + + "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/tools" +) + +// EnvCaptureMessageContent is the OTel-recommended environment variable +// that toggles capture of GenAI request/response content as span +// attributes. Default is off because chat history routinely contains +// PII, secrets, internal documents, and other content that should not +// be exported by default. +// +// Recognised truthy values: "true", "1", "yes", "on" (case-insensitive). +const EnvCaptureMessageContent = "OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT" + +// IsContentCaptureEnabled reports whether the OTel content-capture +// opt-in is set. Read on every call so tests and feature flags can +// flip the value at runtime. +func IsContentCaptureEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv(EnvCaptureMessageContent))) { + case "true", "1", "yes", "on": + return true + default: + return false + } +} + +// messagePart matches the OTel GenAI semconv message part schema +// (https://opentelemetry.io/docs/specs/semconv/gen-ai/non-normative/examples-llm-calls/). +// +// Field choice per spec: +// - "text" parts use Content +// - "uri" parts use URI (and may set MimeType / Modality) +// - "tool_call" / "tool_call_response" parts use ID, Name, Arguments, +// Result +type messagePart struct { + Type string `json:"type"` + Content string `json:"content,omitempty"` + URI string `json:"uri,omitempty"` + MimeType string `json:"mime_type,omitempty"` + Modality string `json:"modality,omitempty"` + ID string `json:"id,omitempty"` + Name string `json:"name,omitempty"` + Arguments any `json:"arguments,omitempty"` + Result any `json:"result,omitempty"` +} + +type structuredMessage struct { + Role string `json:"role"` + Parts []messagePart `json:"parts"` +} + +// SetInputMessages serialises chat history into `gen_ai.input.messages` +// per the OTel GenAI examples schema (role + parts) and attaches it to +// the span. System messages are removed from the array and emitted +// separately as `gen_ai.system_instructions` per the spec. +// +// No-op when content capture is disabled or the span is nil. +func SetInputMessages(span *ChatSpan, messages []chat.Message) { + if span == nil || !IsContentCaptureEnabled() { + return + } + + var systemInstructions []structuredMessage + var input []structuredMessage + for i := range messages { + msg := messageToStructured(&messages[i]) + if messages[i].Role == chat.MessageRoleSystem { + systemInstructions = append(systemInstructions, msg) + continue + } + input = append(input, msg) + } + + if len(systemInstructions) > 0 { + if encoded, err := json.Marshal(systemInstructions); err == nil { + span.SetAttributes(attribute.String(AttrSystemInstructions, string(encoded))) + } + } + if len(input) > 0 { + if encoded, err := json.Marshal(input); err == nil { + span.SetAttributes(attribute.String(AttrInputMessages, string(encoded))) + } + } +} + +// SetOutputMessages serialises the assembled response into +// `gen_ai.output.messages`. Use after streaming has completed and the +// final assistant message is known. +func SetOutputMessages(span *ChatSpan, content, reasoning string, toolCalls []tools.ToolCall) { + if span == nil || !IsContentCaptureEnabled() { + return + } + parts := []messagePart{} + if reasoning != "" { + parts = append(parts, messagePart{Type: "reasoning", Content: reasoning}) + } + if content != "" { + parts = append(parts, messagePart{Type: "text", Content: content}) + } + for _, tc := range toolCalls { + parts = append(parts, messagePart{ + Type: "tool_call", + ID: tc.ID, + Name: tc.Function.Name, + Arguments: tc.Function.Arguments, + }) + } + if len(parts) == 0 { + return + } + out := []structuredMessage{{Role: "assistant", Parts: parts}} + if encoded, err := json.Marshal(out); err == nil { + span.SetAttributes(attribute.String(AttrOutputMessages, string(encoded))) + } +} + +// SetToolDefinitions serialises the tool definitions presented to the +// model into `gen_ai.tool.definitions`. +func SetToolDefinitions(span *ChatSpan, toolDefs []tools.Tool) { + if span == nil || !IsContentCaptureEnabled() || len(toolDefs) == 0 { + return + } + encoded, err := json.Marshal(toolDefs) + if err != nil { + return + } + span.SetAttributes(attribute.String(AttrToolDefinitions, string(encoded))) +} + +// messageToStructured converts a chat.Message to the spec-shaped +// structured message representation. Multi-content messages produce one +// part per content block; tool calls and tool results map to their +// respective part types. +func messageToStructured(m *chat.Message) structuredMessage { + role := string(m.Role) + parts := []messagePart{} + + switch { + case len(m.MultiContent) > 0: + for _, mc := range m.MultiContent { + switch mc.Type { + case chat.MessagePartTypeText: + if mc.Text != "" { + parts = append(parts, messagePart{Type: "text", Content: mc.Text}) + } + case chat.MessagePartTypeImageURL: + if mc.ImageURL != nil && mc.ImageURL.URL != "" { + parts = append(parts, messagePart{ + Type: "uri", + URI: mc.ImageURL.URL, + Modality: "image", + }) + } + case chat.MessagePartTypeFile: + if mc.File != nil { + p := messagePart{Type: "file", ID: mc.File.FileID} + if mc.File.MimeType != "" { + p.MimeType = mc.File.MimeType + } + parts = append(parts, p) + } + } + } + case m.ToolCallID != "": + // Tool result messages: the entire content is the tool's + // response payload, encoded as a single tool_call_response + // part. Skip the default text/reasoning branch so we don't + // also emit a duplicate `text` part with the same payload. + default: + if m.ReasoningContent != "" { + parts = append(parts, messagePart{Type: "reasoning", Content: m.ReasoningContent}) + } + if m.Content != "" { + parts = append(parts, messagePart{Type: "text", Content: m.Content}) + } + } + + for _, tc := range m.ToolCalls { + parts = append(parts, messagePart{ + Type: "tool_call", + ID: tc.ID, + Name: tc.Function.Name, + Arguments: tc.Function.Arguments, + }) + } + if m.ToolCallID != "" { + // Per the OTel GenAI semconv example schema, tool_call_response + // parts carry the payload in `result`, not `content` (which is + // reserved for `text`/`reasoning` parts). Spec-aware backends + // look for the `result` key when decoding tool responses. + parts = append(parts, messagePart{ + Type: "tool_call_response", + ID: m.ToolCallID, + Result: m.Content, + }) + } + + return structuredMessage{Role: role, Parts: parts} +} diff --git a/pkg/telemetry/genai/conversation.go b/pkg/telemetry/genai/conversation.go new file mode 100644 index 000000000..06b0edf4d --- /dev/null +++ b/pkg/telemetry/genai/conversation.go @@ -0,0 +1,52 @@ +package genai + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/baggage" +) + +// baggageKeyConversationID matches the GenAI semconv attribute key for +// the conversation identifier so the value flows transparently through +// the W3C `baggage` header alongside `traceparent`. Any downstream +// service or subprocess running OTel auto-instrumentation will pick it +// up without per-helper plumbing. +const baggageKeyConversationID = "gen_ai.conversation.id" + +// WithConversationID returns a context that carries the conversation id +// in OTel baggage. Spans created later in the chain — including ones in +// helper packages that have no direct access to the session — read it +// via ConversationIDFromContext and attach `gen_ai.conversation.id` +// automatically. Empty id is a no-op. +func WithConversationID(ctx context.Context, id string) context.Context { + if id == "" { + return ctx + } + member, err := baggage.NewMember(baggageKeyConversationID, id) + if err != nil { + return ctx + } + bag, err := baggage.FromContext(ctx).SetMember(member) + if err != nil { + return ctx + } + return baggage.ContextWithBaggage(ctx, bag) +} + +// ConversationIDFromContext returns the conversation id stored in the +// context's baggage, or "" when none has been seeded. +func ConversationIDFromContext(ctx context.Context) string { + return baggage.FromContext(ctx).Member(baggageKeyConversationID).Value() +} + +// conversationAttribute returns the gen_ai.conversation.id attribute +// from baggage when present, or zero-value KeyValue when absent. Helper +// for span starters so they can append it in one line. +func conversationAttribute(ctx context.Context) (attribute.KeyValue, bool) { + id := ConversationIDFromContext(ctx) + if id == "" { + return attribute.KeyValue{}, false + } + return attribute.String(AttrConversationID, id), true +} diff --git a/pkg/telemetry/genai/doc.go b/pkg/telemetry/genai/doc.go new file mode 100644 index 000000000..61bf90dd0 --- /dev/null +++ b/pkg/telemetry/genai/doc.go @@ -0,0 +1,15 @@ +// Package genai provides OpenTelemetry instrumentation helpers that follow +// the GenAI semantic conventions +// (https://opentelemetry.io/docs/specs/semconv/gen-ai/). +// +// The package is structured so that callers — provider clients, the agent +// runtime, MCP clients — describe what they are doing in domain terms and +// the helpers produce the spec-conformant spans, metrics, and log records. +// Centralising the OTel surface here lets us upgrade the semantic +// conventions in one place and keeps the call sites compact. +// +// All gen_ai.* attributes are Development stability per the spec. Attribute +// keys are declared as constants in this package rather than imported from +// go.opentelemetry.io/otel/semconv to insulate callers from the upstream +// reorganisations the GenAI conventions are still going through. +package genai diff --git a/pkg/telemetry/genai/embedding.go b/pkg/telemetry/genai/embedding.go new file mode 100644 index 000000000..a83ad752e --- /dev/null +++ b/pkg/telemetry/genai/embedding.go @@ -0,0 +1,176 @@ +package genai + +import ( + "context" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// EmbeddingRequest carries the inputs needed to start an +// `embeddings {model}` span per the OTel GenAI semantic conventions. +type EmbeddingRequest struct { + Provider string + Model string + // BatchSize is the number of input texts in the embedding call, + // recorded as `cagent.embeddings.batch_size`. Zero means + // single-input. + BatchSize int + // EncodingFormats is the optional list of requested output + // encodings (e.g. "float", "base64") per the GenAI semconv. + // Recorded as `gen_ai.request.encoding_formats` when non-empty. + EncodingFormats []string +} + +// EmbeddingSpan handles the lifecycle of an embedding span and the +// matching `gen_ai.client.operation.duration` / `gen_ai.client.token.usage` +// metric records. +type EmbeddingSpan struct { + span trace.Span + provider string + model string + startedAt time.Time + metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time + + mu sync.Mutex + ended bool + inputTokens int64 + dimensions int + errType string +} + +// StartEmbedding begins a CLIENT-kind `embeddings {model}` span and +// records the spec-required `gen_ai.operation.name=embeddings`, +// `gen_ai.provider.name`, and `gen_ai.request.model` attributes. +func StartEmbedding(ctx context.Context, req EmbeddingRequest) (context.Context, *EmbeddingSpan) { + tracer := otel.Tracer(instrumentationName) + name := OperationEmbeddings + if req.Model != "" { + name = OperationEmbeddings + " " + req.Model + } + attrs := []attribute.KeyValue{ + attribute.String(AttrOperationName, OperationEmbeddings), + attribute.String(AttrProviderName, req.Provider), + } + if req.Model != "" { + attrs = append(attrs, attribute.String(AttrRequestModel, req.Model)) + } + if req.BatchSize > 1 { + attrs = append(attrs, attribute.Int("cagent.embeddings.batch_size", req.BatchSize)) + } + if len(req.EncodingFormats) > 0 { + attrs = append(attrs, attribute.StringSlice(AttrRequestEncodingFormats, req.EncodingFormats)) + } + if conv, ok := conversationAttribute(ctx); ok { + attrs = append(attrs, conv) + } + ctx, span := tracer.Start(ctx, name, + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(attrs...), + ) + return ctx, &EmbeddingSpan{ + span: span, + provider: req.Provider, + model: req.Model, + startedAt: time.Now(), + metricCtx: ctx, + } +} + +// SetInputTokens records the number of input tokens consumed by the +// embedding call. Emitted as `gen_ai.usage.input_tokens` on the span +// and as the `gen_ai.client.token.usage` metric at End time. +func (s *EmbeddingSpan) SetInputTokens(n int64) { + if s == nil { + return + } + s.mu.Lock() + s.inputTokens = n + s.mu.Unlock() +} + +// SetDimensions records the dimensionality of the resulting embedding +// vector(s). Emitted as `gen_ai.embeddings.dimension.count`. +func (s *EmbeddingSpan) SetDimensions(d int) { + if s == nil { + return + } + s.mu.Lock() + s.dimensions = d + s.mu.Unlock() +} + +// RecordError marks the span as failed and stores `error.type` for the +// duration metric. +func (s *EmbeddingSpan) RecordError(err error, errType string) { + if s == nil || err == nil { + return + } + if errType == "" { + errType = ClassifyError(err) + } + s.mu.Lock() + s.errType = errType + s.mu.Unlock() + s.span.RecordError(err) + s.span.SetStatus(codes.Error, err.Error()) + s.span.SetAttributes(attribute.String("error.type", errType)) +} + +// End closes the span and records the duration + token-usage metrics. +func (s *EmbeddingSpan) End() { + if s == nil { + return + } + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + inputTokens := s.inputTokens + dimensions := s.dimensions + errType := s.errType + s.mu.Unlock() + + if inputTokens > 0 { + s.span.SetAttributes(attribute.Int64(AttrUsageInputTokens, inputTokens)) + } + if dimensions > 0 { + s.span.SetAttributes(attribute.Int(AttrEmbeddingsDimensionCount, dimensions)) + } + s.span.End() + + insts := getInstruments() + if insts == nil { + return + } + commonAttrs := []attribute.KeyValue{ + attribute.String(AttrOperationName, OperationEmbeddings), + attribute.String(AttrProviderName, s.provider), + } + if s.model != "" { + commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model)) + } + durationAttrs := append([]attribute.KeyValue(nil), commonAttrs...) + if errType != "" { + durationAttrs = append(durationAttrs, attribute.String("error.type", errType)) + } + if insts.clientOperationDuration != nil { + insts.clientOperationDuration.Record(s.metricCtx, time.Since(s.startedAt).Seconds(), + metric.WithAttributes(durationAttrs...), + ) + } + if inputTokens > 0 && insts.clientTokenUsage != nil { + tokenAttrs := append([]attribute.KeyValue(nil), commonAttrs...) + tokenAttrs = append(tokenAttrs, attribute.String(AttrTokenType, TokenTypeInput)) + insts.clientTokenUsage.Record(s.metricCtx, inputTokens, + metric.WithAttributes(tokenAttrs...), + ) + } +} diff --git a/pkg/telemetry/genai/errors.go b/pkg/telemetry/genai/errors.go new file mode 100644 index 000000000..8d1f7db18 --- /dev/null +++ b/pkg/telemetry/genai/errors.go @@ -0,0 +1,85 @@ +package genai + +import ( + "context" + "errors" + "net" + "strings" + + "go.opentelemetry.io/otel/attribute" +) + +// ErrorTypeOther is the OTel-mandated fallback for `error.type` when no +// classifier matches. The spec requires `_OTHER` rather than a Go type +// name so backends can rely on a bounded cardinality. +const ErrorTypeOther = "_OTHER" + +// ClassifyError maps a provider error to a low-cardinality `error.type` +// value suitable for span and metric attributes. Falls back to +// `_OTHER` (the spec-defined sentinel) when the error does not match any +// known pattern. +// +// Spec leaves the value space open for callers — these strings are picked +// for cross-provider comparability on dashboards. +func ClassifyError(err error) string { + if err == nil { + return "" + } + switch { + case errors.Is(err, context.Canceled): + return "context_canceled" + case errors.Is(err, context.DeadlineExceeded): + return "deadline_exceeded" + } + + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "context length") || strings.Contains(msg, "context_length"): + // Bare "max_tokens" matches too eagerly: validation errors + // like `max_tokens must be > 0` and "model X does not + // support max_tokens" both contain the token but are not + // context overflows. Stick to the unambiguous phrases. + return "context_length_exceeded" + case strings.Contains(msg, "rate limit") || strings.Contains(msg, "429"): + return "rate_limit" + case strings.Contains(msg, "401") || strings.Contains(msg, "unauthorized") || strings.Contains(msg, "authentication"): + return "auth" + case strings.Contains(msg, "403") || strings.Contains(msg, "forbidden") || strings.Contains(msg, "permission"): + return "forbidden" + case strings.Contains(msg, "content policy") || strings.Contains(msg, "content filter") || strings.Contains(msg, "safety"): + return "content_policy" + } + + var netErr net.Error + if errors.As(err, &netErr) { + if netErr.Timeout() { + return "network_timeout" + } + return "network" + } + + return ErrorTypeOther +} + +// applyExtraAttribute converts a StreamAttributer KeyValue into an OTel +// attribute and applies it to the span. Unsupported value types are +// dropped silently — telemetry must never crash request paths. +func applyExtraAttribute(span *ChatSpan, kv KeyValue) { + if span == nil || kv.Key == "" { + return + } + switch v := kv.Value.(type) { + case string: + span.SetAttributes(attribute.String(kv.Key, v)) + case bool: + span.SetAttributes(attribute.Bool(kv.Key, v)) + case int: + span.SetAttributes(attribute.Int(kv.Key, v)) + case int64: + span.SetAttributes(attribute.Int64(kv.Key, v)) + case float64: + span.SetAttributes(attribute.Float64(kv.Key, v)) + case []string: + span.SetAttributes(attribute.StringSlice(kv.Key, v)) + } +} diff --git a/pkg/telemetry/genai/evaluation.go b/pkg/telemetry/genai/evaluation.go new file mode 100644 index 000000000..4d1673efa --- /dev/null +++ b/pkg/telemetry/genai/evaluation.go @@ -0,0 +1,64 @@ +package genai + +import ( + "context" + + "go.opentelemetry.io/otel/log" + "go.opentelemetry.io/otel/log/global" +) + +// EvaluationResult describes one evaluation outcome that should be emitted +// as a `gen_ai.evaluation.result` log record per the OTel GenAI semconv +// (https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/). +type EvaluationResult struct { + // Name is the evaluation metric — e.g. "relevance", "factuality", + // "tool_calls_f1". Required. + Name string + + // ScoreLabel is the human-readable verdict — e.g. "passed", + // "failed", "satisfactory". Optional but commonly set. + ScoreLabel string + + // ScoreValue is the numeric score (commonly 0.0–1.0). Optional. + ScoreValue float64 + HasScoreValue bool + + // Explanation is a free-form reason for the score. Optional. + Explanation string + + // ErrorType is set when the evaluation itself failed (e.g. the + // judge model errored out). Mirrors the spec's `error.type` field. + ErrorType string +} + +// EmitEvaluationResult emits a `gen_ai.evaluation.result` log record. The +// record links to the active span via the supplied context so dashboards +// can join evaluation outcomes back onto the operation that produced +// them. No-op when no logger provider is configured. +func EmitEvaluationResult(ctx context.Context, result EvaluationResult) { + logger := global.GetLoggerProvider().Logger(instrumentationName) + + var rec log.Record + rec.SetEventName("gen_ai.evaluation.result") + rec.SetSeverity(log.SeverityInfo) + rec.SetSeverityText("INFO") + + rec.AddAttributes(log.String(AttrEvaluationName, result.Name)) + if result.ScoreLabel != "" { + rec.AddAttributes(log.String(AttrEvaluationScoreLabel, result.ScoreLabel)) + } + if result.HasScoreValue { + rec.AddAttributes(log.Float64(AttrEvaluationScoreValue, result.ScoreValue)) + } + if result.Explanation != "" { + rec.AddAttributes(log.String(AttrEvaluationExplanation, result.Explanation)) + } + if result.ErrorType != "" { + rec.AddAttributes(log.String("error.type", result.ErrorType)) + } + if convID := ConversationIDFromContext(ctx); convID != "" { + rec.AddAttributes(log.String(AttrConversationID, convID)) + } + + logger.Emit(ctx, rec) +} diff --git a/pkg/telemetry/genai/genai_test.go b/pkg/telemetry/genai/genai_test.go new file mode 100644 index 000000000..692d41212 --- /dev/null +++ b/pkg/telemetry/genai/genai_test.go @@ -0,0 +1,156 @@ +package genai + +import ( + "context" + "errors" + "io" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/docker/docker-agent/pkg/chat" +) + +func TestProviderNameForConfig(t *testing.T) { + t.Parallel() + tests := []struct { + in string + want string + }{ + {"openai", ProviderOpenAI}, + {"openai_chatcompletions", ProviderOpenAI}, + {"openai_responses", ProviderOpenAI}, + {"anthropic", ProviderAnthropic}, + {"amazon-bedrock", ProviderAWSBedrock}, + {"google", ProviderGCPGenAI}, + {"vertexai", ProviderGCPVertexAI}, + {"azure", ProviderAzureAI}, + {"dmr", ProviderDMR}, + {"custom-provider", "custom-provider"}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, ProviderNameForConfig(tt.in)) + }) + } +} + +func TestClassifyError(t *testing.T) { + t.Parallel() + tests := []struct { + name string + err error + want string + }{ + {"nil", nil, ""}, + {"context canceled", context.Canceled, "context_canceled"}, + {"context deadline", context.DeadlineExceeded, "deadline_exceeded"}, + {"rate limit", errors.New("HTTP 429 Too Many Requests"), "rate_limit"}, + {"context length", errors.New("context_length_exceeded: prompt too large"), "context_length_exceeded"}, + {"unauthorized", errors.New("HTTP 401 Unauthorized"), "auth"}, + {"forbidden", errors.New("HTTP 403 Forbidden"), "forbidden"}, + {"content policy", errors.New("response blocked by content filter"), "content_policy"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, ClassifyError(tt.err)) + }) + } +} + +// fakeStream produces a fixed sequence of chunks then EOF. +type fakeStream struct { + chunks []chat.MessageStreamResponse + idx int + closed bool +} + +func (f *fakeStream) Recv() (chat.MessageStreamResponse, error) { + if f.idx >= len(f.chunks) { + return chat.MessageStreamResponse{}, io.EOF + } + r := f.chunks[f.idx] + f.idx++ + return r, nil +} + +func (f *fakeStream) Close() { f.closed = true } + +func TestStartChatAndWrapStream(t *testing.T) { + t.Parallel() + + stream := &fakeStream{ + chunks: []chat.MessageStreamResponse{ + { + ID: "resp-1", + Model: "claude-sonnet-4", + Choices: []chat.MessageStreamChoice{ + {Delta: chat.MessageDelta{Content: "hello"}}, + }, + }, + { + Choices: []chat.MessageStreamChoice{ + {FinishReason: chat.FinishReasonStop}, + }, + Usage: &chat.Usage{ + InputTokens: 100, + OutputTokens: 50, + CachedInputTokens: 20, + CacheWriteTokens: 10, + }, + }, + }, + } + + ctx, span := StartChat(t.Context(), ChatRequest{ + Provider: ProviderAnthropic, + Model: "claude-sonnet-4", + Stream: true, + MaxTokens: 4096, + }) + require.NotNil(t, span) + require.NotNil(t, ctx) + + wrapped := WrapStream(span, stream) + + // Drain the stream. + for { + resp, err := wrapped.Recv() + if errors.Is(err, io.EOF) { + break + } + require.NoError(t, err) + _ = resp + } + wrapped.Close() + assert.True(t, stream.closed) + + // Re-closing should be a no-op (the wrapper guards against + // double-Close, which would otherwise emit two End() calls). + wrapped.Close() +} + +func TestWrapStreamNilSpanReturnsOriginal(t *testing.T) { + t.Parallel() + s := &fakeStream{} + got := WrapStream(nil, s) + assert.Same(t, s, got) +} + +func TestServerAddressFromURL(t *testing.T) { + t.Parallel() + host, port := ServerAddressFromURL("https://api.anthropic.com:443/v1/messages") + assert.Equal(t, "api.anthropic.com", host) + assert.Equal(t, 443, port) + + host, port = ServerAddressFromURL("https://api.openai.com/v1/chat/completions") + assert.Equal(t, "api.openai.com", host) + assert.Equal(t, 0, port) + + host, port = ServerAddressFromURL("") + assert.Empty(t, host) + assert.Equal(t, 0, port) +} diff --git a/pkg/telemetry/genai/metrics.go b/pkg/telemetry/genai/metrics.go new file mode 100644 index 000000000..01f8d90f8 --- /dev/null +++ b/pkg/telemetry/genai/metrics.go @@ -0,0 +1,80 @@ +package genai + +import ( + "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +// instrumentationName identifies this package as the OTel instrumentation +// scope for spans, metrics, and log records it produces. +const instrumentationName = "github.com/docker/docker-agent/pkg/telemetry/genai" + +// metricBucketsDuration matches the spec for `gen_ai.client.operation.duration` +// (and related per-chunk timing histograms). +var metricBucketsDuration = []float64{ + 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92, +} + +// metricBucketsTokenUsage matches the spec for `gen_ai.client.token.usage`. +var metricBucketsTokenUsage = []float64{ + 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864, +} + +// instruments holds the lazily-initialised metric instruments. Resolved on +// first use because the global MeterProvider is set at SDK init time, which +// may run after package-level var initialisation in some contexts. +type instruments struct { + clientOperationDuration metric.Float64Histogram + clientOperationTTFC metric.Float64Histogram + clientOperationTimePerChunk metric.Float64Histogram + clientTokenUsage metric.Int64Histogram +} + +var ( + instOnce sync.Once + inst *instruments +) + +// getInstruments resolves and caches the package-level meter instruments. +// Histogram creation rarely fails in practice; when one does we keep the +// instruments that did succeed and leave the failed one nil. Call sites +// already nil-check each instrument, so a partial set is functional — +// previously a single early return left every metric permanently +// disabled, which surprised production debugging when one bucket +// configuration tripped a registration error. +func getInstruments() *instruments { + instOnce.Do(func() { + meter := otel.Meter(instrumentationName) + i := &instruments{} + + i.clientOperationDuration, _ = meter.Float64Histogram( + "gen_ai.client.operation.duration", + metric.WithUnit("s"), + metric.WithDescription("GenAI operation duration."), + metric.WithExplicitBucketBoundaries(metricBucketsDuration...), + ) + i.clientOperationTTFC, _ = meter.Float64Histogram( + "gen_ai.client.operation.time_to_first_chunk", + metric.WithUnit("s"), + metric.WithDescription("Time to receive the first chunk of a streaming GenAI response."), + metric.WithExplicitBucketBoundaries(metricBucketsDuration...), + ) + i.clientOperationTimePerChunk, _ = meter.Float64Histogram( + "gen_ai.client.operation.time_per_output_chunk", + metric.WithUnit("s"), + metric.WithDescription("Time between consecutive output chunks of a streaming GenAI response."), + metric.WithExplicitBucketBoundaries(metricBucketsDuration...), + ) + i.clientTokenUsage, _ = meter.Int64Histogram( + "gen_ai.client.token.usage", + metric.WithUnit("{token}"), + metric.WithDescription("Number of tokens used in a GenAI client request, broken down by token type."), + metric.WithExplicitBucketBoundaries(metricBucketsTokenUsage...), + ) + + inst = i + }) + return inst +} diff --git a/pkg/telemetry/genai/provider_names.go b/pkg/telemetry/genai/provider_names.go new file mode 100644 index 000000000..8076583f4 --- /dev/null +++ b/pkg/telemetry/genai/provider_names.go @@ -0,0 +1,28 @@ +package genai + +// ProviderNameForConfig maps the project's internal provider type strings +// (the values used in agent YAML and resolved by +// pkg/model/provider.resolveProviderType) to the GenAI semconv provider +// names defined in the per-provider semantic conventions. Unknown +// providers fall through unchanged so dashboards still receive a value +// rather than empty string. +func ProviderNameForConfig(internalName string) string { + switch internalName { + case "openai", "openai_chatcompletions", "openai_responses": + return ProviderOpenAI + case "anthropic": + return ProviderAnthropic + case "amazon-bedrock": + return ProviderAWSBedrock + case "google": + return ProviderGCPGenAI + case "vertexai", "google-vertex": + return ProviderGCPVertexAI + case "azure", "azure-openai": + return ProviderAzureAI + case "dmr": + return ProviderDMR + default: + return internalName + } +} diff --git a/pkg/telemetry/genai/runtime.go b/pkg/telemetry/genai/runtime.go new file mode 100644 index 000000000..628b21c1c --- /dev/null +++ b/pkg/telemetry/genai/runtime.go @@ -0,0 +1,367 @@ +package genai + +import ( + "context" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// Custom (non-spec) attribute keys for runtime-side observability that has +// no GenAI semconv equivalent yet (fallback chain, response cache, +// approval pipeline). Kept under the `cagent.` namespace so they are +// clearly distinguishable from the spec-defined `gen_ai.*` and `mcp.*` +// attributes when scrolling through a span. +const ( + AttrFallbackPrimaryModel = "cagent.fallback.primary_model" + AttrFallbackFinalModel = "cagent.fallback.final_model" + AttrFallbackAttempts = "cagent.fallback.attempts" + AttrFallbackOutcome = "cagent.fallback.outcome" + AttrFallbackInCooldown = "cagent.fallback.in_cooldown" + + AttrCacheHit = "cagent.cache.hit" + AttrCacheBacking = "cagent.cache.backing" + + AttrAgentNameRuntime = "cagent.agent.name" + + AttrRetrievalResultCount = "cagent.retrieval.result_count" + + AttrSandboxRuntime = "cagent.sandbox.runtime" + AttrSandboxImage = "cagent.sandbox.image" + AttrSandboxContainer = "cagent.sandbox.container" + AttrSandboxExitCode = "cagent.sandbox.exit_code" +) + +// FallbackOutcome values for AttrFallbackOutcome. +const ( + FallbackOutcomeSuccess = "success" + FallbackOutcomeFailed = "failed" + FallbackOutcomeContextCanceled = "context_canceled" +) + +// FallbackSpan is the handle for an in-flight runtime.fallback span. +type FallbackSpan struct { + span trace.Span + startedAt time.Time + + mu sync.Mutex + attempts int + final string + outcome string + errType string + ended bool +} + +// StartFallback begins a runtime.fallback span covering the whole fallback +// chain for one agent turn. Each per-model attempt produces its own +// `chat {model}` CLIENT child span (created by the provider decorator). +// Attributes set up front: primary model name, agent name, in-cooldown +// flag. The caller updates final model / attempts / outcome through the +// returned handle and calls End to flush. +func StartFallback(ctx context.Context, agentName, primaryModel string, inCooldown bool) (context.Context, *FallbackSpan) { + tracer := otel.Tracer(instrumentationName) + attrs := []attribute.KeyValue{ + attribute.String(AttrAgentNameRuntime, agentName), + attribute.Bool(AttrFallbackInCooldown, inCooldown), + } + if primaryModel != "" { + attrs = append(attrs, attribute.String(AttrFallbackPrimaryModel, primaryModel)) + } + if conv, ok := conversationAttribute(ctx); ok { + attrs = append(attrs, conv) + } + ctx, span := tracer.Start(ctx, "runtime.fallback", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attrs...), + ) + return ctx, &FallbackSpan{ + span: span, + startedAt: time.Now(), + } +} + +// IncrementAttempt counts one attempt against the chain. Called once per +// (model × retry) iteration so the final span carries the total count. +func (s *FallbackSpan) IncrementAttempt() { + if s == nil { + return + } + s.mu.Lock() + s.attempts++ + s.mu.Unlock() +} + +// SetFinalModel records the model that ultimately served the response. +// Called on the success path; not called on full-failure paths so the +// attribute remains absent and dashboards can distinguish the cases. +func (s *FallbackSpan) SetFinalModel(model string) { + if s == nil || model == "" { + return + } + s.mu.Lock() + s.final = model + s.mu.Unlock() +} + +// RecordError stores an error and an error.type label for the metric. +func (s *FallbackSpan) RecordError(err error, errType string) { + if s == nil || err == nil { + return + } + if errType == "" { + errType = ClassifyError(err) + } + s.mu.Lock() + s.errType = errType + s.mu.Unlock() + s.span.RecordError(err) + s.span.SetStatus(codes.Error, err.Error()) + s.span.SetAttributes(attribute.String("error.type", errType)) +} + +// SetOutcome records the terminal outcome of the chain. Use one of the +// FallbackOutcome* constants. +func (s *FallbackSpan) SetOutcome(outcome string) { + if s == nil || outcome == "" { + return + } + s.mu.Lock() + s.outcome = outcome + s.mu.Unlock() +} + +// End closes the span and flushes accumulated attributes. +func (s *FallbackSpan) End() { + if s == nil { + return + } + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + final := s.final + outcome := s.outcome + attempts := s.attempts + s.mu.Unlock() + + if final != "" { + s.span.SetAttributes(attribute.String(AttrFallbackFinalModel, final)) + } + if outcome != "" { + s.span.SetAttributes(attribute.String(AttrFallbackOutcome, outcome)) + } + s.span.SetAttributes(attribute.Int(AttrFallbackAttempts, attempts)) + s.span.End() +} + +// RetrievalSpan handles a retrieval-operation span lifecycle. +type RetrievalSpan struct { + span trace.Span + startedAt time.Time + + mu sync.Mutex + resultCount int + errType string + ended bool +} + +// StartRetrieval begins a `retrieval {data_source.id}` span per the OTel +// GenAI semconv. providerName identifies the retrieval backend +// ("sqlite", "rag", an embedding-provider name) and is Required by the +// spec for retrieval operations. dataSourceID identifies the corpus / +// index / collection being queried; queryText is captured only when +// the caller has confirmed the content-capture opt-in. +func StartRetrieval(ctx context.Context, providerName, dataSourceID string, captureQuery bool, queryText string) (context.Context, *RetrievalSpan) { + tracer := otel.Tracer(instrumentationName) + name := OperationRetrieval + if dataSourceID != "" { + name = OperationRetrieval + " " + dataSourceID + } + attrs := []attribute.KeyValue{ + attribute.String(AttrOperationName, OperationRetrieval), + } + if providerName != "" { + attrs = append(attrs, attribute.String(AttrProviderName, providerName)) + } + if dataSourceID != "" { + attrs = append(attrs, attribute.String(AttrDataSourceID, dataSourceID)) + } + if captureQuery && queryText != "" { + attrs = append(attrs, attribute.String(AttrRetrievalQueryText, queryText)) + } + if conv, ok := conversationAttribute(ctx); ok { + attrs = append(attrs, conv) + } + ctx, span := tracer.Start(ctx, name, + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attrs...), + ) + return ctx, &RetrievalSpan{span: span, startedAt: time.Now()} +} + +// SetAttributes adds extra attributes to the retrieval span. Use for +// retrieval-specific extensions (corpus filter, category, fusion mode, +// etc.) that don't have a dedicated setter. +func (s *RetrievalSpan) SetAttributes(attrs ...attribute.KeyValue) { + if s == nil { + return + } + s.span.SetAttributes(attrs...) +} + +// SetResultCount records how many documents the retrieval returned. +func (s *RetrievalSpan) SetResultCount(n int) { + if s == nil { + return + } + s.mu.Lock() + s.resultCount = n + s.mu.Unlock() +} + +// RecordError marks the retrieval span as failed. +func (s *RetrievalSpan) RecordError(err error, errType string) { + if s == nil || err == nil { + return + } + if errType == "" { + errType = ClassifyError(err) + } + s.mu.Lock() + s.errType = errType + s.mu.Unlock() + s.span.RecordError(err) + s.span.SetStatus(codes.Error, err.Error()) + s.span.SetAttributes(attribute.String("error.type", errType)) +} + +// End closes the retrieval span and flushes the result count. +func (s *RetrievalSpan) End() { + if s == nil { + return + } + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + count := s.resultCount + s.mu.Unlock() + s.span.SetAttributes(attribute.Int(AttrRetrievalResultCount, count)) + s.span.End() +} + +// CacheRequest counter — records every cache lookup with `result=hit|miss` +// and a `backing` attribute for memory-only vs file-backed caches. +var ( + cacheCounterOnce sync.Once + cacheCounter metric.Int64Counter +) + +func getCacheCounter() metric.Int64Counter { + cacheCounterOnce.Do(func() { + meter := otel.Meter(instrumentationName) + c, err := meter.Int64Counter( + "cagent.cache.requests", + metric.WithUnit("{request}"), + metric.WithDescription("Number of response-cache lookups, broken down by hit/miss."), + ) + if err != nil { + return + } + cacheCounter = c + }) + return cacheCounter +} + +// RecordCacheLookup increments the cache counter and returns a small span +// describing the lookup. Callers `defer span.End()` and the helper sets +// `cagent.cache.hit` from the value returned by SetHit. +func RecordCacheLookup(ctx context.Context, backing string) (context.Context, *CacheSpan) { + return startCacheSpan(ctx, "cache.lookup", "lookup", backing) +} + +// RecordCacheStore is the Store-side counterpart of RecordCacheLookup. +func RecordCacheStore(ctx context.Context, backing string) (context.Context, *CacheSpan) { + return startCacheSpan(ctx, "cache.store", "store", backing) +} + +func startCacheSpan(ctx context.Context, spanName, op, backing string) (context.Context, *CacheSpan) { + tracer := otel.Tracer(instrumentationName) + attrs := []attribute.KeyValue{} + if backing != "" { + attrs = append(attrs, attribute.String(AttrCacheBacking, backing)) + } + if conv, ok := conversationAttribute(ctx); ok { + attrs = append(attrs, conv) + } + ctx, span := tracer.Start(ctx, spanName, + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attrs...), + ) + return ctx, &CacheSpan{span: span, metricCtx: ctx, backing: backing, op: op} +} + +// CacheSpan handles cache-operation span lifecycle. +type CacheSpan struct { + span trace.Span + // metricCtx carries the active span context so counter Add calls + // produce span-context exemplars (drill Mimir bucket → Tempo + // trace). Without this the counter measurement gets only the + // resource attributes. + metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time + backing string + op string + + mu sync.Mutex + hit bool + set bool +} + +// SetHit records whether the lookup found an entry. Increments the +// cache counter immediately so the metric reflects the result even if End +// is called late. +func (s *CacheSpan) SetHit(hit bool) { + if s == nil { + return + } + s.mu.Lock() + s.hit = hit + s.set = true + s.mu.Unlock() + s.span.SetAttributes(attribute.Bool(AttrCacheHit, hit)) + + if c := getCacheCounter(); c != nil { + result := "miss" + if hit { + result = "hit" + } + attrs := []attribute.KeyValue{ + attribute.String("result", result), + attribute.String("operation", s.op), + } + if s.backing != "" { + attrs = append(attrs, attribute.String(AttrCacheBacking, s.backing)) + } + // Use the active context so the counter measurement carries + // the span exemplar — drill from Mimir bucket → Tempo trace + // works for cache operations the same way it does for chat. + c.Add(s.metricCtx, 1, metric.WithAttributes(attrs...)) + } +} + +// End closes the cache span. +func (s *CacheSpan) End() { + if s == nil { + return + } + s.span.End() +} diff --git a/pkg/telemetry/genai/sandbox.go b/pkg/telemetry/genai/sandbox.go new file mode 100644 index 000000000..4b97d7fc0 --- /dev/null +++ b/pkg/telemetry/genai/sandbox.go @@ -0,0 +1,231 @@ +package genai + +import ( + "context" + "strings" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" +) + +// envCarrier adapts an env-var key/value map to OTel's TextMapCarrier so +// the configured propagator can write traceparent / tracestate / baggage +// into a subprocess's environment. Keys are uppercased on Set to match +// the convention subprocess-propagation tools (otel-cli, OTel SDKs) +// expect. +type envCarrier map[string]string + +func (c envCarrier) Get(key string) string { return c[strings.ToUpper(key)] } +func (c envCarrier) Set(key, value string) { c[strings.ToUpper(key)] = value } +func (c envCarrier) Keys() []string { + keys := make([]string, 0, len(c)) + for k := range c { + keys = append(keys, k) + } + return keys +} + +var _ propagation.TextMapCarrier = envCarrier{} + +// InjectSandboxEnv returns docker-style `-e KEY=VALUE` flags carrying the +// W3C trace context for the current span so the agent process spawned +// inside a sandbox container inherits the parent trace. Anything OTel- +// aware running in the container — another agent, an HTTP client with +// otelhttp transport, otel-cli — auto-parents its spans onto the active +// CLIENT span on the host side. +// +// Returns nil when no propagator is configured or when the active context +// has no span context to inject. +func InjectSandboxEnv(ctx context.Context) []string { + carrier := envCarrier{} + otel.GetTextMapPropagator().Inject(ctx, carrier) + if len(carrier) == 0 { + return nil + } + flags := make([]string, 0, 2*len(carrier)) + for k, v := range carrier { + flags = append(flags, "-e", k+"="+v) + } + return flags +} + +// InjectTraceContextEnv returns `KEY=VALUE` env-var strings carrying the +// W3C trace context for the current span. Use to extend `exec.Cmd.Env` +// for direct subprocess spawns (hook scripts, LSP servers) so OTel-aware +// children chain onto the active span. Companion to `InjectSandboxEnv`, +// which formats for `docker -e`. +// +// Returns nil when no propagator is configured or when the active context +// has no span context to inject. +func InjectTraceContextEnv(ctx context.Context) []string { + carrier := envCarrier{} + otel.GetTextMapPropagator().Inject(ctx, carrier) + if len(carrier) == 0 { + return nil + } + out := make([]string, 0, len(carrier)) + for k, v := range carrier { + out = append(out, k+"="+v) + } + return out +} + +// SandboxSpan handles the lifecycle of a sandbox.exec span and the +// matching sandbox.exec.duration histogram. Use to wrap the actual +// `docker sandbox exec` (or equivalent) subprocess invocation so the +// host side has timing, exit code, runtime kind, and image information +// alongside the inherited child trace from inside the sandbox. +type SandboxSpan struct { + span trace.Span + // metricCtx carries the active span context so histogram Record + // calls produce span-context exemplars (drill Mimir → Tempo). + metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time + startedAt time.Time + runtime string + + mu sync.Mutex + exitCode int + hasExit bool + errType string + ended bool +} + +// SandboxOptions configures the attributes set on a sandbox.exec span at +// creation time. All fields are optional except Runtime. +type SandboxOptions struct { + // Runtime is a short label identifying the sandbox backend (e.g. + // `"docker"`). Recorded as `cagent.sandbox.runtime` and used as a + // histogram label, so callers should keep the set of values small + // and stable. + Runtime string + + // Image is the container/pod image when known. + Image string + + // Container is the container/pod identifier when known. + Container string + + // AgentName is the agent being executed in the sandbox. + AgentName string +} + +// StartSandboxExec opens a `sandbox.exec` INTERNAL span. Runtime kind is +// set up front; exit code and error info attach via the returned handle. +func StartSandboxExec(ctx context.Context, opts SandboxOptions) (context.Context, *SandboxSpan) { + tracer := otel.Tracer(instrumentationName) + attrs := []attribute.KeyValue{} + if opts.Runtime != "" { + attrs = append(attrs, attribute.String(AttrSandboxRuntime, opts.Runtime)) + } + if opts.Image != "" { + attrs = append(attrs, attribute.String(AttrSandboxImage, opts.Image)) + } + if opts.Container != "" { + attrs = append(attrs, attribute.String(AttrSandboxContainer, opts.Container)) + } + if opts.AgentName != "" { + attrs = append(attrs, attribute.String(AttrAgentNameRuntime, opts.AgentName)) + } + if conv, ok := conversationAttribute(ctx); ok { + attrs = append(attrs, conv) + } + ctx, span := tracer.Start(ctx, "sandbox.exec", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attrs...), + ) + return ctx, &SandboxSpan{span: span, metricCtx: ctx, startedAt: time.Now(), runtime: opts.Runtime} +} + +// SetExitCode records the subprocess exit code as +// `cagent.sandbox.exit_code`. Set zero on success. +func (s *SandboxSpan) SetExitCode(code int) { + if s == nil { + return + } + s.mu.Lock() + s.exitCode = code + s.hasExit = true + s.mu.Unlock() + s.span.SetAttributes(attribute.Int(AttrSandboxExitCode, code)) +} + +// RecordError marks the span as failed. +func (s *SandboxSpan) RecordError(err error, errType string) { + if s == nil || err == nil { + return + } + if errType == "" { + errType = ClassifyError(err) + } + s.mu.Lock() + s.errType = errType + s.mu.Unlock() + s.span.RecordError(err) + s.span.SetStatus(codes.Error, err.Error()) + s.span.SetAttributes(attribute.String("error.type", errType)) +} + +// End closes the span and records the sandbox.exec.duration histogram. +func (s *SandboxSpan) End() { + if s == nil { + return + } + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + errType := s.errType + s.mu.Unlock() + + s.span.End() + + hist := getSandboxDurationHistogram() + if hist == nil { + return + } + attrs := []attribute.KeyValue{} + if s.runtime != "" { + // Partitions the histogram by sandbox backend so dashboards + // can compare exec latency across runtimes when more than + // one is wired up. + attrs = append(attrs, attribute.String(AttrSandboxRuntime, s.runtime)) + } + if errType != "" { + attrs = append(attrs, attribute.String("error.type", errType)) + } + // Use the active context so the histogram measurement carries the + // span exemplar — drill from Mimir bucket → Tempo trace. + hist.Record(s.metricCtx, time.Since(s.startedAt).Seconds(), + metric.WithAttributes(attrs...), + ) +} + +var ( + sandboxDurationOnce sync.Once + sandboxDurationHist metric.Float64Histogram +) + +func getSandboxDurationHistogram() metric.Float64Histogram { + sandboxDurationOnce.Do(func() { + meter := otel.Meter(instrumentationName) + h, err := meter.Float64Histogram( + "cagent.sandbox.exec.duration", + metric.WithUnit("s"), + metric.WithDescription("Time the host side spent waiting for a sandbox exec invocation to complete."), + metric.WithExplicitBucketBoundaries(metricBucketsDuration...), + ) + if err != nil { + return + } + sandboxDurationHist = h + }) + return sandboxDurationHist +} diff --git a/pkg/telemetry/genai/span.go b/pkg/telemetry/genai/span.go new file mode 100644 index 000000000..9b0542973 --- /dev/null +++ b/pkg/telemetry/genai/span.go @@ -0,0 +1,418 @@ +package genai + +import ( + "context" + "net/url" + "slices" + "strconv" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" + tracenoop "go.opentelemetry.io/otel/trace/noop" +) + +// ChatRequest carries the inputs needed to start a `chat {model}` span and +// to record the matching client metrics. Provider-specific extensions +// (openai service tier, aws.bedrock guardrail, etc.) attach via +// ChatSpan.SetAttributes after the span has started. +type ChatRequest struct { + // Provider is the GenAI provider name. Use one of the Provider* + // constants. Set on the span at creation time per the per-provider + // semconv MUST clauses. + Provider string + + // Model is the requested model identifier. Empty model is allowed + // (some routers do not commit until inside the call) but produces a + // span name of just "chat". + Model string + + // Stream is true if the request is streaming. Recorded as + // gen_ai.request.stream. + Stream bool + + // ServerAddress / ServerPort identify the GenAI endpoint when known + // (helpful for routing-aware dashboards). Optional. + ServerAddress string + ServerPort int + + // Sampling parameters. Zero values are treated as unset and not + // recorded on the span. + MaxTokens int + Temperature float64 + TopP float64 + TopK float64 + FrequencyPenalty float64 + PresencePenalty float64 + Seed int + StopSequences []string + ChoiceCount int + + // HasTemperature / HasTopP / HasTopK / HasFreqPenalty / HasPresPenalty + // disambiguate "explicitly zero" from "unset" for the float params. + // Callers that use the zero value as meaningful must set these. + HasTemperature bool + HasTopP bool + HasTopK bool + HasFreqPenalty bool + HasPresPenalty bool +} + +// ServerAddressFromURL extracts host and port for the ServerAddress / +// ServerPort fields when callers have a full URL handy. +func ServerAddressFromURL(raw string) (string, int) { + if raw == "" { + return "", 0 + } + u, err := url.Parse(raw) + if err != nil || u.Host == "" { + return "", 0 + } + port, _ := strconv.Atoi(u.Port()) + return u.Hostname(), port +} + +// ChatSpan is the handle returned by StartChat. It wraps an OTel span and +// captures enough state to emit per-operation metrics on End. +type ChatSpan struct { + span trace.Span + provider string + model string + startedAt time.Time + // metricCtx carries the request context captured at StartChat + // time so metric Record / Add calls in End preserve the + // trace-to-metric exemplar link. Using context.Background() here + // would silently strip the active span context and break + // drill-from-metric-bucket-to-trace navigation in Tempo/Mimir. + metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time + + mu sync.Mutex + ended bool + responseModel string + finishReasons []string + usageRecorded bool + usage chatUsage + errType string + + // Streaming metrics: the first non-empty chunk timestamp and the + // previous chunk timestamp drive the time_to_first_chunk and + // time_per_output_chunk histograms. + firstChunkAt time.Time + prevChunkAt time.Time + chunkDurations []float64 +} + +type chatUsage struct { + inputTokens int64 + outputTokens int64 + cacheReadInput int64 + cacheCreationInput int64 + reasoningOutput int64 +} + +// StartChat begins a CLIENT-kind `chat {model}` span and records the +// required gen_ai.* request attributes. The returned context carries the +// new span; callers MUST call ChatSpan.End to flush the span and metrics. +func StartChat(ctx context.Context, req ChatRequest) (context.Context, *ChatSpan) { + tracer := otel.Tracer(instrumentationName) + + name := OperationChat + if req.Model != "" { + name = OperationChat + " " + req.Model + } + + attrs := []attribute.KeyValue{ + attribute.String(AttrOperationName, OperationChat), + attribute.String(AttrProviderName, req.Provider), + attribute.Bool(AttrRequestStream, req.Stream), + } + if req.Model != "" { + attrs = append(attrs, attribute.String(AttrRequestModel, req.Model)) + } + if req.ServerAddress != "" { + attrs = append(attrs, attribute.String("server.address", req.ServerAddress)) + if req.ServerPort > 0 { + attrs = append(attrs, attribute.Int("server.port", req.ServerPort)) + } + } + if req.MaxTokens > 0 { + attrs = append(attrs, attribute.Int(AttrRequestMaxTokens, req.MaxTokens)) + } + if req.HasTemperature { + attrs = append(attrs, attribute.Float64(AttrRequestTemperature, req.Temperature)) + } + if req.HasTopP { + attrs = append(attrs, attribute.Float64(AttrRequestTopP, req.TopP)) + } + if req.HasTopK { + attrs = append(attrs, attribute.Float64(AttrRequestTopK, req.TopK)) + } + if req.HasFreqPenalty { + attrs = append(attrs, attribute.Float64(AttrRequestFrequencyPenalty, req.FrequencyPenalty)) + } + if req.HasPresPenalty { + attrs = append(attrs, attribute.Float64(AttrRequestPresencePenalty, req.PresencePenalty)) + } + if req.Seed != 0 { + attrs = append(attrs, attribute.Int(AttrRequestSeed, req.Seed)) + } + if len(req.StopSequences) > 0 { + attrs = append(attrs, attribute.StringSlice(AttrRequestStopSequences, req.StopSequences)) + } + if req.ChoiceCount > 0 && req.ChoiceCount != 1 { + attrs = append(attrs, attribute.Int(AttrRequestChoiceCount, req.ChoiceCount)) + } + if conv, ok := conversationAttribute(ctx); ok { + attrs = append(attrs, conv) + } + + ctx, span := tracer.Start(ctx, name, + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(attrs...), + ) + + return ctx, &ChatSpan{ + span: span, + provider: req.Provider, + model: req.Model, + startedAt: time.Now(), + metricCtx: ctx, + } +} + +// SetAttributes adds extra attributes to the span. Use for provider-specific +// fields (openai.*, aws.bedrock.*) and for response-side attributes the +// caller learns later. +func (s *ChatSpan) SetAttributes(attrs ...attribute.KeyValue) { + if s == nil { + return + } + s.span.SetAttributes(attrs...) +} + +// SetResponseModel records gen_ai.response.model. Some providers return a +// resolved model name that differs from the requested one (alias expansion, +// version pinning); both values are useful. +func (s *ChatSpan) SetResponseModel(model string) { + if s == nil || model == "" { + return + } + s.mu.Lock() + s.responseModel = model + s.mu.Unlock() + s.span.SetAttributes(attribute.String(AttrResponseModel, model)) +} + +// SetResponseID records gen_ai.response.id. +func (s *ChatSpan) SetResponseID(id string) { + if s == nil || id == "" { + return + } + s.span.SetAttributes(attribute.String(AttrResponseID, id)) +} + +// AddFinishReason accumulates a finish reason. The spec defines the +// attribute as a string array — multiple values are recorded once on End. +func (s *ChatSpan) AddFinishReason(reason string) { + if s == nil || reason == "" { + return + } + s.mu.Lock() + defer s.mu.Unlock() + if slices.Contains(s.finishReasons, reason) { + return + } + s.finishReasons = append(s.finishReasons, reason) +} + +// RecordUsage stores the token usage for emission as both span attributes +// and the gen_ai.client.token.usage histogram. Callers pass raw provider +// values; this package applies the spec-mandated Anthropic input-token sum +// (`input_tokens` reported by Anthropic excludes cached tokens, so the +// spec requires summing input + cache_read + cache_creation). +func (s *ChatSpan) RecordUsage(inputTokens, outputTokens, cacheReadInput, cacheCreationInput, reasoningOutput int64) { + if s == nil { + return + } + s.mu.Lock() + defer s.mu.Unlock() + s.usage.inputTokens = inputTokens + s.usage.outputTokens = outputTokens + s.usage.cacheReadInput = cacheReadInput + s.usage.cacheCreationInput = cacheCreationInput + s.usage.reasoningOutput = reasoningOutput + s.usageRecorded = true +} + +// MarkChunk records the timing of a streamed output chunk. The first call +// drives gen_ai.response.time_to_first_chunk (and the corresponding +// metric); subsequent calls accumulate per-chunk durations. +func (s *ChatSpan) MarkChunk() { + if s == nil { + return + } + now := time.Now() + s.mu.Lock() + defer s.mu.Unlock() + if s.firstChunkAt.IsZero() { + s.firstChunkAt = now + } else { + s.chunkDurations = append(s.chunkDurations, now.Sub(s.prevChunkAt).Seconds()) + } + s.prevChunkAt = now +} + +// RecordError marks the span as failed and stores error.type for the +// duration metric. errType should be a short, low-cardinality string — +// "rate_limit", "context_length_exceeded", "auth", "network", +// "context_canceled", or "_OTHER" as the spec-defined fallback. When +// errType is empty, ClassifyError(err) is called to derive a value, so +// callers that don't already have a classification can pass "" without +// losing it to the "_OTHER" bucket. +func (s *ChatSpan) RecordError(err error, errType string) { + if s == nil || err == nil { + return + } + if errType == "" { + errType = ClassifyError(err) + } + s.mu.Lock() + s.errType = errType + s.mu.Unlock() + s.span.RecordError(err) + s.span.SetStatus(codes.Error, err.Error()) + s.span.SetAttributes(attribute.String("error.type", errType)) +} + +// End closes the span, flushes accumulated finish reasons / usage / timing +// to the span, and records the duration and token-usage histograms. Safe +// to call multiple times; subsequent calls are no-ops. +func (s *ChatSpan) End() { + if s == nil { + return + } + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + finishReasons := append([]string(nil), s.finishReasons...) + usage := s.usage + usageRecorded := s.usageRecorded + errType := s.errType + firstChunkAt := s.firstChunkAt + chunkDurations := append([]float64(nil), s.chunkDurations...) + s.mu.Unlock() + + if len(finishReasons) > 0 { + s.span.SetAttributes(attribute.StringSlice(AttrResponseFinishReasons, finishReasons)) + } + if !firstChunkAt.IsZero() { + ttfc := firstChunkAt.Sub(s.startedAt).Seconds() + s.span.SetAttributes(attribute.Float64(AttrResponseTimeToFirstChunk, ttfc)) + } + if usageRecorded { + // Apply the spec-mandated Anthropic input-token math: Anthropic's + // API reports input_tokens excluding cache, but spec wants the + // inclusive total on gen_ai.usage.input_tokens. + spanInputTokens := usage.inputTokens + if s.provider == ProviderAnthropic { + spanInputTokens += usage.cacheReadInput + usage.cacheCreationInput + } + spanAttrs := []attribute.KeyValue{ + attribute.Int64(AttrUsageInputTokens, spanInputTokens), + attribute.Int64(AttrUsageOutputTokens, usage.outputTokens), + } + if usage.cacheReadInput > 0 { + spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageCacheReadInputTokens, usage.cacheReadInput)) + } + if usage.cacheCreationInput > 0 { + spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageCacheCreationInputTokens, usage.cacheCreationInput)) + } + if usage.reasoningOutput > 0 { + spanAttrs = append(spanAttrs, attribute.Int64(AttrUsageReasoningOutputTokens, usage.reasoningOutput)) + } + s.span.SetAttributes(spanAttrs...) + } + + s.span.End() + + // Emit metrics. Failure to resolve instruments must not block span + // completion, so we silently skip when getInstruments returns nil. + insts := getInstruments() + if insts == nil { + return + } + + commonAttrs := []attribute.KeyValue{ + attribute.String(AttrOperationName, OperationChat), + attribute.String(AttrProviderName, s.provider), + } + if s.model != "" { + commonAttrs = append(commonAttrs, attribute.String(AttrRequestModel, s.model)) + } + + durationAttrs := append([]attribute.KeyValue(nil), commonAttrs...) + if errType != "" { + durationAttrs = append(durationAttrs, attribute.String("error.type", errType)) + } + if insts.clientOperationDuration != nil { + insts.clientOperationDuration.Record(s.metricCtx, time.Since(s.startedAt).Seconds(), + metric.WithAttributes(durationAttrs...), + ) + } + + if !firstChunkAt.IsZero() && insts.clientOperationTTFC != nil { + insts.clientOperationTTFC.Record(s.metricCtx, firstChunkAt.Sub(s.startedAt).Seconds(), + metric.WithAttributes(commonAttrs...), + ) + } + if insts.clientOperationTimePerChunk != nil { + for _, d := range chunkDurations { + insts.clientOperationTimePerChunk.Record(s.metricCtx, d, + metric.WithAttributes(commonAttrs...), + ) + } + } + + if usageRecorded && insts.clientTokenUsage != nil { + recordTokenMetric := func(tokenType string, value int64) { + if value <= 0 { + return + } + tokenAttrs := append([]attribute.KeyValue(nil), commonAttrs...) + tokenAttrs = append(tokenAttrs, attribute.String(AttrTokenType, tokenType)) + insts.clientTokenUsage.Record(s.metricCtx, value, + metric.WithAttributes(tokenAttrs...), + ) + } + // Per-token-type metric data points use raw provider values so a + // backend summing across types reconstructs the true total + // without double-counting cached tokens. The Anthropic spec sum + // (input + cache_read + cache_creation) is only applied to the + // span attribute `gen_ai.usage.input_tokens` per the per-provider + // semconv MUST clause — see span attribute emission above. + recordTokenMetric(TokenTypeInput, usage.inputTokens) + recordTokenMetric(TokenTypeOutput, usage.outputTokens) + recordTokenMetric(TokenTypeCacheRead, usage.cacheReadInput) + recordTokenMetric(TokenTypeCacheCreation, usage.cacheCreationInput) + recordTokenMetric(TokenTypeReasoning, usage.reasoningOutput) + } +} + +// Span returns the underlying OTel span so callers can attach span events +// or links when they need finer control than the helpers expose. Returns +// a real no-op span (not a struct embedding a nil trace.Span) when the +// receiver is nil so callers don't have to nil-check before invoking +// Span methods like AddEvent / SetAttributes. +func (s *ChatSpan) Span() trace.Span { + if s == nil { + return tracenoop.Span{} + } + return s.span +} diff --git a/pkg/telemetry/genai/stability.go b/pkg/telemetry/genai/stability.go new file mode 100644 index 000000000..021ce0450 --- /dev/null +++ b/pkg/telemetry/genai/stability.go @@ -0,0 +1,130 @@ +package genai + +import ( + "os" + "strings" + "sync" + + "go.opentelemetry.io/otel/attribute" +) + +// EnvSemconvStability is the OTel-defined environment variable that lets +// callers opt into experimental versions of the GenAI semantic +// conventions +// (https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/README.md). +// +// It is a comma-separated list of opt-in tokens. The only token defined +// for GenAI today is `gen_ai_latest_experimental` — when present, the +// instrumentation emits only the spec-defined `gen_ai.*` attributes and +// drops the legacy attribute names (e.g. `tool.name`, `agent`, +// `session.id`). +// +// Default behaviour (env var unset) is dual-emit: spans carry both the +// legacy keys and the `gen_ai.*` keys so existing dashboards keep +// working alongside spec-aware tooling. This matches the spec's +// recommendation that instrumentations not change the version of +// conventions they emit by default and instead require the opt-in for +// the new version. +const EnvSemconvStability = "OTEL_SEMCONV_STABILITY_OPT_IN" + +// stabilityToken is the spec-defined opt-in for the latest experimental +// GenAI conventions. +const stabilityToken = "gen_ai_latest_experimental" + +// Stability identifies which version of attribute names a span should +// emit. +type Stability int + +const ( + // StabilityDualEmit is the default: emit both legacy attribute + // names (`tool.name`, `agent`, `session.id`, ...) and the + // `gen_ai.*` keys, so existing dashboards continue working while + // spec-aware tooling sees the new values. + StabilityDualEmit Stability = iota + // StabilityGenAILatest is selected by + // `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Only + // the `gen_ai.*` attributes are emitted; the legacy keys are + // dropped. + StabilityGenAILatest +) + +var ( + stabilityMu sync.Mutex + stabilityOnce sync.Once + cachedStability Stability +) + +// CurrentStability returns the active stability mode. The result is +// computed once per process from the env var; tests that need to flip +// the mode at runtime should call ResetStabilityForTest first. +func CurrentStability() Stability { + stabilityMu.Lock() + once := &stabilityOnce + stabilityMu.Unlock() + + once.Do(func() { + raw := os.Getenv(EnvSemconvStability) + for tok := range strings.SplitSeq(raw, ",") { + // Spec: tokens are case-insensitive. + if strings.EqualFold(strings.TrimSpace(tok), stabilityToken) { + stabilityMu.Lock() + cachedStability = StabilityGenAILatest + stabilityMu.Unlock() + return + } + } + stabilityMu.Lock() + cachedStability = StabilityDualEmit + stabilityMu.Unlock() + }) + + stabilityMu.Lock() + defer stabilityMu.Unlock() + return cachedStability +} + +// ResetStabilityForTest clears the cached stability value so a +// subsequent CurrentStability call re-reads the env var. Test-only — +// callers must ensure no other goroutine is in CurrentStability when +// this runs. The mutex protects the sync.Once and cache fields against +// other Reset calls and against the lock-protected segments of +// CurrentStability, but CurrentStability releases the mutex before +// invoking once.Do, so a concurrent reset there races on the +// sync.Once memory itself (flagged under -race). All in-tree usage is +// sequential (t.Setenv + t.Cleanup, no t.Parallel), so this is safe in +// practice; do not introduce parallel callers. +func ResetStabilityForTest() { + stabilityMu.Lock() + defer stabilityMu.Unlock() + stabilityOnce = sync.Once{} + cachedStability = StabilityDualEmit +} + +// EmitLegacyAttributes reports whether legacy (pre-semconv) attribute +// keys should be emitted. True when stability is StabilityDualEmit; +// false when the user has opted into `gen_ai_latest_experimental`. +func EmitLegacyAttributes() bool { + return CurrentStability() == StabilityDualEmit +} + +// LegacyToolAttributes returns the historic tool dispatcher attribute +// set (`tool.name`, `agent`, `session.id`, `tool.call_id`, +// `tool.type`) — but only when legacy emission is enabled. Returns nil +// otherwise so call sites can append unconditionally. +func LegacyToolAttributes(toolName, toolType, agentName, sessionID, callID string) []attribute.KeyValue { + if !EmitLegacyAttributes() { + return nil + } + attrs := []attribute.KeyValue{ + attribute.String("tool.name", toolName), + attribute.String("agent", agentName), + attribute.String("session.id", sessionID), + } + if toolType != "" { + attrs = append(attrs, attribute.String("tool.type", toolType)) + } + if callID != "" { + attrs = append(attrs, attribute.String("tool.call_id", callID)) + } + return attrs +} diff --git a/pkg/telemetry/genai/stability_test.go b/pkg/telemetry/genai/stability_test.go new file mode 100644 index 000000000..f89ee7991 --- /dev/null +++ b/pkg/telemetry/genai/stability_test.go @@ -0,0 +1,55 @@ +package genai + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestCurrentStabilityDefault(t *testing.T) { + t.Setenv(EnvSemconvStability, "") + ResetStabilityForTest() + assert.Equal(t, StabilityDualEmit, CurrentStability()) + assert.True(t, EmitLegacyAttributes()) +} + +func TestCurrentStabilityGenAILatest(t *testing.T) { + t.Setenv(EnvSemconvStability, "gen_ai_latest_experimental") + ResetStabilityForTest() + t.Cleanup(ResetStabilityForTest) + assert.Equal(t, StabilityGenAILatest, CurrentStability()) + assert.False(t, EmitLegacyAttributes()) +} + +func TestCurrentStabilityIgnoresUnrelatedTokens(t *testing.T) { + t.Setenv(EnvSemconvStability, "http,database") + ResetStabilityForTest() + t.Cleanup(ResetStabilityForTest) + assert.Equal(t, StabilityDualEmit, CurrentStability()) +} + +func TestCurrentStabilityCompositeList(t *testing.T) { + t.Setenv(EnvSemconvStability, "http, gen_ai_latest_experimental ,database") + ResetStabilityForTest() + t.Cleanup(ResetStabilityForTest) + assert.Equal(t, StabilityGenAILatest, CurrentStability()) +} + +func TestCurrentStabilityCaseInsensitive(t *testing.T) { + t.Setenv(EnvSemconvStability, "GEN_AI_LATEST_EXPERIMENTAL") + ResetStabilityForTest() + t.Cleanup(ResetStabilityForTest) + assert.Equal(t, StabilityGenAILatest, CurrentStability()) +} + +func TestLegacyToolAttributesGated(t *testing.T) { + t.Setenv(EnvSemconvStability, "gen_ai_latest_experimental") + ResetStabilityForTest() + t.Cleanup(ResetStabilityForTest) + assert.Empty(t, LegacyToolAttributes("shell", "function", "main", "sess1", "call1")) + + t.Setenv(EnvSemconvStability, "") + ResetStabilityForTest() + got := LegacyToolAttributes("shell", "function", "main", "sess1", "call1") + assert.NotEmpty(t, got) +} diff --git a/pkg/telemetry/genai/stream.go b/pkg/telemetry/genai/stream.go new file mode 100644 index 000000000..382597512 --- /dev/null +++ b/pkg/telemetry/genai/stream.go @@ -0,0 +1,255 @@ +package genai + +import ( + "errors" + "io" + "strings" + "sync" + + "github.com/docker/docker-agent/pkg/chat" + "github.com/docker/docker-agent/pkg/tools" +) + +// StreamAttributer is an optional interface that provider stream adapters +// may implement to surface provider-specific attributes to the chat span +// once the response is complete. The wrapper queries the underlying stream +// on Close (in addition to the per-chunk Recv path) and applies whatever +// attributes the provider chose to expose. Implementations are expected to +// be safe to call after Close. +type StreamAttributer interface { + GenAIStreamAttributes() []KeyValue +} + +// KeyValue is a re-exported attribute key/value pair used by the optional +// StreamAttributer interface so providers can implement it without +// importing go.opentelemetry.io/otel/attribute directly. The decorator +// converts these back into OTel attributes before applying them to the +// span. +type KeyValue struct { + Key string + Value any +} + +// WrapStream wraps a chat.MessageStream so that consuming the stream +// drives the lifecycle of a ChatSpan: per-chunk timing, response-level +// attributes (id / response.model / finish reasons), usage capture, and +// final span End on stream close or terminal error. +// +// The returned stream forwards all Recv/Close calls to the underlying +// stream verbatim and adds no other behaviour, so swapping it in is +// invisible to callers. +func WrapStream(span *ChatSpan, stream chat.MessageStream) chat.MessageStream { + if span == nil || stream == nil { + return stream + } + return &instrumentedStream{ + span: span, + inner: stream, + capture: IsContentCaptureEnabled(), + } +} + +type instrumentedStream struct { + span *ChatSpan + inner chat.MessageStream + + // mu guards the lifecycle flags and the streaming-state buffers + // so a Recv that errors concurrently with the consumer's Close + // does not race on the check-then-set in endOnce or + // double-apply attributes through SetOutputMessages. + mu sync.Mutex + + // ended is set when the span has been finalised (output flushed + // and `End` called). innerClosed is set when the inner stream's + // `Close` has been called. They are tracked separately so an + // error in `Recv` can end the span without preempting the + // caller's `Close` that releases the inner stream's resources. + ended bool + innerClosed bool + + // capture buffers the streamed deltas for emission as + // `gen_ai.output.messages` on Close. Filled only when content + // capture is opted in (`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true`) + // so the buffer cost stays out of the default request path. + capture bool + contentBuf strings.Builder + reasoningBuf strings.Builder + pendingTools map[string]*tools.ToolCall + toolCallOrder []string +} + +func (s *instrumentedStream) Recv() (chat.MessageStreamResponse, error) { + resp, err := s.inner.Recv() + if err != nil { + // io.EOF is the normal stream terminator and is not an error + // for the span's purposes — End handles closing. + // For non-EOF errors we end the span here too: callers that + // abandon the stream after an error (a common pattern for + // network failures) would otherwise leak the span and skip the + // duration metric. Close remains idempotent so the canonical + // `defer Close()` path still works. + if !errors.Is(err, io.EOF) { + s.span.RecordError(err, ClassifyError(err)) + s.endOnce() + } + return resp, err + } + + // First chunk arrival is meaningful for the time_to_first_chunk + // metric. Mark on every Recv that produced any content so we cover + // cases where the provider opens with an empty preamble. + if hasChunkPayload(&resp) { + s.span.MarkChunk() + } + + if resp.ID != "" { + s.span.SetResponseID(resp.ID) + } + if resp.Model != "" { + s.span.SetResponseModel(resp.Model) + } + for i := range resp.Choices { + if resp.Choices[i].FinishReason != "" { + s.span.AddFinishReason(string(resp.Choices[i].FinishReason)) + } + } + if resp.Usage != nil { + s.span.RecordUsage( + resp.Usage.InputTokens, + resp.Usage.OutputTokens, + resp.Usage.CachedInputTokens, + resp.Usage.CacheWriteTokens, + resp.Usage.ReasoningTokens, + ) + } + + if s.capture { + s.mu.Lock() + s.bufferDeltas(&resp) + s.mu.Unlock() + } + return resp, nil +} + +// bufferDeltas accumulates content and tool-call deltas for the +// gen_ai.output.messages attribute. Tool calls arrive across multiple +// chunks (id once, name once, arguments in pieces), so we keep a map +// keyed by id and concatenate arguments as they stream in. +func (s *instrumentedStream) bufferDeltas(resp *chat.MessageStreamResponse) { + for i := range resp.Choices { + d := &resp.Choices[i].Delta + if d.Content != "" { + s.contentBuf.WriteString(d.Content) + } + if d.ReasoningContent != "" { + s.reasoningBuf.WriteString(d.ReasoningContent) + } + for j := range d.ToolCalls { + tc := &d.ToolCalls[j] + id := tc.ID + if id == "" { + // Provider didn't include the id on this delta — fall + // back to the most recent in-progress tool call. + if len(s.toolCallOrder) == 0 { + continue + } + id = s.toolCallOrder[len(s.toolCallOrder)-1] + } + if s.pendingTools == nil { + s.pendingTools = map[string]*tools.ToolCall{} + } + existing, ok := s.pendingTools[id] + if !ok { + existing = &tools.ToolCall{ID: id, Type: tc.Type} + s.pendingTools[id] = existing + s.toolCallOrder = append(s.toolCallOrder, id) + } + if tc.Function.Name != "" { + existing.Function.Name = tc.Function.Name + } + if tc.Function.Arguments != "" { + existing.Function.Arguments += tc.Function.Arguments + } + } + } +} + +func (s *instrumentedStream) Close() { + s.mu.Lock() + closeInner := !s.innerClosed + s.innerClosed = true + s.mu.Unlock() + if closeInner { + s.inner.Close() + } + s.endOnce() +} + +// endOnce flushes captured content, applies provider-supplied attributes, +// and ends the span — at most once per stream. Both the error path in +// `Recv` and the explicit `Close` path go through here so a stream that +// errors mid-flight still ends its span without waiting for the caller. +// `inner.Close` is intentionally NOT called here: leaving it to the +// explicit `Close` path keeps the contract that the wrapper releases +// the underlying stream exactly when the caller asks. +func (s *instrumentedStream) endOnce() { + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + // Snapshot the buffers under the lock so we don't race against a + // concurrent Recv writing more deltas. Release before calling out + // to the OTel SDK and the StreamAttributer hook to avoid holding + // the mutex across third-party code. + var ( + extras []KeyValue + captured bool + content string + reasoning string + collected []tools.ToolCall + streamAttrer StreamAttributer + ) + if attrer, ok := s.inner.(StreamAttributer); ok { + streamAttrer = attrer + } + if s.capture { + captured = true + content = s.contentBuf.String() + reasoning = s.reasoningBuf.String() + for _, id := range s.toolCallOrder { + if tc, ok := s.pendingTools[id]; ok { + collected = append(collected, *tc) + } + } + } + s.mu.Unlock() + + if streamAttrer != nil { + extras = streamAttrer.GenAIStreamAttributes() + } + for _, kv := range extras { + applyExtraAttribute(s.span, kv) + } + if captured { + SetOutputMessages(s.span, content, reasoning, collected) + } + s.span.End() +} + +// hasChunkPayload reports whether the response carries content that should +// count as an output chunk (text, reasoning, tool call, etc.). Empty +// keep-alive frames do not advance the per-chunk timing metrics. +func hasChunkPayload(resp *chat.MessageStreamResponse) bool { + for i := range resp.Choices { + d := &resp.Choices[i].Delta + if d.Content != "" || d.ReasoningContent != "" || d.ThinkingSignature != "" { + return true + } + if len(d.ToolCalls) > 0 || d.FunctionCall != nil { + return true + } + } + return false +} diff --git a/pkg/telemetry/mcp/attrs.go b/pkg/telemetry/mcp/attrs.go new file mode 100644 index 000000000..64a1d4138 --- /dev/null +++ b/pkg/telemetry/mcp/attrs.go @@ -0,0 +1,58 @@ +package mcp + +// MCP attribute keys defined by the OTel semantic conventions +// (https://opentelemetry.io/docs/specs/semconv/registry/attributes/mcp/). +// All are Development stability. +const ( + AttrMethodName = "mcp.method.name" + AttrProtocolVersion = "mcp.protocol.version" + AttrResourceURI = "mcp.resource.uri" + AttrSessionID = "mcp.session.id" +) + +// JSON-RPC attribute keys used alongside MCP spans for request id and +// response status when applicable. +const ( + AttrJSONRPCRequestID = "jsonrpc.request.id" + AttrJSONRPCProtocolVersion = "jsonrpc.protocol.version" + AttrRPCResponseStatusCode = "rpc.response.status_code" +) + +// gen_ai.* attribute keys that the MCP semconv overlays on MCP spans when +// applicable. These are duplicated here as constants so the MCP package +// doesn't depend on the genai package — keeping the two telemetry helpers +// compositional. +const ( + AttrGenAIOperationName = "gen_ai.operation.name" + AttrGenAIToolName = "gen_ai.tool.name" + AttrGenAIPromptName = "gen_ai.prompt.name" +) + +// Well-known MCP method names (https://modelcontextprotocol.io/specification). +// These match the values listed in the OTel semconv registry. +const ( + MethodInitialize = "initialize" + MethodPing = "ping" + MethodCompletionComplete = "completion/complete" + MethodPromptsList = "prompts/list" + MethodPromptsGet = "prompts/get" + MethodResourcesList = "resources/list" + MethodResourcesRead = "resources/read" + MethodResourcesSubscribe = "resources/subscribe" + MethodResourcesUnsub = "resources/unsubscribe" + MethodResourcesTemplates = "resources/templates/list" + MethodRootsList = "roots/list" + MethodSamplingCreate = "sampling/createMessage" + MethodToolsList = "tools/list" + MethodToolsCall = "tools/call" + MethodLoggingSetLevel = "logging/setLevel" + MethodElicitationCreate = "elicitation/create" +) + +// OperationExecuteTool is the gen_ai.operation.name value used on MCP +// tools/call spans per the spec. +const OperationExecuteTool = "execute_tool" + +// instrumentationName identifies this package as the OTel instrumentation +// scope for spans, metrics, and log records it produces. +const instrumentationName = "github.com/docker/docker-agent/pkg/telemetry/mcp" diff --git a/pkg/telemetry/mcp/conversation.go b/pkg/telemetry/mcp/conversation.go new file mode 100644 index 000000000..efeaad57f --- /dev/null +++ b/pkg/telemetry/mcp/conversation.go @@ -0,0 +1,19 @@ +package mcp + +import ( + "context" + + "go.opentelemetry.io/otel/baggage" +) + +// ConversationIDFromBaggage reads `gen_ai.conversation.id` from the +// context's W3C baggage. The MCP package mirrors the genai package's +// convention so MCP spans automatically carry the session id when the +// runtime has seeded it; the value also propagates across MCP server +// boundaries via the standard `baggage` header alongside `traceparent`. +// +// Exported so adjacent code (e.g. the MCP OAuth transport) can attach +// the same attribute to spans it creates directly via `otel.Tracer`. +func ConversationIDFromBaggage(ctx context.Context) string { + return baggage.FromContext(ctx).Member("gen_ai.conversation.id").Value() +} diff --git a/pkg/telemetry/mcp/doc.go b/pkg/telemetry/mcp/doc.go new file mode 100644 index 000000000..401f7472d --- /dev/null +++ b/pkg/telemetry/mcp/doc.go @@ -0,0 +1,13 @@ +// Package mcp provides OpenTelemetry instrumentation helpers that follow +// the OTel GenAI semantic conventions for the Model Context Protocol +// (https://opentelemetry.io/docs/specs/semconv/gen-ai/mcp/). +// +// MCP attributes use the `mcp.*` namespace (separate from `gen_ai.*`). +// Trace context propagates through the MCP `params._meta` field so that +// requests crossing client/server boundaries chain into a single trace. +// +// The package is structured so that callers describe what they are doing +// in MCP terms (method name, tool name, session id) and the helpers +// produce the spec-conformant spans, metrics, and propagation. All helpers +// are no-op-safe when telemetry is disabled. +package mcp diff --git a/pkg/telemetry/mcp/mcp_test.go b/pkg/telemetry/mcp/mcp_test.go new file mode 100644 index 000000000..5e5d78342 --- /dev/null +++ b/pkg/telemetry/mcp/mcp_test.go @@ -0,0 +1,99 @@ +package mcp + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/trace" + traceapi "go.opentelemetry.io/otel/trace" +) + +func TestEnsureMeta(t *testing.T) { + t.Parallel() + got := EnsureMeta(nil) + assert.NotNil(t, got) + assert.Empty(t, got) + + existing := map[string]any{"foo": "bar"} + got = EnsureMeta(existing) + assert.Equal(t, existing, got) +} + +func TestInjectExtractRoundTrip(t *testing.T) { + // Mutates the global OTel text-map propagator, so this test cannot + // run in parallel with other tests that read or modify it. + + // A propagator must be configured for inject/extract to do anything; + // install one for the duration of the test and put it back after. + prev := otel.GetTextMapPropagator() + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + t.Cleanup(func() { otel.SetTextMapPropagator(prev) }) + + // Start a sampled span so traceparent has a non-trivial trace id. + tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample())) + t.Cleanup(func() { _ = tp.Shutdown(t.Context()) }) + + parentCtx, parentSpan := tp.Tracer("test").Start(t.Context(), "parent") + defer parentSpan.End() + parentSC := traceapi.SpanContextFromContext(parentCtx) + + meta := map[string]any{} + InjectMeta(parentCtx, meta) + assert.Contains(t, meta, "traceparent", + "propagator should have written W3C traceparent into _meta") + + // Extract from a fresh context and verify the span context lines up + // with the parent we started with. + childCtx := ExtractMeta(t.Context(), meta) + extracted := traceapi.SpanContextFromContext(childCtx) + assert.Equal(t, parentSC.TraceID(), extracted.TraceID()) + assert.Equal(t, parentSC.SpanID(), extracted.SpanID()) +} + +func TestInjectMetaNilNoOp(t *testing.T) { + t.Parallel() + // Should not panic on a nil map. + InjectMeta(t.Context(), nil) +} + +func TestExtractMetaNilReturnsParent(t *testing.T) { + t.Parallel() + got := ExtractMeta(t.Context(), nil) + // Without trace context to extract we get back the same context. + assert.Equal(t, t.Context(), got) +} + +func TestStartClientReturnsActiveSpan(t *testing.T) { + // Mutates the global OTel tracer provider, so this test cannot run + // in parallel with other tests that read or modify it. + + tp := trace.NewTracerProvider(trace.WithSampler(trace.AlwaysSample())) + t.Cleanup(func() { _ = tp.Shutdown(t.Context()) }) + prev := otel.GetTracerProvider() + otel.SetTracerProvider(tp) + t.Cleanup(func() { otel.SetTracerProvider(prev) }) + + ctx, span := StartClient(t.Context(), CallOptions{ + Method: MethodToolsCall, + ToolName: "search-web", + }) + defer span.End() + + sc := traceapi.SpanContextFromContext(ctx) + assert.True(t, sc.IsValid(), "context should carry an active span") +} + +func TestClassifyError(t *testing.T) { + t.Parallel() + assert.Empty(t, ClassifyError(nil)) + assert.Equal(t, "context_canceled", ClassifyError(context.Canceled)) + assert.Equal(t, "deadline_exceeded", ClassifyError(context.DeadlineExceeded)) + assert.Equal(t, "rpc_error", ClassifyError(errors.New("some other error"))) +} diff --git a/pkg/telemetry/mcp/metrics.go b/pkg/telemetry/mcp/metrics.go new file mode 100644 index 000000000..fab407f9d --- /dev/null +++ b/pkg/telemetry/mcp/metrics.go @@ -0,0 +1,56 @@ +package mcp + +import ( + "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +// metricBuckets matches the spec's bucket boundaries for all four MCP +// duration histograms (mcp.client/server.operation.duration and +// mcp.client/server.session.duration). +var metricBuckets = []float64{ + 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, +} + +type instruments struct { + clientOperationDuration metric.Float64Histogram + serverOperationDuration metric.Float64Histogram + // mcp.{client,server}.session.duration histograms are defined by + // the spec but require a SessionSpan that tracks open/close at + // the transport layer. Wire those up alongside the transport + // instrumentation; until then registering them here would create + // always-empty time series in Mimir. +} + +var ( + instOnce sync.Once + inst *instruments +) + +func getInstruments() *instruments { + instOnce.Do(func() { + meter := otel.Meter(instrumentationName) + i := &instruments{} + + // Histogram registration rarely fails; on the rare miss we + // keep the successfully created instruments rather than + // abandoning the whole package — record sites nil-check. + i.clientOperationDuration, _ = meter.Float64Histogram( + "mcp.client.operation.duration", + metric.WithUnit("s"), + metric.WithDescription("Time taken by an MCP client to send a request and receive its response."), + metric.WithExplicitBucketBoundaries(metricBuckets...), + ) + i.serverOperationDuration, _ = meter.Float64Histogram( + "mcp.server.operation.duration", + metric.WithUnit("s"), + metric.WithDescription("Time taken by an MCP server to handle a request and send its response."), + metric.WithExplicitBucketBoundaries(metricBuckets...), + ) + + inst = i + }) + return inst +} diff --git a/pkg/telemetry/mcp/propagation.go b/pkg/telemetry/mcp/propagation.go new file mode 100644 index 000000000..b0e62040b --- /dev/null +++ b/pkg/telemetry/mcp/propagation.go @@ -0,0 +1,92 @@ +package mcp + +import ( + "context" + "maps" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/propagation" +) + +// metaCarrier adapts an MCP `params._meta` map (which the MCP SDK exposes +// as `map[string]any`) to OTel's TextMapCarrier interface so the package's +// configured propagator can read and write trace context (`traceparent`, +// `tracestate`, `baggage`) the way it does for any HTTP carrier. +type metaCarrier struct { + meta map[string]any +} + +func (c metaCarrier) Get(key string) string { + if c.meta == nil { + return "" + } + v, ok := c.meta[key] + if !ok { + return "" + } + if s, ok := v.(string); ok { + return s + } + return "" +} + +func (c metaCarrier) Set(key, value string) { + if c.meta == nil { + return + } + c.meta[key] = value +} + +func (c metaCarrier) Keys() []string { + if c.meta == nil { + return nil + } + keys := make([]string, 0, len(c.meta)) + for k, v := range c.meta { + if _, ok := v.(string); ok { + keys = append(keys, k) + } + } + return keys +} + +// InjectMeta writes the active trace context into the given MCP `_meta` +// map so the receiving server can extract it and parent its SERVER span +// onto our CLIENT span. Per the MCP semconv, the keys written are +// `traceparent`, `tracestate`, and `baggage` (W3C TraceContext + Baggage). +// +// If meta is nil, InjectMeta is a no-op — callers should ensure the map +// is non-nil before calling so the keys actually persist on the request. +func InjectMeta(ctx context.Context, meta map[string]any) { + if meta == nil { + return + } + otel.GetTextMapPropagator().Inject(ctx, metaCarrier{meta: meta}) +} + +// ExtractMeta reads trace context from the given MCP `_meta` map and +// returns a context with the parent span attached. Use on the server side +// to chain incoming spans onto the client's caller. +func ExtractMeta(ctx context.Context, meta map[string]any) context.Context { + if meta == nil { + return ctx + } + return otel.GetTextMapPropagator().Extract(ctx, metaCarrier{meta: meta}) +} + +// EnsureMeta returns a metadata map suitable for InjectMeta to write +// trace context into. When m is non-nil it is shallow-copied so an +// upstream caller that reuses the same request struct (e.g. on retry) +// does not see stale `traceparent` keys from a previous span injected +// into the map they own. When m is nil a fresh map is allocated. +func EnsureMeta(m map[string]any) map[string]any { + if m == nil { + return map[string]any{} + } + out := make(map[string]any, len(m)+3) + maps.Copy(out, m) + return out +} + +// Verify metaCarrier satisfies the propagator interface at compile time. +var _ propagation.TextMapCarrier = metaCarrier{} diff --git a/pkg/telemetry/mcp/span.go b/pkg/telemetry/mcp/span.go new file mode 100644 index 000000000..594ba99bd --- /dev/null +++ b/pkg/telemetry/mcp/span.go @@ -0,0 +1,247 @@ +package mcp + +import ( + "context" + "errors" + "strings" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" +) + +// CallOptions describes an MCP request being made or handled. Used by +// both client- and server-side helpers so call sites depend on a single +// vocabulary. +type CallOptions struct { + // Method is the MCP method name (e.g. "tools/call"). Required. + Method string + + // Target is the low-cardinality target of the operation: tool name + // for tools/call, prompt name for prompts/get, etc. When set the + // span name becomes "{method} {target}"; otherwise just "{method}". + Target string + + // ToolName, when set, is recorded as gen_ai.tool.name and used as + // the default Target for tools/call. + ToolName string + + // PromptName, when set, is recorded as gen_ai.prompt.name and used + // as the default Target for prompts/get. + PromptName string + + // ResourceURI, when set, is recorded as mcp.resource.uri and used + // as the default Target for resources/* methods. + ResourceURI string + + // SessionID identifies the MCP session and is recorded as + // mcp.session.id when set. + SessionID string + + // ProtocolVersion is recorded as mcp.protocol.version when set. + ProtocolVersion string + + // JSONRPCRequestID is recorded as jsonrpc.request.id when set + // (client-side requests; ignored for notifications). + JSONRPCRequestID string + + // ServerAddress / ServerPort identify the MCP endpoint when known. + ServerAddress string + ServerPort int +} + +// Span is the handle returned by StartClient / StartServer. It carries +// enough state to record `mcp.{client,server}.operation.duration` and to +// flush span attributes as the operation proceeds. +type Span struct { + span trace.Span + // metricCtx carries the active span context so the duration + // histogram measurement produces span-context exemplars (drill + // Mimir bucket → Tempo trace). + metricCtx context.Context //nolint:containedctx // intentional: needed for OTel exemplar attribution at End time + startedAt time.Time + method string + kind trace.SpanKind + + mu sync.Mutex + errType string + ended bool +} + +// StartClient begins a CLIENT-kind MCP span and returns a context carrying +// it. Callers MUST call Span.End to flush the span and metrics. +func StartClient(ctx context.Context, opts CallOptions) (context.Context, *Span) { + return startSpan(ctx, opts, trace.SpanKindClient) +} + +// StartServer begins a SERVER-kind MCP span. Use after extracting trace +// context from the incoming `params._meta` so the span chains onto the +// caller. Callers MUST call Span.End. +func StartServer(ctx context.Context, opts CallOptions) (context.Context, *Span) { + return startSpan(ctx, opts, trace.SpanKindServer) +} + +func startSpan(ctx context.Context, opts CallOptions, kind trace.SpanKind) (context.Context, *Span) { + tracer := otel.Tracer(instrumentationName) + + target := opts.Target + if target == "" { + switch { + case opts.ToolName != "": + target = opts.ToolName + case opts.PromptName != "": + target = opts.PromptName + case opts.ResourceURI != "": + target = opts.ResourceURI + } + } + + name := opts.Method + if name == "" { + name = "mcp" + } + if target != "" { + name = name + " " + target + } + + attrs := []attribute.KeyValue{ + attribute.String(AttrMethodName, opts.Method), + } + if opts.ToolName != "" { + attrs = append(attrs, + attribute.String(AttrGenAIToolName, opts.ToolName), + ) + if strings.HasPrefix(opts.Method, "tools/") { + attrs = append(attrs, attribute.String(AttrGenAIOperationName, OperationExecuteTool)) + } + } + if opts.PromptName != "" { + attrs = append(attrs, attribute.String(AttrGenAIPromptName, opts.PromptName)) + } + if opts.ResourceURI != "" { + attrs = append(attrs, attribute.String(AttrResourceURI, opts.ResourceURI)) + } + if opts.SessionID != "" { + attrs = append(attrs, attribute.String(AttrSessionID, opts.SessionID)) + } + if opts.ProtocolVersion != "" { + attrs = append(attrs, attribute.String(AttrProtocolVersion, opts.ProtocolVersion)) + } + if opts.JSONRPCRequestID != "" { + attrs = append(attrs, attribute.String(AttrJSONRPCRequestID, opts.JSONRPCRequestID)) + } + if opts.ServerAddress != "" { + attrs = append(attrs, attribute.String("server.address", opts.ServerAddress)) + if opts.ServerPort > 0 { + attrs = append(attrs, attribute.Int("server.port", opts.ServerPort)) + } + } + if conv := ConversationIDFromBaggage(ctx); conv != "" { + attrs = append(attrs, attribute.String("gen_ai.conversation.id", conv)) + } + + ctx, span := tracer.Start(ctx, name, + trace.WithSpanKind(kind), + trace.WithAttributes(attrs...), + ) + + return ctx, &Span{ + span: span, + metricCtx: ctx, + startedAt: time.Now(), + method: opts.Method, + kind: kind, + } +} + +// SetAttributes adds extra attributes to the span. Use for MCP extensions +// or for response-side attributes the caller learns later +// (e.g. rpc.response.status_code). +func (s *Span) SetAttributes(attrs ...attribute.KeyValue) { + if s == nil { + return + } + s.span.SetAttributes(attrs...) +} + +// RecordError marks the span as failed and stores error.type for the +// duration metric. errType should be a short, low-cardinality string; +// when empty, ClassifyError(err) supplies a value (one of +// "context_canceled", "deadline_exceeded", "rpc_error"). +func (s *Span) RecordError(err error, errType string) { + if s == nil || err == nil { + return + } + if errType == "" { + errType = ClassifyError(err) + } + s.mu.Lock() + s.errType = errType + s.mu.Unlock() + s.span.RecordError(err) + s.span.SetStatus(codes.Error, err.Error()) + s.span.SetAttributes(attribute.String("error.type", errType)) +} + +// End closes the span and records the operation duration metric. Safe to +// call multiple times; subsequent calls are no-ops. +func (s *Span) End() { + if s == nil { + return + } + s.mu.Lock() + if s.ended { + s.mu.Unlock() + return + } + s.ended = true + errType := s.errType + s.mu.Unlock() + + s.span.End() + + insts := getInstruments() + if insts == nil { + return + } + attrs := []attribute.KeyValue{ + attribute.String(AttrMethodName, s.method), + } + if errType != "" { + attrs = append(attrs, attribute.String("error.type", errType)) + } + + histogram := insts.clientOperationDuration + if s.kind == trace.SpanKindServer { + histogram = insts.serverOperationDuration + } + if histogram == nil { + return + } + // Use the span's started-at as the reference; we already snapshot + // errType under the lock above, so no additional locking is needed + // for the immutable startedAt field. + histogram.Record(s.metricCtx, time.Since(s.startedAt).Seconds(), + metric.WithAttributes(attrs...), + ) +} + +// ClassifyError maps an MCP error to a low-cardinality error.type value. +// MCP errors are often plain RPC errors; this helper picks reasonable +// labels for cancellation and falls back to the type name otherwise. +func ClassifyError(err error) string { + if err == nil { + return "" + } + switch { + case errors.Is(err, context.Canceled): + return "context_canceled" + case errors.Is(err, context.DeadlineExceeded): + return "deadline_exceeded" + } + return "rpc_error" +} diff --git a/pkg/toolinstall/registry.go b/pkg/toolinstall/registry.go index 1c53189ef..5529483b6 100644 --- a/pkg/toolinstall/registry.go +++ b/pkg/toolinstall/registry.go @@ -14,6 +14,8 @@ import ( "github.com/goccy/go-yaml" "github.com/natefinch/atomic" + + "github.com/docker/docker-agent/pkg/httpclient" ) // githubToken returns a GitHub personal access token from the environment, @@ -115,7 +117,7 @@ var ( // NewRegistry creates a new Registry with default settings. func NewRegistry() *Registry { return &Registry{ - httpClient: http.DefaultClient, + httpClient: httpclient.TracedDefaultClient(), baseURL: registryBaseURL, cacheDir: RegistryDir(), } diff --git a/pkg/tools/builtin/agent/agent.go b/pkg/tools/builtin/agent/agent.go index d195695ea..ff2b3f07a 100644 --- a/pkg/tools/builtin/agent/agent.go +++ b/pkg/tools/builtin/agent/agent.go @@ -12,9 +12,14 @@ import ( "time" "github.com/google/uuid" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/concurrent" "github.com/docker/docker-agent/pkg/session" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" ) @@ -295,6 +300,13 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall // via HandleStop which calls cancel(). taskCtx, cancel := context.WithCancel(context.WithoutCancel(ctx)) + // Capture a link to the current trace so the background task's + // new root trace can be navigated back to the spawning agent in + // observability-svc. The parent span context comes from the + // active `runtime.tool.call` span; the link survives even after + // that span ends, while a child-span relationship would not. + parentSpanContext := trace.SpanContextFromContext(ctx) + t := &task{ id: taskID, agentName: params.Agent, @@ -308,9 +320,50 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall h.wg.Go(func() { defer cancel() + // Each background task starts its own trace (WithNewRoot) + // because it outlives the spawning request — making it a + // child would leave a span open after the parent ended. + // A span link preserves navigability from the spawning + // trace to the background task. + spanAttrs := []attribute.KeyValue{ + attribute.String("cagent.background_agent.task_id", taskID), + attribute.String("cagent.background_agent.agent", params.Agent), + } + // Stamp gen_ai.conversation.id directly: WithNewRoot resets the + // span context but baggage flows through context.WithoutCancel, + // so the id is reachable yet would not appear as a span attr + // without an explicit lift. + if convID := genai.ConversationIDFromContext(taskCtx); convID != "" { + spanAttrs = append(spanAttrs, attribute.String(genai.AttrConversationID, convID)) + } + startOpts := []trace.SpanStartOption{ + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithNewRoot(), + trace.WithAttributes(spanAttrs...), + } + if parentSpanContext.IsValid() { + startOpts = append(startOpts, trace.WithLinks(trace.Link{ + SpanContext: parentSpanContext, + Attributes: []attribute.KeyValue{ + attribute.String("cagent.link.kind", "spawned_from"), + }, + })) + } + // Static span name; the agent name lives in the + // `cagent.background_agent.agent` attribute. Putting the + // user-defined agent name into the span name itself would + // blow up Tempo's operation-name index when many agents are + // configured. + tracedCtx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools/builtin/agent").Start( + taskCtx, + "background_agent.run", + startOpts..., + ) + defer span.End() + slog.Debug("Starting background agent task", "task_id", taskID, "agent", params.Agent) - result := h.runner.RunAgent(taskCtx, RunParams{ + result := h.runner.RunAgent(tracedCtx, RunParams{ AgentName: params.Agent, Task: params.Task, ExpectedOutput: params.ExpectedOutput, @@ -321,12 +374,18 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall if result.ErrMsg != "" { t.errMsg = result.ErrMsg t.storeStatus(taskFailed) + span.SetStatus(codes.Error, result.ErrMsg) + span.SetAttributes( + attribute.String("error.type", "agent_error"), + attribute.String("cagent.background_agent.outcome", "failed"), + ) slog.Debug("Background agent task failed", "task_id", taskID, "agent", params.Agent, "error", result.ErrMsg) return } - if taskCtx.Err() != nil && t.loadStatus() == taskRunning { + if tracedCtx.Err() != nil && t.loadStatus() == taskRunning { t.storeStatus(taskStopped) + span.SetAttributes(attribute.String("cagent.background_agent.outcome", "stopped")) slog.Debug("Background agent task stopped", "task_id", taskID) return } @@ -335,6 +394,7 @@ func (h *Handler) HandleRun(ctx context.Context, sess *session.Session, toolCall // always see the populated result field. t.result = result.Result if t.casStatus(taskRunning, taskCompleted) { + span.SetAttributes(attribute.String("cagent.background_agent.outcome", "completed")) slog.Debug("Background agent task completed", "task_id", taskID, "agent", params.Agent) } }) diff --git a/pkg/tools/builtin/api/api.go b/pkg/tools/builtin/api/api.go index 9b13d5d1b..5fd30e7a2 100644 --- a/pkg/tools/builtin/api/api.go +++ b/pkg/tools/builtin/api/api.go @@ -13,6 +13,7 @@ import ( "time" "github.com/docker/docker-agent/pkg/config/latest" + "github.com/docker/docker-agent/pkg/httpclient" "github.com/docker/docker-agent/pkg/js" "github.com/docker/docker-agent/pkg/remote" "github.com/docker/docker-agent/pkg/tools" @@ -34,7 +35,7 @@ var ( func (t *Tool) callTool(ctx context.Context, toolCall tools.ToolCall) (*tools.ToolCallResult, error) { client := &http.Client{ Timeout: 30 * time.Second, - Transport: remote.NewTransport(ctx), + Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx)), } endpoint := t.config.Endpoint diff --git a/pkg/tools/builtin/deferred/deferred.go b/pkg/tools/builtin/deferred/deferred.go index e18b354fb..30f35938e 100644 --- a/pkg/tools/builtin/deferred/deferred.go +++ b/pkg/tools/builtin/deferred/deferred.go @@ -8,6 +8,9 @@ import ( "strings" "sync" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/tools" ) @@ -84,7 +87,7 @@ type AddToolArgs struct { Name string `json:"name" jsonschema:"The name of the tool to activate"` } -func (d *Toolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*tools.ToolCallResult, error) { +func (d *Toolset) handleSearchTool(ctx context.Context, args SearchToolArgs) (*tools.ToolCallResult, error) { query := strings.ToLower(args.Query) d.mu.RLock() @@ -103,6 +106,15 @@ func (d *Toolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*too } } + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.String("cagent.tool.deferred.op", "search_tool"), + attribute.String("cagent.tool.deferred.query", args.Query), + attribute.Int("cagent.tool.deferred.match_count", len(results)), + attribute.Int("cagent.tool.deferred.pool_size", len(d.deferredTools)), + ) + } + if len(results) == 0 { return tools.ResultError(fmt.Sprintf("No deferred tools found matching '%s'", args.Query)), nil } @@ -115,21 +127,37 @@ func (d *Toolset) handleSearchTool(_ context.Context, args SearchToolArgs) (*too return tools.ResultSuccess(fmt.Sprintf("Found %d deferred tool(s):\n%s", len(results), string(output))), nil } -func (d *Toolset) handleAddTool(_ context.Context, args AddToolArgs) (*tools.ToolCallResult, error) { +func (d *Toolset) handleAddTool(ctx context.Context, args AddToolArgs) (*tools.ToolCallResult, error) { d.mu.Lock() defer d.mu.Unlock() + span := trace.SpanFromContext(ctx) + annotate := func(outcome string) { + if !span.IsRecording() { + return + } + span.SetAttributes( + attribute.String("cagent.tool.deferred.op", "add_tool"), + attribute.String("cagent.tool.deferred.tool_name", args.Name), + attribute.String("cagent.tool.deferred.outcome", outcome), + attribute.Int("cagent.tool.deferred.activated_count", len(d.activatedTools)), + ) + } + if _, exists := d.activatedTools[args.Name]; exists { + annotate("already_active") return tools.ResultSuccess(fmt.Sprintf("Tool '%s' is already active", args.Name)), nil } entry, exists := d.deferredTools[args.Name] if !exists { + annotate("not_found") return tools.ResultError(fmt.Sprintf("Tool '%s' not found.", args.Name)), nil } delete(d.deferredTools, args.Name) d.activatedTools[args.Name] = entry.tool + annotate("activated") return tools.ResultSuccess(fmt.Sprintf("Tool '%s' has been activated and is now available for use.\n\nDescription: %s", args.Name, entry.tool.Description)), nil } diff --git a/pkg/tools/builtin/fetch/fetch.go b/pkg/tools/builtin/fetch/fetch.go index eb1edd522..1c50d8f48 100644 --- a/pkg/tools/builtin/fetch/fetch.go +++ b/pkg/tools/builtin/fetch/fetch.go @@ -15,7 +15,10 @@ import ( htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" "github.com/k3a/html2text" "github.com/temoto/robotstxt" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/httpclient" "github.com/docker/docker-agent/pkg/remote" "github.com/docker/docker-agent/pkg/tools" "github.com/docker/docker-agent/pkg/useragent" @@ -48,15 +51,55 @@ type ToolArgs struct { Format string `json:"format,omitempty"` } +// sanitizeFetchURLs strips query strings and userinfo from each URL so +// the resulting span attribute can ship by default without leaking +// signed-URL tokens, OAuth codes, or inline credentials. URLs that fail +// to parse are emitted as a sentinel rather than the raw string, since +// an unparseable URL could also carry sensitive material. +func sanitizeFetchURLs(urls []string) []string { + out := make([]string, len(urls)) + for i, raw := range urls { + u, err := url.Parse(raw) + if err != nil { + out[i] = "" + continue + } + u.RawQuery = "" + u.Fragment = "" + u.User = nil + out[i] = u.String() + } + return out +} + func (h *fetchHandler) CallTool(ctx context.Context, params ToolArgs) (*tools.ToolCallResult, error) { if len(params.URLs) == 0 { return nil, errors.New("at least one URL is required") } + // Decorate the active runtime.tool.handler span with the requested + // URLs. Strip query params and userinfo first: query strings often + // carry signed-URL tokens, OAuth codes, or session IDs, and userinfo + // carries credentials inline. The path stays intact so dashboards + // can still answer "which sites/endpoints did the agent hit?" — the + // HTTP CLIENT child span emitted by `httpclient.WrapWithOTel` below + // retains the full URL under `http.url` for callers that opt into + // that backend's full-URL capture. + if span := trace.SpanFromContext(ctx); span.IsRecording() { + attrs := []attribute.KeyValue{ + attribute.Int("cagent.tool.fetch.url_count", len(params.URLs)), + attribute.StringSlice("cagent.tool.fetch.urls", sanitizeFetchURLs(params.URLs)), + } + if params.Format != "" { + attrs = append(attrs, attribute.String("cagent.tool.fetch.format", params.Format)) + } + span.SetAttributes(attrs...) + } + // Set timeout if specified client := &http.Client{ Timeout: h.timeout, - Transport: remote.NewTransport(ctx), + Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx)), // Re-check the domain allow/deny lists on every redirect: without this, // an allowed origin could redirect into a denied one and bypass the // policy. The 10-redirect cap mirrors the net/http default. diff --git a/pkg/tools/builtin/filesystem/filesystem.go b/pkg/tools/builtin/filesystem/filesystem.go index 1bd63ff57..e43605e2a 100644 --- a/pkg/tools/builtin/filesystem/filesystem.go +++ b/pkg/tools/builtin/filesystem/filesystem.go @@ -15,11 +15,49 @@ import ( "strings" "sync" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/chat" "github.com/docker/docker-agent/pkg/fsx" "github.com/docker/docker-agent/pkg/tools" ) +// annotateFilesystemSpan stamps the operation kind and target path +// onto the active runtime.tool.handler span. Paths ship unconditionally +// — they're the main signal of what the agent touched. Drop or hash +// `cagent.tool.filesystem.path` at the OTel collector if paths +// routinely reveal identifiers you don't want shipped. +func annotateFilesystemSpan(ctx context.Context, op, path string) { + span := trace.SpanFromContext(ctx) + if !span.IsRecording() { + return + } + attrs := []attribute.KeyValue{ + attribute.String("cagent.tool.filesystem.op", op), + } + if path != "" { + attrs = append(attrs, attribute.String("cagent.tool.filesystem.path", path)) + } + span.SetAttributes(attrs...) +} + +// maxFilesystemPathsAttr caps how many entries from args.Paths land on a +// span attribute. Many backends drop attributes over a few KiB and per- +// element string costs add up fast on a multi-hundred-path call. The +// path_count attribute (always recorded) preserves total fidelity. +const maxFilesystemPathsAttr = 32 + +// cappedPaths returns paths truncated to maxFilesystemPathsAttr entries. +// Callers should also record `path_count = len(paths)` separately so the +// truncation is visible. +func cappedPaths(paths []string) []string { + if len(paths) <= maxFilesystemPathsAttr { + return paths + } + return paths[:maxFilesystemPathsAttr] +} + const ( ToolNameReadFile = "read_file" ToolNameReadMultipleFiles = "read_multiple_files" @@ -604,6 +642,7 @@ func (t *Tool) shouldIgnorePath(path string) bool { // Handler implementations func (t *Tool) handleDirectoryTree(ctx context.Context, args DirectoryTreeArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "directory_tree", args.Path) resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { return tools.ResultError(err.Error()), nil @@ -676,6 +715,7 @@ func (t *Tool) editFileHandler() tools.ToolHandler { } func (t *Tool) handleEditFile(ctx context.Context, args EditFileArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "edit_file", args.Path) resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { return tools.ResultError(err.Error()), nil @@ -713,7 +753,8 @@ func (t *Tool) handleEditFile(ctx context.Context, args EditFileArgs) (*tools.To return tools.ResultSuccess("File edited successfully. Changes:\n" + strings.Join(changes, "\n")), nil } -func (t *Tool) handleListDirectory(_ context.Context, args ListDirectoryArgs) (*tools.ToolCallResult, error) { +func (t *Tool) handleListDirectory(ctx context.Context, args ListDirectoryArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "list_directory", args.Path) resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { return tools.ResultError(err.Error()), nil @@ -754,7 +795,8 @@ func (t *Tool) handleListDirectory(_ context.Context, args ListDirectoryArgs) (* }, nil } -func (t *Tool) handleReadFile(_ context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) { +func (t *Tool) handleReadFile(ctx context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "read_file", args.Path) resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { return &tools.ToolCallResult{ @@ -861,6 +903,13 @@ func (t *Tool) readImageFile(resolvedPath, originalPath string) (*tools.ToolCall } func (t *Tool) handleReadMultipleFiles(ctx context.Context, args ReadMultipleFilesArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "read_multiple_files", "") + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)), + attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)), + ) + } type PathContent struct { Path string `json:"path"` Content string `json:"content"` @@ -934,7 +983,8 @@ func (t *Tool) handleReadMultipleFiles(ctx context.Context, args ReadMultipleFil }, nil } -func (t *Tool) handleSearchFilesContent(_ context.Context, args SearchFilesContentArgs) (*tools.ToolCallResult, error) { +func (t *Tool) handleSearchFilesContent(ctx context.Context, args SearchFilesContentArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "search_files_content", args.Path) resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { return tools.ResultError(err.Error()), nil @@ -1054,6 +1104,7 @@ func (t *Tool) handleSearchFilesContent(_ context.Context, args SearchFilesConte } func (t *Tool) handleWriteFile(ctx context.Context, args WriteFileArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "write_file", args.Path) resolvedPath, err := t.resolveAndCheckPath(args.Path) if err != nil { return tools.ResultError(err.Error()), nil @@ -1076,7 +1127,14 @@ func (t *Tool) handleWriteFile(ctx context.Context, args WriteFileArgs) (*tools. return tools.ResultSuccess(fmt.Sprintf("File written successfully: %s (%d bytes)", args.Path, len(args.Content))), nil } -func (t *Tool) handleCreateDirectory(_ context.Context, args CreateDirectoryArgs) (*tools.ToolCallResult, error) { +func (t *Tool) handleCreateDirectory(ctx context.Context, args CreateDirectoryArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "create_directory", "") + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)), + attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)), + ) + } var results []string for _, path := range args.Paths { resolvedPath, err := t.resolveAndCheckPath(path) @@ -1092,7 +1150,14 @@ func (t *Tool) handleCreateDirectory(_ context.Context, args CreateDirectoryArgs return tools.ResultSuccess(strings.Join(results, "\n")), nil } -func (t *Tool) handleRemoveDirectory(_ context.Context, args RemoveDirectoryArgs) (*tools.ToolCallResult, error) { +func (t *Tool) handleRemoveDirectory(ctx context.Context, args RemoveDirectoryArgs) (*tools.ToolCallResult, error) { + annotateFilesystemSpan(ctx, "remove_directory", "") + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.Int("cagent.tool.filesystem.path_count", len(args.Paths)), + attribute.StringSlice("cagent.tool.filesystem.paths", cappedPaths(args.Paths)), + ) + } var results []string for _, path := range args.Paths { resolvedPath, err := t.resolveAndCheckPath(path) diff --git a/pkg/tools/builtin/lsp/lsp.go b/pkg/tools/builtin/lsp/lsp.go index ed1c42ace..3c3cca40b 100644 --- a/pkg/tools/builtin/lsp/lsp.go +++ b/pkg/tools/builtin/lsp/lsp.go @@ -19,6 +19,9 @@ import ( "sync/atomic" "time" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/concurrent" "github.com/docker/docker-agent/pkg/tools" "github.com/docker/docker-agent/pkg/tools/lifecycle" @@ -460,12 +463,29 @@ type WorkspaceArgs struct{} // lspTool is a shorthand for constructing a tools.Tool with common LSP defaults. func lspTool(name, title, description string, readOnly bool, params any, handler tools.ToolHandler) tools.Tool { + // Wrap the handler so every LSP RPC stamps the LSP method name on + // the active runtime.tool.handler span. Single tool name = single + // LSP operation, so the gen_ai.tool.name attribute on the parent + // span is enough for filtering by RPC kind in dashboards. The + // `cagent.tool.lsp.tool` is redundant with gen_ai.tool.name but + // kept under the cagent.* namespace for symmetry with the other + // builtin tool annotations and so dashboards have a uniform + // `cagent.tool.{kind}.*` query surface across builtins. + wrapped := func(ctx context.Context, tc tools.ToolCall) (*tools.ToolCallResult, error) { + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.String("cagent.tool.lsp.tool", name), + attribute.Bool("cagent.tool.lsp.read_only", readOnly), + ) + } + return handler(ctx, tc) + } return tools.Tool{ Name: name, Category: "lsp", Description: description, Parameters: params, - Handler: handler, + Handler: wrapped, Annotations: tools.ToolAnnotations{ Title: title, ReadOnlyHint: readOnly, diff --git a/pkg/tools/builtin/lsp/lsp_lifecycle.go b/pkg/tools/builtin/lsp/lsp_lifecycle.go index 4a7376497..bc8cf8469 100644 --- a/pkg/tools/builtin/lsp/lsp_lifecycle.go +++ b/pkg/tools/builtin/lsp/lsp_lifecycle.go @@ -12,6 +12,7 @@ import ( "sync" "github.com/docker/docker-agent/pkg/concurrent" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools/lifecycle" ) @@ -28,7 +29,7 @@ func (c *lspConnector) Connect(ctx context.Context) (lifecycle.Session, error) { h := c.h slog.Debug("Starting LSP server", "command", h.command, "args", h.args) - p, err := spawnLSPProcess(h) + p, err := spawnLSPProcess(ctx, h) if err != nil { return nil, err } @@ -73,14 +74,19 @@ type lspProcess struct { // kicks off a stderr-drain goroutine bound to the process lifetime. // Errors are mapped to typed lifecycle errors so the supervisor can // apply the right policy. -func spawnLSPProcess(h *lspHandler) (*lspProcess, error) { +func spawnLSPProcess(callerCtx context.Context, h *lspHandler) (*lspProcess, error) { // The process must outlive the caller's request context (which is // often cancelled when an HTTP/agent turn ends). The supervisor // calls Close to shut it down on Stop or restart. processCtx, processCancel := context.WithCancel(context.Background()) cmd := exec.CommandContext(processCtx, h.command, h.args...) + // Inherit the caller's W3C trace context (the Connect call's + // `toolset.start` or per-request span) so an OTel-aware LSP server + // can chain its spans onto the agent trace. Most LSPs do not emit + // OTel today, so this is defensive parity with sandbox.exec. cmd.Env = append(os.Environ(), h.env...) + cmd.Env = append(cmd.Env, genai.InjectTraceContextEnv(callerCtx)...) cmd.Dir = h.workingDir stdin, err := cmd.StdinPipe() diff --git a/pkg/tools/builtin/openapi/openapi.go b/pkg/tools/builtin/openapi/openapi.go index 863098e30..97a7c2034 100644 --- a/pkg/tools/builtin/openapi/openapi.go +++ b/pkg/tools/builtin/openapi/openapi.go @@ -18,6 +18,7 @@ import ( v3 "github.com/pb33f/libopenapi/datamodel/high/v3" "go.yaml.in/yaml/v4" + "github.com/docker/docker-agent/pkg/httpclient" "github.com/docker/docker-agent/pkg/remote" "github.com/docker/docker-agent/pkg/tools" "github.com/docker/docker-agent/pkg/upstream" @@ -74,7 +75,7 @@ func (t *Tool) fetchSpec(ctx context.Context) (*v3.Document, error) { req.Header.Set("Accept", "application/json") setHeaders(req, t.headers) - resp, err := (&http.Client{Timeout: httpTimeout, Transport: remote.NewTransport(ctx)}).Do(req) + resp, err := (&http.Client{Timeout: httpTimeout, Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx))}).Do(req) if err != nil { return nil, fmt.Errorf("request failed: %w", err) } @@ -423,7 +424,7 @@ func (h *openAPIHandler) callTool(ctx context.Context, params openAPICallArgs) ( req.Header.Set("Accept", "application/json") setHeaders(req, h.headers) - resp, err := (&http.Client{Timeout: httpTimeout, Transport: remote.NewTransport(ctx)}).Do(req) + resp, err := (&http.Client{Timeout: httpTimeout, Transport: httpclient.WrapWithOTel(remote.NewTransport(ctx))}).Do(req) if err != nil { return nil, fmt.Errorf("request failed: %w", err) } diff --git a/pkg/tools/builtin/shell/script_shell.go b/pkg/tools/builtin/shell/script_shell.go index 2eacd4259..061466067 100644 --- a/pkg/tools/builtin/shell/script_shell.go +++ b/pkg/tools/builtin/shell/script_shell.go @@ -11,6 +11,9 @@ import ( "slices" "strings" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/config/latest" "github.com/docker/docker-agent/pkg/shellpath" "github.com/docker/docker-agent/pkg/tools" @@ -138,6 +141,17 @@ func (t *ScriptShellTool) execute(ctx context.Context, toolConfig *latest.Script } } + // Stamp the script_shell call shape onto the active span. Cmd + // ships unconditionally for the same reason as shell.RunShell — + // see that comment for the redact-at-collector guidance. + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.String("cagent.tool.script_shell.tool_name", toolCall.Function.Name), + attribute.String("cagent.tool.script_shell.cmd", toolConfig.Cmd), + attribute.String("cagent.tool.script_shell.cwd", cmp.Or(toolConfig.WorkingDir, ".")), + ) + } + shell, argsPrefix := shellpath.DetectShell() cmd := exec.CommandContext(ctx, shell, append(argsPrefix, toolConfig.Cmd)...) diff --git a/pkg/tools/builtin/shell/shell.go b/pkg/tools/builtin/shell/shell.go index 53d8beb61..96377bc24 100644 --- a/pkg/tools/builtin/shell/shell.go +++ b/pkg/tools/builtin/shell/shell.go @@ -16,6 +16,9 @@ import ( "sync/atomic" "time" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/concurrent" "github.com/docker/docker-agent/pkg/config" "github.com/docker/docker-agent/pkg/shellpath" @@ -199,6 +202,19 @@ func (h *shellHandler) RunShell(ctx context.Context, params RunShellArgs) (*tool cwd := h.resolveWorkDir(params.Cwd) + // Stamp the call shape (cmd, cwd, timeout) onto the active span. + // Cmd ships unconditionally — it's the main signal of what the + // agent actually did, and gating it on chat-content capture loses + // too much debug value. Drop or hash `cagent.tool.shell.cmd` at + // the OTel collector if commands routinely carry secrets. + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes( + attribute.String("cagent.tool.shell.cmd", params.Cmd), + attribute.Float64("cagent.tool.shell.timeout_seconds", timeout.Seconds()), + attribute.String("cagent.tool.shell.cwd", cwd), + ) + } + slog.Debug("Executing native shell command", "command", params.Cmd, "cwd", cwd) return h.runNativeCommand(timeoutCtx, ctx, params.Cmd, cwd, timeout), nil diff --git a/pkg/tools/builtin/todo/todo.go b/pkg/tools/builtin/todo/todo.go index 58dd73f98..450fa5a60 100644 --- a/pkg/tools/builtin/todo/todo.go +++ b/pkg/tools/builtin/todo/todo.go @@ -8,10 +8,43 @@ import ( "sync" "sync/atomic" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/concurrent" "github.com/docker/docker-agent/pkg/tools" ) +// annotateTodoSpan stamps the operation kind, batch size, and the +// resulting list size onto the active runtime.tool.handler span so a +// glance at a session shows when the agent was actually managing +// progress vs. just chatting. +func annotateTodoSpan(ctx context.Context, op string, batch, total, completed int) { + span := trace.SpanFromContext(ctx) + if !span.IsRecording() { + return + } + span.SetAttributes( + attribute.String("cagent.tool.todo.op", op), + attribute.Int("cagent.tool.todo.batch_size", batch), + attribute.Int("cagent.tool.todo.total", total), + attribute.Int("cagent.tool.todo.completed", completed), + ) +} + +// countCompleted returns how many todos in the current snapshot are +// marked completed. Cheap O(n) scan over a typically-tiny slice; called +// once per todo handler invocation for the span annotation. +func countCompleted(all []Todo) int { + n := 0 + for _, t := range all { + if t.Status == "completed" { + n++ + } + } + return n +} + const ( ToolNameCreateTodo = "create_todo" ToolNameCreateTodos = "create_todos" @@ -199,9 +232,11 @@ func (h *todoHandler) jsonResult(ctx context.Context, v any) (*tools.ToolCallRes func (h *todoHandler) createTodo(ctx context.Context, params CreateTodoArgs) (*tools.ToolCallResult, error) { created := h.addTodo(ctx, params.Description) + all := h.storage.All(ctx) + annotateTodoSpan(ctx, "create_todo", 1, len(all), countCompleted(all)) return h.jsonResult(ctx, CreateTodoOutput{ Created: created, - AllTodos: h.storage.All(ctx), + AllTodos: all, Reminder: h.incompleteReminder(ctx), }) } @@ -211,9 +246,11 @@ func (h *todoHandler) createTodos(ctx context.Context, params CreateTodosArgs) ( for _, desc := range params.Descriptions { created = append(created, h.addTodo(ctx, desc)) } + all := h.storage.All(ctx) + annotateTodoSpan(ctx, "create_todos", len(params.Descriptions), len(all), countCompleted(all)) return h.jsonResult(ctx, CreateTodosOutput{ Created: created, - AllTodos: h.storage.All(ctx), + AllTodos: all, Reminder: h.incompleteReminder(ctx), }) } @@ -246,6 +283,7 @@ func (h *todoHandler) updateTodos(ctx context.Context, params UpdateTodosArgs) ( result.AllTodos = h.storage.All(ctx) result.Reminder = h.incompleteReminder(ctx) + annotateTodoSpan(ctx, "update_todos", len(params.Updates), len(result.AllTodos), countCompleted(result.AllTodos)) return h.jsonResult(ctx, result) } @@ -283,6 +321,7 @@ func (h *todoHandler) listTodos(ctx context.Context, _ tools.ToolCall) (*tools.T if todos == nil { todos = []Todo{} } + annotateTodoSpan(ctx, "list_todos", 0, len(todos), countCompleted(todos)) out := ListTodosOutput{Todos: todos} out.Reminder = h.incompleteReminder(ctx) return h.jsonResult(ctx, out) diff --git a/pkg/tools/builtin/userprompt/userprompt.go b/pkg/tools/builtin/userprompt/userprompt.go index 421845a5e..aecf2ce68 100644 --- a/pkg/tools/builtin/userprompt/userprompt.go +++ b/pkg/tools/builtin/userprompt/userprompt.go @@ -6,6 +6,8 @@ import ( "fmt" "github.com/modelcontextprotocol/go-sdk/mcp" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/tools" ) @@ -47,6 +49,14 @@ func (t *Tool) userPrompt(ctx context.Context, params Args) (*tools.ToolCallResu return tools.ResultError("user_prompt tool is not available in this context (no elicitation handler configured)"), nil } + span := trace.SpanFromContext(ctx) + if span.IsRecording() { + span.SetAttributes( + attribute.Int("cagent.tool.user_prompt.message_length", len(params.Message)), + attribute.Bool("cagent.tool.user_prompt.has_schema", params.Schema != nil), + ) + } + var meta mcp.Meta if params.Title != "" { meta = mcp.Meta{"cagent/title": params.Title} @@ -68,6 +78,10 @@ func (t *Tool) userPrompt(ctx context.Context, params Args) (*tools.ToolCallResu Content: result.Content, } + if span.IsRecording() { + span.SetAttributes(attribute.String("cagent.tool.user_prompt.action", string(result.Action))) + } + responseJSON, err := json.Marshal(response) if err != nil { return nil, fmt.Errorf("failed to marshal response: %w", err) diff --git a/pkg/tools/codemode/exec.go b/pkg/tools/codemode/exec.go index 0d16b3035..df143b1f4 100644 --- a/pkg/tools/codemode/exec.go +++ b/pkg/tools/codemode/exec.go @@ -3,12 +3,17 @@ package codemode import ( "bytes" "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "fmt" "slices" "github.com/dop251/goja" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + "github.com/docker/docker-agent/pkg/telemetry/genai" "github.com/docker/docker-agent/pkg/tools" ) @@ -40,6 +45,29 @@ func (c *codeModeTool) runJavascript(ctx context.Context, script string) (Script vm := goja.New() tracker := &toolCallTracker{} + // Always stamp a hash + length so dashboards can correlate + // identical scripts ("model ran the same script 200 times this + // hour") without ever shipping the body. Codemode scripts are + // kilobyte-scale arbitrary JS — embedded auth tokens, pasted + // user data, and inline secrets are common — so the body itself + // is gated behind the GenAI content-capture opt-in. + span := trace.SpanFromContext(ctx) + if span.IsRecording() { + sum := sha256.Sum256([]byte(script)) + span.SetAttributes( + attribute.String("cagent.tool.codemode.script_hash", hex.EncodeToString(sum[:])), + attribute.Int("cagent.tool.codemode.script_length", len(script)), + ) + if genai.IsContentCaptureEnabled() { + span.SetAttributes(attribute.String("cagent.tool.codemode.script", script)) + } + } + defer func() { + if span.IsRecording() { + span.SetAttributes(attribute.Int("cagent.tool.codemode.tool_call_count", len(tracker.calls))) + } + }() + // Inject console object to the help the LLM debug its own code. var ( stdOut bytes.Buffer diff --git a/pkg/tools/mcp/mcp.go b/pkg/tools/mcp/mcp.go index a0537fba3..34f9f3abf 100644 --- a/pkg/tools/mcp/mcp.go +++ b/pkg/tools/mcp/mcp.go @@ -16,6 +16,8 @@ import ( "time" "github.com/modelcontextprotocol/go-sdk/mcp" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "github.com/docker/docker-agent/pkg/config/latest" "github.com/docker/docker-agent/pkg/tools" @@ -33,6 +35,11 @@ type mcpClient interface { SetManagedOAuth(managed bool) SetToolListChangedHandler(handler func()) SetPromptListChangedHandler(handler func()) + // ServerAddress returns the connection identifier (URL for remote + // clients, executable name for stdio). Used by `Toolset.Start` to + // stamp `server.address` on the parent `toolset.start` span so + // initialize failures show which target produced them. + ServerAddress() string // Wait blocks until the underlying connection is closed by the server. // It returns nil if the connection was closed gracefully. Wait() error @@ -286,6 +293,19 @@ func (ts *Toolset) Start(ctx context.Context) error { if ts.supervisor == nil { return errors.New("toolset has no supervisor: must be created via NewToolsetCommand or NewRemoteToolset") } + // Stamp the connection identifier on the parent `toolset.start` + // span before doing anything else so an Initialize failure (e.g. + // the multi-replica MCP "session not found" 404 case) carries the + // target address as `server.address` — without this, the error + // message has the only clue and triage requires log greppage to + // match toolsets to URLs. + if ts.mcpClient != nil { + if addr := ts.mcpClient.ServerAddress(); addr != "" { + if span := trace.SpanFromContext(ctx); span.IsRecording() { + span.SetAttributes(attribute.String("server.address", addr)) + } + } + } return ts.supervisor.Start(ctx) } diff --git a/pkg/tools/mcp/mcp_test.go b/pkg/tools/mcp/mcp_test.go index 8a80e6264..63be08bab 100644 --- a/pkg/tools/mcp/mcp_test.go +++ b/pkg/tools/mcp/mcp_test.go @@ -50,6 +50,8 @@ func (m *mockMCPClient) SetToolListChangedHandler(func()) {} func (m *mockMCPClient) SetPromptListChangedHandler(func()) {} +func (m *mockMCPClient) ServerAddress() string { return "mock://test" } + func (m *mockMCPClient) Wait() error { return nil } func (m *mockMCPClient) Close(context.Context) error { return nil } diff --git a/pkg/tools/mcp/oauth.go b/pkg/tools/mcp/oauth.go index fa3fb3b72..6385db666 100644 --- a/pkg/tools/mcp/oauth.go +++ b/pkg/tools/mcp/oauth.go @@ -16,9 +16,15 @@ import ( "time" mcpsdk "github.com/modelcontextprotocol/go-sdk/mcp" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "golang.org/x/oauth2" "github.com/docker/docker-agent/pkg/config/latest" + "github.com/docker/docker-agent/pkg/httpclient" + otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp" "github.com/docker/docker-agent/pkg/tools" ) @@ -475,17 +481,42 @@ func (t *oauthTransport) getValidToken(ctx context.Context) *OAuthToken { slog.Debug("Attempting silent token refresh", "url", t.baseURL) - o := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}} + // Wrap the refresh path in a span so the latency and failure + // rate of silent OAuth token refreshes are visible — the user + // otherwise just sees a stalled MCP request with no obvious + // cause. Pull conversation id from baggage so observability-svc + // can attribute the refresh to the spawning session. + refreshAttrs := []attribute.KeyValue{ + attribute.String("cagent.oauth.base_url", t.baseURL), + } + if convID := otelmcp.ConversationIDFromBaggage(ctx); convID != "" { + refreshAttrs = append(refreshAttrs, attribute.String("gen_ai.conversation.id", convID)) + } + ctx, refreshSpan := otel.Tracer("github.com/docker/docker-agent/pkg/tools/mcp").Start( + ctx, + "oauth.token.refresh", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(refreshAttrs...), + ) + defer refreshSpan.End() + + o := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })} authServer := cmp.Or(token.AuthServer, t.baseURL) metadata, err := o.getAuthorizationServerMetadata(ctx, authServer) if err != nil { slog.Debug("Failed to fetch auth server metadata for refresh", "auth_server", authServer, "error", err) + refreshSpan.RecordError(err) + refreshSpan.SetStatus(codes.Error, "metadata fetch failed") + refreshSpan.SetAttributes(attribute.String("error.type", "metadata")) return nil } newToken, err := RefreshAccessToken(ctx, metadata.TokenEndpoint, token.RefreshToken, token.ClientID, token.ClientSecret) if err != nil { slog.Debug("Token refresh failed, will require interactive auth", "error", err) + refreshSpan.RecordError(err) + refreshSpan.SetStatus(codes.Error, "refresh failed") + refreshSpan.SetAttributes(attribute.String("error.type", "refresh_token")) t.mu.Lock() t.refreshFailedAt = time.Now() t.mu.Unlock() @@ -546,24 +577,54 @@ func configuredScopes(c *latest.RemoteOAuthConfig) []string { } // handleOAuthFlow performs the OAuth flow when a 401 response is received -func (t *oauthTransport) handleOAuthFlow(ctx context.Context, authServer, wwwAuth string) error { +func (t *oauthTransport) handleOAuthFlow(ctx context.Context, authServer, wwwAuth string) (err error) { + kind := "unmanaged" if t.managed { - return t.handleManagedOAuthFlow(ctx, authServer, wwwAuth) + kind = "managed" + } + // Interactive OAuth flows can take seconds to minutes (user + // switches to browser, completes the consent screen, comes + // back). The span makes that latency attributable and gives + // dashboards a way to count auth-failure rates by managed kind. + flowAttrs := []attribute.KeyValue{ + attribute.String("cagent.oauth.base_url", t.baseURL), + attribute.String("cagent.oauth.kind", kind), } + if convID := otelmcp.ConversationIDFromBaggage(ctx); convID != "" { + flowAttrs = append(flowAttrs, attribute.String("gen_ai.conversation.id", convID)) + } + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools/mcp").Start( + ctx, + "oauth.flow", + trace.WithSpanKind(trace.SpanKindClient), + trace.WithAttributes(flowAttrs...), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() + if t.managed { + return t.handleManagedOAuthFlow(ctx, authServer, wwwAuth) + } return t.handleUnmanagedOAuthFlow(ctx, authServer, wwwAuth) } func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, wwwAuth string) error { slog.Debug("Starting OAuth flow for server", "url", t.baseURL) + span := trace.SpanFromContext(ctx) resourceURL := cmp.Or(resourceMetadataFromWWWAuth(wwwAuth), authServer+"/.well-known/oauth-protected-resource") + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_protected_resource_metadata"))) resourceReq, err := http.NewRequestWithContext(ctx, http.MethodGet, resourceURL, http.NoBody) if err != nil { return err } - resp, err := http.DefaultClient.Do(resourceReq) + resp, err := httpclient.TracedDefaultClient().Do(resourceReq) if err != nil { return err } @@ -585,7 +646,8 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, resourceMetadata.AuthorizationServers = []string{authServer} } - oauth := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}} + oauth := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })} + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_authorization_server_metadata"))) authServerMetadata, err := oauth.getAuthorizationServerMetadata(ctx, resourceMetadata.AuthorizationServers[0]) if err != nil { return fmt.Errorf("failed to fetch authorization server metadata: %w", err) @@ -628,6 +690,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, scopes = t.oauthConfig.Scopes case authServerMetadata.RegistrationEndpoint != "": slog.Debug("Attempting dynamic client registration") + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "dynamic_client_registration"))) clientID, clientSecret, err = RegisterClient(ctx, authServerMetadata, redirectURI, nil) if err != nil { slog.Debug("Dynamic registration failed", "error", err) @@ -676,6 +739,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, } slog.Debug("Requesting authorization code", "url", authURL) + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "request_authorization_code"))) code, receivedState, err := RequestAuthorizationCode(ctx, authURL, callbackServer, state) if err != nil { @@ -687,6 +751,7 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, } slog.Debug("Exchanging authorization code for token") + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "token_exchange"))) token, err := ExchangeCodeForToken( ctx, authServerMetadata.TokenEndpoint, @@ -720,15 +785,17 @@ func (t *oauthTransport) handleManagedOAuthFlow(ctx context.Context, authServer, // where the client handles the OAuth interaction instead of us func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServer, wwwAuth string) error { slog.Debug("Starting unmanaged OAuth flow for server", "url", t.baseURL) + span := trace.SpanFromContext(ctx) // Extract resource URL from WWW-Authenticate header resourceURL := cmp.Or(resourceMetadataFromWWWAuth(wwwAuth), authServer+"/.well-known/oauth-protected-resource") + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_protected_resource_metadata"))) resourceReq, err := http.NewRequestWithContext(ctx, http.MethodGet, resourceURL, http.NoBody) if err != nil { return err } - resp, err := http.DefaultClient.Do(resourceReq) + resp, err := httpclient.TracedDefaultClient().Do(resourceReq) if err != nil { return err } @@ -750,7 +817,8 @@ func (t *oauthTransport) handleUnmanagedOAuthFlow(ctx context.Context, authServe resourceMetadata.AuthorizationServers = []string{authServer} } - oauth := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}} + oauth := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })} + span.AddEvent("oauth.step", trace.WithAttributes(attribute.String("cagent.oauth.step", "fetch_authorization_server_metadata"))) authServerMetadata, err := oauth.getAuthorizationServerMetadata(ctx, resourceMetadata.AuthorizationServers[0]) if err != nil { return fmt.Errorf("failed to fetch authorization server metadata: %w", err) diff --git a/pkg/tools/mcp/oauth_helpers.go b/pkg/tools/mcp/oauth_helpers.go index ca9e862c8..768bec002 100644 --- a/pkg/tools/mcp/oauth_helpers.go +++ b/pkg/tools/mcp/oauth_helpers.go @@ -16,6 +16,7 @@ import ( "golang.org/x/oauth2" "github.com/docker/docker-agent/pkg/browser" + "github.com/docker/docker-agent/pkg/httpclient" ) // GenerateState generates a random state parameter for OAuth CSRF protection @@ -62,7 +63,7 @@ func ExchangeCodeForToken(ctx context.Context, tokenEndpoint, code, codeVerifier req.Header.Set("Content-Type", "application/x-www-form-urlencoded") - resp, err := http.DefaultClient.Do(req) + resp, err := httpclient.TracedDefaultClient().Do(req) if err != nil { return nil, fmt.Errorf("failed to exchange code for token: %w", err) } @@ -221,7 +222,7 @@ func RegisterClient(ctx context.Context, authMetadata *AuthorizationServerMetada } req.Header.Set("Content-Type", "application/json") - resp, err := http.DefaultClient.Do(req) + resp, err := httpclient.TracedDefaultClient().Do(req) if err != nil { return "", "", fmt.Errorf("failed to register client: %w", err) } @@ -269,7 +270,7 @@ func RefreshAccessToken(ctx context.Context, tokenEndpoint, refreshToken, client } req.Header.Set("Content-Type", "application/x-www-form-urlencoded") - resp, err := http.DefaultClient.Do(req) + resp, err := httpclient.TracedDefaultClient().Do(req) if err != nil { return nil, fmt.Errorf("failed to refresh token: %w", err) } diff --git a/pkg/tools/mcp/oauth_login.go b/pkg/tools/mcp/oauth_login.go index 00d57c8fb..a71ddc2a2 100644 --- a/pkg/tools/mcp/oauth_login.go +++ b/pkg/tools/mcp/oauth_login.go @@ -11,6 +11,8 @@ import ( "time" "golang.org/x/oauth2" + + "github.com/docker/docker-agent/pkg/httpclient" ) // PerformOAuthLogin performs a standalone OAuth flow for the given MCP server URL. @@ -19,7 +21,7 @@ import ( func PerformOAuthLogin(ctx context.Context, serverURL string) error { tokenStore := NewKeyringTokenStore() - o := &oauth{metadataClient: &http.Client{Timeout: 5 * time.Second}} + o := &oauth{metadataClient: httpclient.TracedClient(func(c *http.Client) { c.Timeout = 5 * time.Second })} // Derive the base origin (scheme + host) from the server URL. // The well-known endpoints live at the origin, not under the SSE/path. @@ -35,7 +37,7 @@ func PerformOAuthLogin(ctx context.Context, serverURL string) error { if err != nil { return fmt.Errorf("failed to create resource metadata request: %w", err) } - resp, err := http.DefaultClient.Do(resourceReq) + resp, err := httpclient.TracedDefaultClient().Do(resourceReq) if err != nil { return fmt.Errorf("failed to fetch protected resource metadata: %w", err) } diff --git a/pkg/tools/mcp/oauth_server.go b/pkg/tools/mcp/oauth_server.go index 5a355ccb4..527316a5d 100644 --- a/pkg/tools/mcp/oauth_server.go +++ b/pkg/tools/mcp/oauth_server.go @@ -12,6 +12,8 @@ import ( "strings" "sync" "time" + + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" ) // CallbackServer handles OAuth callback requests @@ -53,8 +55,12 @@ func NewCallbackServerOnPort(port int) (*CallbackServer, error) { mux := http.NewServeMux() mux.HandleFunc("/callback", cs.handleCallback) + // Wrap with otelhttp so the OAuth callback span chains onto the + // caller's trace when the OAuth provider preserves trace context + // in the redirect (most don't, but the wrap is harmless when + // they don't, and useful when they do). cs.server = &http.Server{ - Handler: mux, + Handler: otelhttp.NewHandler(mux, "oauth.callback"), ReadTimeout: 10 * time.Second, WriteTimeout: 10 * time.Second, } diff --git a/pkg/tools/mcp/reconnect_test.go b/pkg/tools/mcp/reconnect_test.go index 71ece482b..df0257a89 100644 --- a/pkg/tools/mcp/reconnect_test.go +++ b/pkg/tools/mcp/reconnect_test.go @@ -72,6 +72,7 @@ func (m *failingInitClient) SetOAuthSuccessHandler(func()) {} func (m *failingInitClient) SetManagedOAuth(bool) {} func (m *failingInitClient) SetToolListChangedHandler(func()) {} func (m *failingInitClient) SetPromptListChangedHandler(func()) {} +func (m *failingInitClient) ServerAddress() string { return "mock://failing" } func (m *failingInitClient) Wait() error { m.mu.Lock() diff --git a/pkg/tools/mcp/remote.go b/pkg/tools/mcp/remote.go index 805c3fe1a..42a7a4254 100644 --- a/pkg/tools/mcp/remote.go +++ b/pkg/tools/mcp/remote.go @@ -5,10 +5,12 @@ import ( "fmt" "log/slog" "net/http" + neturl "net/url" gomcp "github.com/modelcontextprotocol/go-sdk/mcp" "github.com/docker/docker-agent/pkg/config/latest" + "github.com/docker/docker-agent/pkg/httpclient" "github.com/docker/docker-agent/pkg/upstream" ) @@ -31,6 +33,7 @@ func newRemoteClient(url, transportType string, headers map[string]string, token } return &remoteMCPClient{ + sessionClient: sessionClient{serverAddress: sanitizeRemoteAddress(url)}, url: url, transportType: transportType, headers: headers, @@ -39,6 +42,26 @@ func newRemoteClient(url, transportType string, headers map[string]string, token } } +// sanitizeRemoteAddress extracts a span-safe identifier from an MCP URL +// before stamping it as `server.address`. The URL may legitimately +// contain credentials in userinfo (`https://user:token@host/`) or query +// params (`?api_key=...`); sending those to the trace backend would be +// a real exfiltration risk. OTel's semantic convention for +// `server.address` is the host (with optional port) anyway, so we keep +// only `u.Host` and drop everything else. +// +// Returns the empty string on parse failure or hostless URLs (file://, +// stdio commands, malformed input). The caller stamps `server.address` +// only when it's non-empty, so a sanitisation miss leaves the span +// without that attribute rather than leaking a raw URL. +func sanitizeRemoteAddress(rawURL string) string { + u, err := neturl.Parse(rawURL) + if err != nil || u.Host == "" { + return "" + } + return u.Host +} + func (c *remoteMCPClient) Initialize(ctx context.Context, _ *gomcp.InitializeRequest) (*gomcp.InitializeResult, error) { // Create HTTP client with OAuth support. We keep a reference to the // oauthTransport so we can enrich Connect errors with the server's own @@ -132,6 +155,16 @@ func (c *remoteMCPClient) SetManagedOAuth(managed bool) { // The oauthTransport is returned alongside the client so callers can inspect // the most recent server-side failure (via lastServerError) when Connect() // returns a bare HTTP-status error and we need to surface the actual cause. +// +// The transport chain wraps `httpclient.WrapWithOTel` outermost so every +// outbound MCP request injects W3C `traceparent` (and creates an HTTP +// CLIENT span). Without this wrap, the streamable-HTTP / SSE transports +// the gomcp SDK builds with our `*http.Client` send raw POST/GET requests +// that never chain onto the calling cagent span — the downstream MCP +// server's spans then live in a separate root trace, breaking end-to-end +// observability for any agent talking to a remote MCP. `WrapWithOTel` is +// a no-op when OTel is disabled at runtime, so the laptop-mode default +// stays unchanged. func (c *remoteMCPClient) createHTTPClient() (*http.Client, *oauthTransport) { base := c.headerTransport() @@ -145,7 +178,7 @@ func (c *remoteMCPClient) createHTTPClient() (*http.Client, *oauthTransport) { oauthConfig: c.oauthConfig, } - return &http.Client{Transport: oauthT}, oauthT + return &http.Client{Transport: httpclient.WrapWithOTel(oauthT)}, oauthT } func (c *remoteMCPClient) headerTransport() http.RoundTripper { diff --git a/pkg/tools/mcp/remote_test.go b/pkg/tools/mcp/remote_test.go index 98678fd5d..17d97c9b0 100644 --- a/pkg/tools/mcp/remote_test.go +++ b/pkg/tools/mcp/remote_test.go @@ -12,6 +12,38 @@ import ( "github.com/stretchr/testify/require" ) +// TestSanitizeRemoteAddress verifies that URLs with embedded credentials +// (basic-auth userinfo, query-string secrets) collapse to a host-only +// string before reaching the `server.address` span attribute. The point +// is exfiltration safety: a URL like `https://user:token@host/?api_key=…` +// would otherwise be replicated verbatim into every CLIENT span and +// shipped to the trace backend. +func TestSanitizeRemoteAddress(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + url string + want string + }{ + {name: "plain", url: "https://example.com/mcp", want: "example.com"}, + {name: "host with port", url: "https://example.com:8443/mcp", want: "example.com:8443"}, + {name: "userinfo stripped", url: "https://alice:s3cret@example.com/mcp", want: "example.com"}, + {name: "query stripped", url: "https://example.com/mcp?api_key=s3cret", want: "example.com"}, + {name: "userinfo and query stripped", url: "https://alice:s3cret@example.com:8443/mcp?api_key=x", want: "example.com:8443"}, + {name: "fragment stripped", url: "https://example.com/mcp#frag", want: "example.com"}, + {name: "hostless empty fallback", url: "not-a-url", want: ""}, + {name: "empty input", url: "", want: ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + got := sanitizeRemoteAddress(tc.url) + assert.Equal(t, tc.want, got, "sanitizeRemoteAddress(%q)", tc.url) + }) + } +} + // TestRemoteClientCustomHeaders verifies that custom headers passed to the remote // MCP client are actually applied to HTTP requests sent to the MCP server. func TestRemoteClientCustomHeaders(t *testing.T) { diff --git a/pkg/tools/mcp/session_client.go b/pkg/tools/mcp/session_client.go index 778ee1530..e2259142c 100644 --- a/pkg/tools/mcp/session_client.go +++ b/pkg/tools/mcp/session_client.go @@ -9,7 +9,9 @@ import ( "sync" gomcp "github.com/modelcontextprotocol/go-sdk/mcp" + "go.opentelemetry.io/otel/attribute" + otelmcp "github.com/docker/docker-agent/pkg/telemetry/mcp" "github.com/docker/docker-agent/pkg/tools" ) @@ -17,8 +19,16 @@ import ( // implementations. Both stdioMCPClient and remoteMCPClient embed it to avoid // duplicating the session-nil guards, notification handlers, and delegating // methods. +// +// `serverAddress` is captured at construction time (the remote URL for +// HTTP/SSE clients, the executable name for stdio clients) and stamped on +// every CLIENT-kind MCP span as the OTel `server.address` attribute. Without +// it, a `tools/list` failure span carries `mcp.method.name=tools/list` and +// nothing else identifying which target produced the error — useful in a +// single-MCP agent, useless in any agent wired to two or more. type sessionClient struct { session *gomcp.ClientSession + serverAddress string toolListChangedHandler func() promptListChangedHandler func() elicitationHandler tools.ElicitationHandler @@ -33,6 +43,15 @@ func (c *sessionClient) setSession(s *gomcp.ClientSession) { c.mu.Unlock() } +// ServerAddress returns the connection identifier captured at construction +// time (URL for remote clients, executable name for stdio). Exposed so +// the parent `toolset.start` span can stamp it as `server.address` — +// otherwise an Initialize failure surfaces the error message but no +// indication of which MCP target produced it. +func (c *sessionClient) ServerAddress() string { + return c.serverAddress +} + // getSession returns the current session under the read lock. func (c *sessionClient) getSession() *gomcp.ClientSession { c.mu.RLock() @@ -93,35 +112,140 @@ func (c *sessionClient) Close(context.Context) error { } func (c *sessionClient) ListTools(ctx context.Context, request *gomcp.ListToolsParams) iter.Seq2[*gomcp.Tool, error] { - if s := c.getSession(); s != nil { - return s.Tools(ctx, request) + s := c.getSession() + if s == nil { + return func(yield func(*gomcp.Tool, error) bool) { + yield(nil, errors.New("session not initialized")) + } } + // Start the span and the underlying RPC inside the closure so a + // caller that obtains the iterator and never iterates does not + // leak the span (and the in-flight RPC). Span lifetime now equals + // iteration lifetime. return func(yield func(*gomcp.Tool, error) bool) { - yield(nil, errors.New("session not initialized")) + spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{ + Method: otelmcp.MethodToolsList, + SessionID: s.ID(), + ServerAddress: c.serverAddress, + }) + defer span.End() + + // Stamp the tool count on the span when iteration finishes — + // answers "what did this server actually return?" without + // having to walk into the JSON-RPC payload. Counts only the + // tools the iterator yielded successfully; partial counts are + // preserved when the caller breaks out early. + var count int + defer func() { + span.SetAttributes(attribute.Int("cagent.mcp.tools.count", count)) + }() + + if request != nil { + request.Meta = otelmcp.EnsureMeta(request.Meta) + otelmcp.InjectMeta(spanCtx, request.Meta) + } + for tool, err := range s.Tools(spanCtx, request) { + if err != nil { + // Record each error inline rather than only the + // last one — paginated lists may yield multiple + // failures and the trace should reflect them all. + span.RecordError(err, "") + } else if tool != nil { + count++ + } + if !yield(tool, err) { + return + } + } } } func (c *sessionClient) CallTool(ctx context.Context, request *gomcp.CallToolParams) (*gomcp.CallToolResult, error) { - if s := c.getSession(); s != nil { - return s.CallTool(ctx, request) + s := c.getSession() + if s == nil { + return nil, errors.New("session not initialized") + } + opts := otelmcp.CallOptions{ + Method: otelmcp.MethodToolsCall, + SessionID: s.ID(), + ServerAddress: c.serverAddress, + } + if request != nil { + opts.ToolName = request.Name + } + spanCtx, span := otelmcp.StartClient(ctx, opts) + defer span.End() + + if request != nil { + request.Meta = otelmcp.EnsureMeta(request.Meta) + otelmcp.InjectMeta(spanCtx, request.Meta) + } + + result, err := s.CallTool(spanCtx, request) + if err != nil { + span.RecordError(err, "") } - return nil, errors.New("session not initialized") + return result, err } func (c *sessionClient) ListPrompts(ctx context.Context, request *gomcp.ListPromptsParams) iter.Seq2[*gomcp.Prompt, error] { - if s := c.getSession(); s != nil { - return s.Prompts(ctx, request) + s := c.getSession() + if s == nil { + return func(yield func(*gomcp.Prompt, error) bool) { + yield(nil, errors.New("session not initialized")) + } } return func(yield func(*gomcp.Prompt, error) bool) { - yield(nil, errors.New("session not initialized")) + // Span and RPC start at iteration time so an unused + // iterator never leaks either. + spanCtx, span := otelmcp.StartClient(ctx, otelmcp.CallOptions{ + Method: otelmcp.MethodPromptsList, + SessionID: s.ID(), + ServerAddress: c.serverAddress, + }) + defer span.End() + + if request != nil { + request.Meta = otelmcp.EnsureMeta(request.Meta) + otelmcp.InjectMeta(spanCtx, request.Meta) + } + for prompt, err := range s.Prompts(spanCtx, request) { + if err != nil { + span.RecordError(err, "") + } + if !yield(prompt, err) { + return + } + } } } func (c *sessionClient) GetPrompt(ctx context.Context, request *gomcp.GetPromptParams) (*gomcp.GetPromptResult, error) { - if s := c.getSession(); s != nil { - return s.GetPrompt(ctx, request) + s := c.getSession() + if s == nil { + return nil, errors.New("session not initialized") + } + opts := otelmcp.CallOptions{ + Method: otelmcp.MethodPromptsGet, + SessionID: s.ID(), + ServerAddress: c.serverAddress, + } + if request != nil { + opts.PromptName = request.Name + } + spanCtx, span := otelmcp.StartClient(ctx, opts) + defer span.End() + + if request != nil { + request.Meta = otelmcp.EnsureMeta(request.Meta) + otelmcp.InjectMeta(spanCtx, request.Meta) + } + + result, err := s.GetPrompt(spanCtx, request) + if err != nil { + span.RecordError(err, "") } - return nil, errors.New("session not initialized") + return result, err } // handleElicitationRequest forwards incoming elicitation requests from the MCP diff --git a/pkg/tools/mcp/stdio.go b/pkg/tools/mcp/stdio.go index 01e3fab25..454fb3139 100644 --- a/pkg/tools/mcp/stdio.go +++ b/pkg/tools/mcp/stdio.go @@ -22,10 +22,15 @@ type stdioMCPClient struct { func newStdioCmdClient(command string, args, env []string, cwd string) *stdioMCPClient { return &stdioMCPClient{ - command: command, - args: args, - env: env, - cwd: cwd, + // stdio has no real "server address" in the OTel HTTP sense; using + // the command as a stand-in keeps spans triageable when the agent + // has multiple stdio MCPs wired up. Span readers see the + // executable name (e.g. `foo-mcp-server`) on `server.address`. + sessionClient: sessionClient{serverAddress: command}, + command: command, + args: args, + env: env, + cwd: cwd, } } diff --git a/pkg/tools/startable.go b/pkg/tools/startable.go index f550a4553..67258ba6e 100644 --- a/pkg/tools/startable.go +++ b/pkg/tools/startable.go @@ -4,6 +4,11 @@ import ( "context" "fmt" "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" ) // Describer can be implemented by a ToolSet to provide a short, user-visible @@ -65,7 +70,7 @@ func (s *StartableToolSet) IsStarted() bool { // Concurrent callers block until the start attempt completes. // If start fails, a future call will retry. // If the underlying toolset doesn't implement Startable, this is a no-op. -func (s *StartableToolSet) Start(ctx context.Context) error { +func (s *StartableToolSet) Start(ctx context.Context) (err error) { s.mu.Lock() defer s.mu.Unlock() @@ -74,6 +79,32 @@ func (s *StartableToolSet) Start(ctx context.Context) error { } if startable, ok := As[Startable](s.ToolSet); ok { + // Span the toolset startup — MCP handshake, OAuth probes, + // tool discovery, etc. can take seconds to minutes and the + // "tools loading…" UI was previously unattributable. Only + // fires when the toolset has work to do; cheap toolsets + // without a Startable implementation skip the span entirely. + // Unwrap once so the kind attribute names the underlying toolset + // (e.g. *mcp.Toolset, *builtin.ShellTool) instead of the + // *tools.namedToolSet wrapper that every toolset gets in the + // registry — same pattern DescribeToolSet uses. + inner := s.ToolSet + if u, ok := inner.(Unwrapper); ok { + inner = u.Unwrap() + } + ctx, span := otel.Tracer("github.com/docker/docker-agent/pkg/tools").Start( + ctx, + "toolset.start", + trace.WithSpanKind(trace.SpanKindInternal), + trace.WithAttributes(attribute.String("cagent.toolset.kind", fmt.Sprintf("%T", inner))), + ) + defer func() { + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + } + span.End() + }() if err := startable.Start(ctx); err != nil { // Queue a warning ONLY on the first failure of a streak so // repeated retries don't re-queue duplicate warnings.