From 1603030f694fd2d489191b6fe55b1499a08b0608 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 16 Jun 2026 00:32:01 -0700 Subject: [PATCH] [gateway] forward-port dynamic forward proxy h2/h2c upstream support with auto_sni These DFP improvements shipped on the v0.11.x release line (authored 2026-06-03 for the NetworkingProd1 incident) but were never forward-ported to main, which diverged from that line on 2025-10-01. This brings them to main, correct from the start: - Speak HTTP/2 upstream for h2/h2c Backends by emitting HttpProtocolOptions on the dynamic_forward_proxy cluster (previously HTTP/1.1 only, which defeated multiplexing and made the upstream connection cap easy to hit). - Remove Envoy's implicit 1024 max_connections/max_requests cap on http and dfp clusters via explicit circuit breakers. - Set auto_sni and auto_san_validation on the DFP cluster's HttpProtocolOptions. Envoy's dynamic_forward_proxy factory requires both once typed_extension_protocol_options are present (otherwise the cluster is rejected: "dynamic_forward_proxy cluster must have auto_sni and auto_san_validation true ..."). This derives the upstream SNI from the dynamically resolved host and validates the cert SAN against it, preserving TLS verification without allow_insecure_cluster_options. Forward-port of fee0145 + b4dcd07 plus the auto_sni fix shipped in v0.11.23. Co-Authored-By: Claude Opus 4.8 (1M context) --- pkg/gateway/xds/translator/cluster.go | 30 +++ .../xds/translator/dynamic_forward_proxy.go | 23 ++- .../translator/dynamic_forward_proxy_test.go | 182 ++++++++++++++++++ 3 files changed, 231 insertions(+), 4 deletions(-) create mode 100644 pkg/gateway/xds/translator/dynamic_forward_proxy_test.go diff --git a/pkg/gateway/xds/translator/cluster.go b/pkg/gateway/xds/translator/cluster.go index c36812f4..e5096cf8 100644 --- a/pkg/gateway/xds/translator/cluster.go +++ b/pkg/gateway/xds/translator/cluster.go @@ -8,6 +8,7 @@ package translator import ( "encoding/hex" "fmt" + "math" "sort" "time" @@ -336,6 +337,18 @@ func buildXdsClusterCircuitBreaker(circuitBreaker *ir.CircuitBreaker) *clusterv3 MaxRetries: &wrapperspb.UInt32Value{ Value: uint32(1024), }, + // Envoy applies implicit defaults of 1024 for max_connections and + // max_requests whenever any circuit breaker threshold is present. Since we + // always emit a threshold (for MaxRetries above), regular HTTP and dynamic + // forward proxy clusters would otherwise be silently capped at 1024 + // connections / parallel requests. Default both to unlimited; callers may + // override below. + MaxConnections: &wrapperspb.UInt32Value{ + Value: math.MaxUint32, + }, + MaxRequests: &wrapperspb.UInt32Value{ + Value: math.MaxUint32, + }, } if circuitBreaker != nil { @@ -530,6 +543,23 @@ func buildTypedExtensionProtocolOptions(args *xdsClusterArgs) map[string]*anypb. } } + // Envoy's dynamic_forward_proxy cluster factory rejects the cluster unless + // upstream_http_protocol_options enables both auto_sni and auto_san_validation + // (or allow_insecure_cluster_options is set). Envoy only auto-injects those + // defaults when the cluster carries no typed_extension_protocol_options; once we + // emit HttpProtocolOptions here (e.g. to speak HTTP/2 upstream for an h2/h2c + // Backend) that auto-injection is a no-op, so we must set them explicitly. + // Enabling them derives the upstream SNI from the dynamically resolved host and + // validates the upstream certificate SAN against it — preserving TLS verification + // for h2/TLS backends, and a no-op for cleartext upstreams. The same args.settings + // guard is used by addXdsCluster to detect the dynamic forward proxy path. + if len(args.settings) > 0 && args.settings[0].DynamicForwardProxy != nil { + protocolOptions.UpstreamHttpProtocolOptions = &corev3.UpstreamHttpProtocolOptions{ + AutoSni: true, + AutoSanValidation: true, + } + } + anyProtocolOptions, _ := anypb.New(&protocolOptions) extensionOptions := map[string]*anypb.Any{ diff --git a/pkg/gateway/xds/translator/dynamic_forward_proxy.go b/pkg/gateway/xds/translator/dynamic_forward_proxy.go index aebbdc1a..d85ec4b0 100644 --- a/pkg/gateway/xds/translator/dynamic_forward_proxy.go +++ b/pkg/gateway/xds/translator/dynamic_forward_proxy.go @@ -162,10 +162,11 @@ func (*dynamicForwardProxy) patchResources( } args := &xdsClusterArgs{ - name: r.Destination.Name, - settings: r.Destination.Settings, - timeout: r.Timeout, - tcpkeepalive: r.TCPKeepalive, + name: r.Destination.Name, + settings: r.Destination.Settings, + timeout: r.Timeout, + tcpkeepalive: r.TCPKeepalive, + circuitBreaker: r.CircuitBreaker, } if err := createDynamicForwardProxyCluster( @@ -213,6 +214,20 @@ func createDynamicForwardProxyCluster( }, } + // Apply circuit breaker limits so the dynamic forward proxy cluster is not + // silently capped at Envoy's implicit 1024 max_connections / max_requests + // defaults. + cluster.CircuitBreakers = buildXdsClusterCircuitBreaker(args.circuitBreaker) + + // Apply upstream HTTP protocol options based on the destination setting's + // protocol, mirroring buildXdsCluster. The custom DFP cluster bypasses the + // normal cluster builder, so without this an h2/h2c Backend would still be + // spoken to over HTTP/1.1 upstream. Returns nil (leaving Envoy's HTTP/1.1 + // default) when no HTTP/2 or other protocol option is required. + if epo := buildTypedExtensionProtocolOptions(args); epo != nil { + cluster.TypedExtensionProtocolOptions = epo + } + if setting.TLS != nil { if cluster.TransportSocket, err = buildXdsUpstreamTLSSocketWthCert(setting.TLS); err != nil { return fmt.Errorf("failed to build xds upstream tls socket: %w", err) diff --git a/pkg/gateway/xds/translator/dynamic_forward_proxy_test.go b/pkg/gateway/xds/translator/dynamic_forward_proxy_test.go new file mode 100644 index 00000000..b47c1a61 --- /dev/null +++ b/pkg/gateway/xds/translator/dynamic_forward_proxy_test.go @@ -0,0 +1,182 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package translator + +import ( + "testing" + + httpv3 "github.com/envoyproxy/go-control-plane/envoy/extensions/upstreams/http/v3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/utils/ptr" + + "github.com/apoxy-dev/apoxy/pkg/gateway/ir" + "github.com/apoxy-dev/apoxy/pkg/gateway/xds/types" +) + +// dfpClusterArgs builds the xdsClusterArgs that the dynamic forward proxy http +// filter passes to createDynamicForwardProxyCluster for a single Backend, mirroring +// (*dynamicForwardProxy).patchResources. +func dfpClusterArgs(name string, protocol ir.AppProtocol, family ir.DNSLookupFamily, tls *ir.TLSUpstreamConfig) *xdsClusterArgs { + return &xdsClusterArgs{ + name: name, + settings: []*ir.DestinationSetting{ + { + Protocol: protocol, + AddressType: ptr.To(ir.DYNAMIC_PROXY), + TLS: tls, + DynamicForwardProxy: &ir.DynamicForwardProxy{ + Name: name, + DNSLookupFamily: family, + }, + }, + }, + } +} + +// TestCreateDynamicForwardProxyCluster_ProtocolOptions reproduces the +// NetworkingProd1 dfp-backend-0e058897 rejection and asserts the fix. A +// core.apoxy.dev Backend with +// +// dynamicProxy: { dnsCacheConfig: { dnsLookupFamily: v4_only } } +// protocol: h2 +// +// translates (api/.../route.go) to a DestinationSetting with Protocol=HTTP2 and a +// system-trust-store upstream TLS config. Once v0.11.22 began emitting +// typed_extension_protocol_options on the DFP cluster (to speak HTTP/2 upstream), +// Envoy's dynamic_forward_proxy cluster factory rejected the cluster: +// +// "dynamic_forward_proxy cluster must have auto_sni and auto_san_validation true +// unless allow_insecure_cluster_options is set." +// +// because Envoy stops auto-injecting those defaults once the cluster carries typed +// HttpProtocolOptions. The cluster must therefore set both fields itself, while +// still honoring HTTP/2 and keeping its TLS transport socket (SAN validation +// preserved, no allow_insecure_cluster_options). +func TestCreateDynamicForwardProxyCluster_ProtocolOptions(t *testing.T) { + tests := []struct { + name string + clusterName string + protocol ir.AppProtocol + family ir.DNSLookupFamily + tls *ir.TLSUpstreamConfig + wantTypedOps bool // expect typed_extension_protocol_options on the cluster + wantHTTP2 bool + wantTLS bool + }{ + { + // Exact production repro: dfp-backend-0e058897, protocol h2, v4_only. + name: "h2-over-tls-v4only", + clusterName: "dfp-backend-0e058897", + protocol: ir.HTTP2, + family: ir.V4Only, + tls: &ir.TLSUpstreamConfig{UseSystemTrustStore: true}, + wantTypedOps: true, + wantHTTP2: true, + wantTLS: true, + }, + { + // h2c (cleartext HTTP/2) is also rejected by the same Envoy check once + // typed options are present, even without TLS. auto_sni/auto_san_validation + // are no-ops over cleartext but must still be set. + name: "h2c-cleartext", + clusterName: "dfp-backend-h2c", + protocol: ir.HTTP2, + family: ir.Auto, + tls: nil, + wantTypedOps: true, + wantHTTP2: true, + wantTLS: false, + }, + { + // Plain HTTP/1.1 DFP backend emits no typed options, so Envoy keeps its + // HTTP/1.1 upstream default and auto-injects auto_sni/auto_san_validation. + name: "http1-no-typed-options", + clusterName: "dfp-backend-http1", + protocol: ir.HTTP, + family: ir.Auto, + tls: nil, + wantTypedOps: false, + wantHTTP2: false, + wantTLS: false, + }, + { + // protocol: tls (BackendProtoTLS) -> ir.HTTP + system-trust TLS, no HTTP/2. + // Still emits no typed options, so the cluster relies on Envoy's auto-inject + // of auto_sni/auto_san_validation onto the legacy field and is not rejected. + // The TLS transport socket must still be present. + name: "tls-http1-no-typed-options", + clusterName: "dfp-backend-tls", + protocol: ir.HTTP, + family: ir.Auto, + tls: &ir.TLSUpstreamConfig{UseSystemTrustStore: true}, + wantTypedOps: false, + wantHTTP2: false, + wantTLS: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tCtx := new(types.ResourceVersionTable) + args := dfpClusterArgs(tc.clusterName, tc.protocol, tc.family, tc.tls) + + require.NoError(t, createDynamicForwardProxyCluster(args, tCtx)) + + cluster := findXdsCluster(tCtx, tc.clusterName) + require.NotNil(t, cluster, "expected DFP cluster %q to be created", tc.clusterName) + + if tc.wantTLS { + require.NotNil(t, cluster.TransportSocket, "expected upstream TLS transport socket") + } else { + assert.Nil(t, cluster.TransportSocket, "did not expect TLS transport socket") + } + + if !tc.wantTypedOps { + assert.Empty(t, cluster.TypedExtensionProtocolOptions, + "expected no typed extension protocol options (Envoy auto-injects auto_sni/auto_san_validation)") + return + } + + anyOpts, ok := cluster.TypedExtensionProtocolOptions[extensionOptionsKey] + require.True(t, ok, "expected %s in typed extension protocol options", extensionOptionsKey) + + var hpo httpv3.HttpProtocolOptions + require.NoError(t, anyOpts.UnmarshalTo(&hpo)) + + if tc.wantHTTP2 { + require.NotNil(t, hpo.GetExplicitHttpConfig().GetHttp2ProtocolOptions(), + "expected upstream HTTP/2 protocol options for %s backend", tc.protocol) + } + + // The contract Envoy's dynamic_forward_proxy factory enforces. + uhp := hpo.GetUpstreamHttpProtocolOptions() + require.NotNil(t, uhp, "DFP cluster must set upstream_http_protocol_options") + assert.True(t, uhp.GetAutoSni(), "auto_sni must be true on the DFP cluster") + assert.True(t, uhp.GetAutoSanValidation(), "auto_san_validation must be true on the DFP cluster") + }) + } +} + +// TestBuildTypedExtensionProtocolOptions_NonDFP_NoAutoSNI guards the scope of the +// fix: only dynamic forward proxy clusters get auto_sni/auto_san_validation forced +// on. A normal HTTP/2 cluster must be left untouched so its (possibly explicit) SNI +// handling is not silently overridden and SAN validation behavior is unchanged. +func TestBuildTypedExtensionProtocolOptions_NonDFP_NoAutoSNI(t *testing.T) { + args := &xdsClusterArgs{ + name: "static-h2", + settings: []*ir.DestinationSetting{{Protocol: ir.HTTP2}}, + } + + epo := buildTypedExtensionProtocolOptions(args) + require.Contains(t, epo, extensionOptionsKey) + + var hpo httpv3.HttpProtocolOptions + require.NoError(t, epo[extensionOptionsKey].UnmarshalTo(&hpo)) + + assert.Nil(t, hpo.GetUpstreamHttpProtocolOptions(), + "non-DFP clusters must not have upstream_http_protocol_options forced on") +}