From afb10b55f72b7c4a92ea8936548251c146651100 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 16 Jun 2026 00:43:04 -0700 Subject: [PATCH 1/2] [handler] reject duplicate routes, owner-aware removal, exact Dst keying (APO-653, APO-654, APO-663) The routing trie indexed networks by Dst via LPM and shared the inner srcTrie by prefix containment, which produced three operator-error defects: - APO-653 (S10): a second virtual network with an identical Src+Dst silently overwrote the first's egress slot. AddVirtualNetwork now rejects a route already owned by a different VNI. - APO-654 (S11): removing one network blackholed another that shared a route, and emptied trie nodes leaked. Removal is now owner-aware and reclaims a Dst entry once its last owner is gone. - APO-663 (S20): a Dst nested inside a broader network's Dst was located via LPM and landed in (clobbering) the broader entry. Management now keys an exact-match dstEntries map; the data-path LPM is unchanged. AddVirtualNetwork checks-then-adds and stores networkByID last; UpdateVirtualNetworkRoutes is atomic with rollback on conflict. The data-path lookup now holds the read lock across both the outer and inner Find, closing a latent unlocked second-tier race (see APO-675). Adds routing_trie_test.go covering duplicate-reject, sibling-preserved + empty-entry reclaim, nested-Dst exact keying, and atomic-update rollback; export_test.go gains RouteLookupForTest / DstEntryCountForTest seams. --- export_test.go | 28 +++++++ handler.go | 170 ++++++++++++++++++++++++++++++------------- inplace_transform.go | 17 +++-- routing_trie_test.go | 159 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 315 insertions(+), 59 deletions(-) create mode 100644 routing_trie_test.go diff --git a/export_test.go b/export_test.go index 8f828b3..bae2437 100644 --- a/export_test.go +++ b/export_test.go @@ -2,9 +2,37 @@ package icx import ( "fmt" + "net/netip" "time" ) +// RouteLookupForTest resolves (src, dst) through the data-path routing trie +// exactly as VirtToPhy does (LPM on Dst, then LPM on Src) and returns the matched +// VNI. It lets the routing-trie tests assert which network a packet resolves to +// without driving a full encap. +func (h *Handler) RouteLookupForTest(src, dst netip.Addr) (uint, bool) { + h.networksByAddressMu.RLock() + defer h.networksByAddressMu.RUnlock() + value := h.networksByAddress.Find(dst) + if value == nil { + return 0, false + } + srcValue := value.(*dstEntry).srcTrie.Find(src) + if srcValue == nil { + return 0, false + } + return srcValue.(*VirtualNetwork).ID, true +} + +// DstEntryCountForTest returns the number of distinct Dst prefixes registered in +// the routing trie, for asserting that emptied entries are reclaimed (APO-654) +// rather than leaking on add/remove cycles. +func (h *Handler) DstEntryCountForTest() int { + h.networksByAddressMu.RLock() + defer h.networksByAddressMu.RUnlock() + return len(h.dstEntries) +} + // InstallKeysForTest installs RX/TX ciphers under a single shared epoch (rxSPI == // txSPI == epoch) without the production monotonicity and distinct-key guards enforced // by UpdateVirtualNetworkSAs. diff --git a/handler.go b/handler.go index 1f73608..d2e999e 100644 --- a/handler.go +++ b/handler.go @@ -226,15 +226,34 @@ func WithClock(c Clock) HandlerOption { // Handler processes encapsulated Geneve traffic for one or more virtual // networks. It performs encryption/decryption, replay protection, address // validation, and translation between physical and virtual frame formats. +// dstEntry is the per-Dst-prefix routing entry. srcTrie is the data-path +// longest-prefix lookup over Src prefixes; owners is the exact-prefix ownership +// index the management plane uses for collision detection (APO-653) and +// owner-aware removal (APO-654). Both index the same routes by Src prefix; the +// two stay in lockstep because all mutations go through addRouteLocked / +// removeRouteLocked under networksByAddressMu. +type dstEntry struct { + srcTrie *iptrie.Trie // Src prefix (LPM) -> *VirtualNetwork (data path) + owners map[netip.Prefix]*VirtualNetwork // exact Src prefix -> owning vnet (management) +} + type Handler struct { - opts *handlerOptions - networkByID sync.Map // Maps VNI to network - networksByAddressMu sync.RWMutex // Protects networksByAddress - networksByAddress *iptrie.Trie // Two tier trie: dstAddr -> srcAddr -> network - proxyARP *proxyarp.ProxyARP - ndProxy *ndproxy.NDProxy - hdrPool *sync.Pool - clock Clock + opts *handlerOptions + networkByID sync.Map // Maps VNI to network + // networksByAddressMu protects both networksByAddress and dstEntries. + networksByAddressMu sync.RWMutex + // networksByAddress is the data-path lookup: Dst prefix (LPM) -> *dstEntry. + // LPM is correct here so a packet matches the most specific Dst route. + networksByAddress *iptrie.Trie + // dstEntries is the management-plane index: EXACT Dst prefix -> *dstEntry (the + // same entries networksByAddress holds). Add/Remove/Update must address a route + // by its exact prefix, not by LPM containment, or a nested Dst lands in the + // wrong entry (APO-663) — so they go through this map, never networksByAddress.Find. + dstEntries map[netip.Prefix]*dstEntry + proxyARP *proxyarp.ProxyARP + ndProxy *ndproxy.NDProxy + hdrPool *sync.Pool + clock Clock } // NewHandler returns a new Handler configured with the given options. @@ -265,6 +284,7 @@ func NewHandler(opts ...HandlerOption) (*Handler, error) { return &Handler{ opts: &options, networksByAddress: iptrie.NewTrie(), + dstEntries: make(map[netip.Prefix]*dstEntry), proxyARP: proxyarp.NewProxyARP(options.srcMAC), ndProxy: ndproxy.NewNDProxy(options.srcMAC), hdrPool: hdrPool, @@ -272,6 +292,60 @@ func NewHandler(opts ...HandlerOption) (*Handler, error) { }, nil } +// checkRouteCollisionsLocked returns an error if any of routes would route an +// exact (Dst, Src) prefix pair already owned by a different virtual network — +// rather than letting the insert silently overwrite it (APO-653). The caller +// must hold networksByAddressMu. +func (h *Handler) checkRouteCollisionsLocked(routes []Route, vnet *VirtualNetwork) error { + for _, route := range routes { + dst, src := route.Dst.Masked(), route.Src.Masked() + if de := h.dstEntries[dst]; de != nil { + if owner, ok := de.owners[src]; ok && owner != vnet { + return fmt.Errorf("route Src=%s Dst=%s already routed to VNI %d", src, dst, owner.ID) + } + } + } + return nil +} + +// addRouteLocked registers route -> vnet in both the data-path trie and the +// exact-match owner index. It must be called only after checkRouteCollisionsLocked +// has passed for these routes, so it never clobbers a different vnet's slot. The +// caller must hold networksByAddressMu. +func (h *Handler) addRouteLocked(route Route, vnet *VirtualNetwork) { + dst, src := route.Dst.Masked(), route.Src.Masked() + de := h.dstEntries[dst] + if de == nil { + de = &dstEntry{ + srcTrie: iptrie.NewTrie(), + owners: make(map[netip.Prefix]*VirtualNetwork), + } + h.dstEntries[dst] = de + h.networksByAddress.Insert(dst, de) + } + de.owners[src] = vnet + de.srcTrie.Insert(src, vnet) +} + +// removeRouteLocked unregisters route, but only if it is currently owned by vnet +// (APO-654) — so decommissioning one network never deletes a route a different +// network still owns. When a Dst entry's last route is removed it is dropped from +// both indexes so empty nodes do not accumulate (APO-654). The caller must hold +// networksByAddressMu. +func (h *Handler) removeRouteLocked(route Route, vnet *VirtualNetwork) { + dst, src := route.Dst.Masked(), route.Src.Masked() + de := h.dstEntries[dst] + if de == nil || de.owners[src] != vnet { + return + } + delete(de.owners, src) + de.srcTrie.Remove(src) + if len(de.owners) == 0 { + delete(h.dstEntries, dst) + h.networksByAddress.Remove(dst) + } +} + // AddVirtualNetwork adds a new network with the given VNI and remote address. func (h *Handler) AddVirtualNetwork(vni uint, remoteAddr *tcpip.FullAddress, allowedRoutes []Route) error { if _, exists := h.networkByID.Load(vni); exists { @@ -284,20 +358,21 @@ func (h *Handler) AddVirtualNetwork(vni uint, remoteAddr *tcpip.FullAddress, all AllowedRoutes: allowedRoutes, } - h.networkByID.Store(vni, vnet) - - // Insert all allowed routes for this vnet h.networksByAddressMu.Lock() + defer h.networksByAddressMu.Unlock() + + // Reject the whole add if any route would take over a slot already owned by a + // different network, instead of silently overwriting it (APO-653). Validate + // before mutating so a conflict leaves no partial state. + if err := h.checkRouteCollisionsLocked(allowedRoutes, vnet); err != nil { + return err + } for _, route := range allowedRoutes { - value := h.networksByAddress.Find(route.Dst.Addr()) - if value == nil { - value = iptrie.NewTrie() - h.networksByAddress.Insert(route.Dst, value) - } - srcTrie := value.(*iptrie.Trie) - srcTrie.Insert(route.Src, vnet) + h.addRouteLocked(route, vnet) } - h.networksByAddressMu.Unlock() + // Publish the network only after its routes are installed, so the VNI never + // exists with a half-applied routing table. + h.networkByID.Store(vni, vnet) return nil } @@ -312,15 +387,11 @@ func (h *Handler) RemoveVirtualNetwork(vni uint) error { h.networkByID.Delete(vni) - // Remove all allowed routes for this vnet + // Remove only the routes this vnet owns, so removing it never blackholes a + // different network that shares a Dst prefix (APO-654). h.networksByAddressMu.Lock() for _, route := range vnet.AllowedRoutes { - value := h.networksByAddress.Find(route.Dst.Addr()) - if value != nil { - srcTrie := value.(*iptrie.Trie) - srcTrie.Remove(route.Src) - // Library doesn't expose a way to easily check if the trie is empty. - } + h.removeRouteLocked(route, vnet) } h.networksByAddressMu.Unlock() @@ -335,30 +406,25 @@ func (h *Handler) UpdateVirtualNetworkRoutes(vni uint, allowedRoutes []Route) er } vnet := v.(*VirtualNetwork) - // Remove all old allowed routes for this vnet, then insert the new ones. h.networksByAddressMu.Lock() + defer h.networksByAddressMu.Unlock() + + // Remove this vnet's current routes, then validate the new set against the + // routes that remain (other networks). On conflict, restore the old routes so + // the update is atomic — it never leaves the network partially routed or + // silently steals another network's slot (APO-653). for _, route := range vnet.AllowedRoutes { - value := h.networksByAddress.Find(route.Dst.Addr()) - if value != nil { - srcTrie := value.(*iptrie.Trie) - srcTrie.Remove(route.Src) - // Library doesn't expose a way to easily check if the trie is empty. + h.removeRouteLocked(route, vnet) + } + if err := h.checkRouteCollisionsLocked(allowedRoutes, vnet); err != nil { + for _, route := range vnet.AllowedRoutes { + h.addRouteLocked(route, vnet) } + return err } - - // Insert all new allowed routes for this vnet for _, route := range allowedRoutes { - value := h.networksByAddress.Find(route.Dst.Addr()) - if value == nil { - value = iptrie.NewTrie() - h.networksByAddress.Insert(route.Dst, value) - } - srcTrie := value.(*iptrie.Trie) - srcTrie.Insert(route.Src, vnet) + h.addRouteLocked(route, vnet) } - h.networksByAddressMu.Unlock() - - // Update vnet state vnet.AllowedRoutes = allowedRoutes return nil @@ -909,22 +975,24 @@ func (h *Handler) VirtToPhy(virtFrame, phyFrame []byte) (int, bool) { } } - // Find the virtual network by the destination and source addresses. + // Find the virtual network by the destination then source address (both LPM). + // Hold the read lock across both lookups: networksByAddress and the inner + // srcTrie are not safe against a concurrent management-plane mutation, so the + // lock must span the whole two-tier lookup, not just the outer Find. h.networksByAddressMu.RLock() value := h.networksByAddress.Find(dstAddr) - h.networksByAddressMu.RUnlock() if value == nil { + h.networksByAddressMu.RUnlock() slog.Debug("Dropping frame with unknown destination address", slog.String("dstAddr", dstAddr.String())) return 0, false } - srcTrie := value.(*iptrie.Trie) - - value = srcTrie.Find(srcAddr) - if value == nil { + srcValue := value.(*dstEntry).srcTrie.Find(srcAddr) + h.networksByAddressMu.RUnlock() + if srcValue == nil { slog.Debug("Dropping frame with unknown source address", slog.String("srcAddr", srcAddr.String())) return 0, false } - vnet := value.(*VirtualNetwork) + vnet := srcValue.(*VirtualNetwork) hdr := h.hdrPool.Get().(*geneve.Header) defer func() { diff --git a/inplace_transform.go b/inplace_transform.go index 2dc3ade..9fc4ee9 100644 --- a/inplace_transform.go +++ b/inplace_transform.go @@ -7,7 +7,6 @@ import ( "net/netip" "time" - "github.com/phemmer/go-iptrie" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" @@ -436,22 +435,24 @@ func (h *Handler) VirtToPhyInPlace(buf []byte, off, length int) (int, int, bool) } } - // Find the virtual network by the destination and source addresses. + // Find the virtual network by the destination then source address (both LPM). + // Hold the read lock across both lookups: networksByAddress and the inner + // srcTrie are not safe against a concurrent management-plane mutation, so the + // lock must span the whole two-tier lookup, not just the outer Find. h.networksByAddressMu.RLock() value := h.networksByAddress.Find(dstAddr) - h.networksByAddressMu.RUnlock() if value == nil { + h.networksByAddressMu.RUnlock() slog.Debug("Dropping frame with unknown destination address", slog.String("dstAddr", dstAddr.String())) return dropWindowOffset, 0, false } - srcTrie := value.(*iptrie.Trie) - - value = srcTrie.Find(srcAddr) - if value == nil { + srcValue := value.(*dstEntry).srcTrie.Find(srcAddr) + h.networksByAddressMu.RUnlock() + if srcValue == nil { slog.Debug("Dropping frame with unknown source address", slog.String("srcAddr", srcAddr.String())) return dropWindowOffset, 0, false } - vnet := value.(*VirtualNetwork) + vnet := srcValue.(*VirtualNetwork) hdr := h.hdrPool.Get().(*geneve.Header) defer func() { diff --git a/routing_trie_test.go b/routing_trie_test.go new file mode 100644 index 0000000..6d42168 --- /dev/null +++ b/routing_trie_test.go @@ -0,0 +1,159 @@ +package icx + +import ( + "net/netip" + "testing" + + "github.com/stretchr/testify/require" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// These tests are the regression for the routing-trie operator-error family: +// APO-653 (S10) two vnets with identical Src+Dst silently overwrite each other, +// APO-654 (S11) removing one vnet blackholes another sharing a Dst (+ node leak), +// APO-663 (S20) the management plane located the per-Dst srcTrie by LPM, so a +// nested Dst landed (and clobbered) the broader entry. +// They drive the management API and assert routing via RouteLookupForTest, which +// mirrors the data-path lookup in VirtToPhy. + +func newRoutingTestHandler(t *testing.T) *Handler { + t.Helper() + h, err := NewHandler( + WithLocalAddr(&tcpip.FullAddress{Addr: tcpip.AddrFrom4([4]byte{127, 0, 0, 1}), Port: 6081}), + WithLayer3VirtFrames(), + ) + require.NoError(t, err) + return h +} + +func rt(srcCIDR, dstCIDR string) Route { + return Route{Src: netip.MustParsePrefix(srcCIDR), Dst: netip.MustParsePrefix(dstCIDR)} +} + +func remote(ip string) *tcpip.FullAddress { + return &tcpip.FullAddress{Addr: tcpip.AddrFrom4(netip.MustParseAddr(ip).As4()), Port: 6081} +} + +func lookup(t *testing.T, h *Handler, src, dst string) (uint, bool) { + t.Helper() + return h.RouteLookupForTest(netip.MustParseAddr(src), netip.MustParseAddr(dst)) +} + +// APO-653 (S10): a second network with an identical Src+Dst route must be +// rejected, not silently overwrite the first network's egress. +func TestAddRejectsDuplicateSrcDst(t *testing.T) { + h := newRoutingTestHandler(t) + require.NoError(t, h.AddVirtualNetwork(100, remote("10.0.0.1"), []Route{rt("192.168.1.0/24", "10.1.0.0/24")})) + + err := h.AddVirtualNetwork(200, remote("10.0.0.2"), []Route{rt("192.168.1.0/24", "10.1.0.0/24")}) + require.Error(t, err, "duplicate Src+Dst must be rejected, not silently overwrite") + + // The original network still owns the route. + vni, ok := lookup(t, h, "192.168.1.5", "10.1.0.5") + require.True(t, ok) + require.Equal(t, uint(100), vni) + + // The rejected add left no partial state: VNI 200 was never created. + require.Error(t, h.RemoveVirtualNetwork(200), "VNI 200 must not have been created") +} + +// APO-654 (S11): removing a network only removes the routes it owns, and an +// emptied Dst entry is reclaimed rather than leaking. +func TestRemovePreservesSiblingAndReclaimsEmptyEntry(t *testing.T) { + h := newRoutingTestHandler(t) + // Two networks share Dst=10.1.0.0/24 but have distinct Src prefixes. + require.NoError(t, h.AddVirtualNetwork(100, remote("10.0.0.1"), []Route{rt("192.168.1.0/24", "10.1.0.0/24")})) + require.NoError(t, h.AddVirtualNetwork(101, remote("10.0.0.2"), []Route{rt("192.168.2.0/24", "10.1.0.0/24")})) + require.Equal(t, 1, h.DstEntryCountForTest(), "a shared Dst is a single entry") + + // Removing 100 must not blackhole 101. + require.NoError(t, h.RemoveVirtualNetwork(100)) + _, ok := lookup(t, h, "192.168.1.5", "10.1.0.5") + require.False(t, ok, "removed network's route is gone") + vni, ok := lookup(t, h, "192.168.2.5", "10.1.0.5") + require.True(t, ok, "sibling sharing the Dst must still route") + require.Equal(t, uint(101), vni) + require.Equal(t, 1, h.DstEntryCountForTest(), "entry survives while a sibling owns it") + + // Removing the last owner reclaims the Dst entry (no unbounded node growth). + require.NoError(t, h.RemoveVirtualNetwork(101)) + require.Equal(t, 0, h.DstEntryCountForTest(), "emptied Dst entry must be reclaimed") + _, ok = lookup(t, h, "192.168.2.5", "10.1.0.5") + require.False(t, ok) +} + +// APO-663 (S20): a Dst nested inside another network's broader Dst gets its own +// entry (exact-match keying), so it neither clobbers the broad network nor +// depends on it. Uses the same Src for both to make the overwrite visible. +func TestNestedDstExactKeying(t *testing.T) { + h := newRoutingTestHandler(t) + require.NoError(t, h.AddVirtualNetwork(100, remote("10.0.0.1"), []Route{rt("172.16.0.0/16", "10.0.0.0/16")})) + require.NoError(t, h.AddVirtualNetwork(200, remote("10.0.0.2"), []Route{rt("172.16.0.0/16", "10.0.5.0/24")})) + require.Equal(t, 2, h.DstEntryCountForTest(), "a nested Dst must be its own entry") + + // The nested range resolves to the nested network (more specific Dst). + vni, ok := lookup(t, h, "172.16.0.1", "10.0.5.1") + require.True(t, ok) + require.Equal(t, uint(200), vni) + + // The broad range outside the nested one must still resolve to the broad + // network — the nested add did not clobber it (the S20+S10 misroute). + vni, ok = lookup(t, h, "172.16.0.1", "10.0.4.1") + require.True(t, ok) + require.Equal(t, uint(100), vni) + + // Removing the nested network leaves the broad one intact. + require.NoError(t, h.RemoveVirtualNetwork(200)) + vni, ok = lookup(t, h, "172.16.0.1", "10.0.4.1") + require.True(t, ok) + require.Equal(t, uint(100), vni) + require.Equal(t, 1, h.DstEntryCountForTest()) +} + +// APO-653 via Update: an update whose new routes collide with another network is +// rejected and rolls back atomically — the network keeps its original routes. +func TestUpdateRoutesAtomicOnConflict(t *testing.T) { + h := newRoutingTestHandler(t) + require.NoError(t, h.AddVirtualNetwork(1, remote("10.0.0.1"), []Route{rt("192.168.1.0/24", "10.1.0.0/24")})) + require.NoError(t, h.AddVirtualNetwork(2, remote("10.0.0.2"), []Route{rt("192.168.2.0/24", "10.2.0.0/24")})) + + // Update VNI 2 to a set colliding with VNI 1 -> reject + roll back. + err := h.UpdateVirtualNetworkRoutes(2, []Route{rt("192.168.1.0/24", "10.1.0.0/24")}) + require.Error(t, err) + + // VNI 2 keeps its original route; VNI 1 still owns the contested one. + vni, ok := lookup(t, h, "192.168.2.5", "10.2.0.5") + require.True(t, ok, "VNI 2's original route must survive a rejected update") + require.Equal(t, uint(2), vni) + vni, ok = lookup(t, h, "192.168.1.5", "10.1.0.5") + require.True(t, ok) + require.Equal(t, uint(1), vni) +} + +// A non-conflicting update swaps the route set and reclaims the old entry. +func TestUpdateRoutesSucceeds(t *testing.T) { + h := newRoutingTestHandler(t) + require.NoError(t, h.AddVirtualNetwork(1, remote("10.0.0.1"), []Route{rt("192.168.1.0/24", "10.1.0.0/24")})) + + require.NoError(t, h.UpdateVirtualNetworkRoutes(1, []Route{rt("192.168.9.0/24", "10.9.0.0/24")})) + + _, ok := lookup(t, h, "192.168.1.5", "10.1.0.5") + require.False(t, ok, "old route must be gone") + vni, ok := lookup(t, h, "192.168.9.5", "10.9.0.5") + require.True(t, ok) + require.Equal(t, uint(1), vni) + require.Equal(t, 1, h.DstEntryCountForTest(), "old Dst entry reclaimed, one new entry") +} + +// Re-adding the same network's routes (idempotent within one vnet) is allowed — +// the collision guard only rejects a *different* owner. +func TestAddVirtualNetworkDuplicateRouteSameVnetOK(t *testing.T) { + h := newRoutingTestHandler(t) + require.NoError(t, h.AddVirtualNetwork(1, remote("10.0.0.1"), []Route{ + rt("192.168.1.0/24", "10.1.0.0/24"), + rt("192.168.1.0/24", "10.1.0.0/24"), + })) + vni, ok := lookup(t, h, "192.168.1.5", "10.1.0.5") + require.True(t, ok) + require.Equal(t, uint(1), vni) +} From ceafa2c5fc39642224ca4daea77b65c42dffe28e Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 16 Jun 2026 00:43:05 -0700 Subject: [PATCH 2/2] [handler] add parallel routing-lookup contention benchmark (APO-675) b.RunParallel benchmark of the per-packet route lookup under the networksByAddressMu RWMutex versus a lock-free atomic.Pointer snapshot doing byte-identical trie work. Quantifies the readerCount cache-line bounce (~2.5x slower at 8-10 cores) that motivates the RCU fix tracked in APO-675. Benchmark only; not run under plain `go test`. --- routing_contention_bench_test.go | 161 +++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 routing_contention_bench_test.go diff --git a/routing_contention_bench_test.go b/routing_contention_bench_test.go new file mode 100644 index 0000000..d2fa883 --- /dev/null +++ b/routing_contention_bench_test.go @@ -0,0 +1,161 @@ +package icx + +// Benchmarks that empirically measure whether taking sync.RWMutex.RLock per +// packet on the route-lookup path is a multi-core scalability bottleneck. +// +// BenchmarkRouteLookupRLockParallel exercises the real per-packet path: +// networksByAddressMu.RLock(); two trie Finds; RUnlock (via RouteLookupForTest, +// the export_test.go seam used by VirtToPhy). Under b.RunParallel, RWMutex.RLock +// increments/decrements a shared readerCount atomic on every call; that cache +// line bounces between cores, so ns/op should RISE as -cpu grows even though +// readers never block each other. +// +// BenchmarkRouteLookupAtomicParallel is the CONTROL: it does the identical two +// trie Finds behind an atomic.Pointer snapshot load (a lock-free read), so it +// isolates the RWMutex cost from the raw trie-find cost. It should stay roughly +// flat (or even improve) as -cpu grows. + +import ( + "fmt" + "net/netip" + "sync/atomic" + "testing" + + "github.com/phemmer/go-iptrie" + "gvisor.dev/gvisor/pkg/tcpip" +) + +// numBenchNetworks is the number of distinct virtual networks (and distinct +// routes) populated into the routing trie. ~256 spreads lookups across varied +// trie nodes so no single hot key dominates. +const numBenchNetworks = 256 + +// benchSink is a process-global accumulator the parallel loops publish into so +// the compiler cannot eliminate the lookup work as dead code. +var benchSink atomic.Uint64 + +// benchRoute returns the (src, dst) prefix pair and a pair of addresses that +// resolve inside those prefixes for network index i. Dst is spread across +// 10...0/24 and Src across 192.168..0/24, giving numBenchNetworks +// distinct routes that all coexist in the trie. +func benchRoute(i int) (srcPfx, dstPfx netip.Prefix, srcAddr, dstAddr netip.Addr) { + a := byte(i / 256) // 0 for i<256 + b := byte(i % 256) // 0..255 + c := byte(i % 256) // 0..255 — distinct Src /24 per network + dstPfx = netip.MustParsePrefix(fmt.Sprintf("10.%d.%d.0/24", a, b)) + srcPfx = netip.MustParsePrefix(fmt.Sprintf("192.168.%d.0/24", c)) + // Pick host .7 inside each /24 so the lookup lands on a real route. + dstAddr = netip.AddrFrom4([4]byte{10, a, b, 7}) + srcAddr = netip.AddrFrom4([4]byte{192, 168, c, 7}) + return +} + +// newBenchHandler builds a Handler populated with numBenchNetworks distinct +// routes via the production AddVirtualNetwork management API. +func newBenchHandler(tb testing.TB) *Handler { + tb.Helper() + h, err := NewHandler( + WithLocalAddr(&tcpip.FullAddress{Addr: tcpip.AddrFrom4([4]byte{127, 0, 0, 1}), Port: 6081}), + WithLayer3VirtFrames(), + ) + if err != nil { + tb.Fatalf("NewHandler: %v", err) + } + for i := 0; i < numBenchNetworks; i++ { + srcPfx, dstPfx, _, _ := benchRoute(i) + remote := &tcpip.FullAddress{ + Addr: tcpip.AddrFrom4([4]byte{10, 0, 0, byte(i % 256)}), + Port: 6081, + } + if err := h.AddVirtualNetwork(uint(i+1), remote, []Route{{Src: srcPfx, Dst: dstPfx}}); err != nil { + tb.Fatalf("AddVirtualNetwork(%d): %v", i+1, err) + } + } + return h +} + +// BenchmarkRouteLookupRLockParallel measures the real per-packet route lookup, +// which takes networksByAddressMu.RLock around two trie Finds. Run with +// -cpu=1,2,4,8(,16): rising ns/op is the signature of RWMutex.RLock cache-line +// contention on the shared readerCount atomic. +func BenchmarkRouteLookupRLockParallel(b *testing.B) { + h := newBenchHandler(b) + + // Sanity: confirm a representative lookup resolves before timing. + { + _, _, srcAddr, dstAddr := benchRoute(0) + if _, ok := h.RouteLookupForTest(srcAddr, dstAddr); !ok { + b.Fatalf("setup lookup did not resolve") + } + } + + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + // Per-goroutine counter so each iteration hits a different route + // rather than one hot key. + var n int + var sink uint + for pb.Next() { + _, _, srcAddr, dstAddr := benchRoute(n % numBenchNetworks) + vni, ok := h.RouteLookupForTest(srcAddr, dstAddr) + if ok { + sink += vni // consume the result to defeat dead-code elimination + } + n++ + } + // Publish sink so the optimizer cannot drop the loop body. + benchSink.Add(uint64(sink)) + }) +} + +// BenchmarkRouteLookupAtomicParallel is the CONTROL. It performs the identical +// two trie Finds (byDst.Find -> *dstEntry, then srcTrie.Find) behind an +// atomic.Pointer snapshot load with NO mutex. The snapshot is populated from the +// SAME tries the Handler built, so the per-lookup work is byte-for-byte the same +// as the RLock benchmark minus the lock. Under -cpu sweep it should stay roughly +// flat, isolating the raw trie-find cost from the RWMutex cost. +func BenchmarkRouteLookupAtomicParallel(b *testing.B) { + h := newBenchHandler(b) + + // Build the snapshot from the handler's own data-path trie so the control + // walks the identical structure. dstEntry (with its srcTrie field) is a + // package-internal type, reused directly here. + type snapshot struct { + byDst *iptrie.Trie + } + var ptr atomic.Pointer[snapshot] + ptr.Store(&snapshot{byDst: h.networksByAddress}) + + // Sanity: confirm the lock-free path resolves the same way. + { + _, _, srcAddr, dstAddr := benchRoute(0) + snap := ptr.Load() + v := snap.byDst.Find(dstAddr) + if v == nil { + b.Fatalf("setup atomic dst lookup did not resolve") + } + if sv := v.(*dstEntry).srcTrie.Find(srcAddr); sv == nil { + b.Fatalf("setup atomic src lookup did not resolve") + } + } + + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + var n int + var sink uint + for pb.Next() { + _, _, srcAddr, dstAddr := benchRoute(n % numBenchNetworks) + snap := ptr.Load() + v := snap.byDst.Find(dstAddr) + if v != nil { + if sv := v.(*dstEntry).srcTrie.Find(srcAddr); sv != nil { + sink += sv.(*VirtualNetwork).ID // consume to defeat DCE + } + } + n++ + } + benchSink.Add(uint64(sink)) + }) +}