From 5434d458a38e4fc247261ef13b4de3d986367f91 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 11:43:27 -0700 Subject: [PATCH 01/20] [handler] harden decap: anti-replay after auth (APO-645), empty-payload guard (APO-647) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes to both decap paths (PhyToVirt and PhyToVirtInPlace): APO-645 / S2 — verify the AEAD tag BEFORE touching the replay window. ValidateCounter both checks and ADVANCES the sliding window, and it was running before rxCipher.Open since the initial commit. That let an off-path attacker who can spoof the outer 4-tuple advance the window with a forged high counter, after which the real peer's in-window packets are rejected as "behind window" — a remote DoS that needs no key. Move the replay check after a successful Open so only authenticated counters move the window (the same auth-then-replay order WireGuard uses). Tradeoff: a replayed frame is now decrypted before being rejected, which is the accepted industry choice. APO-647 / S4 — drop an empty decrypted payload before reading its version nibble. The min-size guards read ipPacket[0] first and only then check < IPvNMinimumSize, so they catch 1..19 bytes but still panic on exactly 0. An authenticated peer can send a non-OOB frame with an empty plaintext (valid GCM: empty body + tag). The encap path already had the len==0 guard; this adds the symmetric one to decap. The differential fuzzer can't reach this (it needs a valid tag), which is why it slipped the earlier crash-hardening pass. --- handler.go | 27 ++++++++++++++++++++------- inplace_transform.go | 27 ++++++++++++++++++++------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/handler.go b/handler.go index e3f84cc..4e65493 100644 --- a/handler.go +++ b/handler.go @@ -500,13 +500,6 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { txCounter := binary.BigEndian.Uint64(nonce[4:]) - if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { - // Delayed packets can cause some uneccesary noise here. - slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) - vnet.Stats.RXReplayDrops.Add(1) - return 0 - } - var ipPacket []byte if h.opts.layer3 { ipPacket = virtFrame[:0] @@ -521,6 +514,18 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } + // Anti-replay AFTER authentication (APO-645/S2): ValidateCounter both checks + // and advances the sliding window, so it must run only on a packet whose tag + // has verified. Running it before Open let an attacker who can spoof the + // outer 4-tuple advance the window with a forged high counter and wedge the + // real peer (whose in-window counters are then rejected as "behind window"). + if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { + // Delayed packets can cause some unnecessary noise here. + slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) + vnet.Stats.RXReplayDrops.Add(1) + return 0 + } + // Is it an authenticated out-of-band message? if hdr.ProtocolType == 0 { slog.Debug("Dropping out-of-band message") @@ -531,6 +536,14 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } + // A non-OOB frame whose authenticated payload is empty has no version nibble; + // ipPacket[0] would panic. An authenticated peer can craft one. (APO-647/S4) + if len(ipPacket) == 0 { + slog.Warn("Dropping empty decrypted payload") + vnet.Stats.RXInvalidSrc.Add(1) + return 0 + } + ipVersion := ipPacket[0] >> 4 // Get the source address of the decrypted frame. diff --git a/inplace_transform.go b/inplace_transform.go index ff43c87..bd5d5f6 100644 --- a/inplace_transform.go +++ b/inplace_transform.go @@ -139,13 +139,6 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { txCounter := binary.BigEndian.Uint64(nonce[4:]) - if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { - // Delayed packets can cause some uneccesary noise here. - slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) - vnet.Stats.RXReplayDrops.Add(1) - return dropWindowOffset, 0 - } - // In-place decap: the ciphertext (payload[hdrLen:]) lives at ctStart within // buf; we open it onto itself at the SAME start (exact overlap), so the // plaintext is written over the ciphertext region. The AAD is the Geneve @@ -162,6 +155,18 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { return dropWindowOffset, 0 } + // Anti-replay AFTER authentication (APO-645/S2): ValidateCounter both checks + // and advances the sliding window, so it must run only on a packet whose tag + // has verified. Running it before Open let an attacker who can spoof the + // outer 4-tuple advance the window with a forged high counter and wedge the + // real peer (whose in-window counters are then rejected as "behind window"). + if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { + // Delayed packets can cause some unnecessary noise here. + slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) + vnet.Stats.RXReplayDrops.Add(1) + return dropWindowOffset, 0 + } + // Is it an authenticated out-of-band message? if hdr.ProtocolType == 0 { slog.Debug("Dropping out-of-band message") @@ -172,6 +177,14 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { return dropWindowOffset, 0 } + // A non-OOB frame whose authenticated payload is empty has no version nibble; + // ipPacket[0] would panic. An authenticated peer can craft one. (APO-647/S4) + if len(ipPacket) == 0 { + slog.Warn("Dropping empty decrypted payload") + vnet.Stats.RXInvalidSrc.Add(1) + return dropWindowOffset, 0 + } + ipVersion := ipPacket[0] >> 4 // Get the source address of the decrypted frame. From a124a358b68c9cf8292880f31eceff09773d2518 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 11:43:28 -0700 Subject: [PATCH 02/20] [forwarder] recover datapath transform panics into frame drops The processFrames loop holds runtime.LockOSThread and has no recovery, so a single panicking packet would tear down the whole queue goroutine (and, via errgroup, the forwarder). The transforms are written to drop malformed frames rather than panic, so this is a last-resort backstop: it converts any panic into a frame drop and keeps the queue running. It also contains the GCM inexact-overlap panic class the in-place aliasing contract depends on. The recovered-panic log is bounded to a single emission so a crafted frame cannot flood the logs. --- forwarder/forwarder.go | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/forwarder/forwarder.go b/forwarder/forwarder.go index 694a9dd..da39ffc 100644 --- a/forwarder/forwarder.go +++ b/forwarder/forwarder.go @@ -10,6 +10,7 @@ import ( "net" "os" "runtime" + "runtime/debug" "sync" "time" @@ -495,6 +496,32 @@ func (f *Forwarder) processFrames(ctx context.Context, queueID int) error { } } +// datapathPanicOnce bounds the recovered-panic log to a single emission so a +// crafted frame that trips an unguarded path cannot also flood the logs. Each +// such frame is still dropped and the queue keeps running. +var datapathPanicOnce sync.Once + +// safeTransform runs an in-place transform but converts a panic into a frame +// drop. The transforms are written to drop malformed frames rather than panic +// (see the length/IsValid guards in handler.go), so this is a last-resort +// backstop: the processFrames loop holds runtime.LockOSThread and has no other +// recovery, so a single panicking packet would otherwise tear down the whole +// queue goroutine (and, via errgroup, the forwarder). It also contains the GCM +// inexact-overlap panic class that the in-place aliasing contract relies on. +func safeTransform(fn inPlaceFn, buf []byte, off, length int) (outOff, outLen int, handled bool) { + defer func() { + if r := recover(); r != nil { + datapathPanicOnce.Do(func() { + slog.Error("recovered panic in datapath transform; dropping frame and continuing", + slog.Any("panic", r), + slog.String("stack", string(debug.Stack()))) + }) + outOff, outLen, handled = 0, 0, false + } + }() + return fn(buf, off, length) +} + // inPlaceFn transforms the packet at buf[off:off+length] in place and returns // the (offset, length) window of the output within buf, plus handled: true when // the handler produced an immediate local reply that must be transmitted back on @@ -561,7 +588,7 @@ func (f *Forwarder) forwardInPlace( continue } - outOff, outLen, handled := fn(buf, off, int(d.Len)) + outOff, outLen, handled := safeTransform(fn, buf, off, int(d.Len)) if outLen <= 0 { free = append(free, d) continue From 85f253da9f0c29adfac091948e5be1b255006732 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 11:43:28 -0700 Subject: [PATCH 03/20] [control] add PSP key derivation: SP 800-108 KDF + AES-CMAC (APO-648) First piece of the key-establishment control plane: the PSP-model key derivation that turns an authenticated, forward-secret session into per-Security-Association AEAD keys for the existing Geneve/AF_XDP data plane. - cmac.go: AES-CMAC (NIST SP 800-38B / RFC 4493) built on crypto/aes so the derivation stays inside the Go FIPS 140-3 module. - kdf.go: the SP 800-108 KDF and PSPVersion codepoints selecting AEAD (AES-GCM-128/256) and derived key size. - kdf_test.go: test vectors. --- control/cmac.go | 93 ++++++++++++++++++++++++++++++++++++++++ control/kdf.go | 78 ++++++++++++++++++++++++++++++++++ control/kdf_test.go | 100 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 271 insertions(+) create mode 100644 control/cmac.go create mode 100644 control/kdf.go create mode 100644 control/kdf_test.go diff --git a/control/cmac.go b/control/cmac.go new file mode 100644 index 0000000..a4702ea --- /dev/null +++ b/control/cmac.go @@ -0,0 +1,93 @@ +// Package control implements ICX's key-establishment control plane (a QUIC/mTLS +// channel) and the PSP-model key derivation that turns an authenticated, +// forward-secret session into per-Security-Association AEAD keys for the +// existing Geneve/AF_XDP data plane. +// +// This file implements AES-CMAC (NIST SP 800-38B / RFC 4493), the +// pseudorandom function underlying the PSP SP 800-108 key-derivation function +// (see kdf.go). CMAC is built directly on the FIPS-validated crypto/aes block +// cipher so the whole derivation stays inside the Go FIPS 140-3 module. +package control + +import ( + "crypto/aes" + "crypto/cipher" +) + +// cmacRb is the GF(2^128) reduction constant for a 128-bit block (RFC 4493 §2.3). +const cmacRb = 0x87 + +// aesCMAC computes the AES-CMAC of msg under key k. k must be a valid AES key +// (16, 24, or 32 bytes); the PSP KDF always uses a 32-byte (AES-256) master +// key. The tag is always 16 bytes (the AES block size). +func aesCMAC(k, msg []byte) ([]byte, error) { + block, err := aes.NewCipher(k) + if err != nil { + return nil, err + } + return cmacWithBlock(block, msg), nil +} + +// cmacWithBlock computes AES-CMAC using a pre-constructed block cipher. +func cmacWithBlock(block cipher.Block, msg []byte) []byte { + const bs = aes.BlockSize // 16 + + // Subkey generation (RFC 4493 §2.3): L = AES_K(0^128); K1 = dbl(L); + // K2 = dbl(K1). + l := make([]byte, bs) + block.Encrypt(l, l) + k1 := dbl(l) + k2 := dbl(k1) + + // Determine the number of blocks and whether the final block is complete. + n := (len(msg) + bs - 1) / bs + lastComplete := n != 0 && len(msg)%bs == 0 + if n == 0 { + n = 1 // empty message uses a single (padded) block + } + + // Final block: XOR with K1 if the last block is complete, else pad with + // 10* and XOR with K2. + last := make([]byte, bs) + if lastComplete { + xorInto(last, msg[(n-1)*bs:], k1) + } else { + rem := msg[(n-1)*bs:] + copy(last, rem) + last[len(rem)] = 0x80 + xorInto(last, last, k2) + } + + // CBC-MAC chain over all but the last block, then the final block. + x := make([]byte, bs) + y := make([]byte, bs) + for i := 0; i < n-1; i++ { + xorInto(y, x, msg[i*bs:(i+1)*bs]) + block.Encrypt(x, y) + } + xorInto(y, x, last) + block.Encrypt(x, y) + return x +} + +// dbl performs the GF(2^128) left-shift-and-reduce used in CMAC subkey +// generation: out = (in << 1), XOR'd with Rb if the high bit of in was set. +func dbl(in []byte) []byte { + out := make([]byte, len(in)) + var carry byte + for i := len(in) - 1; i >= 0; i-- { + out[i] = in[i]<<1 | carry + carry = in[i] >> 7 + } + if carry != 0 { + out[len(out)-1] ^= cmacRb + } + return out +} + +// xorInto writes a XOR b into dst. dst, a, and b must be the same length. +func xorInto(dst, a, b []byte) { + for i := range dst { + dst[i] = a[i] ^ b[i] + } +} diff --git a/control/kdf.go b/control/kdf.go new file mode 100644 index 0000000..65d6bb0 --- /dev/null +++ b/control/kdf.go @@ -0,0 +1,78 @@ +package control + +import ( + "encoding/binary" + "fmt" +) + +// PSPVersion is a PSP encryption-mode codepoint. It selects both the AEAD +// (AES-GCM-128 vs AES-GCM-256) and, via the KDF label, the size of the derived +// security-association key. +type PSPVersion uint8 + +const ( + // PSPv0 is AES-GCM-128: a 16-byte SA key. Required by every PSP + // implementation; the ICX default (zero churn to the [16]byte data plane). + PSPv0 PSPVersion = 0 + // PSPv1 is AES-GCM-256: a 32-byte SA key. The CNSA / 256-bit path. + PSPv1 PSPVersion = 1 +) + +// MasterKeyLen is the required length of a PSP master key (256 bits). PSP +// master keys are always AES-256 keys regardless of the SA key size. +const MasterKeyLen = 32 + +// label returns the 4-byte SP 800-108 label for the version: "Pv0\0" +// (0x50 0x76 0x30 0x00) for v0, "Pv1\0" for v1. The trailing NUL also serves as +// the SP 800-108 label/context separator. Per the spec, the version number may +// be OR'd into the third byte of the base label. +func (v PSPVersion) label() [4]byte { + return [4]byte{0x50, 0x76, 0x30 | byte(v), 0x00} +} + +// keyLen returns the derived SA key length in bytes for the version. +func (v PSPVersion) keyLen() int { + if v == PSPv1 { + return 32 + } + return 16 +} + +// DeriveSAKey derives a PSP security-association key from a 256-bit master key +// and a 32-bit SPI, exactly per the PSP Architecture Specification: a NIST +// SP 800-108 counter-mode KDF whose PRF is AES-CMAC (see cmac.go). Each PRF +// input block is the 16-byte concatenation +// +// counter(4) || label(4) || context=SPI(4) || length-in-bits(4) +// +// all in network byte order. A 128-bit key needs one block (counter=1); a +// 256-bit key needs two (counter=1, counter=2) concatenated. +// +// The caller is responsible for selecting which master key to pass based on the +// SPI's most-significant bit (the PSP master-key selector); the SPI is fed into +// the KDF context verbatim, MSB included, so the derivation is bound to it. +func DeriveSAKey(masterKey []byte, spi uint32, v PSPVersion) ([]byte, error) { + if len(masterKey) != MasterKeyLen { + return nil, fmt.Errorf("control: master key must be %d bytes, got %d", MasterKeyLen, len(masterKey)) + } + + keyLen := v.keyLen() + bitLen := uint32(keyLen * 8) + label := v.label() + blocks := (keyLen + 15) / 16 + + out := make([]byte, 0, blocks*16) + for i := 1; i <= blocks; i++ { + var in [16]byte + binary.BigEndian.PutUint32(in[0:4], uint32(i)) // counter + copy(in[4:8], label[:]) // label + binary.BigEndian.PutUint32(in[8:12], spi) // context = SPI + binary.BigEndian.PutUint32(in[12:16], bitLen) // length (bits) + mac, err := aesCMAC(masterKey, in[:]) + if err != nil { + return nil, err + } + out = append(out, mac...) + } + return out[:keyLen], nil +} diff --git a/control/kdf_test.go b/control/kdf_test.go new file mode 100644 index 0000000..6618750 --- /dev/null +++ b/control/kdf_test.go @@ -0,0 +1,100 @@ +package control + +import ( + "bytes" + "encoding/hex" + "strings" + "testing" +) + +func unhex(t *testing.T, s string) []byte { + t.Helper() + b, err := hex.DecodeString(strings.ReplaceAll(s, " ", "")) + if err != nil { + t.Fatalf("bad hex %q: %v", s, err) + } + return b +} + +// TestAESCMAC_RFC4493 validates the AES-CMAC PRF against the canonical RFC 4493 +// (= NIST SP 800-38B) test vectors: empty, one full block, a partial final +// block, and a multi-block message. +func TestAESCMAC_RFC4493(t *testing.T) { + key := unhex(t, "2b7e151628aed2a6abf7158809cf4f3c") + const ( + b1 = "6bc1bee22e409f96e93d7e117393172a" + b2 = "ae2d8a571e03ac9c9eb76fac45af8e51" + b3 = "30c81c46a35ce411e5fbc1191a0a52ef" + b4 = "f69f2445df4f9b17ad2b417be66c3710" + b3part8 = "30c81c46a35ce411" // first 8 bytes of b3 (40-byte message) + ) + cases := []struct { + name, msg, want string + }{ + {"len0", "", "bb1d6929e95937287fa37d129b756746"}, + {"len16", b1, "070a16b46b4d4144f79bdd9dd04a287c"}, + {"len40", b1 + b2 + b3part8, "dfa66747de9ae63030ca32611497c827"}, + {"len64", b1 + b2 + b3 + b4, "51f0bebf7e3b9d92fc49741779363cfe"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := aesCMAC(key, unhex(t, c.msg)) + if err != nil { + t.Fatal(err) + } + if want := unhex(t, c.want); !bytes.Equal(got, want) { + t.Fatalf("CMAC mismatch\n got %x\nwant %x", got, want) + } + }) + } +} + +// TestDeriveSAKey_PSPSpec validates the SP 800-108 KDF against the worked +// examples in the PSP Architecture Specification ("Examples of key +// derivation", p.7). +func TestDeriveSAKey_PSPSpec(t *testing.T) { + k0 := unhex(t, "34448a064292601b11a0978f56a2d34cf3fc35ede1a6bc04f8db3e5243a2b0ca") + k1 := unhex(t, "563952565d3a78ae773ec1b779f2f2d99f4a7f53a6fbb9b07d5b71f39364d739") + + cases := []struct { + name string + master []byte + spi uint32 + version PSPVersion + want string + }{ + { + name: "v0_spi_12345678_mk0", master: k0, spi: 0x12345678, version: PSPv0, + want: "96c22dc799198090b74b70ae468e4e30", + }, + { + // MSB set -> master key 1 selected by the caller. + name: "v0_spi_9A345678_mk1", master: k1, spi: 0x9A345678, version: PSPv0, + want: "3946da2554eae46ad1ef77a64372edc4", + }, + { + name: "v1_spi_12345678_mk0", master: k0, spi: 0x12345678, version: PSPv1, + want: "2b7d72074e42ca334487f2990e3f8c4037e436f38283449b76463e9b7fb2e3de", + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := DeriveSAKey(c.master, c.spi, c.version) + if err != nil { + t.Fatal(err) + } + if want := unhex(t, c.want); !bytes.Equal(got, want) { + t.Fatalf("SA key mismatch\n got %x\nwant %x", got, want) + } + if len(got) != c.version.keyLen() { + t.Fatalf("key length = %d, want %d", len(got), c.version.keyLen()) + } + }) + } +} + +func TestDeriveSAKey_BadMasterKeyLen(t *testing.T) { + if _, err := DeriveSAKey(make([]byte, 16), 1, PSPv0); err == nil { + t.Fatal("expected error for 16-byte master key, got nil") + } +} From 4eae2248e8564d3935e91798494b663ab5b6ff88 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 12:40:27 -0700 Subject: [PATCH 04/20] [control] add QUIC/mTLS control plane with PSP SA negotiation (APO-648) Establish authenticated, forward-secret per-session keys over a dedicated QUIC/mTLS control channel, following the PSP security model while keeping the Geneve/AF_XDP data plane untouched. Resolves the static-PSK / no-forward-secrecy gap (APO-648/S5); the SPI-bound nonce that closes the GCM reuse window (S1) lands with the Phase 3 data-plane wiring. - identity.go: long-term ECDSA P-256 identities, WireGuard-style pinned raw public keys (self-signed leaf, SPKI pin), genkey/pubkey-ready marshalling. - tls.go: TLS 1.3-only config, ALPN icx-ctrl/1, pin verifier via VerifyConnection, RFC 8446 exporter -> 32-byte root secret. Under fips140=on the suite is AES-GCM + P-256/384 + SHA-2 + HKDF with no custom handshake crypto. - transport.go: QUIC Dial/Listen (Retry-based address validation is the handshake-flood defense), no 0-RTT so every handshake is a fresh ECDHE (forward secrecy). Root secret -> HKDF master keys; SA negotiation over a QUIC stream where each peer announces only its RX SPI and both derive every key locally -- no key material on the wire. A successful NegotiateSAs, not a successful Dial, is the fail-closed key-confirmation precondition. - sa.go/protocol.go: SPI allocation (MSB master-key selector, role-partitioned so tx key != rx key), SP800-108/AES-CMAC SA derivation, framed SA-offer. - kdf.go: PSP version now fails closed on unknown codepoints instead of defaulting to a 16-byte key. - _examples/keyexchange: runnable, self-verifying two-peer loopback demo that tracks the API; doubles as a smoke test (GODEBUG=fips140=on go run ...). Adds quic-go v0.59.1 (no crypto of its own; delegates to crypto/tls, so the FIPS boundary stays the Go module). --- _examples/keyexchange/main.go | 182 +++++++++++++++++++++++++ control/identity.go | 160 ++++++++++++++++++++++ control/identity_test.go | 82 ++++++++++++ control/kdf.go | 11 +- control/kdf_test.go | 7 + control/protocol.go | 104 ++++++++++++++ control/sa.go | 134 ++++++++++++++++++ control/sa_test.go | 118 ++++++++++++++++ control/tls.go | 111 +++++++++++++++ control/tls_test.go | 138 +++++++++++++++++++ control/transport.go | 246 ++++++++++++++++++++++++++++++++++ control/transport_test.go | 170 +++++++++++++++++++++++ go.mod | 10 +- go.sum | 25 ++-- 14 files changed, 1484 insertions(+), 14 deletions(-) create mode 100644 _examples/keyexchange/main.go create mode 100644 control/identity.go create mode 100644 control/identity_test.go create mode 100644 control/protocol.go create mode 100644 control/sa.go create mode 100644 control/sa_test.go create mode 100644 control/tls.go create mode 100644 control/tls_test.go create mode 100644 control/transport.go create mode 100644 control/transport_test.go diff --git a/_examples/keyexchange/main.go b/_examples/keyexchange/main.go new file mode 100644 index 0000000..2abfdad --- /dev/null +++ b/_examples/keyexchange/main.go @@ -0,0 +1,182 @@ +// Command keyexchange is a runnable demonstration of the ICX control plane: +// two peers establish a forward-secret, mutually-authenticated QUIC/mTLS +// session over loopback, derive PSP master keys from the TLS exporter, and +// negotiate per-direction Security Associations whose AES-GCM keys feed the +// Geneve/AF_XDP data plane. +// +// It runs both peers in one process and self-verifies the result, so it doubles +// as living documentation and a smoke test. Build under GODEBUG=fips140=on to +// confirm the whole exchange uses only FIPS-approved primitives: +// +// GODEBUG=fips140=on go run ./_examples/keyexchange +// +// This example tracks the control-plane API as it evolves; keep it building. +package main + +import ( + "context" + "crypto/sha256" + "crypto/tls" + "flag" + "fmt" + "log" + "net" + "time" + + "github.com/apoxy-dev/icx/control" +) + +func main() { + pspV1 := flag.Bool("v1", false, "use PSP v1 (AES-256-GCM) instead of v0 (AES-128-GCM)") + flag.Parse() + + version := control.PSPv0 + if *pspV1 { + version = control.PSPv1 + } + + if err := run(version); err != nil { + log.Fatalf("keyexchange demo failed: %v", err) + } +} + +func run(version control.PSPVersion) error { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + // 1. Long-term identities. In production each side holds its own private key + // and is configured with the peer's public key (--peer-key), WireGuard + // style. Here we mint both. + initiatorID, err := control.GenerateIdentity() + if err != nil { + return err + } + responderID, err := control.GenerateIdentity() + if err != nil { + return err + } + iFP, _ := initiatorID.Fingerprint() + rFP, _ := responderID.Fingerprint() + fmt.Printf("identities:\n initiator %s\n responder %s\n", iFP, rFP) + + // 2. Loopback UDP sockets (the control-plane port; AF_XDP owns the data port). + srvConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + return err + } + defer srvConn.Close() + cliConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + return err + } + defer cliConn.Close() + + // 3. Responder listens (and pins the initiator's key). + ln, err := control.Listen(srvConn, responderID, initiatorID.PublicKey()) + if err != nil { + return err + } + defer ln.Close() + + type negResult struct { + sess *control.Session + sas *control.DirectionalSAs + err error + } + respCh := make(chan negResult, 1) + go func() { + sess, err := ln.Accept(ctx) + if err != nil { + respCh <- negResult{err: err} + return + } + sas, err := sess.NegotiateSAs(ctx, version) + respCh <- negResult{sess: sess, sas: sas, err: err} + }() + + // 4. Initiator dials (and pins the responder's key) — this is the TLS 1.3 + // handshake: mutual auth + ephemeral ECDHE (forward secrecy). + initSess, err := control.Dial(ctx, cliConn, ln.Addr(), initiatorID, responderID.PublicKey()) + if err != nil { + return fmt.Errorf("dial: %w", err) + } + defer initSess.Close() + + st := initSess.TLSState() + fmt.Printf("handshake: TLS %s, cipher %s, ALPN %q\n", + tlsVersionName(st.Version), tls.CipherSuiteName(st.CipherSuite), st.NegotiatedProtocol) + + // 5. Negotiate SAs (initiator side). + initSAs, err := initSess.NegotiateSAs(ctx, version) + if err != nil { + return fmt.Errorf("initiator NegotiateSAs: %w", err) + } + + r := <-respCh + if r.sess != nil { + defer r.sess.Close() + } + if r.err != nil { + return fmt.Errorf("responder side: %w", r.err) + } + respSAs := r.sas + + // 6. Report and verify. + fmt.Printf("master keys agree: %v\n", initSess.MasterKeys() != nil && r.sess.MasterKeys() != nil) + fmt.Printf("SAs (PSP %s):\n", pspName(version)) + fmt.Printf(" initiator: tx spi=%#08x key=%s | rx spi=%#08x key=%s\n", + initSAs.Tx.SPI, fp(initSAs.Tx.Key), initSAs.Rx.SPI, fp(initSAs.Rx.Key)) + fmt.Printf(" responder: tx spi=%#08x key=%s | rx spi=%#08x key=%s\n", + respSAs.Tx.SPI, fp(respSAs.Tx.Key), respSAs.Rx.SPI, fp(respSAs.Rx.Key)) + + if !equal(initSAs.Tx.Key, respSAs.Rx.Key) || !equal(initSAs.Rx.Key, respSAs.Tx.Key) { + return fmt.Errorf("VERIFY FAILED: tx/rx keys do not cross-match between peers") + } + if equal(initSAs.Tx.Key, initSAs.Rx.Key) { + return fmt.Errorf("VERIFY FAILED: initiator tx and rx keys collided") + } + if len(initSAs.Tx.Key) != expectedKeyLen(version) { + return fmt.Errorf("VERIFY FAILED: key length %d, want %d", len(initSAs.Tx.Key), expectedKeyLen(version)) + } + + fmt.Println("VERIFY OK: cross-matched, tx≠rx, FIPS-suite handshake, keys never crossed the wire") + return nil +} + +func fp(key []byte) string { + sum := sha256.Sum256(key) + return fmt.Sprintf("%x", sum[:6]) +} + +func equal(a, b []byte) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func expectedKeyLen(v control.PSPVersion) int { + if v == control.PSPv1 { + return 32 + } + return 16 +} + +func pspName(v control.PSPVersion) string { + if v == control.PSPv1 { + return "v1/AES-256-GCM" + } + return "v0/AES-128-GCM" +} + +func tlsVersionName(v uint16) string { + if v == tls.VersionTLS13 { + return "1.3" + } + return fmt.Sprintf("%#x", v) +} diff --git a/control/identity.go b/control/identity.go new file mode 100644 index 0000000..93f80ce --- /dev/null +++ b/control/identity.go @@ -0,0 +1,160 @@ +package control + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/sha256" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/base64" + "encoding/pem" + "fmt" + "math/big" + "time" +) + +// Identity is a node's long-term signing key used to mutually authenticate the +// QUIC/mTLS control channel. It is an ECDSA P-256 key: a FIPS 186-approved +// signature algorithm in the Go FIPS 140-3 module, and a curve TLS 1.3 will use +// in FIPS mode. Peers authenticate each other WireGuard-style — by pinning the +// expected public key — rather than via a CA, so identities are self-signed. +// +// Note this signing key is distinct from the ephemeral ECDHE that TLS performs +// for forward secrecy; the identity only proves "who", the handshake provides +// the fresh per-session secret. +type Identity struct { + priv *ecdsa.PrivateKey +} + +// identityCertValidity is how long the self-signed identity certificate is +// nominally valid. Pinning ignores CA chains and (with the custom verifier) +// time validity, but a sane window keeps stricter stacks happy. +const identityCertValidity = 100 * 365 * 24 * time.Hour + +// GenerateIdentity creates a fresh ECDSA P-256 identity using crypto/rand. +func GenerateIdentity() (*Identity, error) { + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return nil, fmt.Errorf("control: generate identity: %w", err) + } + return &Identity{priv: priv}, nil +} + +// MarshalPrivatePEM encodes the identity private key as a PKCS#8 PEM block, +// suitable for writing to a 0600 key file. +func (id *Identity) MarshalPrivatePEM() ([]byte, error) { + der, err := x509.MarshalPKCS8PrivateKey(id.priv) + if err != nil { + return nil, fmt.Errorf("control: marshal private key: %w", err) + } + return pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: der}), nil +} + +// LoadIdentityPEM parses a PKCS#8 PEM private key produced by MarshalPrivatePEM. +// It rejects anything that is not an ECDSA P-256 key. +func LoadIdentityPEM(pemBytes []byte) (*Identity, error) { + block, _ := pem.Decode(pemBytes) + if block == nil { + return nil, fmt.Errorf("control: no PEM block in identity key") + } + key, err := x509.ParsePKCS8PrivateKey(block.Bytes) + if err != nil { + return nil, fmt.Errorf("control: parse identity key: %w", err) + } + priv, ok := key.(*ecdsa.PrivateKey) + if !ok || priv.Curve != elliptic.P256() { + return nil, fmt.Errorf("control: identity key must be ECDSA P-256") + } + return &Identity{priv: priv}, nil +} + +// PublicKey returns the identity's public key. +func (id *Identity) PublicKey() *ecdsa.PublicKey { + return &id.priv.PublicKey +} + +// PublicKeyString returns the base64(SPKI DER) encoding of the public key. This +// is the value distributed to peers and supplied via --peer-key (analogous to a +// WireGuard public key). +func (id *Identity) PublicKeyString() (string, error) { + return MarshalPublicKey(&id.priv.PublicKey) +} + +// Fingerprint returns a short, stable identifier for the public key: +// base64(SHA-256(SPKI DER)). Used as the certificate subject and in logs. +func (id *Identity) Fingerprint() (string, error) { + der, err := x509.MarshalPKIXPublicKey(&id.priv.PublicKey) + if err != nil { + return "", err + } + sum := sha256.Sum256(der) + return base64.RawStdEncoding.EncodeToString(sum[:]), nil +} + +// MarshalPublicKey encodes a public key as base64(SPKI DER). +func MarshalPublicKey(pub *ecdsa.PublicKey) (string, error) { + der, err := x509.MarshalPKIXPublicKey(pub) + if err != nil { + return "", fmt.Errorf("control: marshal public key: %w", err) + } + return base64.StdEncoding.EncodeToString(der), nil +} + +// ParsePublicKey decodes a base64(SPKI DER) public key (the --peer-key value) +// and verifies it is ECDSA P-256. +func ParsePublicKey(s string) (*ecdsa.PublicKey, error) { + der, err := base64.StdEncoding.DecodeString(s) + if err != nil { + return nil, fmt.Errorf("control: decode peer key: %w", err) + } + pub, err := x509.ParsePKIXPublicKey(der) + if err != nil { + return nil, fmt.Errorf("control: parse peer key: %w", err) + } + ec, ok := pub.(*ecdsa.PublicKey) + if !ok || ec.Curve != elliptic.P256() { + return nil, fmt.Errorf("control: peer key must be ECDSA P-256") + } + return ec, nil +} + +// TLSCertificate builds a self-signed leaf certificate for this identity, for +// use as the local end of the mTLS handshake. Authentication is by key pinning, +// not by chain validation, so the certificate is its own issuer. +func (id *Identity) TLSCertificate() (tls.Certificate, error) { + fp, err := id.Fingerprint() + if err != nil { + return tls.Certificate{}, err + } + // A fixed serial is fine: the cert is never chained or revoked, only pinned. + now := time.Now() + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "icx:" + fp}, + NotBefore: now.Add(-time.Hour), + NotAfter: now.Add(identityCertValidity), + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth}, + BasicConstraintsValid: true, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &id.priv.PublicKey, id.priv) + if err != nil { + return tls.Certificate{}, fmt.Errorf("control: create self-signed cert: %w", err) + } + leaf, err := x509.ParseCertificate(der) + if err != nil { + return tls.Certificate{}, fmt.Errorf("control: parse self-signed cert: %w", err) + } + return tls.Certificate{ + Certificate: [][]byte{der}, + PrivateKey: id.priv, + Leaf: leaf, + }, nil +} + +// PublicKeyEqual reports whether two ECDSA public keys are identical. +func PublicKeyEqual(a, b *ecdsa.PublicKey) bool { + return a != nil && b != nil && a.Equal(b) +} diff --git a/control/identity_test.go b/control/identity_test.go new file mode 100644 index 0000000..baac675 --- /dev/null +++ b/control/identity_test.go @@ -0,0 +1,82 @@ +package control + +import ( + "testing" +) + +func TestIdentityPrivatePEMRoundTrip(t *testing.T) { + id, err := GenerateIdentity() + if err != nil { + t.Fatal(err) + } + pemBytes, err := id.MarshalPrivatePEM() + if err != nil { + t.Fatal(err) + } + got, err := LoadIdentityPEM(pemBytes) + if err != nil { + t.Fatal(err) + } + if !PublicKeyEqual(id.PublicKey(), got.PublicKey()) { + t.Fatal("round-tripped identity public key differs") + } +} + +func TestPublicKeyStringRoundTrip(t *testing.T) { + id, err := GenerateIdentity() + if err != nil { + t.Fatal(err) + } + s, err := id.PublicKeyString() + if err != nil { + t.Fatal(err) + } + pub, err := ParsePublicKey(s) + if err != nil { + t.Fatal(err) + } + if !PublicKeyEqual(id.PublicKey(), pub) { + t.Fatal("parsed peer key differs from original") + } +} + +func TestParsePublicKeyRejectsGarbage(t *testing.T) { + if _, err := ParsePublicKey("not-base64!!"); err == nil { + t.Fatal("expected error for non-base64 peer key") + } + if _, err := ParsePublicKey("aGVsbG8="); err == nil { // valid base64, not a key + t.Fatal("expected error for non-SPKI peer key") + } +} + +func TestTLSCertificatePinsIdentityKey(t *testing.T) { + id, err := GenerateIdentity() + if err != nil { + t.Fatal(err) + } + cert, err := id.TLSCertificate() + if err != nil { + t.Fatal(err) + } + if cert.Leaf == nil { + t.Fatal("expected parsed Leaf on tls.Certificate") + } + // The leaf's public key must equal the identity's, so a pin against the + // identity key matches the certificate presented during the handshake. + if !id.PublicKey().Equal(cert.Leaf.PublicKey) { + t.Fatal("leaf certificate public key does not match identity") + } +} + +func TestDistinctIdentitiesDiffer(t *testing.T) { + a, _ := GenerateIdentity() + b, _ := GenerateIdentity() + if PublicKeyEqual(a.PublicKey(), b.PublicKey()) { + t.Fatal("two generated identities must not collide") + } + fa, _ := a.Fingerprint() + fb, _ := b.Fingerprint() + if fa == fb || fa == "" { + t.Fatalf("fingerprints should be distinct and non-empty: %q %q", fa, fb) + } +} diff --git a/control/kdf.go b/control/kdf.go index 65d6bb0..3ddbde0 100644 --- a/control/kdf.go +++ b/control/kdf.go @@ -30,7 +30,13 @@ func (v PSPVersion) label() [4]byte { return [4]byte{0x50, 0x76, 0x30 | byte(v), 0x00} } -// keyLen returns the derived SA key length in bytes for the version. +// valid reports whether v is a supported PSP version. Callers must reject +// unsupported versions before deriving keys (fail-closed), so keyLen/label are +// never asked to map an unknown version. +func (v PSPVersion) valid() bool { return v == PSPv0 || v == PSPv1 } + +// keyLen returns the derived SA key length in bytes for the version. Only valid +// versions reach here (guarded by DeriveSAKey); v0 = 16, v1 = 32. func (v PSPVersion) keyLen() int { if v == PSPv1 { return 32 @@ -55,6 +61,9 @@ func DeriveSAKey(masterKey []byte, spi uint32, v PSPVersion) ([]byte, error) { if len(masterKey) != MasterKeyLen { return nil, fmt.Errorf("control: master key must be %d bytes, got %d", MasterKeyLen, len(masterKey)) } + if !v.valid() { + return nil, fmt.Errorf("control: unsupported PSP version %d", v) + } keyLen := v.keyLen() bitLen := uint32(keyLen * 8) diff --git a/control/kdf_test.go b/control/kdf_test.go index 6618750..af534e0 100644 --- a/control/kdf_test.go +++ b/control/kdf_test.go @@ -98,3 +98,10 @@ func TestDeriveSAKey_BadMasterKeyLen(t *testing.T) { t.Fatal("expected error for 16-byte master key, got nil") } } + +func TestDeriveSAKey_UnsupportedVersionFailsClosed(t *testing.T) { + mk := make([]byte, MasterKeyLen) + if _, err := DeriveSAKey(mk, 1, PSPVersion(7)); err == nil { + t.Fatal("expected error for unsupported PSP version, got nil (must fail closed, not default to 16 bytes)") + } +} diff --git a/control/protocol.go b/control/protocol.go new file mode 100644 index 0000000..082e550 --- /dev/null +++ b/control/protocol.go @@ -0,0 +1,104 @@ +package control + +import ( + "encoding/binary" + "fmt" + "io" +) + +// The control-plane SA-setup protocol runs over a bidirectional QUIC stream +// after the mTLS handshake. Because both peers derive the identical PSP master +// keys from the TLS exporter, no key material is ever exchanged — peers only +// announce the SPI on which each will RECEIVE, and derive every key locally. +// +// Frame = uint16 big-endian length prefix + payload. Messages are small and +// fixed today; the leading protocol-version + type bytes leave room to grow +// (lifetimes, capabilities, rekey signalling) without breaking the framing. + +const ( + // ProtocolVersion is the control-plane wire-protocol version. + ProtocolVersion = 1 + // maxFrameLen bounds a single control frame (these are tiny; the cap just + // stops a peer from forcing a large allocation). + maxFrameLen = 4096 +) + +type msgType uint8 + +const ( + msgSAOffer msgType = 1 +) + +// saOffer announces the SPI on which the sender will RECEIVE data-plane traffic +// for the given PSP version. The peer derives the key for this SPI and uses it +// as its TX key; the sender uses it as its RX key. +type saOffer struct { + PSPVersion PSPVersion + RxSPI uint32 +} + +const saOfferLen = 1 + 1 + 1 + 4 // protoVer + type + pspVer + rxSPI + +func (o saOffer) marshal() []byte { + b := make([]byte, saOfferLen) + b[0] = ProtocolVersion + b[1] = byte(msgSAOffer) + b[2] = byte(o.PSPVersion) + binary.BigEndian.PutUint32(b[3:], o.RxSPI) + return b +} + +func parseSAOffer(b []byte) (saOffer, error) { + if len(b) != saOfferLen { + return saOffer{}, fmt.Errorf("control: SA offer wrong size %d, want %d", len(b), saOfferLen) + } + if b[0] != ProtocolVersion { + return saOffer{}, fmt.Errorf("control: unsupported protocol version %d", b[0]) + } + if msgType(b[1]) != msgSAOffer { + return saOffer{}, fmt.Errorf("control: expected SA offer, got message type %d", b[1]) + } + return saOffer{ + PSPVersion: PSPVersion(b[2]), + RxSPI: binary.BigEndian.Uint32(b[3:7]), + }, nil +} + +// writeFrame writes a length-prefixed control frame. +func writeFrame(w io.Writer, payload []byte) error { + if len(payload) > maxFrameLen { + return fmt.Errorf("control: frame too large (%d)", len(payload)) + } + var hdr [2]byte + binary.BigEndian.PutUint16(hdr[:], uint16(len(payload))) + if _, err := w.Write(hdr[:]); err != nil { + return err + } + _, err := w.Write(payload) + return err +} + +// readFrame reads a single length-prefixed control frame. +func readFrame(r io.Reader) ([]byte, error) { + var hdr [2]byte + if _, err := io.ReadFull(r, hdr[:]); err != nil { + return nil, err + } + n := binary.BigEndian.Uint16(hdr[:]) + if int(n) > maxFrameLen { + return nil, fmt.Errorf("control: frame too large (%d)", n) + } + buf := make([]byte, n) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + return buf, nil +} + +func readSAOffer(r io.Reader) (saOffer, error) { + b, err := readFrame(r) + if err != nil { + return saOffer{}, err + } + return parseSAOffer(b) +} diff --git a/control/sa.go b/control/sa.go new file mode 100644 index 0000000..a512175 --- /dev/null +++ b/control/sa.go @@ -0,0 +1,134 @@ +package control + +import ( + "crypto/hkdf" + "crypto/sha256" + "errors" + "fmt" + "sync" +) + +// numMasterKeys is the PSP master-key count: one active, one retained for +// in-flight SAs during rotation (the MSB of the SPI selects between them). +const numMasterKeys = 2 + +// MasterKeys holds the two 256-bit PSP master keys. They are seeded from the +// forward-secret TLS exporter (see ExportRootSecret) and live only in RAM; they +// are never persisted, so a recorded session cannot be decrypted once they are +// dropped — this is where the forward secrecy reaches the data plane. +type MasterKeys struct { + keys [numMasterKeys][MasterKeyLen]byte +} + +// masterKeyInfo domain-separates the master-key derivation from any other use +// of the root secret. +const masterKeyInfo = "icx psp master keys v1" + +// DeriveMasterKeys expands the TLS-exported root secret into the two PSP master +// keys via HKDF-SHA-256 (FIPS SP 800-56C). Both peers feed the identical root +// secret and therefore derive the identical master keys, so each can compute +// any SA key locally from its SPI — no key material ever crosses the wire. +func DeriveMasterKeys(rootSecret []byte) (*MasterKeys, error) { + if len(rootSecret) < RootSecretLen { + return nil, fmt.Errorf("control: root secret must be >= %d bytes, got %d", RootSecretLen, len(rootSecret)) + } + okm, err := hkdf.Key(sha256.New, rootSecret, nil, masterKeyInfo, numMasterKeys*MasterKeyLen) + if err != nil { + return nil, fmt.Errorf("control: derive master keys: %w", err) + } + mk := &MasterKeys{} + for i := range mk.keys { + copy(mk.keys[i][:], okm[i*MasterKeyLen:(i+1)*MasterKeyLen]) + } + return mk, nil +} + +// MasterKeyIndex returns which master key (0 or 1) an SPI selects: per PSP, the +// most-significant bit of the SPI. +func MasterKeyIndex(spi uint32) int { return int(spi >> 31) } + +// SA is a unidirectional PSP security association: an SPI, the derived AES-GCM +// key, and the PSP version (which fixes the key length / cipher). +type SA struct { + SPI uint32 + Key []byte + Version PSPVersion +} + +// DeriveSA derives the SA key for spi using the master key its MSB selects. +func (m *MasterKeys) DeriveSA(spi uint32, v PSPVersion) (*SA, error) { + if spi&spiLowMask == 0 { + return nil, errors.New("control: SPI low 31 bits must be non-zero (zero is reserved)") + } + key, err := DeriveSAKey(m.keys[MasterKeyIndex(spi)][:], spi, v) + if err != nil { + return nil, err + } + return &SA{SPI: spi, Key: key, Version: v}, nil +} + +// Role identifies which peer allocated an SPI. The two directions MUST use +// distinct SPIs, otherwise both directions would derive the same key +// (txKey == rxKey). Partitioning the SPI space by role guarantees distinctness +// even though both peers allocate independently from the shared master keys. +type Role uint8 + +const ( + Initiator Role = iota // canonical lower static key + Responder +) + +// SPI bit layout (PSP keeps the SPI opaque except for the MSB master-key +// selector; we additionally reserve one bit to partition by allocating role): +// +// bit31 master-key index (PSP) +// bit30 allocating role (0=initiator, 1=responder) +// bits[29:0] per-(index,role) counter, 1..2^30-1 (0 reserved) +const ( + spiRoleShift = 30 + spiCounterMask = (uint32(1) << spiRoleShift) - 1 // low 30 bits + spiLowMask = uint32(0x7fffffff) // low 31 bits (PSP: must be non-zero) +) + +// MakeSPI composes an SPI from the active master-key index, the allocating role +// and a per-(index,role) counter. +func MakeSPI(masterKeyIndex int, role Role, counter uint32) (uint32, error) { + if masterKeyIndex < 0 || masterKeyIndex >= numMasterKeys { + return 0, fmt.Errorf("control: master key index must be 0..%d", numMasterKeys-1) + } + if role > Responder { + return 0, fmt.Errorf("control: invalid role %d", role) + } + if counter == 0 || counter > spiCounterMask { + return 0, fmt.Errorf("control: SPI counter out of range (1..%d)", spiCounterMask) + } + return uint32(masterKeyIndex)<<31 | uint32(role)<= numMasterKeys { + return 0, fmt.Errorf("control: master key index must be 0..%d", numMasterKeys-1) + } + a.mu.Lock() + defer a.mu.Unlock() + a.next[masterKeyIndex]++ + c := a.next[masterKeyIndex] + if c > spiCounterMask { + return 0, fmt.Errorf("control: SPI space exhausted for master key %d; rotate", masterKeyIndex) + } + return MakeSPI(masterKeyIndex, a.role, c) +} diff --git a/control/sa_test.go b/control/sa_test.go new file mode 100644 index 0000000..d5cb64f --- /dev/null +++ b/control/sa_test.go @@ -0,0 +1,118 @@ +package control + +import ( + "bytes" + "testing" +) + +func TestDeriveMasterKeysDeterministic(t *testing.T) { + root := bytes.Repeat([]byte{0xA5}, RootSecretLen) + a, err := DeriveMasterKeys(root) + if err != nil { + t.Fatal(err) + } + b, err := DeriveMasterKeys(root) + if err != nil { + t.Fatal(err) + } + if a.keys != b.keys { + t.Fatal("master keys not deterministic for the same root secret") + } + if a.keys[0] == a.keys[1] { + t.Fatal("the two master keys must differ") + } +} + +func TestDeriveMasterKeysSessionUnique(t *testing.T) { + a, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x01}, RootSecretLen)) + b, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x02}, RootSecretLen)) + if a.keys == b.keys { + t.Fatal("different root secrets must yield different master keys (per-session FS)") + } +} + +func TestDeriveMasterKeysRejectsShortRoot(t *testing.T) { + if _, err := DeriveMasterKeys(make([]byte, RootSecretLen-1)); err == nil { + t.Fatal("expected error for short root secret") + } +} + +func TestDeriveSAMatchesKDFAndSelectsMasterKey(t *testing.T) { + mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x5a}, RootSecretLen)) + + // MSB clear -> master key 0; MSB set -> master key 1. + spi0, _ := MakeSPI(0, Initiator, 7) + spi1, _ := MakeSPI(1, Initiator, 7) + + sa0, err := mk.DeriveSA(spi0, PSPv0) + if err != nil { + t.Fatal(err) + } + want0, _ := DeriveSAKey(mk.keys[0][:], spi0, PSPv0) + if !bytes.Equal(sa0.Key, want0) { + t.Fatal("DeriveSA(MSB=0) did not use master key 0") + } + + sa1, err := mk.DeriveSA(spi1, PSPv0) + if err != nil { + t.Fatal(err) + } + want1, _ := DeriveSAKey(mk.keys[1][:], spi1, PSPv0) + if !bytes.Equal(sa1.Key, want1) { + t.Fatal("DeriveSA(MSB=1) did not use master key 1") + } +} + +// TestDirectionsNeverCollide is the txKey != rxKey guarantee: the two peers +// allocate RX SPIs independently, but the role bit keeps them in disjoint +// subspaces, so the same counter yields different SPIs and thus different keys. +func TestDirectionsNeverCollide(t *testing.T) { + mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x33}, RootSecretLen)) + initAlloc := NewSPIAllocator(Initiator) + respAlloc := NewSPIAllocator(Responder) + + seen := map[uint32]bool{} + for i := 0; i < 1000; i++ { + is, err := initAlloc.Allocate(0) + if err != nil { + t.Fatal(err) + } + rs, err := respAlloc.Allocate(0) + if err != nil { + t.Fatal(err) + } + if is == rs { + t.Fatalf("initiator and responder allocated the same SPI %#x", is) + } + if seen[is] || seen[rs] { + t.Fatalf("SPI reuse detected at i=%d", i) + } + seen[is], seen[rs] = true, true + + txSA, _ := mk.DeriveSA(is, PSPv0) + rxSA, _ := mk.DeriveSA(rs, PSPv0) + if bytes.Equal(txSA.Key, rxSA.Key) { + t.Fatal("tx and rx SA keys collided") + } + } +} + +func TestMakeSPIValidation(t *testing.T) { + if _, err := MakeSPI(0, Initiator, 0); err == nil { + t.Fatal("counter 0 must be rejected") + } + if _, err := MakeSPI(2, Initiator, 1); err == nil { + t.Fatal("master key index 2 must be rejected") + } + if _, err := MakeSPI(0, Initiator, spiCounterMask+1); err == nil { + t.Fatal("counter overflow must be rejected") + } +} + +func TestDeriveSARejectsReservedSPI(t *testing.T) { + mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x01}, RootSecretLen)) + // SPI whose low 31 bits are zero (only the master-key bit set) is reserved. + if _, err := mk.DeriveSA(uint32(1)<<31, PSPv0); err == nil { + t.Fatal("expected error for reserved SPI (zero low 31 bits)") + } +} diff --git a/control/tls.go b/control/tls.go new file mode 100644 index 0000000..c20a14a --- /dev/null +++ b/control/tls.go @@ -0,0 +1,111 @@ +package control + +import ( + "crypto/ecdsa" + "crypto/tls" + "errors" + "fmt" +) + +// ALPN is the application-layer protocol name negotiated on the ICX control +// channel. A mismatch (e.g. a stray TLS client) fails the handshake. +const ALPN = "icx-ctrl/1" + +// exporterLabel is the RFC 5705 / RFC 8446 §7.5 exporter label used to derive +// the data-plane master-key seed from the completed TLS 1.3 handshake. Changing +// it is a breaking protocol change. +const exporterLabel = "EXPORTER-icx-master-v1" + +// exporterContext domain-separates the master-key seed from any other exporter +// use on the same connection. +var exporterContext = []byte("icx control plane master seed v1") + +// RootSecretLen is the length of the exported master-key seed (256-bit). +const RootSecretLen = 32 + +// pinVerifier returns a tls.Config.VerifyConnection callback that authenticates +// the peer WireGuard-style: the leaf certificate's public key must equal the +// pinned peer identity key. Chain/CA/hostname validation is intentionally not +// used (the certificates are self-signed); pinning is the whole trust model. +func pinVerifier(peerPub *ecdsa.PublicKey) func(tls.ConnectionState) error { + return func(cs tls.ConnectionState) error { + if len(cs.PeerCertificates) == 0 { + return errors.New("control: peer presented no certificate") + } + leafPub, ok := cs.PeerCertificates[0].PublicKey.(*ecdsa.PublicKey) + if !ok { + return errors.New("control: peer certificate key is not ECDSA") + } + if !leafPub.Equal(peerPub) { + return errors.New("control: peer key pin mismatch") + } + return nil + } +} + +// baseTLSConfig builds the shared TLS 1.3 configuration: our self-signed +// identity certificate, the pinned-peer verifier, TLS 1.3 only, the ICX ALPN, +// and FIPS-approved curves. In a fips140=on build the module further restricts +// the suite to AES-GCM + SHA-2 and disables X25519/ChaCha automatically, so the +// whole handshake stays inside the validated boundary. +func baseTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, error) { + if local == nil || peerPub == nil { + return nil, errors.New("control: local identity and peer key are required") + } + cert, err := local.TLSCertificate() + if err != nil { + return nil, err + } + return &tls.Config{ + Certificates: []tls.Certificate{cert}, + MinVersion: tls.VersionTLS13, + MaxVersion: tls.VersionTLS13, + NextProtos: []string{ALPN}, + CurvePreferences: []tls.CurveID{tls.CurveP256, tls.CurveP384}, + VerifyConnection: pinVerifier(peerPub), + }, nil +} + +// ServerTLSConfig builds the responder side of the control-plane mTLS: it +// requires (and pins) a client certificate. +func ServerTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, error) { + cfg, err := baseTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + // Accept any presented client cert at the chain layer; pinVerifier (run via + // VerifyConnection) is what actually authenticates it. + cfg.ClientAuth = tls.RequireAnyClientCert + return cfg, nil +} + +// ClientTLSConfig builds the initiator side of the control-plane mTLS. +func ClientTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, error) { + cfg, err := baseTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + // InsecureSkipVerify disables ONLY the default CA-chain/hostname checks, which + // are meaningless for a self-signed, pinned peer. It does NOT disable peer + // authentication: VerifyConnection (pinVerifier) still runs and fully + // authenticates the peer by its pinned public key. Without this flag the + // handshake would fail on the absent CA chain before pinning could run. + cfg.InsecureSkipVerify = true + cfg.ServerName = "icx" + return cfg, nil +} + +// ExportRootSecret derives the 32-byte data-plane master-key seed from a +// completed TLS 1.3 handshake via the RFC 8446 exporter. Both peers compute the +// identical value; it is the forward-secret root the PSP master keys are seeded +// from (see keys.go). It must only be called after the handshake completes. +func ExportRootSecret(cs tls.ConnectionState) ([]byte, error) { + if cs.Version != tls.VersionTLS13 { + return nil, fmt.Errorf("control: refusing to export from TLS version %#x (want 1.3)", cs.Version) + } + secret, err := cs.ExportKeyingMaterial(exporterLabel, exporterContext, RootSecretLen) + if err != nil { + return nil, fmt.Errorf("control: export keying material: %w", err) + } + return secret, nil +} diff --git a/control/tls_test.go b/control/tls_test.go new file mode 100644 index 0000000..267534a --- /dev/null +++ b/control/tls_test.go @@ -0,0 +1,138 @@ +package control + +import ( + "bytes" + "crypto/tls" + "errors" + "net" + "testing" + "time" +) + +// tlsHandshakeResult carries one side's post-handshake outcome. +type tlsHandshakeResult struct { + state tls.ConnectionState + err error +} + +// doHandshake runs a TLS 1.3 mTLS handshake between a client and server config +// over an in-memory pipe and returns both sides' results. +func doHandshake(t *testing.T, clientCfg, serverCfg *tls.Config) (client, server tlsHandshakeResult) { + t.Helper() + c, s := net.Pipe() + defer c.Close() + defer s.Close() + + srvCh := make(chan tlsHandshakeResult, 1) + go func() { + conn := tls.Server(s, serverCfg) + _ = conn.SetDeadline(time.Now().Add(2 * time.Second)) + err := conn.Handshake() + srvCh <- tlsHandshakeResult{state: conn.ConnectionState(), err: err} + }() + + conn := tls.Client(c, clientCfg) + _ = conn.SetDeadline(time.Now().Add(2 * time.Second)) + clientErr := conn.Handshake() + client = tlsHandshakeResult{state: conn.ConnectionState(), err: clientErr} + server = <-srvCh + return client, server +} + +func mustConfigs(t *testing.T) (clientCfg, serverCfg *tls.Config, a, b *Identity) { + t.Helper() + a, err := GenerateIdentity() // client/initiator + if err != nil { + t.Fatal(err) + } + b, err = GenerateIdentity() // server/responder + if err != nil { + t.Fatal(err) + } + clientCfg, err = ClientTLSConfig(a, b.PublicKey()) + if err != nil { + t.Fatal(err) + } + serverCfg, err = ServerTLSConfig(b, a.PublicKey()) + if err != nil { + t.Fatal(err) + } + return clientCfg, serverCfg, a, b +} + +func TestMTLSHandshakeSucceedsAndExportsSharedSecret(t *testing.T) { + clientCfg, serverCfg, _, _ := mustConfigs(t) + + client, server := doHandshake(t, clientCfg, serverCfg) + if client.err != nil { + t.Fatalf("client handshake failed: %v", client.err) + } + if server.err != nil { + t.Fatalf("server handshake failed: %v", server.err) + } + + if client.state.Version != tls.VersionTLS13 { + t.Fatalf("negotiated TLS version %#x, want 1.3", client.state.Version) + } + if client.state.NegotiatedProtocol != ALPN { + t.Fatalf("ALPN = %q, want %q", client.state.NegotiatedProtocol, ALPN) + } + + cs, err := ExportRootSecret(client.state) + if err != nil { + t.Fatal(err) + } + ss, err := ExportRootSecret(server.state) + if err != nil { + t.Fatal(err) + } + if len(cs) != RootSecretLen { + t.Fatalf("root secret len = %d, want %d", len(cs), RootSecretLen) + } + if !bytes.Equal(cs, ss) { + t.Fatalf("exported root secrets differ:\n client %x\n server %x", cs, ss) + } +} + +func TestMTLSWrongClientPinRejected(t *testing.T) { + // Server pins a DIFFERENT key than the client actually holds. + client, _ := GenerateIdentity() + server, _ := GenerateIdentity() + imposter, _ := GenerateIdentity() + + clientCfg, _ := ClientTLSConfig(client, server.PublicKey()) + // Server expects `imposter`, but the client authenticates as `client`. + serverCfg, _ := ServerTLSConfig(server, imposter.PublicKey()) + + c, s := doHandshake(t, clientCfg, serverCfg) + if s.err == nil { + t.Fatal("server accepted a client whose key it did not pin") + } + if c.err == nil { + t.Fatal("client handshake should also fail when server rejects it") + } +} + +func TestMTLSWrongServerPinRejected(t *testing.T) { + client, _ := GenerateIdentity() + server, _ := GenerateIdentity() + imposter, _ := GenerateIdentity() + + // Client expects `imposter`, but the server authenticates as `server`. + clientCfg, _ := ClientTLSConfig(client, imposter.PublicKey()) + serverCfg, _ := ServerTLSConfig(server, client.PublicKey()) + + c, _ := doHandshake(t, clientCfg, serverCfg) + if c.err == nil { + t.Fatal("client accepted a server whose key it did not pin") + } + if !errors.Is(c.err, c.err) { // smoke: error is non-nil (pin mismatch surfaced) + t.Fatal("expected a pin-mismatch error") + } +} + +func TestExportRootSecretRejectsZeroState(t *testing.T) { + if _, err := ExportRootSecret(tls.ConnectionState{}); err == nil { + t.Fatal("expected error exporting from a non-1.3 (zero) connection state") + } +} diff --git a/control/transport.go b/control/transport.go new file mode 100644 index 0000000..0182a92 --- /dev/null +++ b/control/transport.go @@ -0,0 +1,246 @@ +package control + +import ( + "bytes" + "context" + "crypto/ecdsa" + "crypto/tls" + "errors" + "fmt" + "net" + "time" + + "github.com/quic-go/quic-go" +) + +// appErrNormal is the QUIC application close code used for a clean shutdown. +const appErrNormal quic.ApplicationErrorCode = 0 + +// activeMasterKeyIndex is the master key used for new SAs in this first +// generation. Master-key rotation (PSP's double-rotation) is layered on later; +// for now both peers always use index 0. +const activeMasterKeyIndex = 0 + +// defaultQUICConfig is the control-plane QUIC configuration. Notably it does NOT +// enable 0-RTT, so every (re)handshake is a full ECDHE exchange — fresh keys +// per session, i.e. forward secrecy by construction. +func defaultQUICConfig() *quic.Config { + return &quic.Config{ + HandshakeIdleTimeout: 10 * time.Second, + MaxIdleTimeout: 30 * time.Second, + KeepAlivePeriod: 10 * time.Second, + MaxIncomingStreams: 4, + } +} + +// Session is an established control-plane connection: an authenticated, +// forward-secret QUIC/mTLS channel plus the PSP master keys derived from its +// TLS exporter. From a Session, peers negotiate the per-direction SAs whose +// keys feed the Geneve/AF_XDP data plane. +type Session struct { + conn *quic.Conn + role Role + masterKeys *MasterKeys + rxAlloc *SPIAllocator +} + +// Dial establishes the initiator side of a control session to peerAddr over the +// already-bound UDP socket pconn, authenticating as local and pinning peerPub. +func Dial(ctx context.Context, pconn net.PacketConn, peerAddr net.Addr, local *Identity, peerPub *ecdsa.PublicKey) (*Session, error) { + tlsConf, err := ClientTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + conn, err := quic.Dial(ctx, pconn, peerAddr, tlsConf, defaultQUICConfig()) + if err != nil { + return nil, fmt.Errorf("control: dial: %w", err) + } + return newSession(ctx, conn, Initiator) +} + +// Listener accepts inbound control sessions on a UDP socket. The underlying +// quic.Transport performs Retry-based source-address validation and enforces +// QUIC's 3x anti-amplification limit, which is the handshake-flood defense. +type Listener struct { + ln *quic.Listener + tr *quic.Transport +} + +// Listen returns a control-plane listener on pconn that authenticates as local +// and pins peerPub. +func Listen(pconn net.PacketConn, local *Identity, peerPub *ecdsa.PublicKey) (*Listener, error) { + tlsConf, err := ServerTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + tr := &quic.Transport{Conn: pconn} + ln, err := tr.Listen(tlsConf, defaultQUICConfig()) + if err != nil { + _ = tr.Close() + return nil, fmt.Errorf("control: listen: %w", err) + } + return &Listener{ln: ln, tr: tr}, nil +} + +// Accept blocks until a peer completes the mTLS handshake, then returns the +// established session (responder role). +func (l *Listener) Accept(ctx context.Context) (*Session, error) { + conn, err := l.ln.Accept(ctx) + if err != nil { + return nil, err + } + return newSession(ctx, conn, Responder) +} + +// Addr returns the local address the listener is bound to. +func (l *Listener) Addr() net.Addr { return l.ln.Addr() } + +// Close tears down the listener and its transport. +func (l *Listener) Close() error { + err := l.ln.Close() + if cerr := l.tr.Close(); err == nil { + err = cerr + } + return err +} + +// newSession waits for the handshake, derives the master keys from the TLS +// exporter, and returns the ready session. +func newSession(ctx context.Context, conn *quic.Conn, role Role) (*Session, error) { + select { + case <-conn.HandshakeComplete(): + case <-ctx.Done(): + _ = conn.CloseWithError(appErrNormal, "handshake cancelled") + return nil, ctx.Err() + } + + // Assert the negotiated ALPN explicitly. TLS already fails the handshake when + // NextProtos don't overlap (both sides advertise only ALPN), but enforcing the + // invariant in code keeps it true if NextProtos is ever widened, and makes the + // guarantee auditable rather than implied. + tlsState := conn.ConnectionState().TLS + if tlsState.NegotiatedProtocol != ALPN { + _ = conn.CloseWithError(appErrNormal, "alpn mismatch") + return nil, fmt.Errorf("control: unexpected ALPN %q, want %q", tlsState.NegotiatedProtocol, ALPN) + } + + root, err := ExportRootSecret(tlsState) + if err != nil { + _ = conn.CloseWithError(appErrNormal, "exporter failure") + return nil, err + } + mk, err := DeriveMasterKeys(root) + if err != nil { + _ = conn.CloseWithError(appErrNormal, "key derivation failure") + return nil, err + } + return &Session{ + conn: conn, + role: role, + masterKeys: mk, + rxAlloc: NewSPIAllocator(role), + }, nil +} + +// Role reports whether this peer is the initiator or responder. +func (s *Session) Role() Role { return s.role } + +// MasterKeys returns the PSP master keys derived from this session. +func (s *Session) MasterKeys() *MasterKeys { return s.masterKeys } + +// TLSState returns the negotiated TLS connection state (version, cipher suite, +// peer certificate). Useful for logging and for asserting the FIPS suite. +func (s *Session) TLSState() tls.ConnectionState { return s.conn.ConnectionState().TLS } + +// Close cleanly shuts the session down. +func (s *Session) Close() error { return s.conn.CloseWithError(appErrNormal, "") } + +// DirectionalSAs is a peer's pair of simplex SAs for one session generation: +// Tx is what we encrypt outbound with (the peer's RX SPI), Rx is what we +// decrypt inbound with (our own RX SPI). +type DirectionalSAs struct { + Tx *SA + Rx *SA +} + +// NegotiateSAs runs the SA-setup exchange over a fresh QUIC stream and returns +// the tx/rx SAs for PSP version v. Each peer allocates and announces its own RX +// SPI; both then derive every key locally from the shared master keys. The +// initiator writes first, the responder replies, so there is no deadlock. +// +// This round-trip is also the mutual key-confirmation: in TLS 1.3 mutual auth +// the initiator's handshake completes before the responder verifies the +// initiator's certificate, so a successful Dial does NOT prove the peer accepted +// us. A peer that fails to pin us tears the connection down, which makes this +// exchange fail. Callers MUST therefore treat a successful NegotiateSAs — not a +// successful Dial/Accept — as the precondition for installing keys (fail-closed). +// +// Concurrency: NOT safe for unmatched concurrent calls on one Session. It pairs +// one initiator OpenStreamSync with one responder AcceptStream, so call it +// sequentially, or have both peers issue the same number of concurrent calls +// (≤ MaxIncomingStreams); a surplus initiator call blocks until a matching +// responder call or the ctx deadline. +func (s *Session) NegotiateSAs(ctx context.Context, v PSPVersion) (*DirectionalSAs, error) { + if !v.valid() { + return nil, fmt.Errorf("control: unsupported PSP version %d", v) + } + myRxSPI, err := s.rxAlloc.Allocate(activeMasterKeyIndex) + if err != nil { + return nil, err + } + offer := saOffer{PSPVersion: v, RxSPI: myRxSPI} + + var stream *quic.Stream + if s.role == Initiator { + stream, err = s.conn.OpenStreamSync(ctx) + } else { + stream, err = s.conn.AcceptStream(ctx) + } + if err != nil { + return nil, fmt.Errorf("control: open SA-setup stream: %w", err) + } + defer stream.Close() + if dl, ok := ctx.Deadline(); ok { + _ = stream.SetDeadline(dl) + } + + var peer saOffer + if s.role == Initiator { + if err := writeFrame(stream, offer.marshal()); err != nil { + return nil, fmt.Errorf("control: send SA offer: %w", err) + } + if peer, err = readSAOffer(stream); err != nil { + return nil, fmt.Errorf("control: read peer SA offer: %w", err) + } + } else { + if peer, err = readSAOffer(stream); err != nil { + return nil, fmt.Errorf("control: read peer SA offer: %w", err) + } + if err := writeFrame(stream, offer.marshal()); err != nil { + return nil, fmt.Errorf("control: send SA offer: %w", err) + } + } + + return s.deriveDirectional(v, myRxSPI, peer) +} + +// deriveDirectional derives the tx/rx SAs and enforces the txKey != rxKey +// invariant (the role-partitioned SPI space guarantees distinct SPIs, but we +// assert on the derived keys as a belt-and-suspenders check). +func (s *Session) deriveDirectional(v PSPVersion, myRxSPI uint32, peer saOffer) (*DirectionalSAs, error) { + if peer.PSPVersion != v { + return nil, fmt.Errorf("control: PSP version mismatch: local %d, peer %d", v, peer.PSPVersion) + } + rx, err := s.masterKeys.DeriveSA(myRxSPI, v) + if err != nil { + return nil, fmt.Errorf("control: derive rx SA: %w", err) + } + tx, err := s.masterKeys.DeriveSA(peer.RxSPI, v) + if err != nil { + return nil, fmt.Errorf("control: derive tx SA: %w", err) + } + if bytes.Equal(tx.Key, rx.Key) { + return nil, errors.New("control: tx and rx SA keys collided") + } + return &DirectionalSAs{Tx: tx, Rx: rx}, nil +} diff --git a/control/transport_test.go b/control/transport_test.go new file mode 100644 index 0000000..38cabb0 --- /dev/null +++ b/control/transport_test.go @@ -0,0 +1,170 @@ +package control + +import ( + "bytes" + "context" + "crypto/tls" + "net" + "testing" + "time" +) + +// loopbackPeers wires an initiator and responder over two loopback UDP sockets +// and returns their established sessions. +func loopbackPeers(t *testing.T) (initiator, responder *Session, cleanup func()) { + t.Helper() + idA, err := GenerateIdentity() // initiator + if err != nil { + t.Fatal(err) + } + idB, err := GenerateIdentity() // responder + if err != nil { + t.Fatal(err) + } + + srvConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + t.Fatal(err) + } + cliConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + t.Fatal(err) + } + + ln, err := Listen(srvConn, idB, idA.PublicKey()) + if err != nil { + t.Fatal(err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + + type res struct { + s *Session + err error + } + respCh := make(chan res, 1) + go func() { + s, err := ln.Accept(ctx) + respCh <- res{s, err} + }() + + initiator, err = Dial(ctx, cliConn, ln.ln.Addr(), idA, idB.PublicKey()) + if err != nil { + cancel() + t.Fatalf("dial: %v", err) + } + r := <-respCh + if r.err != nil { + cancel() + t.Fatalf("accept: %v", r.err) + } + responder = r.s + + cleanup = func() { + cancel() + _ = initiator.Close() + _ = responder.Close() + _ = ln.Close() + _ = cliConn.Close() + } + return initiator, responder, cleanup +} + +func TestControlSessionHandshakeAndSANegotiation(t *testing.T) { + initiator, responder, cleanup := loopbackPeers(t) + defer cleanup() + + // The handshake must be TLS 1.3 with an AES-GCM suite (FIPS-approved). + st := initiator.TLSState() + if st.Version != tls.VersionTLS13 { + t.Fatalf("TLS version %#x, want 1.3", st.Version) + } + switch st.CipherSuite { + case tls.TLS_AES_128_GCM_SHA256, tls.TLS_AES_256_GCM_SHA384: + default: + t.Fatalf("negotiated non-AES-GCM suite %#x", st.CipherSuite) + } + + // Both peers must derive identical master keys from the shared exporter. + if initiator.MasterKeys().keys != responder.MasterKeys().keys { + t.Fatal("peers derived different master keys") + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + type res struct { + sas *DirectionalSAs + err error + } + rCh := make(chan res, 1) + go func() { + sas, err := responder.NegotiateSAs(ctx, PSPv0) + rCh <- res{sas, err} + }() + iSAs, err := initiator.NegotiateSAs(ctx, PSPv0) + if err != nil { + t.Fatalf("initiator NegotiateSAs: %v", err) + } + r := <-rCh + if r.err != nil { + t.Fatalf("responder NegotiateSAs: %v", r.err) + } + rSAs := r.sas + + // Cross-match: what the initiator transmits with == what the responder + // receives with, and vice versa. This holds only if both derived the same + // master keys and agreed on SPIs. + if !bytes.Equal(iSAs.Tx.Key, rSAs.Rx.Key) { + t.Fatal("initiator TX key != responder RX key") + } + if !bytes.Equal(iSAs.Rx.Key, rSAs.Tx.Key) { + t.Fatal("initiator RX key != responder TX key") + } + // Within each peer, tx and rx must differ (no key/SPI collision). + if bytes.Equal(iSAs.Tx.Key, iSAs.Rx.Key) { + t.Fatal("initiator tx and rx keys collided") + } + if iSAs.Tx.SPI != rSAs.Rx.SPI || iSAs.Rx.SPI != rSAs.Tx.SPI { + t.Fatal("SPIs did not cross-match between peers") + } + if MasterKeyIndex(iSAs.Tx.SPI) != activeMasterKeyIndex { + t.Fatalf("tx SPI selects master key %d, want %d", MasterKeyIndex(iSAs.Tx.SPI), activeMasterKeyIndex) + } +} + +func TestControlSessionRejectsUnpinnedPeer(t *testing.T) { + idA, _ := GenerateIdentity() + idB, _ := GenerateIdentity() + imposter, _ := GenerateIdentity() + + srvConn, _ := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + cliConn, _ := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + defer srvConn.Close() + defer cliConn.Close() + + // Responder pins `imposter`, but the initiator authenticates as idA. + ln, err := Listen(srvConn, idB, imposter.PublicKey()) + if err != nil { + t.Fatal(err) + } + defer ln.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + go func() { _, _ = ln.Accept(ctx) }() + + // In TLS 1.3 mutual auth the client's handshake completes before the server + // verifies the client certificate, so Dial may return a session even though + // the responder will reject us. The security property is that we can never + // NEGOTIATE with a peer that does not pin us: the SA-setup round-trip is the + // mutual key-confirmation, and it must fail closed. + sess, err := Dial(ctx, cliConn, ln.Addr(), idA, idB.PublicKey()) + if err != nil { + return // rejected at dial — also acceptable + } + defer sess.Close() + if _, err := sess.NegotiateSAs(ctx, PSPv0); err == nil { + t.Fatal("SA negotiation succeeded against a responder that pinned a different key") + } +} diff --git a/go.mod b/go.mod index f890d00..98e708e 100644 --- a/go.mod +++ b/go.mod @@ -7,11 +7,12 @@ require ( github.com/cilium/ebpf v0.18.0 github.com/google/gopacket v1.1.19 github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 + github.com/quic-go/quic-go v0.59.1 github.com/safchain/ethtool v0.6.1 - github.com/stretchr/testify v1.10.0 + github.com/stretchr/testify v1.11.1 github.com/vishvananda/netlink v1.3.1 - golang.org/x/sync v0.15.0 - golang.org/x/sys v0.33.0 + golang.org/x/sync v0.16.0 + golang.org/x/sys v0.35.0 gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 ) @@ -21,7 +22,8 @@ require ( github.com/google/go-cmp v0.7.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect - golang.org/x/net v0.39.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/time v0.7.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 97b31f3..bab901a 100644 --- a/go.sum +++ b/go.sum @@ -20,40 +20,47 @@ github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 h1:C8IqpV7kfAyZD github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9/go.mod h1:dDLiSjNqdp8VjphLdGTx19OeAUsHOzhtc1FFJqpzWMU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/quic-go/quic-go v0.59.1 h1:0Gmua0HW1Tv7ANR7hUYwRyD0MG5OJfgvYSZasGZzBic= +github.com/quic-go/quic-go v0.59.1/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/safchain/ethtool v0.6.1 h1:mhRnXE1H8fV8TTXh/HdqE4tXtb57r//BQh5pPYMuM5k= github.com/safchain/ethtool v0.6.1/go.mod h1:JzoNbG8xeg/BeVeVoMCtCb3UPWoppZZbFpA+1WFh+M0= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= +go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= +go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 h1:BEymU11L8DZSC4GNK48JYIR8EcHs+gFxtg9YfYlp68c= From f7332abb343f80e872c4780d79cc237df0791d93 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 13:11:26 -0700 Subject: [PATCH 05/20] [handler] bind the AES-GCM nonce to the SPI and guard the key-install seam (APO-644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reshape the data plane toward the PSP nonce model and harden the key-install path, keeping the Geneve/AF_XDP wire format unchanged. Nonce = epoch‖counter. The 12-byte GCM nonce was 0x00000000‖counter64; the high 4 bytes are now the 32-bit SPI (the value already carried in the Geneve key-epoch option). All four TX sites (VirtToPhy, ToPhy, and the in-place equivalents) write it before Seal; both decap sites verify nonce[:4] == the selected epoch before Open and drop + count RXDropsSPIMismatch on a mismatch. GCM already authenticates these bytes (nonce + header are in the tag), so the check is fast-path tamper-rejection and a precise drop reason rather than new authenticity. The decap parsers now also require the TxCounter option's declared 12-byte length before trusting its value as the nonce. Key-install seam. UpdateVirtualNetworkKeys is split into a guarded public seam over an unguarded installKeys. The seam fails closed on three invariants: a non-zero epoch (matching control/sa.go's reserved-SPI rule), a strictly increasing epoch, and rxKey != txKey. The last is load-bearing: both simplex directions share the single epoch, so the SPI prefix is identical inbound and outbound and the keys are the only thing separating the two directions' nonce spaces — equal keys would collide nonces under one key. Scope / honesty: - This closes in-process re-install/rotation reuse and adds RX SPI tamper rejection. The cross-restart static-key case (counter resets to 0 under a reused key) is NOT closed here: the monotonicity guard compares in-memory state only. Restart safety comes from ephemeral per-session keys (the control plane) or durable epoch/counter state — Phase 4. The doc on UpdateVirtualNetworkKeys states this and the per-VNI serialization precondition. - A single shared epoch cannot represent the control plane's distinct tx/rx SPIs; a TODO marks where Phase 4 wires them in. - Hard flag day: a Phase-3 receiver drops frames from a pre-Phase-3 sender (nonce[:4]=0 != epoch). Peers must upgrade together per VNI; preserving the old all-zero-prefix format would reopen the reuse hazard. Tests. The single-handler loopback tests (byte-equivalence, round-trip, fuzz, bench) use one shared key, which the distinct-key guard now forbids, so they route through a test-only InstallKeysForTest seam (export_test.go, compiled only under `go test`). The foreign-package forwarder crypto test is split into two peer handlers with distinct keys. Adds TestUpdateVirtualNetworkKeysGuards and TestRXRejectsSPINonceMismatch. The in-place vs cross-buffer fuzz oracle still agrees in both directions after the nonce binding. Reviewed adversarially (22-agent workflow): no must-fix; the confirmed findings (restart-reuse scoping, shared-epoch phrasing, flag-day, reserved-SPI, option length) are folded in as guards and precise comments above. --- export_test.go | 24 ++++++ forwarder/forwarder_crypto_test.go | 57 ++++++++----- handler.go | 102 ++++++++++++++++++++++- handler_test.go | 129 ++++++++++++++++++++++++++--- inplace_bench_test.go | 4 +- inplace_transform.go | 36 +++++++- inplace_transform_test.go | 9 +- 7 files changed, 322 insertions(+), 39 deletions(-) create mode 100644 export_test.go diff --git a/export_test.go b/export_test.go new file mode 100644 index 0000000..cef6163 --- /dev/null +++ b/export_test.go @@ -0,0 +1,24 @@ +package icx + +import ( + "fmt" + "time" +) + +// InstallKeysForTest installs RX/TX ciphers under epoch without the production +// monotonicity and distinct-key guards enforced by UpdateVirtualNetworkKeys. +// +// It exists only for in-process loopback tests that encrypt and decrypt on a +// single handler with one shared key (the byte-equivalence, round-trip, fuzz and +// benchmark harnesses). Real peers always derive distinct per-direction keys and +// strictly increasing SPIs, so the guarded UpdateVirtualNetworkKeys deliberately +// rejects that shape — hence this unguarded test seam. The file name ends in +// _test.go, so it is compiled only under `go test` and never ships in the +// production binary or public API. +func (h *Handler) InstallKeysForTest(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { + value, ok := h.networkByID.Load(vni) + if !ok { + return fmt.Errorf("VNI %d not found", vni) + } + return h.installKeys(value.(*VirtualNetwork), epoch, rxKey, txKey, expiresAt) +} diff --git a/forwarder/forwarder_crypto_test.go b/forwarder/forwarder_crypto_test.go index fe60924..9f96b27 100644 --- a/forwarder/forwarder_crypto_test.go +++ b/forwarder/forwarder_crypto_test.go @@ -30,16 +30,17 @@ import ( // forwarder decapsulates it in place and emits the recovered inner packet on the // virt interface, byte-for-byte. // -// Construction (single self-keyed handler, option (b1) from the test plan): -// - One *icx.Handler with a single shared key and one route whose Src and Dst -// both cover the inner addresses, so a frame its two-buffer VirtToPhy encaps -// can be decapped by its in-place PhyToVirtInPlace (decap looks the vnet up -// by VNI and validates the inner source against route.Dst; it does NOT check -// the outer underlay source, so a self-encap/decap loop is valid). -// - The encap is done OFFLINE with VirtToPhy to mint a real encrypted frame; -// the frame is injected via a raw AF_PACKET socket on the phy peer (the same -// XDP-redirect primitive TestForwarderRXHeadroom uses), and the decapped -// inner frame is read with a second raw socket on the virt peer. +// Construction (two peer handlers — the production model, since +// UpdateVirtualNetworkKeys rejects equal rx/tx keys): +// - encapH mints genuinely-encrypted frames OFFLINE with VirtToPhy using +// txKey == abKey; the forwarder's handler h decapsulates them in place with +// rxKey == abKey. Both share one VNI and a route whose Src and Dst cover the +// inner addresses, so encap routing and decap source validation both pass +// (decap looks the vnet up by VNI and validates the inner source against +// route.Dst; it does NOT check the outer underlay source). +// - The minted frame is injected via a raw AF_PACKET socket on the phy peer +// (the same XDP-redirect primitive TestForwarderRXHeadroom uses), and the +// decapped inner frame is read with a second raw socket on the virt peer. func TestForwarderCryptoRoundTrip(t *testing.T) { netAdmin, _ := permissions.IsNetAdmin() if !netAdmin { @@ -78,20 +79,38 @@ func TestForwarderCryptoRoundTrip(t *testing.T) { LinkAddr: tcpip.LinkAddress("\x02\x00\x00\x00\x0a\x02"), } + const vni uint = 0x1234 + prefix := netip.MustParsePrefix("10.99.0.0/24") + routes := []icx.Route{{Src: prefix, Dst: prefix}} + + // Two handlers model the two real peers. The A->B direction key (abKey) is + // shared, but each peer's own rx/tx keys differ — UpdateVirtualNetworkKeys + // rejects equal rx/tx keys, since in the shared-epoch nonce layout the key is + // the only thing separating the two directions' nonce spaces. + var abKey, encapRx, hTx [16]byte + copy(abKey[:], []byte("icx-roundtrip-k!")) + copy(encapRx[:], []byte("icx-encap-rxkey!")) + copy(hTx[:], []byte("icx-decap-txkey!")) + expires := time.Now().Add(time.Hour) + + // h: the forwarder's handler. It decapsulates inbound frames with rxKey=abKey. h, err := icx.NewHandler( icx.WithLocalAddr(localUnderlay), icx.WithVirtMAC(virtMAC), ) require.NoError(t, err) + require.NoError(t, h.AddVirtualNetwork(vni, remoteUnderlay, routes)) + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, abKey, hTx, expires)) - const vni uint = 0x1234 - prefix := netip.MustParsePrefix("10.99.0.0/24") - require.NoError(t, h.AddVirtualNetwork(vni, remoteUnderlay, - []icx.Route{{Src: prefix, Dst: prefix}})) - - var key [16]byte - copy(key[:], []byte("icx-roundtrip-k!")) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour))) + // encapH: an offline peer used only to mint genuinely-encrypted frames with + // txKey=abKey (so h can decrypt them); it is never wired to the forwarder. + encapH, err := icx.NewHandler( + icx.WithLocalAddr(localUnderlay), + icx.WithVirtMAC(virtMAC), + ) + require.NoError(t, err) + require.NoError(t, encapH.AddVirtualNetwork(vni, remoteUnderlay, routes)) + require.NoError(t, encapH.UpdateVirtualNetworkKeys(vni, 1, encapRx, abKey, expires)) // Build the forwarder with the REAL handler (not the identity pipe). fwd, err := forwarder.NewForwarder(h, @@ -162,7 +181,7 @@ func TestForwarderCryptoRoundTrip(t *testing.T) { // counter, so every injected frame carries a unique nonce and clears the // replay filter. phyBuf := make([]byte, 2048) - n, handled := h.VirtToPhy(virtFrame, phyBuf) + n, handled := encapH.VirtToPhy(virtFrame, phyBuf) require.Greater(t, n, 0, "offline encap produced no frame") require.False(t, handled) enc := phyBuf[:n] diff --git a/handler.go b/handler.go index 4e65493..7902cd8 100644 --- a/handler.go +++ b/handler.go @@ -4,6 +4,7 @@ import ( "crypto/aes" "crypto/cipher" "encoding/binary" + "errors" "fmt" "log/slog" "net" @@ -45,6 +46,10 @@ type Statistics struct { RXReplayDrops atomic.Uint64 // RXDecryptErrors is the number of received packets that failed decryption. RXDecryptErrors atomic.Uint64 + // RXDropsSPIMismatch is the number of received packets dropped because the + // SPI bound into the AEAD nonce (nonce[:4]) did not match the key epoch the + // frame selected — a malformed or tampered frame (APO-644). + RXDropsSPIMismatch atomic.Uint64 // RXInvalidSrc is the number of received packets with an invalid source address. RXInvalidSrc atomic.Uint64 // TXPackets is the number of transmitted packets. @@ -347,9 +352,27 @@ func (h *Handler) UpdateVirtualNetworkRoutes(vni uint, allowedRoutes []Route) er return nil } -// UpdateVirtualNetworkKeys sets/rotates the encryption keys for a virtual network. -// This must be called atleast once every 24 hours or after `replay.RekeyAfterMessages` -// messages. The epoch must be a monotonically increasing value. +// UpdateVirtualNetworkKeys sets/rotates the encryption keys for a virtual +// network. It must be called at least once every 24 hours or after +// replay.RekeyAfterMessages messages. +// +// epoch is the 32-bit SPI that selects this security association: it is carried +// in the Geneve key-epoch option and bound into the high 4 bytes of the AES-GCM +// nonce (nonce = epoch‖counter). Under the current shared-epoch model the same +// epoch is used for both simplex directions, so the SPI prefix does NOT separate +// them — the distinct rx/tx keys do (see the guards below). The SPI binding's +// value is RX tamper-rejection/auditability and forward-compatibility with +// per-direction SPIs. +// +// Three fail-closed guards require a non-zero epoch, a strictly increasing +// epoch, and rxKey != txKey. IMPORTANT: the monotonicity guard compares against +// in-memory state, so it holds only within a single process lifetime — it cannot +// detect an epoch reused across a restart. Restart safety therefore depends on +// the caller never reusing an (epoch, key) pair: with ephemeral per-session keys +// (the control plane, Phase 4) a restart yields fresh keys and is safe; with +// static persisted keys it is NOT safe absent durable epoch/counter state (the +// residual APO-644 case). Callers must also serialize installs per VNI; the +// guard→install sequence is not internally locked. func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { value, ok := h.networkByID.Load(vni) if !ok { @@ -357,6 +380,41 @@ func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey } vnet := value.(*VirtualNetwork) + // Reserved-SPI guard: epoch 0 is reserved. Rejecting it keeps the data + // plane's accepted SPI space aligned with the control plane, which never + // emits an SPI whose low 31 bits are zero (control/sa.go), and refuses to + // write the all-zero nonce prefix that predated the SPI binding. + if epoch == 0 { + return errors.New("epoch (SPI) must be non-zero") + } + + // Monotonicity guard: within this process the epoch (SPI) must strictly + // increase. Reinstalling under a live or older SPI would reset that SA's TX + // counter and replay window while its key — and thus its nonce space — has + // already been used (a GCM nonce-reuse hazard). This covers in-process + // re-install/rotation only; cross-restart safety is discussed in the doc above. + if cur := vnet.txCipher.Load(); cur != nil && epoch <= cur.epoch { + return fmt.Errorf("epoch must be monotonically increasing: new %d <= current %d", epoch, cur.epoch) + } + + // Distinct-key guard: both simplex directions share the epoch, so the SPI in + // the nonce is identical inbound and outbound. The only thing separating the + // two directions' (key, nonce) spaces is then the key itself; equal rx/tx + // keys would collide nonces under one key — catastrophic for AES-GCM. Real + // peers always derive distinct per-direction keys (control.DeriveSA over + // role-partitioned SPIs). + if rxKey == txKey { + return errors.New("rx and tx keys must differ: each direction requires its own key") + } + + return h.installKeys(vnet, epoch, rxKey, txKey, expiresAt) +} + +// installKeys builds and installs the RX/TX ciphers for epoch, applies the 30s +// grace period to the previous RX key, and sweeps expired RX keys. It is the +// unguarded mechanism behind UpdateVirtualNetworkKeys; the monotonicity and +// distinct-key guards live in that caller. +func (h *Handler) installKeys(vnet *VirtualNetwork, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { // Set grace period (30s) on the previous RX key, if it exists if txCipher := vnet.txCipher.Load(); txCipher != nil { prevEpoch := txCipher.epoch @@ -470,7 +528,14 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { if opt.Class == geneve.ClassExperimental { switch opt.Type { case geneve.OptionTypeTxCounter: - nonce = opt.Value[:12] + // Require the declared 12-byte (Length=3) value so nonce[:4] (the + // SPI) and the counter are provably sender-written, not stale pooled + // bytes from a short/malformed option — keeps the SPI-mismatch drop + // attribution honest. A wrong length leaves nonce nil → the + // "Expected TX counter" drop below. + if opt.Length == 3 { + nonce = opt.Value[:12] + } case geneve.OptionTypeKeyEpoch: epoch = binary.BigEndian.Uint32(opt.Value[:4]) } @@ -498,6 +563,19 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } + // Verify the SPI bound into the nonce matches the epoch that selected this + // SA (nonce = SPI‖counter). A conformant sender always sets nonce[:4] to the + // key epoch; a mismatch is a malformed or tampered frame. GCM would also + // reject it at Open (the nonce and the header both feed the tag), but the + // explicit check makes the binding auditable and gives a precise drop reason. + // (APO-644) + if spi := binary.BigEndian.Uint32(nonce[:4]); spi != epoch { + slog.Debug("Dropping frame: nonce SPI does not match key epoch", + slog.Uint64("epoch", uint64(epoch)), slog.Uint64("nonceSPI", uint64(spi))) + vnet.Stats.RXDropsSPIMismatch.Add(1) + return 0 + } + txCounter := binary.BigEndian.Uint64(nonce[4:]) var ipPacket []byte @@ -792,7 +870,15 @@ func (h *Handler) VirtToPhy(virtFrame, phyFrame []byte) (int, bool) { return 0, false } + // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 + // bytes. Under the shared-epoch model this prefix is identical for both + // directions, so it does not separate them (the distinct rx/tx keys do); its + // value here is letting RX reject a tampered/mismatched SPI and forward-compat + // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. + // TODO(phase4): carry the control plane's distinct tx/rx SPIs, which a single + // shared epoch cannot represent. Both halves must be written before Seal. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) switch ipVersion { @@ -911,7 +997,15 @@ func (h *Handler) ToPhy(phyFrame []byte) int { // Fill options: epoch + nonce/counter binary.BigEndian.PutUint32(hdr.Options[0].Value[:4], txCipher.epoch) + // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 + // bytes. Under the shared-epoch model this prefix is identical for both + // directions, so it does not separate them (the distinct rx/tx keys do); its + // value here is letting RX reject a tampered/mismatched SPI and forward-compat + // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. + // TODO(phase4): carry the control plane's distinct tx/rx SPIs, which a single + // shared epoch cannot represent. Both halves must be written before Seal. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) // Place Geneve payload inside outer UDP frame. diff --git a/handler_test.go b/handler_test.go index bbf6b97..ed0290f 100644 --- a/handler_test.go +++ b/handler_test.go @@ -13,6 +13,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/header" "github.com/apoxy-dev/icx" + "github.com/apoxy-dev/icx/udp" ) func TestHandler(t *testing.T) { @@ -47,7 +48,7 @@ func TestHandler(t *testing.T) { err = h.AddVirtualNetwork(0x12345, peerAddr, []icx.Route{{Src: wildcardPrefix, Dst: wildcardPrefix}}) require.NoError(t, err) - err = h.UpdateVirtualNetworkKeys(0x12345, 1, key, key, time.Now().Add(time.Hour)) + err = h.InstallKeysForTest(0x12345, 1, key, key, time.Now().Add(time.Hour)) require.NoError(t, err) virtFrame := makeIPv4UDPEthernetFrame(virtMAC) @@ -110,7 +111,7 @@ func TestHandler_Layer3(t *testing.T) { err = h.AddVirtualNetwork(0x12345, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}}) require.NoError(t, err) - err = h.UpdateVirtualNetworkKeys(0x12345, 1, key, key, time.Now().Add(time.Hour)) + err = h.InstallKeysForTest(0x12345, 1, key, key, time.Now().Add(time.Hour)) require.NoError(t, err) ipPacket := makeIPv4UDPPacket() @@ -151,7 +152,7 @@ func TestHandler_Layer3_IPv6(t *testing.T) { // Prefix contains src 2001:db8::1 privatePrefix := netip.MustParsePrefix("2001:db8::/64") require.NoError(t, h.AddVirtualNetwork(0x45678, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}})) - require.NoError(t, h.UpdateVirtualNetworkKeys(0x45678, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(0x45678, 1, key, key, time.Now().Add(time.Hour))) ip6 := makeIPv6UDPPacket() phy := make([]byte, 1500) @@ -189,7 +190,7 @@ func TestUpdateVirtualNetworkRoutes(t *testing.T) { privatePrefix := netip.MustParsePrefix("192.168.1.0/24") err = h.AddVirtualNetwork(0x23456, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}}) require.NoError(t, err) - require.NoError(t, h.UpdateVirtualNetworkKeys(0x23456, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(0x23456, 1, key, key, time.Now().Add(time.Hour))) virt := makeIPv4UDPEthernetFrame(tcpip.GetRandMacAddr()) phy := make([]byte, 1500) @@ -249,7 +250,7 @@ func TestKeyRotation(t *testing.T) { require.NoError(t, h.AddVirtualNetwork(vni, peer, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}})) // Epoch 1 - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, k1, k1, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 1, k1, k1, clk.Now().Add(time.Hour))) ip := makeIPv4UDPPacket() phy := make([]byte, 2000) @@ -267,7 +268,7 @@ func TestKeyRotation(t *testing.T) { epoch1B := append([]byte(nil), phy[:n]...) // Rotate to epoch 2; epoch 1 gets grace - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 2, k2, k2, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 2, k2, k2, clk.Now().Add(time.Hour))) // Within grace: one of the saved epoch-1 frames must decrypt. m := h.PhyToVirt(epoch1A, out) @@ -286,13 +287,13 @@ func TestKeyRotation(t *testing.T) { epoch2A := append([]byte(nil), phy[:n]...) // Rotate to epoch 3 (starts grace for epoch 2) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 3, k3, k3, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 3, k3, k3, clk.Now().Add(time.Hour))) // Let epoch-2 grace expire. clk.Advance(31 * time.Second) // Rotate to epoch 4; expired RX keys should be swept here - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 4, k4, k4, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 4, k4, k4, clk.Now().Add(time.Hour))) // The saved epoch-2 frame should now be rejected (no matching key after cleanup). m = h.PhyToVirt(epoch2A, out[:cap(out)]) @@ -307,6 +308,114 @@ func TestKeyRotation(t *testing.T) { require.Equal(t, ip, out[:m]) } +// TestUpdateVirtualNetworkKeysGuards exercises the two fail-closed guards on the +// production key-install seam: the epoch (SPI) must strictly increase, and the +// rx/tx keys must differ. +func TestUpdateVirtualNetworkKeysGuards(t *testing.T) { + localAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()), Port: 1234} + peerAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()), Port: 4321} + + h, err := icx.NewHandler(icx.WithLocalAddr(localAddr), icx.WithLayer3VirtFrames()) + require.NoError(t, err) + + const vni = 0x9999 + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, peerAddr, []icx.Route{{Src: prefix, Dst: prefix}})) + + var k1, k2 [16]byte + copy(k1[:], []byte("aaaaaaaaaaaaaaaa")) + copy(k2[:], []byte("bbbbbbbbbbbbbbbb")) + exp := time.Now().Add(time.Hour) + + // Equal rx/tx keys are rejected even on the first install. + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 1, k1, k1, exp), + "equal rx/tx keys must be rejected") + + // epoch 0 (reserved SPI) is rejected. + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 0, k1, k2, exp), + "epoch 0 (reserved SPI) must be rejected") + + // Distinct keys with a fresh, non-zero epoch succeed. + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, k1, k2, exp)) + + // Reinstalling the same epoch is rejected (must strictly increase). + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 1, k2, k1, exp), + "same epoch must be rejected (monotonicity)") + + // A higher epoch with distinct keys succeeds. + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 3, k2, k1, exp)) + + // A non-zero epoch lower than the current one is rejected (monotonicity). + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 2, k1, k2, exp), + "lower epoch must be rejected (monotonicity)") +} + +// TestRXRejectsSPINonceMismatch proves the RX side rejects a frame whose nonce +// SPI (nonce[:4]) does not match the key epoch it selected, on BOTH the +// cross-buffer and in-place decap paths, and that TX binds the SPI into the +// nonce in the first place (the offset sanity checks below). +func TestRXRejectsSPINonceMismatch(t *testing.T) { + localAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()), Port: 1234} + peerAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()), Port: 4321} + + h, err := icx.NewHandler(icx.WithLocalAddr(localAddr), icx.WithLayer3VirtFrames()) + require.NoError(t, err) + + const vni = 0x1234 + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, peerAddr, []icx.Route{{Src: prefix, Dst: prefix}})) + + var key [16]byte + copy(key[:], []byte("0123456789abcdef")) + // Loopback (single shared key) so this one handler both encaps and decaps; + // the production seam would reject equal rx/tx keys. + require.NoError(t, h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour))) + + // Mint a real encrypted frame; TX binds epoch 1 into nonce[:4]. + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + n, loop := h.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + + // The Geneve header sits at the UDP payload offset; its layout is + // base(8) + KeyEpoch option(hdr 4 + value 4) + TxCounter option(hdr 4 + value 12). + // So the key-epoch value and the nonce (= TxCounter value, nonce[:4] = SPI) + // live at these absolute offsets within the IPv4-underlay physical frame. + const geneveBase, optHdr, epochValLen = 8, 4, 4 + keyEpochOff := udp.PayloadOffsetIPv4 + geneveBase + optHdr + nonceOff := keyEpochOff + epochValLen + optHdr + require.Equal(t, uint32(1), binary.BigEndian.Uint32(phy[keyEpochOff:keyEpochOff+4]), + "sanity: key-epoch option should carry epoch 1") + require.Equal(t, uint32(1), binary.BigEndian.Uint32(phy[nonceOff:nonceOff+4]), + "TX must bind the SPI (epoch 1) into nonce[:4]") + + // Tamper nonce[:4] so it no longer matches the key epoch (which stays 1, so + // the RX side still selects the installed cipher and reaches the SPI check). + tampered := append([]byte(nil), phy[:n]...) + tampered[nonceOff] = 0xFF + require.NotEqual(t, uint32(1), binary.BigEndian.Uint32(tampered[nonceOff:nonceOff+4])) + require.Equal(t, uint32(1), binary.BigEndian.Uint32(tampered[keyEpochOff:keyEpochOff+4]), + "key epoch must remain 1 so the SPI check, not the key lookup, rejects the frame") + + vnet, ok := h.GetVirtualNetwork(vni) + require.True(t, ok) + + // Cross-buffer decap drops it and counts an SPI mismatch. + out := make([]byte, 1500) + require.Zero(t, h.PhyToVirt(append([]byte(nil), tampered...), out)) + require.Equal(t, uint64(1), vnet.Stats.RXDropsSPIMismatch.Load()) + + // In-place decap drops it identically (placed at a non-zero offset). + buf := make([]byte, len(tampered)+256) + const off = 64 + copy(buf[off:off+len(tampered)], tampered) + gotOff, gotLen := h.PhyToVirtInPlace(buf, off, len(tampered)) + require.Zero(t, gotLen) + require.Zero(t, gotOff) + require.Equal(t, uint64(2), vnet.Stats.RXDropsSPIMismatch.Load()) +} + func TestARPRequest_Loopback(t *testing.T) { if testing.Verbose() { slog.SetLogLoggerLevel(slog.LevelDebug) @@ -378,7 +487,7 @@ func TestNeighborSolicitation_Loopback(t *testing.T) { privatePrefix := netip.MustParsePrefix("2001:db8::/64") require.NoError(t, h.AddVirtualNetwork(0x56789, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}})) - require.NoError(t, h.UpdateVirtualNetworkKeys(0x56789, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(0x56789, 1, key, key, time.Now().Add(time.Hour))) nsFrame := makeIPv6NeighborSolicitationEthernetFrame() phy := make([]byte, 2000) @@ -450,7 +559,7 @@ func BenchmarkHandler(b *testing.B) { err = h.AddVirtualNetwork(vni, remoteAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}}) require.NoError(b, err) - err = h.UpdateVirtualNetworkKeys(0x12345, 1, key, key, time.Now().Add(time.Hour)) + err = h.InstallKeysForTest(0x12345, 1, key, key, time.Now().Add(time.Hour)) require.NoError(b, err) virtMAC := tcpip.GetRandMacAddr() diff --git a/inplace_bench_test.go b/inplace_bench_test.go index 99c4922..3a8090c 100644 --- a/inplace_bench_test.go +++ b/inplace_bench_test.go @@ -55,8 +55,8 @@ func newBenchEnv(b *testing.B, tc inplaceTestCase) *inplaceEnv { if err := h.AddVirtualNetwork(vni, remoteAddr, routes); err != nil { b.Fatalf("AddVirtualNetwork: %v", err) } - if err := h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour)); err != nil { - b.Fatalf("UpdateVirtualNetworkKeys: %v", err) + if err := h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour)); err != nil { + b.Fatalf("InstallKeysForTest: %v", err) } vnet, ok := h.GetVirtualNetwork(vni) if !ok { diff --git a/inplace_transform.go b/inplace_transform.go index bd5d5f6..02a3d96 100644 --- a/inplace_transform.go +++ b/inplace_transform.go @@ -109,7 +109,14 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { if opt.Class == geneve.ClassExperimental { switch opt.Type { case geneve.OptionTypeTxCounter: - nonce = opt.Value[:12] + // Require the declared 12-byte (Length=3) value so nonce[:4] (the + // SPI) and the counter are provably sender-written, not stale pooled + // bytes from a short/malformed option — keeps the SPI-mismatch drop + // attribution honest. A wrong length leaves nonce nil → the + // "Expected TX counter" drop below. Mirrors PhyToVirt exactly. + if opt.Length == 3 { + nonce = opt.Value[:12] + } case geneve.OptionTypeKeyEpoch: epoch = binary.BigEndian.Uint32(opt.Value[:4]) } @@ -137,6 +144,19 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { return dropWindowOffset, 0 } + // Verify the SPI bound into the nonce matches the epoch that selected this + // SA (nonce = SPI‖counter). A conformant sender always sets nonce[:4] to the + // key epoch; a mismatch is a malformed or tampered frame. GCM would also + // reject it at Open (the nonce and the header both feed the tag), but the + // explicit check makes the binding auditable and gives a precise drop reason. + // (APO-644). Mirrors PhyToVirt exactly to preserve byte-equivalence. + if spi := binary.BigEndian.Uint32(nonce[:4]); spi != epoch { + slog.Debug("Dropping frame: nonce SPI does not match key epoch", + slog.Uint64("epoch", uint64(epoch)), slog.Uint64("nonceSPI", uint64(spi))) + vnet.Stats.RXDropsSPIMismatch.Add(1) + return dropWindowOffset, 0 + } + txCounter := binary.BigEndian.Uint64(nonce[4:]) // In-place decap: the ciphertext (payload[hdrLen:]) lives at ctStart within @@ -471,7 +491,14 @@ func (h *Handler) VirtToPhyInPlace(buf []byte, off, length int) (int, int, bool) return dropWindowOffset, 0, false } + // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 + // bytes. Under the shared-epoch model this prefix is identical for both + // directions, so it does not separate them (the distinct rx/tx keys do); its + // value here is letting RX reject a tampered/mismatched SPI and forward-compat + // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. + // Must match the cross-buffer VirtToPhy/ToPhy nonce layout exactly. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) switch ipVersion { @@ -690,7 +717,14 @@ func (h *Handler) ToPhyInPlace(buf []byte, off int) (int, int) { // Fill options: epoch + nonce/counter binary.BigEndian.PutUint32(hdr.Options[0].Value[:4], txCipher.epoch) + // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 + // bytes. Under the shared-epoch model this prefix is identical for both + // directions, so it does not separate them (the distinct rx/tx keys do); its + // value here is letting RX reject a tampered/mismatched SPI and forward-compat + // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. + // Must match the cross-buffer VirtToPhy/ToPhy nonce layout exactly. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) // Place Geneve payload inside outer UDP frame. diff --git a/inplace_transform_test.go b/inplace_transform_test.go index bb7ae13..955680d 100644 --- a/inplace_transform_test.go +++ b/inplace_transform_test.go @@ -185,10 +185,13 @@ func newInplaceEnv(t *testing.T, tc inplaceTestCase) *inplaceEnv { // Use a single key for both RX and TX so that frames encrypted with the TX // cipher can be decrypted with the RX cipher (the round-trip and decap - // equivalence tests run encap then decap on the same handler). + // equivalence tests run encap then decap on the same handler). This loopback + // shape requires the unguarded InstallKeysForTest seam: the production + // UpdateVirtualNetworkKeys rejects equal rx/tx keys (real peers use distinct + // per-direction keys). key := generateKey(t) require.NoError(t, h.AddVirtualNetwork(vni, remoteAddr, routes)) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour))) vnet, ok := h.GetVirtualNetwork(vni) require.True(t, ok) @@ -482,7 +485,7 @@ func newInplaceEnvKeepAlive(t *testing.T, tc inplaceTestCase, interval time.Dura }} key := generateKey(t) require.NoError(t, h.AddVirtualNetwork(vni, remoteAddr, routes)) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour))) vnet, ok := h.GetVirtualNetwork(vni) require.True(t, ok) From faaac88c888b536336e9e03a1fbab37c2af178ee Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 13:19:33 -0700 Subject: [PATCH 06/20] [cli] document enforced rx!=tx + restart nonce-reuse caveat (APO-644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflect the key-install guards added in f7332ab: rx and tx must differ and the epoch must strictly increase within a process, so the CLI now refuses an INI whose rx==tx. Spell out that a restart re-reads the INI at epoch 1 with the TX counter reset to 0, so restarting against an unchanged key file reuses the AES-GCM nonce sequence — rotate to fresh keys or use a key-exchange mechanism. --- cli/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cli/README.md b/cli/README.md index 5a69e54..994ad28 100644 --- a/cli/README.md +++ b/cli/README.md @@ -5,6 +5,14 @@ ICX uses a pair of **ephemeral, per-session** symmetric keys for encrypting traffic. **Do not reuse keys** across sessions (to avoid nonce reuse risks). +ICX enforces two invariants when keys are installed and will refuse the +key otherwise: `rx` and `tx` **must differ** (each direction needs its own key), +and the key epoch must **strictly increase** within a running process. Note that +a restart re-reads the INI starting again at epoch 1 with the TX counter reset to +0, so **do not restart against an unchanged key file** — rotate to fresh keys +(below) or use a key-exchange mechanism, otherwise the AES-GCM nonce sequence is +reused under the same key. + In production, use a secure key exchange mechanism (e.g., IKEv2) to generate and distribute keys. From 8a052c3d48848495e2a288961b073c545ad2ccc0 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 15:31:16 -0700 Subject: [PATCH 07/20] [control] add control-plane tunnel orchestrator wiring SAs into the datapath (APO-648) Drive the QUIC/mTLS control plane into the data plane. A Tunnel establishes the session, negotiates the first SA pair fail-closed, and keeps keys fresh: the initiator rekeys on a timer (reacting to QUIC connection-context loss), the responder serves rekeys from a blocking accept loop. Any negotiation error is session-fatal and triggers reconnect-with-backoff (no retry on a dead session); a rejected install (epoch regression after a reconnect) is logged and swallowed so the data plane keeps its current keys and fails closed on their expiry. Bridge the control plane's distinct, role-partitioned per-direction SPIs onto the handler's single shared epoch via SharedEpoch (the initiator-allocated / bit30==0 SPI): both peers install the identical scalar epoch while deriving distinct per-direction keys from the distinct SPIs, so AES-GCM nonce uniqueness rests on the distinct-key guard. This needs no handler/seam change; carrying the genuine per-direction SPI on the wire is a future additive UpdateVirtualNetworkSAs. Add SelectMode (static XOR control-plane, fail-closed, no silent fallback), CanonicalInitiator (deterministic role election by SPKI order, equal keys rejected), and Session.Context for prompt session-loss detection. Tests: SelectMode truth table; CanonicalInitiator ordering + equal-key reject; SharedEpoch agreement/validation; installSAs PSPv0/16-byte fail-closed and swallow-on-rejection; two-peer loopback bring-up, rekey, pin-mismatch fail-closed, and reconnect self-heal; plus an end-to-end control-plane->handler Geneve round-trip proving SharedEpoch interoperates with zero drops while the naive per-direction Tx.SPI epoch drops every frame. --- control/cp.go | 446 +++++++++++++++++++++++++++++++++++++++++++ control/cp_test.go | 375 ++++++++++++++++++++++++++++++++++++ control/transport.go | 5 + cp_wire_test.go | 174 +++++++++++++++++ 4 files changed, 1000 insertions(+) create mode 100644 control/cp.go create mode 100644 control/cp_test.go create mode 100644 cp_wire_test.go diff --git a/control/cp.go b/control/cp.go new file mode 100644 index 0000000..cdd137f --- /dev/null +++ b/control/cp.go @@ -0,0 +1,446 @@ +package control + +import ( + "bytes" + "context" + "crypto/ecdsa" + "crypto/x509" + "errors" + "fmt" + "log/slog" + "net" + "time" +) + +// This file is the control-plane orchestrator: it drives the QUIC/mTLS session +// (control/transport.go) and feeds the negotiated SAs into the data plane via an +// SAInstaller, so the CLI stays thin and the wiring is unit-testable off Linux. +// +// Data-plane epoch model (this build): the handler carries a SINGLE 32-bit epoch +// per security association for both simplex directions (handler.go), while the +// control plane allocates a DISTINCT, role-partitioned SPI per direction. We bridge +// the two with SharedEpoch: both peers install the same scalar epoch (the +// initiator-allocated SPI) but derive DISTINCT per-direction keys from the distinct +// SPIs, so AES-GCM nonce uniqueness rests on the keys differing (the handler's +// rxKey != txKey guard), exactly as documented at handler.go. Carrying the genuine +// per-direction SPI on the wire (true per-direction nonce spaces) is the additive +// UpdateVirtualNetworkSAs follow-up (Option C); it is intentionally out of scope here. + +// Mode is the keying mode selected from the CLI flags. +type Mode int + +const ( + ModeNone Mode = iota + // ModeStatic is the legacy static pre-shared keys loaded from an INI file. + ModeStatic + // ModeControlPlane is the QUIC/mTLS control plane with ephemeral, forward-secret, + // per-session keys. + ModeControlPlane +) + +func (m Mode) String() string { + switch m { + case ModeStatic: + return "static" + case ModeControlPlane: + return "control-plane" + default: + return "none" + } +} + +// SelectMode resolves the keying mode from which flags are set, fail-closed. +// Exactly one mode must be configured: either static keys (--key-file) OR the +// control plane (--identity-key AND --peer-key). Any other combination — both, the +// control plane half-configured, or nothing — is an error. There is deliberately no +// silent fallback from the control plane to static keys. +func SelectMode(hasKeyFile, hasIdentity, hasPeer bool) (Mode, error) { + cp := hasIdentity || hasPeer + switch { + case hasKeyFile && cp: + return ModeNone, errors.New("conflicting keying modes: set --key-file (static) OR --identity-key/--peer-key (control plane), not both") + case cp: + if !hasIdentity || !hasPeer { + return ModeNone, errors.New("control-plane mode requires both --identity-key and --peer-key") + } + return ModeControlPlane, nil + case hasKeyFile: + return ModeStatic, nil + default: + return ModeNone, errors.New("no keying configured: set --key-file (static) or --identity-key and --peer-key (control plane)") + } +} + +// CanonicalInitiator reports whether the local node is the control-plane initiator +// — the peer that dials. The role is elected deterministically from the two pinned +// identities so both ends agree with zero configuration (WireGuard-style): the node +// whose SubjectPublicKeyInfo DER sorts lower is the initiator, the other listens. +// Identical keys are rejected — a node must not tunnel to itself, and equal keys +// would make both ends pick the same role (double-dial / double-listen deadlock). +func CanonicalInitiator(localPub, peerPub *ecdsa.PublicKey) (bool, error) { + if localPub == nil || peerPub == nil { + return false, errors.New("control: nil identity key") + } + if PublicKeyEqual(localPub, peerPub) { + return false, errors.New("control: local and peer identity keys are identical; peers must have distinct keys") + } + l, err := x509.MarshalPKIXPublicKey(localPub) + if err != nil { + return false, fmt.Errorf("control: marshal local key: %w", err) + } + p, err := x509.MarshalPKIXPublicKey(peerPub) + if err != nil { + return false, fmt.Errorf("control: marshal peer key: %w", err) + } + return bytes.Compare(l, p) < 0, nil +} + +// roleBit is the SPI bit that encodes the allocating role (see sa.go): the +// initiator allocates SPIs with this bit clear, the responder with it set. +const roleBit = uint32(1) << spiRoleShift + +// SharedEpoch derives the single data-plane epoch both peers install for a +// negotiated SA generation. Of the two role-partitioned SPIs the peer holds +// ({Tx, Rx}), exactly one was allocated by the initiator (role bit clear); both +// peers select that one and compute the identical value, because the initiator's +// Rx SPI is the responder's Tx SPI. The epoch is what lands in the Geneve key-epoch +// option and nonce[:4]; the distinct per-direction KEYS still come from the distinct +// SPIs, so the two directions never share a (key, nonce) pair. +// +// This selection is well-defined only while the master-key index (SPI bit31) is 0 +// for both directions, which holds until master-key rotation is introduced; rotation +// is a known incompatibility for the shared-epoch bridge and is the trigger for the +// per-direction-SPI follow-up. +func SharedEpoch(sas *DirectionalSAs) (uint32, error) { + if sas == nil || sas.Tx == nil || sas.Rx == nil { + return 0, errors.New("control: nil SAs") + } + if MasterKeyIndex(sas.Tx.SPI) != 0 || MasterKeyIndex(sas.Rx.SPI) != 0 { + return 0, errors.New("control: SharedEpoch requires master-key index 0 (rotation not yet supported)") + } + txInitiator := sas.Tx.SPI&roleBit == 0 + rxInitiator := sas.Rx.SPI&roleBit == 0 + if txInitiator == rxInitiator { + return 0, fmt.Errorf("control: SAs are not role-partitioned (tx=%#08x rx=%#08x)", sas.Tx.SPI, sas.Rx.SPI) + } + if txInitiator { + return sas.Tx.SPI, nil + } + return sas.Rx.SPI, nil +} + +// SAInstaller installs a negotiated SA generation into the data plane. epoch is the +// shared data-plane epoch (see SharedEpoch); rxKey/txKey are the 16-byte AES-128 keys +// for the receive/transmit directions. The installer owns the key lifetime/expiry and +// is expected to enforce the handler's fail-closed guards (non-zero, strictly +// increasing epoch, rxKey != txKey). A returned error is treated as a rejected +// rotation, not a session failure. +type SAInstaller func(epoch uint32, rxKey, txKey [16]byte) error + +// Default lifecycle timings; overridable on Tunnel for tests. +const ( + defaultPerExchangeTimeout = 10 * time.Second + defaultReconnectBackoff = 5 * time.Second +) + +// Tunnel runs the control-plane lifecycle for one peer: it establishes the QUIC/mTLS +// session, performs the initial SA negotiation and install (fail-closed) in Bringup, +// and then keeps the SAs fresh in Run — the initiator drives rekeys on a timer, the +// responder serves them from an accept loop. A Tunnel is not safe for concurrent use; +// Bringup then Run are called once each, in that order. +type Tunnel struct { + local *Identity + peerPub *ecdsa.PublicKey + conn net.PacketConn + peerAddr net.Addr + rekeyIvl time.Duration + install SAInstaller + initiator bool + + // tunables (defaults set by NewTunnel; tests may override) + perExchangeTimeout time.Duration + reconnectBackoff time.Duration + + ln *Listener // responder only; persists across reconnects + sess *Session +} + +// TunnelConfig is the immutable configuration for a Tunnel. +type TunnelConfig struct { + // Local is this node's long-term identity (its private key). + Local *Identity + // PeerPub is the pinned public key of the single expected peer. + PeerPub *ecdsa.PublicKey + // Conn is the bound control-plane UDP socket (separate from the Geneve data port). + Conn net.PacketConn + // PeerAddr is the peer's control-plane address (peer IP + control port). + PeerAddr net.Addr + // RekeyInterval is how often the initiator negotiates a fresh SA generation. + RekeyInterval time.Duration +} + +// NewTunnel validates the config, elects the canonical role, and returns a Tunnel +// ready for Bringup. It does no I/O. +func NewTunnel(cfg TunnelConfig, install SAInstaller) (*Tunnel, error) { + if cfg.Local == nil || cfg.PeerPub == nil { + return nil, errors.New("control: tunnel requires local identity and peer key") + } + if cfg.Conn == nil || cfg.PeerAddr == nil { + return nil, errors.New("control: tunnel requires a control socket and peer address") + } + if install == nil { + return nil, errors.New("control: tunnel requires an SA installer") + } + if cfg.RekeyInterval <= 0 { + return nil, errors.New("control: rekey interval must be positive") + } + initiator, err := CanonicalInitiator(cfg.Local.PublicKey(), cfg.PeerPub) + if err != nil { + return nil, err + } + return &Tunnel{ + local: cfg.Local, + peerPub: cfg.PeerPub, + conn: cfg.Conn, + peerAddr: cfg.PeerAddr, + rekeyIvl: cfg.RekeyInterval, + install: install, + initiator: initiator, + perExchangeTimeout: defaultPerExchangeTimeout, + reconnectBackoff: defaultReconnectBackoff, + }, nil +} + +// Initiator reports the elected role (true = this node dials). +func (t *Tunnel) Initiator() bool { return t.initiator } + +// Bringup establishes the session and performs the first SA negotiation and install. +// It is synchronous and FAIL-CLOSED: it returns an error (and installs nothing) if +// the handshake, negotiation, or install fails, so the caller must not start the data +// plane until Bringup succeeds. +func (t *Tunnel) Bringup(ctx context.Context) error { + if err := t.establish(ctx); err != nil { + return fmt.Errorf("control: establish session: %w", err) + } + if err := t.negotiateAndInstall(ctx); err != nil { + t.closeSession() + return fmt.Errorf("control: initial SA negotiation: %w", err) + } + role := "responder" + if t.initiator { + role = "initiator" + } + slog.Info("control plane established", slog.String("role", role), + slog.String("peer", t.peerAddr.String())) + return nil +} + +// Run keeps the SAs fresh until ctx is cancelled. The initiator rekeys on its timer +// (and reacts promptly to session loss via the QUIC connection context); the +// responder serves rekeys from a blocking accept loop. A failed negotiation is +// session-fatal: the session is torn down and re-established (fresh, aligned +// allocators) rather than retried on a dead session. Control-plane failures are NOT +// returned: they drive reconnect-with-backoff indefinitely, so Run effectively +// returns only when ctx is cancelled (clean shutdown). If the control plane cannot +// re-establish, the data plane fails closed when the installed keys expire — Run does +// not proactively tear it down. Bringup must have succeeded first. +func (t *Tunnel) Run(ctx context.Context) error { + defer t.Close() + if t.initiator { + return t.runInitiator(ctx) + } + return t.runResponder(ctx) +} + +func (t *Tunnel) runInitiator(ctx context.Context) error { + ticker := time.NewTicker(t.rekeyIvl) + defer ticker.Stop() + for { + sessLost := t.sessionDone() + select { + case <-ctx.Done(): + return nil + case <-sessLost: + slog.Warn("control: session lost, reconnecting") + if err := t.reestablish(ctx); err != nil { + return err + } + case <-ticker.C: + exCtx, cancel := context.WithTimeout(ctx, t.perExchangeTimeout) + err := t.negotiateAndInstall(exCtx) + cancel() + if err == nil { + continue + } + if ctx.Err() != nil { + return nil + } + // A monotonicity rejection (epoch regression) surfaces only after a + // reconnect reset the per-session SPI counter; installSAs logs it and + // returns nil, so any error here is a genuine session/transport failure. + slog.Warn("control: rekey failed, reconnecting", slog.Any("error", err)) + if err := t.reestablish(ctx); err != nil { + return err + } + } + } +} + +func (t *Tunnel) runResponder(ctx context.Context) error { + for { + if ctx.Err() != nil { + return nil + } + // The accept loop blocks in NegotiateSAs' AcceptStream until the initiator + // drives the next rekey; the long-lived ctx (no per-exchange deadline) lets it + // wait across the whole interval, and QUIC's MaxIdleTimeout bounds a half-open + // exchange. Errors are session-fatal → reconnect. + if err := t.negotiateAndInstall(ctx); err != nil { + if ctx.Err() != nil { + return nil + } + slog.Warn("control: SA negotiation failed, reconnecting", slog.Any("error", err)) + if err := t.reestablish(ctx); err != nil { + return err + } + } + } +} + +// negotiateAndInstall runs one SA exchange on the live session and installs the +// result. installSAs swallows a rotation rejection (returns nil) so it does not look +// like a transport failure; a non-nil error here means the wire exchange failed. +func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { + sas, err := t.sess.NegotiateSAs(ctx, PSPv0) + if err != nil { + return err + } + return t.installSAs(sas) +} + +// installSAs validates the negotiated SAs fail-closed (PSPv0, 16-byte keys), +// computes the shared epoch, and hands them to the installer. A rejected rotation +// (e.g. the monotonicity guard refusing a regressed epoch after a reconnect) is +// logged and swallowed: the previously installed keys keep forwarding, and the data +// plane fails closed on their own expiry. Seamless reconnect/restart across the guard +// is the deferred durable epoch high-water work. +func (t *Tunnel) installSAs(sas *DirectionalSAs) error { + if sas.Tx.Version != PSPv0 || sas.Rx.Version != PSPv0 { + return fmt.Errorf("control: only PSPv0/AES-128 is supported in this build (tx=%d rx=%d)", sas.Tx.Version, sas.Rx.Version) + } + if len(sas.Rx.Key) != 16 || len(sas.Tx.Key) != 16 { + return fmt.Errorf("control: expected 16-byte SA keys (rx=%d tx=%d)", len(sas.Rx.Key), len(sas.Tx.Key)) + } + epoch, err := SharedEpoch(sas) + if err != nil { + return err + } + var rxKey, txKey [16]byte + copy(rxKey[:], sas.Rx.Key) + copy(txKey[:], sas.Tx.Key) + if err := t.install(epoch, rxKey, txKey); err != nil { + slog.Warn("control: SA install rejected; keeping current keys until they expire (seamless reconnect needs durable epoch high-water — deferred)", + slog.Uint64("epoch", uint64(epoch)), slog.Any("error", err)) + return nil + } + slog.Debug("control: installed SA generation", slog.Uint64("epoch", uint64(epoch))) + return nil +} + +// establish opens a fresh session: the initiator dials, the responder accepts on a +// listener it keeps across reconnects. +func (t *Tunnel) establish(ctx context.Context) error { + if t.initiator { + sess, err := Dial(ctx, t.conn, t.peerAddr, t.local, t.peerPub) + if err != nil { + return err + } + t.sess = sess + return nil + } + if t.ln == nil { + ln, err := Listen(t.conn, t.local, t.peerPub) + if err != nil { + return err + } + t.ln = ln + } + sess, err := t.ln.Accept(ctx) + if err != nil { + return err + } + t.sess = sess + return nil +} + +// reestablish tears down the dead session and re-establishes one, backing off +// between attempts so a persistent failure does not hot-loop. It returns an error +// only if ctx is cancelled while waiting. +func (t *Tunnel) reestablish(ctx context.Context) error { + t.closeSession() + for attempt := 0; ; attempt++ { + // Try immediately on the first attempt; back off only between retries so a + // transient loss recovers without an added backoff of latency. + if attempt > 0 && !sleepCtx(ctx, t.reconnectBackoff) { + return ctx.Err() + } + if err := t.establish(ctx); err != nil { + if ctx.Err() != nil { + return ctx.Err() + } + slog.Warn("control: reconnect attempt failed", slog.Any("error", err)) + continue + } + // Re-key immediately on the new session so traffic resumes without waiting a + // full interval. A rejected install (regressed epoch) is swallowed by + // installSAs; a transport error drops back to another reconnect attempt. + exCtx, cancel := context.WithTimeout(ctx, t.perExchangeTimeout) + err := t.negotiateAndInstall(exCtx) + cancel() + if err != nil && ctx.Err() == nil { + slog.Warn("control: post-reconnect negotiation failed", slog.Any("error", err)) + t.closeSession() + continue + } + return ctx.Err() + } +} + +// sessionDone returns the current session's done channel, or nil (which blocks +// forever in a select) if there is no live session. +func (t *Tunnel) sessionDone() <-chan struct{} { + if t.sess == nil { + return nil + } + return t.sess.Context().Done() +} + +func (t *Tunnel) closeSession() { + if t.sess != nil { + _ = t.sess.Close() + t.sess = nil + } +} + +// Close releases the session and (responder) listener. It is idempotent. +func (t *Tunnel) Close() error { + t.closeSession() + if t.ln != nil { + err := t.ln.Close() + t.ln = nil + return err + } + return nil +} + +// sleepCtx waits for d or until ctx is done. It reports false if ctx was cancelled. +func sleepCtx(ctx context.Context, d time.Duration) bool { + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-ctx.Done(): + return false + case <-timer.C: + return true + } +} diff --git a/control/cp_test.go b/control/cp_test.go new file mode 100644 index 0000000..bf9e79f --- /dev/null +++ b/control/cp_test.go @@ -0,0 +1,375 @@ +package control + +import ( + "bytes" + "context" + "crypto/x509" + "errors" + "net" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSelectMode(t *testing.T) { + cases := []struct { + name string + keyFile, identity, peer bool + want Mode + wantErr bool + }{ + {"static", true, false, false, ModeStatic, false}, + {"control-plane", false, true, true, ModeControlPlane, false}, + {"cp half (identity only)", false, true, false, ModeNone, true}, + {"cp half (peer only)", false, false, true, ModeNone, true}, + {"both modes", true, true, true, ModeNone, true}, + {"static + identity", true, true, false, ModeNone, true}, + {"nothing", false, false, false, ModeNone, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := SelectMode(tc.keyFile, tc.identity, tc.peer) + if tc.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + require.Equal(t, tc.want, got) + }) + } +} + +func TestCanonicalInitiator(t *testing.T) { + a, err := GenerateIdentity() + require.NoError(t, err) + b, err := GenerateIdentity() + require.NoError(t, err) + + aInit, err := CanonicalInitiator(a.PublicKey(), b.PublicKey()) + require.NoError(t, err) + bInit, err := CanonicalInitiator(b.PublicKey(), a.PublicKey()) + require.NoError(t, err) + // Exactly one side is the initiator, and both compute it consistently. + require.NotEqual(t, aInit, bInit) + + // Pin the rule, not just its antisymmetry: the node whose SPKI DER sorts lower is + // the initiator. (A flipped comparison would still pass the NotEqual check above.) + aDER, err := x509.MarshalPKIXPublicKey(a.PublicKey()) + require.NoError(t, err) + bDER, err := x509.MarshalPKIXPublicKey(b.PublicKey()) + require.NoError(t, err) + require.Equal(t, bytes.Compare(aDER, bDER) < 0, aInit, "the lower SPKI must be the initiator") + + _, err = CanonicalInitiator(a.PublicKey(), a.PublicKey()) + require.Error(t, err, "identical keys must be rejected") + _, err = CanonicalInitiator(nil, b.PublicKey()) + require.Error(t, err) +} + +func TestSharedEpoch(t *testing.T) { + iSPI, err := MakeSPI(0, Initiator, 7) // role bit clear + require.NoError(t, err) + rSPI, err := MakeSPI(0, Responder, 3) // role bit set + require.NoError(t, err) + + // Initiator's view: Tx == peer (responder) Rx == rSPI; Rx == own == iSPI. + eInit, err := SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: rSPI}, Rx: &SA{SPI: iSPI}}) + require.NoError(t, err) + require.Equal(t, iSPI, eInit) + + // Responder's view: Tx == peer (initiator) Rx == iSPI; Rx == own == rSPI. + eResp, err := SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: iSPI}, Rx: &SA{SPI: rSPI}}) + require.NoError(t, err) + require.Equal(t, iSPI, eResp) + + // Both peers therefore install the identical epoch. + require.Equal(t, eInit, eResp) + + // Not role-partitioned (both initiator) → error. + _, err = SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: iSPI}, Rx: &SA{SPI: iSPI}}) + require.Error(t, err) + + // Master-key index != 0 is unsupported by the shared-epoch bridge. + hiSPI, err := MakeSPI(1, Initiator, 1) + require.NoError(t, err) + _, err = SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: rSPI}, Rx: &SA{SPI: hiSPI}}) + require.Error(t, err) +} + +// validV0SAs returns a role-partitioned, PSPv0 DirectionalSAs with distinct +// 16-byte keys, as NegotiateSAs would produce. +func validV0SAs() *DirectionalSAs { + iSPI, _ := MakeSPI(0, Initiator, 1) + rSPI, _ := MakeSPI(0, Responder, 1) + rx := make([]byte, 16) + tx := make([]byte, 16) + for i := range rx { + rx[i] = byte(i) + tx[i] = byte(i + 100) + } + return &DirectionalSAs{ + Tx: &SA{SPI: rSPI, Key: tx, Version: PSPv0}, + Rx: &SA{SPI: iSPI, Key: rx, Version: PSPv0}, + } +} + +func TestInstallSAsRejectsNonV0(t *testing.T) { + tn := &Tunnel{install: func(uint32, [16]byte, [16]byte) error { + t.Fatal("installer must not be called for a non-PSPv0 SA") + return nil + }} + sas := validV0SAs() + sas.Tx.Version = PSPv1 + sas.Tx.Key = make([]byte, 32) + require.Error(t, tn.installSAs(sas)) +} + +func TestInstallSAsSwallowsRotationRejection(t *testing.T) { + called := false + tn := &Tunnel{install: func(uint32, [16]byte, [16]byte) error { + called = true + // Mimic the handler's monotonicity guard rejecting a regressed epoch. + return errors.New("epoch must be monotonically increasing") + }} + // A rejected rotation is logged and swallowed (the data plane keeps its current + // keys and fails closed on their own expiry); it must not look like a transport + // error to the run loop. + require.NoError(t, tn.installSAs(validV0SAs())) + require.True(t, called) +} + +// twoTunnels wires an initiator and a responder Tunnel over loopback UDP, assigning +// the canonical roles correctly, with tight timings for tests. +func twoTunnels(t *testing.T, instInit, instResp SAInstaller, rekey time.Duration) (initT, respT *Tunnel, cleanup func()) { + t.Helper() + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + respConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + initConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + + initT, err = NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, + PeerAddr: respConn.LocalAddr(), RekeyInterval: rekey, + }, instInit) + require.NoError(t, err) + require.True(t, initT.Initiator()) + + respT, err = NewTunnel(TunnelConfig{ + Local: respID, PeerPub: initID.PublicKey(), Conn: respConn, + PeerAddr: initConn.LocalAddr(), RekeyInterval: rekey, + }, instResp) + require.NoError(t, err) + require.False(t, respT.Initiator()) + + for _, tn := range []*Tunnel{initT, respT} { + tn.perExchangeTimeout = 2 * time.Second + tn.reconnectBackoff = 50 * time.Millisecond + } + + cleanup = func() { + _ = initT.Close() + _ = respT.Close() + _ = initConn.Close() + _ = respConn.Close() + } + return initT, respT, cleanup +} + +// epochRecorder is a thread-safe SAInstaller that records the epochs it installs. +type epochRecorder struct { + mu sync.Mutex + epochs []uint32 + keys map[uint32][2][16]byte // epoch -> {rx, tx} +} + +func newEpochRecorder() *epochRecorder { return &epochRecorder{keys: map[uint32][2][16]byte{}} } + +func (r *epochRecorder) install(epoch uint32, rxKey, txKey [16]byte) error { + r.mu.Lock() + defer r.mu.Unlock() + r.epochs = append(r.epochs, epoch) + r.keys[epoch] = [2][16]byte{rxKey, txKey} + return nil +} + +func (r *epochRecorder) snapshot() ([]uint32, map[uint32][2][16]byte) { + r.mu.Lock() + defer r.mu.Unlock() + out := append([]uint32(nil), r.epochs...) + cp := make(map[uint32][2][16]byte, len(r.keys)) + for k, v := range r.keys { + cp[k] = v + } + return out, cp +} + +func TestTunnelBringupAndRekey(t *testing.T) { + initRec, respRec := newEpochRecorder(), newEpochRecorder() + initT, respT, cleanup := twoTunnels(t, initRec.install, respRec.install, 100*time.Millisecond) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + // Bring both peers up; the responder must be accepting before the initiator dials. + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + // Both installed exactly one (matching) generation in bring-up. + ie, ik := initRec.snapshot() + re, rk := respRec.snapshot() + require.Len(t, ie, 1) + require.Len(t, re, 1) + require.Equal(t, ie[0], re[0], "peers must install the same shared epoch") + require.NotZero(t, ie[0]) + // Cross-derivation: initiator TX == responder RX and vice versa. + require.Equal(t, ik[ie[0]][1], rk[re[0]][0], "initiator tx key != responder rx key") + require.Equal(t, ik[ie[0]][0], rk[re[0]][1], "initiator rx key != responder tx key") + // Within a peer, the two directions use distinct keys. + require.NotEqual(t, ik[ie[0]][0], ik[ie[0]][1]) + + // Run both peers and let the initiator drive a few rekeys. + runCh := make(chan error, 2) + go func() { runCh <- initT.Run(ctx) }() + go func() { runCh <- respT.Run(ctx) }() + + require.Eventually(t, func() bool { + e, _ := initRec.snapshot() + return len(e) >= 3 + }, 10*time.Second, 20*time.Millisecond, "initiator should rekey several times") + + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + + // Epochs strictly increase per peer, and the two peers agree generation-by-generation. + ie, _ = initRec.snapshot() + re, _ = respRec.snapshot() + for i := 1; i < len(ie); i++ { + require.Greater(t, ie[i], ie[i-1], "initiator epochs must strictly increase") + } + n := len(ie) + if len(re) < n { + n = len(re) + } + require.GreaterOrEqual(t, n, 2) + for i := 0; i < n; i++ { + require.Equal(t, ie[i], re[i], "peers disagree on epoch for generation %d", i) + } +} + +func TestTunnelBringupFailsClosedOnPinMismatch(t *testing.T) { + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + imposter, err := GenerateIdentity() + require.NoError(t, err) + + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + respConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer respConn.Close() + initConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer initConn.Close() + + mustNotInstall := func(uint32, [16]byte, [16]byte) error { + t.Fatal("keys must never be installed on a pin failure") + return nil + } + + // The initiator pins the real responder, but the responder pins an imposter, so + // the SA-setup round-trip (the mutual key-confirmation) must fail closed. + initT, err := NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, + PeerAddr: respConn.LocalAddr(), RekeyInterval: time.Second, + }, mustNotInstall) + require.NoError(t, err) + respT, err := NewTunnel(TunnelConfig{ + Local: respID, PeerPub: imposter.PublicKey(), Conn: respConn, + PeerAddr: initConn.LocalAddr(), RekeyInterval: time.Second, + }, mustNotInstall) + require.NoError(t, err) + defer initT.Close() + defer respT.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second) + defer cancel() + go func() { _ = respT.Bringup(ctx) }() + require.Error(t, initT.Bringup(ctx), "bring-up must fail when the peer does not pin us") +} + +// TestTunnelReconnects exercises the reestablish state machine: a session loss +// mid-run must tear the dead session down and re-establish a fresh one on both ends, +// resuming rotation, rather than wedging or hot-looping. +func TestTunnelReconnects(t *testing.T) { + initRec, respRec := newEpochRecorder(), newEpochRecorder() + initT, respT, cleanup := twoTunnels(t, initRec.install, respRec.install, 100*time.Millisecond) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second) + defer cancel() + + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + // Force a session loss before the run loops start: closing the initiator's session + // tears down both ends' QUIC connections, so the initiator detects loss via the + // connection context and the responder's next accept errors out. + require.NoError(t, initT.sess.Close()) + + runCh := make(chan error, 2) + go func() { runCh <- initT.Run(ctx) }() + go func() { runCh <- respT.Run(ctx) }() + + // Both peers must reconnect and resume installing matching epochs (well past the + // single bring-up generation), proving the reconnect path self-heals. + require.Eventually(t, func() bool { + ie, _ := initRec.snapshot() + re, _ := respRec.snapshot() + return len(ie) >= 3 && len(re) >= 3 + }, 18*time.Second, 25*time.Millisecond, "peers must self-heal and resume rekeying after a session loss") + + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + + // Agreement invariant: the initiator installs a generation only after reading the + // responder's offer, which the responder writes only after it has committed to + // installing — so every epoch the initiator installed must also have been + // installed by the responder. (The reverse can differ by one: a negotiation torn + // by the forced loss after the responder committed but before the initiator + // finished leaves the responder with an extra generation. Per-session allocators + // also reset across the reconnect, so epochs are not globally monotonic — agreement + // in this direction, not monotonicity, is the invariant.) + ie, _ := initRec.snapshot() + _, rk := respRec.snapshot() + for _, e := range ie { + require.Contains(t, rk, e, "responder never installed initiator epoch %d", e) + } +} diff --git a/control/transport.go b/control/transport.go index 0182a92..e4cb1e6 100644 --- a/control/transport.go +++ b/control/transport.go @@ -152,6 +152,11 @@ func (s *Session) MasterKeys() *MasterKeys { return s.masterKeys } // peer certificate). Useful for logging and for asserting the FIPS suite. func (s *Session) TLSState() tls.ConnectionState { return s.conn.ConnectionState().TLS } +// Context returns a context that is cancelled when the underlying QUIC connection +// closes (peer close, idle timeout, or transport error). RunTunnel selects on it to +// detect session loss promptly rather than waiting for the next rekey tick. +func (s *Session) Context() context.Context { return s.conn.Context() } + // Close cleanly shuts the session down. func (s *Session) Close() error { return s.conn.CloseWithError(appErrNormal, "") } diff --git a/cp_wire_test.go b/cp_wire_test.go new file mode 100644 index 0000000..c10d64f --- /dev/null +++ b/cp_wire_test.go @@ -0,0 +1,174 @@ +package icx_test + +import ( + "context" + "net" + "net/netip" + "testing" + "time" + + "github.com/stretchr/testify/require" + "gvisor.dev/gvisor/pkg/tcpip" + + "github.com/apoxy-dev/icx" + "github.com/apoxy-dev/icx/control" +) + +// This file proves the Phase 4 control-plane → data-plane bridge end to end: the +// shared-epoch chosen by control.SharedEpoch lets two independently-keyed handlers +// exchange Geneve traffic, and the naive alternative (each peer using its own Tx SPI +// as the epoch) provably drops every frame. The handler is cross-platform, so this +// runs without the AF_XDP forwarder. + +// negotiateLoopback brings up an initiator and a responder control session over +// loopback UDP and returns each peer's negotiated directional SAs. +func negotiateLoopback(t *testing.T) (iSAs, rSAs *control.DirectionalSAs) { + t.Helper() + idA, err := control.GenerateIdentity() + require.NoError(t, err) + idB, err := control.GenerateIdentity() + require.NoError(t, err) + + srv, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + cli, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + t.Cleanup(func() { _ = srv.Close(); _ = cli.Close() }) + + ln, err := control.Listen(srv, idB, idA.PublicKey()) + require.NoError(t, err) + t.Cleanup(func() { _ = ln.Close() }) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + type sres struct { + s *control.Session + err error + } + accCh := make(chan sres, 1) + go func() { + s, err := ln.Accept(ctx) + accCh <- sres{s, err} + }() + iSess, err := control.Dial(ctx, cli, ln.Addr(), idA, idB.PublicKey()) + require.NoError(t, err) + acc := <-accCh + require.NoError(t, acc.err) + rSess := acc.s + t.Cleanup(func() { _ = iSess.Close(); _ = rSess.Close() }) + + type nres struct { + sas *control.DirectionalSAs + err error + } + negCh := make(chan nres, 1) + go func() { + sas, err := rSess.NegotiateSAs(ctx, control.PSPv0) + negCh <- nres{sas, err} + }() + iSAs, err = iSess.NegotiateSAs(ctx, control.PSPv0) + require.NoError(t, err) + neg := <-negCh + require.NoError(t, neg.err) + return iSAs, neg.sas +} + +func newPeerHandler(t *testing.T, vni uint, local, remote tcpip.Address) *icx.Handler { + t.Helper() + h, err := icx.NewHandler( + icx.WithLocalAddr(&tcpip.FullAddress{Addr: local, Port: 6081}), + icx.WithLayer3VirtFrames(), + ) + require.NoError(t, err) + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, &tcpip.FullAddress{Addr: remote, Port: 6081}, + []icx.Route{{Src: prefix, Dst: prefix}})) + return h +} + +func installSAs(t *testing.T, h *icx.Handler, vni uint, epoch uint32, sas *control.DirectionalSAs) { + t.Helper() + require.Len(t, sas.Rx.Key, 16) + require.Len(t, sas.Tx.Key, 16) + var rx, tx [16]byte + copy(rx[:], sas.Rx.Key) + copy(tx[:], sas.Tx.Key) + // Use the real guarded seam the production installer calls. + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, epoch, rx, tx, time.Now().Add(time.Hour))) +} + +func TestControlPlaneSharedEpochGeneveRoundTrip(t *testing.T) { + iSAs, rSAs := negotiateLoopback(t) + + eI, err := control.SharedEpoch(iSAs) + require.NoError(t, err) + eR, err := control.SharedEpoch(rSAs) + require.NoError(t, err) + require.Equal(t, eI, eR, "both peers must derive the identical shared epoch") + + const vni = 0x424344 + addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) + addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) + hI := newPeerHandler(t, vni, addrA, addrB) + hR := newPeerHandler(t, vni, addrB, addrA) + installSAs(t, hI, vni, eI, iSAs) + installSAs(t, hR, vni, eR, rSAs) + + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + out := make([]byte, 1500) + + // initiator -> responder + n, loop := hI.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + m := hR.PhyToVirt(phy[:n], out) + require.NotZero(t, m, "responder must decrypt initiator traffic") + require.Equal(t, ip, out[:m]) + + // responder -> initiator + n, loop = hR.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + m = hI.PhyToVirt(phy[:n], out) + require.NotZero(t, m, "initiator must decrypt responder traffic") + require.Equal(t, ip, out[:m]) + + vnR, ok := hR.GetVirtualNetwork(vni) + require.True(t, ok) + require.Zero(t, vnR.Stats.RXDropsNoKey.Load()) + require.Zero(t, vnR.Stats.RXDropsSPIMismatch.Load()) + require.Equal(t, uint64(1), vnR.Stats.RXPackets.Load()) +} + +func TestControlPlaneNaiveTxSPIEpochDropsTraffic(t *testing.T) { + iSAs, rSAs := negotiateLoopback(t) + + // The naive bridge — each peer installs under its OWN Tx SPI — gives the two + // peers different epochs (the SPIs are role-partitioned), so the receiver's + // rxCiphers lookup misses and every frame drops. This is exactly why SharedEpoch + // is required; assert the failure mode explicitly. + require.NotEqual(t, iSAs.Tx.SPI, rSAs.Tx.SPI) + + const vni = 0x515253 + addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) + addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) + hI := newPeerHandler(t, vni, addrA, addrB) + hR := newPeerHandler(t, vni, addrB, addrA) + installSAs(t, hI, vni, iSAs.Tx.SPI, iSAs) + installSAs(t, hR, vni, rSAs.Tx.SPI, rSAs) + + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + out := make([]byte, 1500) + n, loop := hI.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + m := hR.PhyToVirt(phy[:n], out) + require.Zero(t, m, "naive per-direction Tx.SPI epoch must miss the receiver's rxCiphers and drop") + + vnR, ok := hR.GetVirtualNetwork(vni) + require.True(t, ok) + require.Equal(t, uint64(1), vnR.Stats.RXDropsNoKey.Load()) +} From 2ad8447067fdd8ef88f42d84871a4d278a84cc89 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 15:31:17 -0700 Subject: [PATCH 08/20] [cli] wire the control plane into the tunnel; add genkey/pubkey (APO-644) Add a control-plane keying mode alongside the legacy static INI keys, selected fail-closed: exactly one of --key-file or --identity-key/--peer-key, with no silent fallback between them. In control-plane mode the CLI loads the identity, pins the peer key, binds a dedicated control UDP socket, brings the tunnel up synchronously (the forwarder never starts without keys), and runs the rekey loop and forwarder under one errgroup sharing a cancel. SIGHUP static reload is gated to static mode so exactly one installer touches a VNI. This closes the production APO-644 path: ephemeral, forward-secret per-session keys mean a restart yields fresh keys, so the static-INI restart nonce-reuse hazard does not apply in control-plane mode. New commands: icx genkey (writes a P-256 identity, O_CREATE|O_EXCL 0600, --force to overwrite) and icx pubkey. New flags: --identity-key, --peer-key, --control-port, --peer-control-port, --rekey-interval, --require-fips. Reject --control-port == --port (the XDP filter would blackhole the control plane). Use SIGINT/SIGTERM for graceful shutdown. README documents the control plane as the recommended path, marks static keys legacy, and records the startup-ordering and one-sided-restart caveats. --- cli/README.md | 159 ++++++++++++++++------- cli/go.mod | 8 +- cli/go.sum | 22 ++-- cli/main.go | 346 +++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 446 insertions(+), 89 deletions(-) diff --git a/cli/README.md b/cli/README.md index 994ad28..7e6d881 100644 --- a/cli/README.md +++ b/cli/README.md @@ -1,20 +1,113 @@ # InterCloud eXpress (ICX) - CLI -## Usage +ICX encrypts tunnel traffic with AES-128-GCM. Keys can be established two ways: -ICX uses a pair of **ephemeral, per-session** symmetric keys for encrypting traffic. -**Do not reuse keys** across sessions (to avoid nonce reuse risks). +- **Control plane (recommended):** a QUIC/mTLS channel negotiates fresh, + forward-secret, per-session keys and rotates them automatically. This is the only + mode that is safe across restarts. +- **Static keys (legacy):** a pair of pre-shared keys loaded from an INI file and + rotated by hand via `SIGHUP`. Retained for compatibility; see the caveats below. -ICX enforces two invariants when keys are installed and will refuse the -key otherwise: `rx` and `tx` **must differ** (each direction needs its own key), -and the key epoch must **strictly increase** within a running process. Note that -a restart re-reads the INI starting again at epoch 1 with the TX counter reset to -0, so **do not restart against an unchanged key file** — rotate to fresh keys -(below) or use a key-exchange mechanism, otherwise the AES-GCM nonce sequence is -reused under the same key. +The two modes are mutually exclusive and fail closed: configure exactly one. ICX never +silently falls back from the control plane to static keys. -In production, use a secure key exchange mechanism (e.g., IKEv2) to -generate and distribute keys. +## Control plane (recommended) + +Each node has a long-term **identity key** (ECDSA P-256). Peers authenticate each other +WireGuard-style by pinning the expected public key — there is no CA. The control channel +runs on its own UDP port (`--control-port`, default `6082`), separate from the Geneve +data port (`--port`, default `6081`); the XDP filter only redirects the data port to +AF_XDP, so the control port rides the normal kernel stack. + +### 1) Generate an identity on each host + +```bash +# Host A +icx genkey --identity-key /etc/icx/identity.pem +# prints Host A's public key (base64) to stderr + +# Host B +icx genkey --identity-key /etc/icx/identity.pem +``` + +`genkey` refuses to overwrite an existing key file (pass `--force` to override). Recover +a public key at any time: + +```bash +icx pubkey --identity-key /etc/icx/identity.pem +``` + +### 2) Exchange public keys + +Distribute each host's public key to the other out of band. The value is what you pass +as the peer's `--peer-key` (it accepts the base64 string directly or a path to a file +containing it). + +### 3) Start ICX on both hosts + +Both hosts run the same command shape; the dialer/listener roles are elected +deterministically from the two public keys, so no extra configuration is needed. Use the +**same `--control-port`** on both ends. + +```bash +# Host A (peer is B's data address) +icx -i eth0 \ + --identity-key /etc/icx/identity.pem \ + --peer-key '' \ + 198.51.100.7:6081 + +# Host B (peer is A's data address) +icx -i eth0 \ + --identity-key /etc/icx/identity.pem \ + --peer-key '' \ + 203.0.113.2:6081 +``` + +ICX establishes the control plane (fail-closed: if the handshake or first negotiation +fails, the tunnel does not come up), installs the negotiated keys, and renegotiates a +fresh security association every `--rekey-interval` (default `2m`). Rotation is +make-before-break: the previous receive key is honored for a 30s grace period. + +Relevant flags: + +- `--identity-key PATH` — this node's identity private key. +- `--peer-key STR|PATH` — the peer's pinned public key. +- `--control-port PORT` — control-plane UDP port (default `6082`; must match on both ends). +- `--peer-control-port PORT` — peer's control port if it differs (defaults to `--control-port`). +- `--rekey-interval DUR` — SA rotation period (default `2m`). +- `--require-fips` — refuse to start unless the Go FIPS 140-3 module is active + (build/run with `GODEBUG=fips140=on`). + +### Operational notes + +**Startup ordering.** The peers elect dialer/listener roles from their keys; the dialer +retries the QUIC handshake only for the handshake window (~10s). If the listener is not up +within that window the dialer's process exits (fail-closed — no tunnel comes up). Start both +ends close together, and run under a supervisor (systemd `Restart=always`, a container +restart policy) so a larger startup skew self-heals on restart. Once established, the control +plane reconnects on its own indefinitely. + +**Restart / reconnect.** Control-plane keys are ephemeral, so a *clean* restart is safe: a +new session derives fresh keys, and a reset transmit counter cannot reuse a nonce under a +fresh key. But this build does not persist the per-session epoch high-water mark across a +restart, so a **one-sided** restart does not promptly recover: the survivor keeps the high +epoch from the old session, the restarted peer comes back at a low epoch, and the survivor's +monotonicity guard rejects it — breaking traffic in **both** directions. The link forwards on +the survivor's existing keys until they expire and only re-keys once the new session's epoch +counter climbs back above the survivor's retained high-water mark (many rekey intervals). In +practice, **cycle both peers** to recover immediately. Durable epoch state for seamless +one-sided restart is planned follow-up work. + +## Static keys (legacy) + +> Prefer the control plane. Static keys provide **no forward secrecy** and are **not safe +> across restarts**: a restart re-reads the INI starting again at epoch 1 with the TX +> counter reset to 0, so **do not restart against an unchanged key file** — rotate to +> fresh keys (below), otherwise the AES-GCM nonce sequence is reused under the same key. + +ICX enforces two invariants when keys are installed and refuses the key otherwise: `rx` +and `tx` **must differ** (each direction needs its own key), and the key epoch must +**strictly increase** within a running process. ### 1) Generate two one-time keys @@ -27,7 +120,7 @@ K_BA=$(openssl rand -hex 16) ### 2) Create an INI file on each host -Each host reads keys from an INI file at --key-file. The required format is: +Each host reads keys from an INI file at `--key-file`. The required format is: ```ini [keys] @@ -39,23 +132,7 @@ tx=<32 hex chars> # the key this host will TRANSMIT with expires=24h ``` -For Host A: - -```ini -[keys] -rx=${K_BA} -tx=${K_AB} -expires=24h -``` - -For Host B: - -```ini -[keys] -rx=${K_AB} -tx=${K_BA} -expires=24h -``` +For Host A `rx=${K_BA}`, `tx=${K_AB}`; for Host B `rx=${K_AB}`, `tx=${K_BA}`. ### 3) Start ICX on both hosts @@ -63,28 +140,14 @@ expires=24h icx -i --key-file=/path/to/icx.ini : ``` -#### Examples: - -```bash -# Host A -icx -i eth0 --key-file=/etc/icx/keys.ini 203.0.113.2:6081 - -# Host B -icx -i eth0 --key-file=/etc/icx/keys.ini 198.51.100.7:6081 -``` - -This creates an icx0 interface on both hosts, which you can use to securely -send and receive traffic over the ICX tunnel. - ### 4) Key rotation (SIGHUP) -To rotate keys, update the same INI file with new rx/tx values, then send -SIGHUP to the running process: +Update the same INI file with new rx/tx values, then send `SIGHUP`: ```bash pkill -HUP icx -# or: kill -HUP ``` -ICX will reload the INI, bump the epoch, and apply the new keys. If the -reloaded keys are identical to the current ones, the reload is refused (epoch unchanged). \ No newline at end of file +ICX reloads the INI, bumps the epoch, and applies the new keys. If the reloaded keys are +identical to the current ones, the reload is refused (epoch unchanged). `SIGHUP` reload is +only active in static mode. diff --git a/cli/go.mod b/cli/go.mod index 8af5765..9b3486c 100644 --- a/cli/go.mod +++ b/cli/go.mod @@ -9,6 +9,7 @@ require ( github.com/google/gopacket v1.1.19 github.com/urfave/cli/v2 v2.27.7 github.com/vishvananda/netlink v1.3.1 + golang.org/x/sync v0.16.0 gopkg.in/ini.v1 v1.67.0 gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 ) @@ -19,12 +20,13 @@ require ( github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/google/btree v1.1.2 // indirect github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 // indirect + github.com/quic-go/quic-go v0.59.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/safchain/ethtool v0.6.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect - golang.org/x/net v0.39.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/sys v0.33.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sys v0.35.0 // indirect golang.org/x/time v0.7.0 // indirect ) diff --git a/cli/go.sum b/cli/go.sum index 9b021fd..2c4a788 100644 --- a/cli/go.sum +++ b/cli/go.sum @@ -22,14 +22,16 @@ github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 h1:C8IqpV7kfAyZD github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9/go.mod h1:dDLiSjNqdp8VjphLdGTx19OeAUsHOzhtc1FFJqpzWMU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/quic-go/quic-go v0.59.1 h1:0Gmua0HW1Tv7ANR7hUYwRyD0MG5OJfgvYSZasGZzBic= +github.com/quic-go/quic-go v0.59.1/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/safchain/ethtool v0.6.1 h1:mhRnXE1H8fV8TTXh/HdqE4tXtb57r//BQh5pPYMuM5k= github.com/safchain/ethtool v0.6.1/go.mod h1:JzoNbG8xeg/BeVeVoMCtCb3UPWoppZZbFpA+1WFh+M0= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= @@ -38,23 +40,27 @@ github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zd github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= +go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= diff --git a/cli/main.go b/cli/main.go index a445ee9..9836c0b 100644 --- a/cli/main.go +++ b/cli/main.go @@ -4,9 +4,12 @@ package main import ( "context" + "crypto/ecdsa" + "crypto/fips140" "encoding/hex" "errors" "fmt" + "io" "log/slog" "math" "net" @@ -23,9 +26,11 @@ import ( "github.com/google/gopacket/pcapgo" "github.com/urfave/cli/v2" "github.com/vishvananda/netlink" + "golang.org/x/sync/errgroup" "gvisor.dev/gvisor/pkg/tcpip" "github.com/apoxy-dev/icx" + "github.com/apoxy-dev/icx/control" "github.com/apoxy-dev/icx/filter" "github.com/apoxy-dev/icx/forwarder" "github.com/apoxy-dev/icx/mac" @@ -38,7 +43,12 @@ import ( func main() { app := &cli.App{ - Name: "icx", + Name: "icx", + Usage: "InterCloud eXpress — an AF_XDP Geneve L3 tunnel", + Commands: []*cli.Command{ + genkeyCommand(), + pubkeyCommand(), + }, Flags: []cli.Flag{ &cli.StringFlag{ Name: "log-level", @@ -46,11 +56,14 @@ func main() { Usage: "Set the logging level (debug, info, warn, error, fatal, panic)", Value: "info", }, + // NOTE: --interface and --key-file are intentionally NOT marked Required. + // urfave/cli enforces required root flags before dispatching to a + // subcommand, which would make `icx genkey`/`icx pubkey` unrunnable. They + // are validated inside run() instead. &cli.StringFlag{ - Name: "interface", - Aliases: []string{"i"}, - Usage: "Physical network interface to use", - Required: true, + Name: "interface", + Aliases: []string{"i"}, + Usage: "Physical network interface to use (required for the tunnel)", }, &cli.UintFlag{ Name: "vni", @@ -59,9 +72,34 @@ func main() { Value: 1, }, &cli.StringFlag{ - Name: "key-file", - Usage: "Path to INI file containing keys (rx, tx, optional expires)", - Required: true, + Name: "key-file", + Usage: "Path to INI file with static keys (rx, tx, optional expires). Legacy/static mode; prefer the control plane (--identity-key/--peer-key)", + }, + &cli.StringFlag{ + Name: "identity-key", + Usage: "Path to this node's identity private key (PKCS#8 PEM from `icx genkey`). Enables the control plane", + }, + &cli.StringFlag{ + Name: "peer-key", + Usage: "Peer's identity public key (base64 SPKI from `icx pubkey`, or a path to a file containing it). Enables the control plane", + }, + &cli.IntFlag{ + Name: "control-port", + Usage: "UDP port for the QUIC/mTLS control plane (must match on both peers)", + Value: 6082, + }, + &cli.IntFlag{ + Name: "peer-control-port", + Usage: "Peer's control-plane UDP port (defaults to --control-port)", + }, + &cli.DurationFlag{ + Name: "rekey-interval", + Usage: "How often the control plane negotiates a fresh security association", + Value: 2 * time.Minute, + }, + &cli.BoolFlag{ + Name: "require-fips", + Usage: "Refuse to start unless the Go FIPS 140-3 module is active (GODEBUG=fips140=on)", }, &cli.IntFlag{ Name: "port", @@ -105,6 +143,101 @@ func main() { if err := app.Run(os.Args); err != nil { slog.Error("Error running app", slog.Any("error", err)) + os.Exit(1) + } +} + +// genkeyCommand generates a fresh ECDSA P-256 identity private key. +func genkeyCommand() *cli.Command { + return &cli.Command{ + Name: "genkey", + Usage: "Generate a new ECDSA P-256 identity private key (PKCS#8 PEM)", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "identity-key", + Aliases: []string{"o"}, + Usage: "Path to write the private key (default: stdout)", + }, + &cli.BoolFlag{ + Name: "force", + Usage: "Overwrite an existing key file", + }, + }, + Action: func(c *cli.Context) error { + id, err := control.GenerateIdentity() + if err != nil { + return err + } + pemBytes, err := id.MarshalPrivatePEM() + if err != nil { + return err + } + + path := c.String("identity-key") + if path == "" { + _, err = os.Stdout.Write(pemBytes) + return err + } + + // Refuse to clobber an existing private key unless explicitly forced. + flags := os.O_CREATE | os.O_EXCL | os.O_WRONLY + if c.Bool("force") { + flags = os.O_CREATE | os.O_TRUNC | os.O_WRONLY + } + f, err := os.OpenFile(path, flags, 0o600) + if err != nil { + return fmt.Errorf("create identity key %q (use --force to overwrite): %w", path, err) + } + defer func() { _ = f.Close() }() + if _, err := f.Write(pemBytes); err != nil { + return err + } + pub, err := id.PublicKeyString() + if err != nil { + return err + } + fmt.Fprintf(os.Stderr, "wrote identity key to %s\npublic key (share as the peer's --peer-key):\n%s\n", path, pub) + return nil + }, + } +} + +// pubkeyCommand derives the base64(SPKI) public key from an identity private key. +func pubkeyCommand() *cli.Command { + return &cli.Command{ + Name: "pubkey", + Usage: "Print the public key (base64 SPKI) for an identity private key (from --identity-key or stdin)", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "identity-key", + Aliases: []string{"i"}, + Usage: "Path to the identity private key (default: read PEM from stdin)", + }, + }, + Action: func(c *cli.Context) error { + var ( + data []byte + err error + ) + if path := c.String("identity-key"); path != "" { + data, err = os.ReadFile(path) + } else { + data, err = io.ReadAll(os.Stdin) + } + if err != nil { + return err + } + id, err := control.LoadIdentityPEM(data) + if err != nil { + return err + } + pub, err := id.PublicKeyString() + if err != nil { + return err + } + fmt.Println(pub) + return nil + }, } } @@ -120,6 +253,23 @@ func run(c *cli.Context) error { } slog.SetLogLoggerLevel(level) + // Resolve the keying mode (static INI vs control plane) up front, fail-closed: + // exactly one must be configured, and there is no silent fallback between them. + keyFile := c.String("key-file") + identityKey := c.String("identity-key") + peerKey := c.String("peer-key") + mode, err := control.SelectMode(keyFile != "", identityKey != "", peerKey != "") + if err != nil { + return err + } + + if c.String("interface") == "" { + return errors.New("--interface is required") + } + if c.Bool("require-fips") && !fips140.Enabled() { + return errors.New("--require-fips set but the Go FIPS 140-3 module is not active; build and run with GODEBUG=fips140=on") + } + if cpuProfilePath := c.String("cpu-profile"); cpuProfilePath != "" { f, err := os.Create(cpuProfilePath) if err != nil { @@ -160,7 +310,9 @@ func run(c *cli.Context) error { } } - ctx, cancel := signal.NotifyContext(c.Context, os.Interrupt, os.Kill) + // SIGINT for interactive use, SIGTERM for systemd/container stop. (os.Kill / + // SIGKILL cannot be caught, so registering it would be a no-op.) + ctx, cancel := signal.NotifyContext(c.Context, syscall.SIGINT, syscall.SIGTERM) defer cancel() isNetAdmin, err := permissions.IsNetAdmin() @@ -252,31 +404,88 @@ func run(c *cli.Context) error { {Src: netip.MustParsePrefix("::/0"), Dst: netip.MustParsePrefix("::/0")}, } - if err := h.AddVirtualNetwork(c.Uint("vni"), peerAddr, allRoutes); err != nil { + vni := c.Uint("vni") + if err := h.AddVirtualNetwork(vni, peerAddr, allRoutes); err != nil { return fmt.Errorf("failed to add virtual network: %w", err) } - var ( - epoch uint32 = 1 // initial epoch - rxKey, txKey [16]byte - expiresAt time.Time + // Keying: install the data-plane keys (and arrange ongoing rotation) according to + // the selected mode. tun is non-nil only in control-plane mode. + var tun *control.Tunnel + switch mode { + case control.ModeStatic: + if err := runStaticKeying(ctx, c, h, vni); err != nil { + return err + } + case control.ModeControlPlane: + var ctrlConn *net.UDPConn + tun, ctrlConn, err = startControlPlane(ctx, c, h, vni, peerUDPAddr) + if err != nil { + return err + } + defer func() { _ = tun.Close() }() + // The Tunnel only borrows the control socket; own its lifetime here. + defer func() { _ = ctrlConn.Close() }() + } + + fwd, err := forwarder.NewForwarder( + h, + forwarder.WithPhyName(phyName), + forwarder.WithVirtName(vethDev.Peer.Attrs().Name), + forwarder.WithPhyFilter(ingressFilter), + forwarder.WithPcapWriter(pcapWriter), ) + if err != nil { + return fmt.Errorf("failed to create forwarder: %w", err) + } + + // In control-plane mode, run the rekey loop and the forwarder under one errgroup + // sharing a cancel: a signal (ctx) or a forwarder error stops both. The control + // plane reconnects indefinitely on its own (Tunnel.Run does not return on CP + // failures), so it does not abort the forwarder; if it cannot re-establish, the + // data plane fails closed when the installed keys expire (see key lifetime below). + if tun != nil { + g, gctx := errgroup.WithContext(ctx) + g.Go(func() error { return tun.Run(gctx) }) + g.Go(func() error { + if err := fwd.Start(gctx); err != nil && !errors.Is(err, context.Canceled) { + return fmt.Errorf("forwarder: %w", err) + } + return nil + }) + if err := g.Wait(); err != nil && !errors.Is(err, context.Canceled) { + return err + } + return nil + } + + if err := fwd.Start(ctx); err != nil && !errors.Is(err, context.Canceled) { + return fmt.Errorf("failed to start forwarder: %w", err) + } + return nil +} + +// runStaticKeying installs the static INI keys and starts the SIGHUP reload loop. +// This is the legacy path; it is gated to ModeStatic so a control-plane deployment +// never has a competing installer touching the same VNI. +func runStaticKeying(ctx context.Context, c *cli.Context, h *icx.Handler, vni uint) error { keyFile := c.String("key-file") - rxKey, txKey, expiresAt, err = loadKeysFromINI(keyFile) + epoch := uint32(1) // initial epoch + rxKey, txKey, expiresAt, err := loadKeysFromINI(keyFile) if err != nil { return err } - - if err := h.UpdateVirtualNetworkKeys(c.Uint("vni"), epoch, rxKey, txKey, expiresAt); err != nil { + if err := h.UpdateVirtualNetworkKeys(vni, epoch, rxKey, txKey, expiresAt); err != nil { return fmt.Errorf("failed to update virtual network key: %w", err) } + slog.Warn("using static INI keys (legacy mode); prefer the control plane (--identity-key/--peer-key). " + + "A restart re-reads the INI at epoch 1 with the TX counter reset, so do not restart against an unchanged key file") var keyMu sync.Mutex sigCh := make(chan os.Signal, 1) signal.Notify(sigCh, syscall.SIGHUP) - // SIGHUP reload handler. go func() { for { select { @@ -305,7 +514,7 @@ func run(c *cli.Context) error { expiresAt = newExpires epoch++ - if err := h.UpdateVirtualNetworkKeys(c.Uint("vni"), epoch, rxKey, txKey, expiresAt); err != nil { + if err := h.UpdateVirtualNetworkKeys(vni, epoch, rxKey, txKey, expiresAt); err != nil { slog.Error("Failed to apply reloaded keys", slog.Any("error", err)) } else { slog.Info("Reloaded keys", @@ -317,23 +526,100 @@ func run(c *cli.Context) error { } } }() + return nil +} - fwd, err := forwarder.NewForwarder( - h, - forwarder.WithPhyName(phyName), - forwarder.WithVirtName(vethDev.Peer.Attrs().Name), - forwarder.WithPhyFilter(ingressFilter), - forwarder.WithPcapWriter(pcapWriter), - ) +// startControlPlane builds the QUIC/mTLS control-plane tunnel and performs the +// initial, fail-closed SA negotiation and install. The returned Tunnel's Run drives +// ongoing rekeys; the caller is responsible for running it and closing the Tunnel. +func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni uint, peerUDPAddr *net.UDPAddr) (*control.Tunnel, *net.UDPConn, error) { + controlPort := c.Int("control-port") + if controlPort == c.Int("port") { + return nil, nil, errors.New("--control-port must differ from the Geneve data port (--port); the XDP filter redirects the data port to AF_XDP and would blackhole the control plane") + } + + ident, err := loadIdentity(c.String("identity-key")) if err != nil { - return fmt.Errorf("failed to create forwarder: %w", err) + return nil, nil, err + } + peerPub, err := readPeerKey(c.String("peer-key")) + if err != nil { + return nil, nil, err } - if err := fwd.Start(ctx); err != nil && !errors.Is(err, context.Canceled) { - return fmt.Errorf("failed to start forwarder: %w", err) + ctrlNet := "udp4" + if peerUDPAddr.IP.To4() == nil { + ctrlNet = "udp6" + } + pconn, err := net.ListenUDP(ctrlNet, &net.UDPAddr{Port: controlPort}) + if err != nil { + return nil, nil, fmt.Errorf("bind control socket on port %d: %w", controlPort, err) } - return nil + peerControlPort := c.Int("peer-control-port") + if peerControlPort == 0 { + peerControlPort = controlPort + } + peerControlAddr := &net.UDPAddr{IP: peerUDPAddr.IP, Port: peerControlPort} + + rekeyIvl := c.Duration("rekey-interval") + // Bound the installed-key lifetime: long enough that a few missed rekeys do not + // drop traffic, short enough to fail closed if rotation stops, capped at the + // handler's recommended 24h rekey ceiling. + keyLifetime := 4 * rekeyIvl + if keyLifetime < time.Hour { + keyLifetime = time.Hour + } + if keyLifetime > 24*time.Hour { + keyLifetime = 24 * time.Hour + } + + installer := func(epoch uint32, rxKey, txKey [16]byte) error { + return h.UpdateVirtualNetworkKeys(vni, epoch, rxKey, txKey, time.Now().Add(keyLifetime)) + } + + tun, err := control.NewTunnel(control.TunnelConfig{ + Local: ident, + PeerPub: peerPub, + Conn: pconn, + PeerAddr: peerControlAddr, + RekeyInterval: rekeyIvl, + }, installer) + if err != nil { + _ = pconn.Close() + return nil, nil, err + } + + slog.Info("establishing control plane", + slog.String("peer-control", peerControlAddr.String()), + slog.Bool("initiator", tun.Initiator()), + slog.Duration("rekey-interval", rekeyIvl), + ) + if err := tun.Bringup(ctx); err != nil { + _ = tun.Close() + _ = pconn.Close() + return nil, nil, fmt.Errorf("control plane bring-up failed: %w", err) + } + return tun, pconn, nil +} + +// loadIdentity reads and parses this node's identity private key from a PEM file. +func loadIdentity(path string) (*control.Identity, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read identity key %q: %w", path, err) + } + return control.LoadIdentityPEM(data) +} + +// readPeerKey resolves a --peer-key value, which may be the base64(SPKI) string +// directly or a path to a file containing it. +func readPeerKey(val string) (*ecdsa.PublicKey, error) { + s := strings.TrimSpace(val) + if data, err := os.ReadFile(val); err == nil { + s = strings.TrimSpace(string(data)) + } + return control.ParsePublicKey(s) } // loadKeysFromINI loads rx/tx (hex, 16-byte each) and optional expires from an INI file. From 4f707b65f5da1e29d6ea8618601dd0f035513349 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 17:33:30 -0700 Subject: [PATCH 09/20] [handler] pin the per-epoch TX-counter reset invariant (APO-648) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Durable-epoch seeding makes the data-plane epoch climb monotonically across rekeys and restarts; AES-GCM nonce uniqueness then rests on each epoch install starting its own TX counter at zero (nonce = epoch‖counter). That reset already happens (installKeys stores a fresh transmitCipher), but nothing pinned it, so a future refactor that carried the counter across installs would silently reintroduce (key, nonce) reuse. Document the invariant at the install site and add a white-box test (TxCounterForTest seam + TestInstallResetsTxCounterPerEpoch) that installs two epochs and asserts the counter resets to zero and counts from 1 again. --- cp_wire_test.go | 45 +++++++++++++++++++++++++++++++++++++++++++++ export_test.go | 16 ++++++++++++++++ handler.go | 6 ++++++ 3 files changed, 67 insertions(+) diff --git a/cp_wire_test.go b/cp_wire_test.go index c10d64f..0105b96 100644 --- a/cp_wire_test.go +++ b/cp_wire_test.go @@ -142,6 +142,51 @@ func TestControlPlaneSharedEpochGeneveRoundTrip(t *testing.T) { require.Equal(t, uint64(1), vnR.Stats.RXPackets.Load()) } +// TestInstallResetsTxCounterPerEpoch pins the nonce-uniqueness invariant the durable- +// epoch seeding (control/epochstate.go) depends on: each new epoch install starts a +// FRESH transmit counter, so the AES-GCM nonce (epoch‖counter) never repeats even as +// epochs climb monotonically across rekeys/restarts. A refactor that carried the +// counter across installs would reuse a (key, nonce) pair and trip this test. +func TestInstallResetsTxCounterPerEpoch(t *testing.T) { + const vni = 0x334455 + addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) + addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) + h := newPeerHandler(t, vni, addrA, addrB) + + var rx, tx [16]byte + for i := range rx { + rx[i], tx[i] = byte(i), byte(255-i) + } + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 100, rx, tx, time.Now().Add(time.Hour))) + c, ok := h.TxCounterForTest(vni) + require.True(t, ok) + require.Zero(t, c, "fresh install starts at counter 0") + + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + n, loop := h.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + c, _ = h.TxCounterForTest(vni) + require.Equal(t, uint64(1), c, "first frame uses counter 1") + + // Install a NEW, higher epoch (as a rekey or a seeded post-restart generation + // would). The counter MUST reset to zero — no carryover. + var rx2, tx2 [16]byte + for i := range rx2 { + rx2[i], tx2[i] = byte(i+1), byte(254-i) + } + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 200, rx2, tx2, time.Now().Add(time.Hour))) + c, _ = h.TxCounterForTest(vni) + require.Zero(t, c, "a new epoch must start a fresh zero counter (no carryover → no nonce reuse)") + + n, loop = h.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + c, _ = h.TxCounterForTest(vni) + require.Equal(t, uint64(1), c, "first frame under the new epoch counts from 1 again") +} + func TestControlPlaneNaiveTxSPIEpochDropsTraffic(t *testing.T) { iSAs, rSAs := negotiateLoopback(t) diff --git a/export_test.go b/export_test.go index cef6163..50e6442 100644 --- a/export_test.go +++ b/export_test.go @@ -22,3 +22,19 @@ func (h *Handler) InstallKeysForTest(vni uint, epoch uint32, rxKey, txKey [16]by } return h.installKeys(value.(*VirtualNetwork), epoch, rxKey, txKey, expiresAt) } + +// TxCounterForTest returns the active SA's current TX nonce counter for the VNI (and +// whether one is installed). It lets a test assert the per-epoch fresh-counter +// invariant — each new epoch install resets the counter to zero — which is what keeps +// the AES-GCM nonce (epoch‖counter) unique as epochs climb across rekeys/restarts. +func (h *Handler) TxCounterForTest(vni uint) (uint64, bool) { + value, ok := h.networkByID.Load(vni) + if !ok { + return 0, false + } + tc := value.(*VirtualNetwork).txCipher.Load() + if tc == nil { + return 0, false + } + return tc.counter.Load(), true +} diff --git a/handler.go b/handler.go index 7902cd8..dad66fe 100644 --- a/handler.go +++ b/handler.go @@ -464,6 +464,12 @@ func (h *Handler) installKeys(vnet *VirtualNetwork, epoch uint32, rxKey, txKey [ expiresAt: expiresAt, }) + // A fresh transmitCipher resets the TX counter to zero for the new epoch. This is + // load-bearing for nonce uniqueness: the AES-GCM nonce is epoch‖counter, so each + // epoch MUST begin its own counter at zero — the control plane's durable-epoch + // seeding makes epochs climb monotonically across rekeys/restarts, and the + // per-epoch counter reset is what keeps (key, nonce) pairs from ever repeating. + // A refactor that carried the counter across installs would reintroduce reuse. vnet.txCipher.Store(&transmitCipher{ AEAD: txCipher, epoch: epoch, From 0b0151f9e09df7e15b3ce436241d94b9728d1bb3 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 17:33:43 -0700 Subject: [PATCH 10/20] [control] add durable epoch high-water for seamless one-sided restart (APO-648) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-session SPI allocator resets on every (re)connect, so the shared data-plane epoch (the initiator-allocated SPI counter) regressed to 1 and the survivor's strictly-increasing epoch guard rejected it — breaking a transient reconnect, a responder restart, and an initiator restart alike. Carry an epoch high-water forward and seed each new session's allocator above it. Two layers, both initiator-only (only the initiator's SPI becomes the wire epoch): - In memory, always on: fixes a transient reconnect (a latent rekey-deadlock) and a responder restart for free. - Durably, opt-in via an EpochStore: a FileEpochStore persists the high-water with fsync + atomic-rename + dir-fsync, integrity-protected by an HMAC-SHA256 keyed from the identity's fixed-width private scalar and bound to the (local, peer) identity pin. Adds initiator-restart recovery; RequireState fails closed on a corrupt/unreadable/persistently-failing store. The margin is applied at seed time so it covers both the durable persistence gap (<=2 generations) and a torn-exchange reconnect lead (1 generation, where the responder committed an epoch the initiator never recorded). Persistence runs on a dedicated coalescing goroutine so a wedged fsync cannot freeze the rekey/ reconnect loop; stop() is grace-bounded for the same reason. SPI-space exhaustion and a stalled store (under RequireState) are terminal: Run fails closed instead of hot-looping. SeedFloor/ErrSPIExhausted on the allocator, Session.SeedRxFloor, and the Tunnel wiring (load at Bringup, seed in establish, persist on install, fatal paths). Tests cover the store format/corruption/round-trip, the persister, the allocator seed, reconnect monotonicity under an enforcing guard, the torn-exchange lead, the durable restart round-trip, and the responder ignoring its store. --- control/cp.go | 224 +++++++++++++++--- control/cp_test.go | 327 ++++++++++++++++++++++++++- control/epochstate.go | 451 +++++++++++++++++++++++++++++++++++++ control/epochstate_test.go | 391 ++++++++++++++++++++++++++++++++ control/sa.go | 39 +++- control/transport.go | 7 + 6 files changed, 1404 insertions(+), 35 deletions(-) create mode 100644 control/epochstate.go create mode 100644 control/epochstate_test.go diff --git a/control/cp.go b/control/cp.go index cdd137f..af35048 100644 --- a/control/cp.go +++ b/control/cp.go @@ -25,6 +25,13 @@ import ( // rxKey != txKey guard), exactly as documented at handler.go. Carrying the genuine // per-direction SPI on the wire (true per-direction nonce spaces) is the additive // UpdateVirtualNetworkSAs follow-up (Option C); it is intentionally out of scope here. +// +// Because the per-session SPI allocator resets on every (re)connect, the shared epoch +// would regress to 1 and the survivor's strictly-increasing epoch guard would reject +// it. The Tunnel carries an epoch high-water forward and seeds each new session's +// allocator above it (in memory always; durably via an EpochStore on the initiator) +// so reconnects and one-sided restarts keep the epoch monotonic — see +// control/epochstate.go. // Mode is the keying mode selected from the CLI flags. type Mode int @@ -157,9 +164,22 @@ type Tunnel struct { install SAInstaller initiator bool + // Durable/in-memory epoch high-water (initiator only; see control/epochstate.go). + // epochSeed seeds each new session's RX allocator so the shared epoch keeps + // increasing across a reconnect/restart instead of resetting to 1; installedHigh + // is the latest installed epoch (the persist target / stall reference); store and + // persist add durability for an initiator restart. + store EpochStore + requireState bool + epochSeed uint32 + installedHigh uint32 + persist *epochPersister + // tunables (defaults set by NewTunnel; tests may override) - perExchangeTimeout time.Duration - reconnectBackoff time.Duration + perExchangeTimeout time.Duration + reconnectBackoff time.Duration + maxStoreFailures int64 + persistStallTimeout time.Duration ln *Listener // responder only; persists across reconnects sess *Session @@ -177,6 +197,19 @@ type TunnelConfig struct { PeerAddr net.Addr // RekeyInterval is how often the initiator negotiates a fresh SA generation. RekeyInterval time.Duration + // EpochStore, when non-nil, persists the data-plane epoch high-water so a restart + // of the elected INITIATOR recovers seamlessly. It is consulted only when this + // node is the initiator — the responder's high-water is not load-bearing (the + // shared epoch is always the initiator-allocated SPI). A responder configured with + // a store leaves it inert. nil disables durable persistence (a transient reconnect + // and a responder restart still recover via the in-memory high-water; only a + // one-sided initiator restart needs the store). + EpochStore EpochStore + // RequireState makes durable epoch state fail closed instead of degrading: a + // corrupt/unreadable state file fails Bringup, and persistently failing/stalled + // stores fail Run. It requires EpochStore. It is an integrity tripwire against + // accidental corruption, NOT an anti-rollback/anti-deletion control. + RequireState bool } // NewTunnel validates the config, elects the canonical role, and returns a Tunnel @@ -194,20 +227,27 @@ func NewTunnel(cfg TunnelConfig, install SAInstaller) (*Tunnel, error) { if cfg.RekeyInterval <= 0 { return nil, errors.New("control: rekey interval must be positive") } + if cfg.RequireState && cfg.EpochStore == nil { + return nil, errors.New("control: RequireState requires an EpochStore") + } initiator, err := CanonicalInitiator(cfg.Local.PublicKey(), cfg.PeerPub) if err != nil { return nil, err } return &Tunnel{ - local: cfg.Local, - peerPub: cfg.PeerPub, - conn: cfg.Conn, - peerAddr: cfg.PeerAddr, - rekeyIvl: cfg.RekeyInterval, - install: install, - initiator: initiator, - perExchangeTimeout: defaultPerExchangeTimeout, - reconnectBackoff: defaultReconnectBackoff, + local: cfg.Local, + peerPub: cfg.PeerPub, + conn: cfg.Conn, + peerAddr: cfg.PeerAddr, + rekeyIvl: cfg.RekeyInterval, + install: install, + initiator: initiator, + store: cfg.EpochStore, + requireState: cfg.RequireState, + perExchangeTimeout: defaultPerExchangeTimeout, + reconnectBackoff: defaultReconnectBackoff, + maxStoreFailures: defaultMaxStoreFailures, + persistStallTimeout: defaultPersistStallTimeout, }, nil } @@ -218,11 +258,22 @@ func (t *Tunnel) Initiator() bool { return t.initiator } // It is synchronous and FAIL-CLOSED: it returns an error (and installs nothing) if // the handshake, negotiation, or install fails, so the caller must not start the data // plane until Bringup succeeds. -func (t *Tunnel) Bringup(ctx context.Context) error { - if err := t.establish(ctx); err != nil { +func (t *Tunnel) Bringup(ctx context.Context) (err error) { + if err = t.loadEpochState(); err != nil { + return err + } + // loadEpochState may have started the persister goroutine; reap it if Bringup + // fails so a caller that drops the Tunnel on a Bringup error does not leak it. + defer func() { + if err != nil && t.persist != nil { + t.persist.stop() + t.persist = nil + } + }() + if err = t.establish(ctx); err != nil { return fmt.Errorf("control: establish session: %w", err) } - if err := t.negotiateAndInstall(ctx); err != nil { + if err = t.negotiateAndInstall(ctx); err != nil { t.closeSession() return fmt.Errorf("control: initial SA negotiation: %w", err) } @@ -231,7 +282,49 @@ func (t *Tunnel) Bringup(ctx context.Context) error { role = "initiator" } slog.Info("control plane established", slog.String("role", role), - slog.String("peer", t.peerAddr.String())) + slog.String("peer", t.peerAddr.String()), + slog.Bool("durableEpochState", t.persist != nil)) + return nil +} + +// loadEpochState reads the durable epoch high-water and starts the persister. It is a +// no-op except on the initiator with a configured store — the responder's high-water +// is not load-bearing (see control/epochstate.go), so a responder configured with a +// store leaves it inert (and --require-state does not gate the responder). It runs at +// the start of Bringup so NewTunnel stays I/O-free. +func (t *Tunnel) loadEpochState() error { + if t.store == nil { + return nil + } + if !t.initiator { + slog.Info("control: durable epoch state inactive on this node (responder role; the shared epoch is initiator-driven). Ensure the elected initiator also has a state file") + return nil + } + hw, ok, err := t.store.Load() + if err != nil { + if t.requireState { + return fmt.Errorf("control: durable epoch state is unreadable and --require-state is set: %w", err) + } + slog.Error("control: epoch state unreadable; starting fresh — a one-sided initiator restart will not recover until state is re-persisted. Use --require-state to fail closed instead", + slog.Any("error", err)) + hw, ok = 0, false + } + start := uint32(0) + if ok { + // epochSeed/installedHigh track the EXACT high-water; the margin is applied at + // seed time in establish (see seedWithMargin), so it covers both the durable + // gap here and the torn-reconnect lead later. + t.epochSeed = hw + t.installedHigh = hw + start = hw + slog.Info("control: loaded durable epoch high-water", + slog.Uint64("highWater", uint64(hw)), slog.Uint64("seed", uint64(seedWithMargin(hw)))) + if seedWithMargin(hw) >= spiCounterMask-1 { + slog.Warn("control: epoch counter space is nearly exhausted; master-key rotation will be required (the control plane will fail closed when it runs out)", + slog.Uint64("highWater", uint64(hw)), slog.Uint64("ceiling", uint64(spiCounterMask))) + } + } + t.persist = newEpochPersister(t.store, start, time.Now().UnixNano()) return nil } @@ -256,6 +349,18 @@ func (t *Tunnel) runInitiator(ctx context.Context) error { ticker := time.NewTicker(t.rekeyIvl) defer ticker.Stop() for { + // A clean shutdown takes priority over the fail-closed tripwire: returning a + // fatal error here on the way out would mis-report a deliberate stop as a + // failure (non-zero exit). Mirror the ctx.Err() guards on the other terminal + // arms below. + if ctx.Err() != nil { + return nil + } + // Fail closed (only under --require-state) if durable persistence has fallen + // far enough behind that a restart could no longer recover. + if err := t.epochPersistFatal(time.Now()); err != nil { + return err + } sessLost := t.sessionDone() select { case <-ctx.Done(): @@ -275,9 +380,15 @@ func (t *Tunnel) runInitiator(ctx context.Context) error { if ctx.Err() != nil { return nil } - // A monotonicity rejection (epoch regression) surfaces only after a - // reconnect reset the per-session SPI counter; installSAs logs it and - // returns nil, so any error here is a genuine session/transport failure. + if isFatalCP(err) { + // SPI-space exhaustion is terminal (master-key rotation, unsupported, + // is the only remedy); reconnecting would just hot-loop. Fail closed. + return err + } + // Epoch regression after a reconnect is now prevented by seeding the + // allocator from the epoch high-water (see installSAs / loadEpochState); + // installSAs still swallows a stray rejection, so any error here is a + // genuine session/transport failure. slog.Warn("control: rekey failed, reconnecting", slog.Any("error", err)) if err := t.reestablish(ctx); err != nil { return err @@ -319,11 +430,13 @@ func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { } // installSAs validates the negotiated SAs fail-closed (PSPv0, 16-byte keys), -// computes the shared epoch, and hands them to the installer. A rejected rotation -// (e.g. the monotonicity guard refusing a regressed epoch after a reconnect) is -// logged and swallowed: the previously installed keys keep forwarding, and the data -// plane fails closed on their own expiry. Seamless reconnect/restart across the guard -// is the deferred durable epoch high-water work. +// computes the shared epoch, and hands them to the installer. Seeding the allocator +// from the epoch high-water (see recordInstalled / loadEpochState) keeps the epoch +// strictly increasing across reconnects/restarts, so the monotonicity guard should +// accept every generation in normal operation. A rejection is still swallowed as +// defense-in-depth (e.g. with no durable state after a one-sided initiator restart, +// or a responder whose floor is not seeded): the previously installed keys keep +// forwarding and the data plane fails closed on their own expiry. func (t *Tunnel) installSAs(sas *DirectionalSAs) error { if sas.Tx.Version != PSPv0 || sas.Rx.Version != PSPv0 { return fmt.Errorf("control: only PSPv0/AES-128 is supported in this build (tx=%d rx=%d)", sas.Tx.Version, sas.Rx.Version) @@ -339,14 +452,35 @@ func (t *Tunnel) installSAs(sas *DirectionalSAs) error { copy(rxKey[:], sas.Rx.Key) copy(txKey[:], sas.Tx.Key) if err := t.install(epoch, rxKey, txKey); err != nil { - slog.Warn("control: SA install rejected; keeping current keys until they expire (seamless reconnect needs durable epoch high-water — deferred)", + slog.Warn("control: SA install rejected; keeping current keys until they expire (seed the epoch floor / configure --state-file for seamless recovery)", slog.Uint64("epoch", uint64(epoch)), slog.Any("error", err)) return nil } + t.recordInstalled(epoch) slog.Debug("control: installed SA generation", slog.Uint64("epoch", uint64(epoch))) return nil } +// recordInstalled advances the initiator's in-memory epoch high-water after a +// successful install and asks the persister to make it durable. It is initiator-only: +// only the initiator's allocator feeds the shared epoch, so only its high-water needs +// to be carried forward. The enqueue never blocks (the persister fsyncs off this +// goroutine). +func (t *Tunnel) recordInstalled(epoch uint32) { + if !t.initiator { + return + } + if epoch > t.epochSeed { + t.epochSeed = epoch + } + if epoch > t.installedHigh { + t.installedHigh = epoch + } + if t.persist != nil { + t.persist.request(t.installedHigh) + } +} + // establish opens a fresh session: the initiator dials, the responder accepts on a // listener it keeps across reconnects. func (t *Tunnel) establish(ctx context.Context) error { @@ -356,6 +490,17 @@ func (t *Tunnel) establish(ctx context.Context) error { return err } t.sess = sess + // Seed THIS session's allocator above the epoch high-water (plus a margin) so + // the shared epoch keeps climbing across the reconnect rather than resetting to + // 1, AND stays strictly above what the survivor retained even if the last + // exchange tore after the responder committed but before we recorded it (see + // seedWithMargin). A fresh start (no high-water) seeds 0 → epoch 1, unchanged. + // Only the initiator is seeded: SharedEpoch always selects the + // initiator-allocated SPI, so the responder's allocator is cosmetic to the wire + // epoch. + if t.epochSeed > 0 { + t.sess.SeedRxFloor(seedWithMargin(t.epochSeed)) + } return nil } if t.ln == nil { @@ -392,12 +537,18 @@ func (t *Tunnel) reestablish(ctx context.Context) error { continue } // Re-key immediately on the new session so traffic resumes without waiting a - // full interval. A rejected install (regressed epoch) is swallowed by - // installSAs; a transport error drops back to another reconnect attempt. + // full interval. The new session's allocator is seeded from the epoch + // high-water (establish), so the epoch keeps increasing and the install is + // accepted; a transport error drops back to another reconnect attempt. exCtx, cancel := context.WithTimeout(ctx, t.perExchangeTimeout) err := t.negotiateAndInstall(exCtx) cancel() if err != nil && ctx.Err() == nil { + if isFatalCP(err) { + // Exhaustion is terminal — re-seeding the same exhausted floor would + // hot-loop. Surface it so Run returns and fails closed. + return err + } slog.Warn("control: post-reconnect negotiation failed", slog.Any("error", err)) t.closeSession() continue @@ -422,9 +573,28 @@ func (t *Tunnel) closeSession() { } } -// Close releases the session and (responder) listener. It is idempotent. +// epochPersistFatal reports a fatal error if durable persistence has degraded past +// the point of guaranteed recovery (only under --require-state; otherwise nil). +func (t *Tunnel) epochPersistFatal(now time.Time) error { + if t.persist == nil { + return nil + } + return t.persist.fatal(t.requireState, t.installedHigh, now, t.maxStoreFailures, t.persistStallTimeout) +} + +// isFatalCP reports whether err is a terminal, non-retryable control-plane error that +// must stop Run rather than drive a reconnect. +func isFatalCP(err error) bool { + return errors.Is(err, ErrSPIExhausted) || errors.Is(err, errEpochPersistStalled) +} + +// Close releases the session, stops the persister, and (responder) the listener. It is +// idempotent. func (t *Tunnel) Close() error { t.closeSession() + if t.persist != nil { + t.persist.stop() + } if t.ln != nil { err := t.ln.Close() t.ln = nil diff --git a/control/cp_test.go b/control/cp_test.go index bf9e79f..b432be1 100644 --- a/control/cp_test.go +++ b/control/cp_test.go @@ -143,6 +143,12 @@ func TestInstallSAsSwallowsRotationRejection(t *testing.T) { // twoTunnels wires an initiator and a responder Tunnel over loopback UDP, assigning // the canonical roles correctly, with tight timings for tests. func twoTunnels(t *testing.T, instInit, instResp SAInstaller, rekey time.Duration) (initT, respT *Tunnel, cleanup func()) { + return twoTunnelsWithStore(t, instInit, instResp, rekey, nil) +} + +// twoTunnelsWithStore is twoTunnels with an optional durable EpochStore on the +// INITIATOR (the only role for which durable state is load-bearing). +func twoTunnelsWithStore(t *testing.T, instInit, instResp SAInstaller, rekey time.Duration, initStore EpochStore) (initT, respT *Tunnel, cleanup func()) { t.Helper() idA, err := GenerateIdentity() require.NoError(t, err) @@ -163,7 +169,7 @@ func twoTunnels(t *testing.T, instInit, instResp SAInstaller, rekey time.Duratio initT, err = NewTunnel(TunnelConfig{ Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, - PeerAddr: respConn.LocalAddr(), RekeyInterval: rekey, + PeerAddr: respConn.LocalAddr(), RekeyInterval: rekey, EpochStore: initStore, }, instInit) require.NoError(t, err) require.True(t, initT.Initiator()) @@ -189,6 +195,36 @@ func twoTunnels(t *testing.T, instInit, instResp SAInstaller, rekey time.Duratio return initT, respT, cleanup } +// guardInstaller is an SAInstaller that enforces the handler's strictly-increasing +// epoch guard (handler.go), so tests can detect a regressed/rejected epoch rather +// than the no-op epochRecorder which accepts everything. +type guardInstaller struct { + mu sync.Mutex + max uint32 + installed []uint32 + rejects int +} + +func newGuardInstaller() *guardInstaller { return &guardInstaller{} } + +func (g *guardInstaller) install(epoch uint32, _, _ [16]byte) error { + g.mu.Lock() + defer g.mu.Unlock() + if epoch <= g.max { + g.rejects++ + return errors.New("epoch must be monotonically increasing") + } + g.max = epoch + g.installed = append(g.installed, epoch) + return nil +} + +func (g *guardInstaller) snapshot() (installed []uint32, rejects int) { + g.mu.Lock() + defer g.mu.Unlock() + return append([]uint32(nil), g.installed...), g.rejects +} + // epochRecorder is a thread-safe SAInstaller that records the epochs it installs. type epochRecorder struct { mu sync.Mutex @@ -364,12 +400,295 @@ func TestTunnelReconnects(t *testing.T) { // installing — so every epoch the initiator installed must also have been // installed by the responder. (The reverse can differ by one: a negotiation torn // by the forced loss after the responder committed but before the initiator - // finished leaves the responder with an extra generation. Per-session allocators - // also reset across the reconnect, so epochs are not globally monotonic — agreement - // in this direction, not monotonicity, is the invariant.) + // finished leaves the responder with an extra generation.) ie, _ := initRec.snapshot() _, rk := respRec.snapshot() for _, e := range ie { require.Contains(t, rk, e, "responder never installed initiator epoch %d", e) } + + // Monotonicity invariant (Phase 5): seeding the new session's allocator from the + // in-memory epoch high-water means the shared epoch no longer resets to 1 across + // the reconnect — the initiator's installed epochs are now globally strictly + // increasing, which is what lets the survivor's monotonicity guard accept the + // post-reconnect generation. + for i := 1; i < len(ie); i++ { + require.Greater(t, ie[i], ie[i-1], "initiator epochs must be globally monotonic across the reconnect") + } +} + +// TestTunnelReconnectGuardNeverRejects is the recovery regression test the design +// review demanded: with installers that ENFORCE the handler's strictly-increasing +// epoch guard, a forced session loss must self-heal, keep installing generation after +// generation, AND never make either guard reject a regressed epoch. Without the +// epoch-floor seeding the post-reconnect epoch would reset to 1 and the guard would +// reject every generation (installs would stall at the single bring-up generation); +// without the seed MARGIN, a torn-after-responder-install exchange would re-offer an +// already-installed epoch and the responder's guard would reject it. The seed margin +// (>= the worst-case one-generation responder lead) is what makes zero rejections +// hold; the deterministic proof of that margin is TestReconnectSeedCoversTornLead. +func TestTunnelReconnectGuardNeverRejects(t *testing.T) { + initGuard, respGuard := newGuardInstaller(), newGuardInstaller() + initT, respT, cleanup := twoTunnels(t, initGuard.install, respGuard.install, 100*time.Millisecond) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second) + defer cancel() + + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + require.NoError(t, initT.sess.Close()) // force a session loss + + runCh := make(chan error, 2) + go func() { runCh <- initT.Run(ctx) }() + go func() { runCh <- respT.Run(ctx) }() + + // Progress well past the single bring-up generation on BOTH peers proves the guard + // keeps accepting because seeding keeps the epoch climbing across the reconnect. + require.Eventually(t, func() bool { + ig, _ := initGuard.snapshot() + rg, _ := respGuard.snapshot() + return len(ig) >= 5 && len(rg) >= 5 + }, 18*time.Second, 25*time.Millisecond, "peers must self-heal and keep installing under the guard") + + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + + ig, iRej := initGuard.snapshot() + _, rRej := respGuard.snapshot() + require.Zero(t, iRej, "initiator guard must never reject (seeding keeps epochs monotonic)") + require.Zero(t, rRej, "responder guard must never reject (the seed margin covers a torn-exchange lead)") + for i := 1; i < len(ig); i++ { + require.Greater(t, ig[i], ig[i-1], "accepted epochs are strictly increasing") + } +} + +// TestReconnectSeedCoversTornLead is the deterministic guard for the in-memory +// reconnect seed margin: it simulates the torn-exchange race — the responder committed +// epoch E but the initiator failed to record it, so the initiator's high-water lags by +// one — and proves the next session's seed still produces an epoch strictly greater +// than E, so the responder's monotonicity guard accepts it (no black-hole). +func TestReconnectSeedCoversTornLead(t *testing.T) { + const responderMax = uint32(50) // responder installed up to 50 + initHighWater := responderMax - 1 // initiator recorded only 49 (torn after 50) + + a := NewSPIAllocator(Initiator) + a.SeedFloor(activeMasterKeyIndex, seedWithMargin(initHighWater)) + next, err := a.Allocate(activeMasterKeyIndex) + require.NoError(t, err) + require.Greater(t, next&spiCounterMask, responderMax, + "the seeded epoch must exceed the responder's retained max despite the one-generation lag") +} + +// TestTunnelSeedsFromDurableHighWater proves the initiator-restart recovery +// mechanism deterministically: a fresh initiator whose store already holds a high +// high-water (as a prior process would have left) seeds its FIRST epoch above it, so +// a survivor that retained that high-water still accepts the new SA. It also confirms +// the value is re-persisted. +func TestTunnelSeedsFromDurableHighWater(t *testing.T) { + const prior = uint32(1000) + store := &fakeEpochStore{} + store.set(prior) + + initRec, respRec := newEpochRecorder(), newEpochRecorder() + initT, respT, cleanup := twoTunnelsWithStore(t, initRec.install, respRec.install, 100*time.Millisecond, store) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + ie, _ := initRec.snapshot() + re, _ := respRec.snapshot() + require.Len(t, ie, 1) + // First epoch is seeded strictly above the durable high-water (+margin), so it + // exceeds anything a survivor retained from the pre-restart session. + require.Equal(t, seedWithMargin(prior)+1, ie[0]) + require.Greater(t, ie[0], prior) + require.Equal(t, ie[0], re[0], "both peers install the seeded epoch") + + // The new high-water is persisted durably for the next restart. + require.Eventually(t, func() bool { + hw, ok := store.loaded() + return ok && hw == ie[0] + }, 2*time.Second, 5*time.Millisecond) +} + +// TestTunnelRequireStateFailsBringupOnCorruptState asserts the fail-closed Load +// policy: under RequireState a corrupt/unreadable store fails Bringup (on the +// initiator), while the default policy starts fresh. +func TestTunnelRequireStateFailsBringupOnCorruptState(t *testing.T) { + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + conn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer conn.Close() + + loadErr := errors.New("state corrupt") + tn, err := NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: conn, + PeerAddr: conn.LocalAddr(), RekeyInterval: time.Second, + EpochStore: &fakeEpochStore{loadErr: loadErr}, RequireState: true, + }, func(uint32, [16]byte, [16]byte) error { return nil }) + require.NoError(t, err) + require.True(t, tn.Initiator()) + defer tn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + // Bringup must fail closed at loadEpochState, before any session is established. + err = tn.Bringup(ctx) + require.Error(t, err) + require.ErrorIs(t, err, loadErr) + require.Nil(t, tn.sess) +} + +func TestNewTunnelRequireStateNeedsStore(t *testing.T) { + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + conn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer conn.Close() + _, err = NewTunnel(TunnelConfig{ + Local: idA, PeerPub: idB.PublicKey(), Conn: conn, + PeerAddr: conn.LocalAddr(), RekeyInterval: time.Second, RequireState: true, + }, func(uint32, [16]byte, [16]byte) error { return nil }) + require.Error(t, err, "RequireState without an EpochStore must be rejected") +} + +// TestTunnelDurableRoundTripAcrossRestart is the true end-to-end durable-recovery +// proof: a real initiator persists its climbing high-water to a shared store, then a +// SECOND initiator instance on the SAME store (a simulated restart) reloads exactly +// what the first persisted and will seed strictly above it. +func TestTunnelDurableRoundTripAcrossRestart(t *testing.T) { + store := &fakeEpochStore{} + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + // Round 1: run a real pair until the initiator has persisted a couple generations. + func() { + respConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer respConn.Close() + initConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer initConn.Close() + + t1, err := NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, + PeerAddr: respConn.LocalAddr(), RekeyInterval: 100 * time.Millisecond, EpochStore: store, + }, newEpochRecorder().install) + require.NoError(t, err) + require.True(t, t1.Initiator()) + r, err := NewTunnel(TunnelConfig{ + Local: respID, PeerPub: initID.PublicKey(), Conn: respConn, + PeerAddr: initConn.LocalAddr(), RekeyInterval: 100 * time.Millisecond, + }, newEpochRecorder().install) + require.NoError(t, err) + defer t1.Close() + defer r.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + brCh := make(chan error, 1) + go func() { brCh <- r.Bringup(ctx) }() + require.NoError(t, t1.Bringup(ctx)) + require.NoError(t, <-brCh) + + runCh := make(chan error, 2) + go func() { runCh <- t1.Run(ctx) }() + go func() { runCh <- r.Run(ctx) }() + require.Eventually(t, func() bool { + hw, ok := store.loaded() + return ok && hw >= 2 + }, 10*time.Second, 10*time.Millisecond, "initiator must persist a few generations") + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + }() + + persisted, ok := store.loaded() + require.True(t, ok) + require.GreaterOrEqual(t, persisted, uint32(2)) + + // Round 2: a fresh initiator (same identity + store) reloads what round 1 wrote. + conn2, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer conn2.Close() + t2, err := NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: conn2, + PeerAddr: conn2.LocalAddr(), RekeyInterval: time.Second, EpochStore: store, + }, func(uint32, [16]byte, [16]byte) error { return nil }) + require.NoError(t, err) + require.True(t, t2.Initiator()) + defer t2.Close() + + require.NoError(t, t2.loadEpochState()) + require.Equal(t, persisted, t2.installedHigh, "second instance loads exactly what the first persisted") + require.Greater(t, seedWithMargin(t2.epochSeed), persisted, "and seeds strictly above it") +} + +// TestResponderIgnoresStore pins the initiator-only invariant: a responder configured +// with a store (and even --require-state) must never consult or write it, must not be +// gated by a load error, and must not start a persister. +func TestResponderIgnoresStore(t *testing.T) { + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + conn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer conn.Close() + + store := &fakeEpochStore{loadErr: errors.New("responder must never read this")} + respT, err := NewTunnel(TunnelConfig{ + Local: respID, PeerPub: initID.PublicKey(), Conn: conn, + PeerAddr: conn.LocalAddr(), RekeyInterval: time.Second, + EpochStore: store, RequireState: true, + }, func(uint32, [16]byte, [16]byte) error { return nil }) + require.NoError(t, err) + require.False(t, respT.Initiator()) + defer respT.Close() + + // On the responder, loadEpochState is a no-op: it must not Load (so the load error + // and RequireState do not gate it) and must not start a persister. + require.NoError(t, respT.loadEpochState()) + require.Nil(t, respT.persist) + loads, stores := store.counts() + require.Zero(t, loads, "responder must never Load its store") + require.Zero(t, stores, "responder must never Store") } diff --git a/control/epochstate.go b/control/epochstate.go new file mode 100644 index 0000000..d30800d --- /dev/null +++ b/control/epochstate.go @@ -0,0 +1,451 @@ +package control + +import ( + "bytes" + "crypto/ecdsa" + "crypto/hkdf" + "crypto/hmac" + "crypto/sha256" + "crypto/x509" + "encoding/binary" + "errors" + "fmt" + "io/fs" + "log/slog" + "os" + "path/filepath" + "sync" + "sync/atomic" + "time" +) + +// This file implements the durable epoch high-water that lets a one-sided restart +// of the control-plane INITIATOR recover seamlessly (Phase 5 item #1). +// +// Why only the initiator: the shared data-plane epoch is the initiator-allocated +// SPI counter (see SharedEpoch). On every (re)connect the per-session SPI allocator +// resets to 1, so the survivor's strictly-increasing epoch guard (handler.go) would +// reject the regressed epoch. The Tunnel fixes this by seeding the new session's +// allocator above an epoch high-water it carries forward: in memory (covers a +// transient reconnect and a responder restart, where the surviving initiator never +// lost the value) and, when an EpochStore is configured, on durable storage (covers +// an initiator restart, where the value must outlive the process). The responder's +// high-water is not load-bearing, so durable state is consulted only on the +// initiator. +// +// Safety note: the epoch is a data-plane SELECTOR, not a nonce. Reusing an epoch +// VALUE across sessions is harmless because every QUIC session derives fresh master +// keys from a fresh ECDHE exporter (no 0-RTT), so the AES-GCM (key, nonce) pair never +// repeats. The persisted high-water therefore only needs integrity, not rollback +// resistance — see EpochStore. + +// EpochStore persists the data-plane epoch high-water across process restarts. +type EpochStore interface { + // Load returns the persisted high-water. ok is false with a nil error ONLY when + // no state has been written yet (first run); every other condition — unreadable + // file, truncation, bad magic/version, identity-pair pin mismatch, MAC failure, + // out-of-range value — returns a non-nil error so a fail-closed caller + // (--require-state) can refuse to start rather than silently resetting to zero. + Load() (highWater uint32, ok bool, err error) + // Store durably writes highWater (fsync of the file, atomic rename into place, + // fsync of the directory). It may block on disk I/O; callers persist off the + // hot path (see epochPersister). + Store(highWater uint32) error +} + +// On-disk format (fixed 74 bytes). The MAC covers bytes [0:42); the mac occupies +// [42:74). A future non-zero flags value must be byte-identical on Store and Load. +const ( + epochStateMagic = "ICXE" + epochStateVersion = 1 + epochStateLen = 4 + 1 + 1 + 32 + 4 + 32 // magic|version|flags|pin|highWater|mac + + offVersion = 4 + offFlags = 5 + offPin = 6 + offHighWater = 38 + offMAC = 42 // == end of MAC-covered prefix + + // stateMACInfo domain-separates the epoch-state MAC key from any other use of + // the identity key. + stateMACInfo = "icx epoch-state hmac v1" +) + +// FileEpochStore is a file-backed EpochStore. The record is integrity-protected by +// an HMAC keyed from the local identity and bound (via the pin) to the exact +// (local, peer) identity pair, so a file cannot be silently swapped between tunnels +// or nodes. One file per (local, peer) tunnel; it must NOT be shared between +// processes (concurrent writers). +// +// What the MAC defends: accidental corruption/bit-rot (rejected on Load) and forgery +// of a chosen high-water by anyone without the identity key (HMAC). What it does NOT +// defend: rollback-replay of an older, validly-signed file, or deletion (an absent +// file is indistinguishable from a genuine first run). Those need a hardware +// monotonic counter and are out of scope; --require-state is an integrity tripwire +// against corruption, not an anti-rollback control. This is acceptable because, per +// the safety note above, even a rolled-back or absent high-water cannot cause AES-GCM +// nonce reuse on the data plane (keys are per-session ephemeral) — the only +// consequence is a transient one-sided-restart-style outage. +type FileEpochStore struct { + path string + macKey []byte + pin [32]byte +} + +// NewFileEpochStore builds a file-backed store at path, keyed from local and bound to +// the (local, peer) identity pair. +func NewFileEpochStore(path string, local *Identity, peerPub *ecdsa.PublicKey) (*FileEpochStore, error) { + if path == "" { + return nil, errors.New("control: epoch state path is empty") + } + if local == nil || peerPub == nil { + return nil, errors.New("control: epoch state requires local identity and peer key") + } + // Validate the parent directory up front so a missing/unwritable state directory + // fails fast at construction. Without this, os.ReadFile on a path whose parent is + // missing reports fs.ErrNotExist, which Load would (correctly) treat as a genuine + // first run — masking the misconfiguration so even --require-state starts happily + // and only fails closed several rekeys later when the first Store cannot create a + // temp file. + dir := filepath.Dir(path) + info, err := os.Stat(dir) + if err != nil { + return nil, fmt.Errorf("control: epoch state directory %q: %w", dir, err) + } + if !info.IsDir() { + return nil, fmt.Errorf("control: epoch state directory %q is not a directory", dir) + } + macKey, err := stateMACKey(local) + if err != nil { + return nil, err + } + pin, err := identityPairPin(local.PublicKey(), peerPub) + if err != nil { + return nil, err + } + // Best-effort sweep of temp files orphaned by a crash between CreateTemp and rename + // in a prior run (safe at construction: no Store is in flight, and the store is + // not shared between processes). + if matches, gerr := filepath.Glob(filepath.Join(dir, ".icx-epoch-*.tmp")); gerr == nil { + for _, m := range matches { + _ = os.Remove(m) + } + } + return &FileEpochStore{path: path, macKey: macKey, pin: pin}, nil +} + +// stateMACKey derives the HMAC key from the identity. The IKM is the fixed-width +// (32-byte, left-zero-padded) P-256 private scalar — a mathematical value that is +// stable across Go releases — NOT the PKCS#8 DER, whose byte layout is an +// implementation detail a toolchain upgrade could perturb and thereby silently +// invalidate every previously-written MAC. Deriving the MAC key from the identity +// (rather than a separate secret) intentionally overloads one key across roles; the +// HKDF info string domain-separates it, and the consequence is that the epoch-state +// MAC lifetime equals the identity-key lifetime — rotating the identity resets the +// durable epoch state. +func stateMACKey(local *Identity) ([]byte, error) { + var scalar [32]byte + local.priv.D.FillBytes(scalar[:]) + key, err := hkdf.Key(sha256.New, scalar[:], nil, stateMACInfo, 32) + if err != nil { + return nil, fmt.Errorf("control: derive epoch-state MAC key: %w", err) + } + return key, nil +} + +// identityPairPin binds a state file to the exact (local, peer) pair so it cannot be +// confused with another tunnel's file even under the same identity. +func identityPairPin(local, peer *ecdsa.PublicKey) ([32]byte, error) { + var pin [32]byte + l, err := x509.MarshalPKIXPublicKey(local) + if err != nil { + return pin, fmt.Errorf("control: marshal local key: %w", err) + } + p, err := x509.MarshalPKIXPublicKey(peer) + if err != nil { + return pin, fmt.Errorf("control: marshal peer key: %w", err) + } + h := sha256.New() + h.Write(l) + h.Write(p) + copy(pin[:], h.Sum(nil)) + return pin, nil +} + +func (s *FileEpochStore) marshal(highWater uint32) []byte { + buf := make([]byte, epochStateLen) + copy(buf[0:offVersion], epochStateMagic) + buf[offVersion] = epochStateVersion + buf[offFlags] = 0 // reserved + copy(buf[offPin:offHighWater], s.pin[:]) + binary.BigEndian.PutUint32(buf[offHighWater:offMAC], highWater) + mac := hmac.New(sha256.New, s.macKey) + mac.Write(buf[:offMAC]) + copy(buf[offMAC:], mac.Sum(nil)) + return buf +} + +// Load reads and verifies the state file. See EpochStore.Load for the ok/err contract. +func (s *FileEpochStore) Load() (uint32, bool, error) { + buf, err := os.ReadFile(s.path) + if err != nil { + // fs.ErrNotExist is the ONLY signal that maps to "first run"; every other + // open/read error (EACCES, EIO, ENOTDIR, dangling symlink, ...) is a real + // failure so --require-state fails closed instead of starting fresh. + if errors.Is(err, fs.ErrNotExist) { + return 0, false, nil + } + return 0, false, fmt.Errorf("control: read epoch state %q: %w", s.path, err) + } + // Length is validated strictly before any field slicing so a truncated or + // zero-length file returns a clean error rather than panicking. + if len(buf) != epochStateLen { + return 0, false, fmt.Errorf("control: epoch state %q is %d bytes, want %d (corrupt/truncated)", s.path, len(buf), epochStateLen) + } + if string(buf[0:offVersion]) != epochStateMagic { + return 0, false, fmt.Errorf("control: epoch state %q has bad magic", s.path) + } + if buf[offVersion] != epochStateVersion { + return 0, false, fmt.Errorf("control: epoch state %q has unsupported version %d", s.path, buf[offVersion]) + } + if !bytes.Equal(buf[offPin:offHighWater], s.pin[:]) { + return 0, false, fmt.Errorf("control: epoch state %q identity-pair pin mismatch (wrong peer or identity key)", s.path) + } + mac := hmac.New(sha256.New, s.macKey) + mac.Write(buf[:offMAC]) + if !hmac.Equal(buf[offMAC:], mac.Sum(nil)) { + return 0, false, fmt.Errorf("control: epoch state %q MAC verification failed (corrupt or tampered)", s.path) + } + hw := binary.BigEndian.Uint32(buf[offHighWater:offMAC]) + if hw > spiCounterMask { + return 0, false, fmt.Errorf("control: epoch state %q high-water %d exceeds max %d", s.path, hw, spiCounterMask) + } + return hw, true, nil +} + +// Store atomically and durably writes highWater. It writes a uniquely-named temp file +// in the target directory (so overlapping writers cannot collide on a shared temp +// name), fsyncs it, renames it over the target (atomic replace), then fsyncs the +// directory so the rename — the durable commit point — survives a crash. +func (s *FileEpochStore) Store(highWater uint32) (err error) { + buf := s.marshal(highWater) + dir := filepath.Dir(s.path) + tmp, err := os.CreateTemp(dir, ".icx-epoch-*.tmp") + if err != nil { + return fmt.Errorf("control: create temp epoch state in %q: %w", dir, err) + } + tmpName := tmp.Name() + committed := false + defer func() { + if !committed { + _ = tmp.Close() + _ = os.Remove(tmpName) + } + }() + if _, err = tmp.Write(buf); err != nil { + return fmt.Errorf("control: write epoch state: %w", err) + } + if err = tmp.Sync(); err != nil { + return fmt.Errorf("control: fsync epoch state: %w", err) + } + if err = tmp.Close(); err != nil { + return fmt.Errorf("control: close epoch state: %w", err) + } + if err = os.Rename(tmpName, s.path); err != nil { + return fmt.Errorf("control: rename epoch state into place: %w", err) + } + committed = true + if derr := fsyncDir(dir); derr != nil { + // The rename succeeded, so the new high-water IS in the file; only the + // durability of the directory ENTRY across a power loss is unconfirmed. On + // filesystems where directory fsync is unsupported (some overlay/network FS) + // this fails every time. Treat it as success for bookkeeping — returning an + // error here would wrongly count a durable write as a failure and could fail a + // healthy node closed under --require-state — but warn so a genuinely failing + // device is visible. + slog.Warn("control: epoch state written but directory fsync failed; value is durable, crash-durability of the rename is unconfirmed", + slog.Any("error", derr)) + } + return nil +} + +func fsyncDir(dir string) error { + d, err := os.Open(dir) + if err != nil { + return err + } + defer func() { _ = d.Close() }() + return d.Sync() +} + +// Tunables for the durable-epoch machinery. Defaults live on the Tunnel so tests can +// override them; the constants below are the shared, non-overridable parameters. +const ( + // epochSeedMargin is how far above the carried-forward high-water the initiator + // seeds each new session's allocator (see seedWithMargin). The worst case it must + // cover is the survivor holding ~2 generations beyond the initiator's known + // high-water (durable-restart persistence lag) and 1 generation beyond it on a + // torn-exchange reconnect, so a margin >= 2 is sufficient; 8 is slack against any + // future rekey pipelining. It is applied per session, so each reconnect/restart + // spends up to margin epochs — negligible against the 2^30 counter space (a + // reconnect every second for years before it matters). + epochSeedMargin = 8 + + // persistShutdownGrace bounds how long stop() waits for the persister goroutine to + // drain before abandoning it, so a wedged Store (uninterruptible fsync on a dying + // disk) cannot pin shutdown — and the --require-state fail-closed error can still + // reach the errgroup. + persistShutdownGrace = 3 * time.Second + + defaultMaxStoreFailures = 5 + defaultPersistStallTimeout = 60 * time.Second +) + +// errEpochPersistStalled is returned from Run (initiator, --require-state only) when +// durable persistence has fallen far enough behind that seamless restart recovery is +// no longer guaranteed. It is fatal/non-retryable: the operator asked to fail closed. +var errEpochPersistStalled = errors.New("control: durable epoch persistence is failing") + +// seedWithMargin computes the allocator seed floor for a carried-forward high-water: +// hw + epochSeedMargin, clamped to spiCounterMask-1. It is applied at SEED time (every +// new session on the initiator), so the margin covers BOTH gaps that can leave the +// seed at or below what the surviving peer already retained: +// +// - the durable persistence gap on an initiator restart (the on-disk value lags the +// survivor by up to ~2 generations); and +// - the torn-exchange lead on an in-memory reconnect: a session can tear after the +// responder committed epoch E but before the initiator recorded it (recordInstalled +// runs only on a successful install), leaving the initiator's high-water one behind +// the responder's. Without a margin the initiator would re-offer E and the +// responder's strictly-increasing guard would reject it — a one-generation +// data-plane black-hole. The margin (>= 2) seeds strictly above E, so the guard +// accepts. +// +// The clamp is one below the ceiling so the seeded allocator can still hand out the +// terminal counter spiCounterMask before exhausting (clamping to spiCounterMask itself +// would make the very first Allocate fail). Reaching the clamp is the exhaustion +// warning threshold. +func seedWithMargin(hw uint32) uint32 { + const ceil = spiCounterMask - 1 + if hw >= ceil || hw+epochSeedMargin > ceil { + return ceil + } + return hw + epochSeedMargin +} + +// epochPersister owns the EpochStore and writes to it from a single dedicated +// goroutine, so an fsync on a degraded disk never blocks the Tunnel's run loop (which +// also drives reconnect and rekey). Requests are coalesced through a one-slot mailbox +// — only the latest high-water matters — and the in-memory high-water remains the +// source of truth, so a late or dropped write merely widens the rollback gap (which +// the seed margin absorbs) rather than corrupting anything. +type epochPersister struct { + store EpochStore + reqCh chan uint32 + stopCh chan struct{} + doneCh chan struct{} + stopOnce sync.Once + + high atomic.Uint32 // last value successfully stored + failures atomic.Int64 // consecutive Store failures + lastOK atomic.Int64 // unix nanos of the last successful store (or start) +} + +func newEpochPersister(store EpochStore, startHigh uint32, nowNanos int64) *epochPersister { + p := &epochPersister{ + store: store, + reqCh: make(chan uint32, 1), + stopCh: make(chan struct{}), + doneCh: make(chan struct{}), + } + p.high.Store(startHigh) + p.lastOK.Store(nowNanos) + go p.run() + return p +} + +func (p *epochPersister) run() { + defer close(p.doneCh) + for { + select { + case <-p.stopCh: + // Best-effort final flush of any queued value on clean shutdown. + select { + case v := <-p.reqCh: + p.flush(v) + default: + } + return + case v := <-p.reqCh: + p.flush(v) + } + } +} + +func (p *epochPersister) flush(v uint32) { + if v <= p.high.Load() { + return // already durable (coalesced no-op) + } + if err := p.store.Store(v); err != nil { + n := p.failures.Add(1) + slog.Error("control: failed to persist epoch high-water; one-sided-restart recovery is degrading (set --require-state to fail closed)", + slog.Uint64("highWater", uint64(v)), slog.Int64("consecutiveFailures", n), slog.Any("error", err)) + return + } + p.failures.Store(0) + p.high.Store(v) + p.lastOK.Store(time.Now().UnixNano()) +} + +// request enqueues v as the latest high-water to persist, coalescing with any value +// still queued by keeping the LARGER of the two — so a request can never drop a higher +// high-water (requests are monotonic in normal use, but keeping the max is robust +// regardless). It never blocks (single producer: the Tunnel's run goroutine). +func (p *epochPersister) request(v uint32) { + for { + select { + case p.reqCh <- v: + return + case old := <-p.reqCh: + if old > v { + v = old + } + } + } +} + +// fatal reports whether durable persistence has degraded past the point of guaranteed +// recovery, but only under requireState (otherwise persistence is best-effort). +// target is the latest in-memory high-water the caller wants durable. It trips on +// SUSTAINED failure (>= maxFailures consecutive Store errors) or a HUNG store (un- +// persisted work with no progress for longer than stall). Intermittent slowness does +// not trip it, and need not: each success coalesces to and stores the latest +// high-water, so recovery never falls materially behind. +func (p *epochPersister) fatal(requireState bool, target uint32, now time.Time, maxFailures int64, stall time.Duration) error { + if !requireState { + return nil + } + if f := p.failures.Load(); f >= maxFailures { + return fmt.Errorf("%w: %d consecutive store failures", errEpochPersistStalled, f) + } + // Also catch a silently hung store (no error, no progress) once there is + // un-persisted work that has not advanced for too long. + if target > p.high.Load() && now.Sub(time.Unix(0, p.lastOK.Load())) > stall { + return fmt.Errorf("%w: durable high-water stalled > %s behind the live epoch", errEpochPersistStalled, stall) + } + return nil +} + +// stop signals the persister to exit and waits for it, but only up to +// persistShutdownGrace: a Store wedged in an uninterruptible fsync would otherwise +// pin the goroutine forever and, since Run defers Close which calls stop, prevent the +// process from exiting (and prevent a --require-state fatal from reaching the +// errgroup). The wedged goroutine is then abandoned (the OS reaps it at exit). +func (p *epochPersister) stop() { + p.stopOnce.Do(func() { close(p.stopCh) }) + select { + case <-p.doneCh: + case <-time.After(persistShutdownGrace): + slog.Warn("control: epoch-state persister did not stop within grace; abandoning a stuck store write") + } +} diff --git a/control/epochstate_test.go b/control/epochstate_test.go new file mode 100644 index 0000000..dff4182 --- /dev/null +++ b/control/epochstate_test.go @@ -0,0 +1,391 @@ +package control + +import ( + "crypto/hkdf" + "crypto/hmac" + "crypto/sha256" + "crypto/x509" + "encoding/binary" + "errors" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSPIAllocatorSeedFloor(t *testing.T) { + a := NewSPIAllocator(Initiator) + // Fresh allocator starts at counter 1. + spi, err := a.Allocate(0) + require.NoError(t, err) + require.Equal(t, uint32(1), spi&spiCounterMask) + + // Seeding above the current position jumps the next allocation past the floor. + a.SeedFloor(0, 1000) + spi, err = a.Allocate(0) + require.NoError(t, err) + require.Equal(t, uint32(1001), spi&spiCounterMask) + + // Seeding at or below the current position is a no-op (monotonic). + a.SeedFloor(0, 5) + spi, err = a.Allocate(0) + require.NoError(t, err) + require.Equal(t, uint32(1002), spi&spiCounterMask) + + // SeedFloor masks off the role/index bits: a full initiator SPI seeds by counter. + a.SeedFloor(0, 2000) + spi, err = a.Allocate(0) + require.NoError(t, err) + require.Equal(t, uint32(2001), spi&spiCounterMask) + + // Out-of-range index is ignored, not a panic. + a.SeedFloor(99, 1<<20) +} + +func TestSPIAllocatorExhaustion(t *testing.T) { + a := NewSPIAllocator(Initiator) + a.SeedFloor(0, spiCounterMask-1) + // The terminal counter (spiCounterMask) is still allocatable. + spi, err := a.Allocate(0) + require.NoError(t, err) + require.Equal(t, spiCounterMask, spi&spiCounterMask) + // The next allocation exhausts the space with the sentinel error. + _, err = a.Allocate(0) + require.ErrorIs(t, err, ErrSPIExhausted) + // Exhaustion is sticky. + _, err = a.Allocate(0) + require.ErrorIs(t, err, ErrSPIExhausted) +} + +func TestSeedWithMargin(t *testing.T) { + require.Equal(t, uint32(1000+epochSeedMargin), seedWithMargin(1000)) + require.Equal(t, uint32(epochSeedMargin), seedWithMargin(0)) + // Near the ceiling, clamp to spiCounterMask-1 so the seeded allocator can still + // hand out the terminal counter before exhausting. + require.Equal(t, spiCounterMask-1, seedWithMargin(spiCounterMask)) + require.Equal(t, spiCounterMask-1, seedWithMargin(spiCounterMask-1)) + require.Equal(t, spiCounterMask-1, seedWithMargin(spiCounterMask-2)) +} + +func newTestStore(t *testing.T) (*FileEpochStore, *Identity, *Identity) { + t.Helper() + local, err := GenerateIdentity() + require.NoError(t, err) + peer, err := GenerateIdentity() + require.NoError(t, err) + path := filepath.Join(t.TempDir(), "epoch.state") + s, err := NewFileEpochStore(path, local, peer.PublicKey()) + require.NoError(t, err) + return s, local, peer +} + +func TestFileEpochStoreRoundTrip(t *testing.T) { + s, _, _ := newTestStore(t) + + // Absent file => first run (ok=false, nil error). + hw, ok, err := s.Load() + require.NoError(t, err) + require.False(t, ok) + require.Zero(t, hw) + + require.NoError(t, s.Store(42)) + hw, ok, err = s.Load() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, uint32(42), hw) + + // Overwrite (atomic replace) with a higher value. + require.NoError(t, s.Store(99)) + hw, ok, err = s.Load() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, uint32(99), hw) +} + +func TestFileEpochStoreMACStableAcrossInstances(t *testing.T) { + local, err := GenerateIdentity() + require.NoError(t, err) + peer, err := GenerateIdentity() + require.NoError(t, err) + path := filepath.Join(t.TempDir(), "epoch.state") + + s1, err := NewFileEpochStore(path, local, peer.PublicKey()) + require.NoError(t, err) + require.NoError(t, s1.Store(7)) + + // A second store built from the SAME identity+peer (i.e. a restart) must verify + // the MAC and load the value — the MAC key derivation is stable. + s2, err := NewFileEpochStore(path, local, peer.PublicKey()) + require.NoError(t, err) + hw, ok, err := s2.Load() + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, uint32(7), hw) +} + +func TestFileEpochStoreRejectsWrongIdentity(t *testing.T) { + local, err := GenerateIdentity() + require.NoError(t, err) + other, err := GenerateIdentity() + require.NoError(t, err) + peer, err := GenerateIdentity() + require.NoError(t, err) + path := filepath.Join(t.TempDir(), "epoch.state") + + s, err := NewFileEpochStore(path, local, peer.PublicKey()) + require.NoError(t, err) + require.NoError(t, s.Store(5)) + + // A different local identity derives a different MAC key → verification fails + // (and it is an error, NOT a silent first-run). + bad, err := NewFileEpochStore(path, other, peer.PublicKey()) + require.NoError(t, err) + _, ok, err := bad.Load() + require.Error(t, err) + require.False(t, ok) +} + +func TestFileEpochStoreRejectsWrongPeer(t *testing.T) { + local, err := GenerateIdentity() + require.NoError(t, err) + peer, err := GenerateIdentity() + require.NoError(t, err) + otherPeer, err := GenerateIdentity() + require.NoError(t, err) + path := filepath.Join(t.TempDir(), "epoch.state") + + s, err := NewFileEpochStore(path, local, peer.PublicKey()) + require.NoError(t, err) + require.NoError(t, s.Store(5)) + + // Same identity, different peer → pin mismatch → error. + bad, err := NewFileEpochStore(path, local, otherPeer.PublicKey()) + require.NoError(t, err) + _, ok, err := bad.Load() + require.Error(t, err) + require.False(t, ok) +} + +func TestFileEpochStoreRejectsCorruption(t *testing.T) { + // Flipping a byte in any region (magic, version, pin, high-water, mac) must be + // detected as an error, never silently accepted or treated as first-run. + // Cover every region's first byte (incl. the reserved flags byte and the + // region boundaries offPin/offMAC) to prove the whole prefix is MAC-protected. + for _, off := range []int{0, offVersion, offFlags, offPin, offHighWater, offMAC, offMAC + 1, epochStateLen - 1} { + s, _, _ := newTestStore(t) + require.NoError(t, s.Store(123)) + buf, err := os.ReadFile(s.path) + require.NoError(t, err) + buf[off] ^= 0xff + require.NoError(t, os.WriteFile(s.path, buf, 0o600)) + _, ok, err := s.Load() + require.Error(t, err, "corruption at offset %d must be rejected", off) + require.False(t, ok) + } +} + +func TestFileEpochStoreRejectsBadLength(t *testing.T) { + for _, n := range []int{0, 1, epochStateLen - 1, epochStateLen + 1} { + s, _, _ := newTestStore(t) + require.NoError(t, os.WriteFile(s.path, make([]byte, n), 0o600)) + _, ok, err := s.Load() + require.Error(t, err, "length %d must be a clean error, not a panic or first-run", n) + require.False(t, ok) + } +} + +func TestFileEpochStoreRejectsOutOfRangeHighWater(t *testing.T) { + s, _, _ := newTestStore(t) + // A validly-MAC'd record whose high-water exceeds the counter space must be + // rejected (it cannot have come from a real allocator). + buf := s.marshal(spiCounterMask + 1) + require.NoError(t, os.WriteFile(s.path, buf, 0o600)) + _, ok, err := s.Load() + require.Error(t, err) + require.False(t, ok) +} + +func TestFileEpochStoreUnreadableIsError(t *testing.T) { + if os.Geteuid() == 0 { + t.Skip("root bypasses file permissions") + } + s, _, _ := newTestStore(t) + require.NoError(t, s.Store(11)) + require.NoError(t, os.Chmod(s.path, 0o000)) + t.Cleanup(func() { _ = os.Chmod(s.path, 0o600) }) + // A present-but-unreadable file is a real error (so --require-state fails closed), + // NOT a first-run. + _, ok, err := s.Load() + require.Error(t, err) + require.False(t, ok) +} + +// TestFileEpochStoreLayout pins the on-disk byte layout (field order, endianness, MAC +// scope and key derivation) via an independent recomputation, so an accidental change +// to any of them is caught. +func TestFileEpochStoreLayout(t *testing.T) { + local, err := GenerateIdentity() + require.NoError(t, err) + peer, err := GenerateIdentity() + require.NoError(t, err) + s, err := NewFileEpochStore(filepath.Join(t.TempDir(), "epoch.state"), local, peer.PublicKey()) + require.NoError(t, err) + + const hw = uint32(0x01020304) + got := s.marshal(hw) + require.Len(t, got, epochStateLen) + + var scalar [32]byte + local.priv.D.FillBytes(scalar[:]) + macKey, err := hkdf.Key(sha256.New, scalar[:], nil, stateMACInfo, 32) + require.NoError(t, err) + lDER, err := x509.MarshalPKIXPublicKey(local.PublicKey()) + require.NoError(t, err) + pDER, err := x509.MarshalPKIXPublicKey(peer.PublicKey()) + require.NoError(t, err) + ph := sha256.New() + ph.Write(lDER) + ph.Write(pDER) + + want := make([]byte, epochStateLen) + copy(want[0:4], "ICXE") + want[4] = 1 + want[5] = 0 + copy(want[6:38], ph.Sum(nil)) + binary.BigEndian.PutUint32(want[38:42], hw) + m := hmac.New(sha256.New, macKey) + m.Write(want[:42]) + copy(want[42:], m.Sum(nil)) + + require.Equal(t, want, got) +} + +// fakeEpochStore is an in-memory EpochStore for the persister/Tunnel tests. It can be +// reused across a simulated process restart (the persisted value survives). +type fakeEpochStore struct { + mu sync.Mutex + high uint32 + has bool + loadErr error + storeErr error + stores int + loads int +} + +func (f *fakeEpochStore) Load() (uint32, bool, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.loads++ + if f.loadErr != nil { + return 0, false, f.loadErr + } + return f.high, f.has, nil +} + +func (f *fakeEpochStore) counts() (loads, stores int) { + f.mu.Lock() + defer f.mu.Unlock() + return f.loads, f.stores +} + +func (f *fakeEpochStore) Store(v uint32) error { + f.mu.Lock() + defer f.mu.Unlock() + f.stores++ + if f.storeErr != nil { + return f.storeErr + } + f.high, f.has = v, true + return nil +} + +func (f *fakeEpochStore) set(v uint32) { + f.mu.Lock() + defer f.mu.Unlock() + f.high, f.has = v, true +} + +func (f *fakeEpochStore) loaded() (uint32, bool) { + f.mu.Lock() + defer f.mu.Unlock() + return f.high, f.has +} + +func TestEpochPersisterCoalesceAndFlush(t *testing.T) { + fs := &fakeEpochStore{} + p := newEpochPersister(fs, 0, time.Now().UnixNano()) + defer p.stop() + + for _, v := range []uint32{1, 2, 3, 7, 5} { + p.request(v) + } + // The persister catches up to the highest requested value; lower/stale values are + // coalesced or ignored (Store is monotonic in the persister). + require.Eventually(t, func() bool { + hw, ok := fs.loaded() + return ok && hw == 7 + }, 2*time.Second, 5*time.Millisecond) + require.Zero(t, p.failures.Load()) +} + +func TestEpochPersisterFatalUnderRequireState(t *testing.T) { + fs := &fakeEpochStore{storeErr: errors.New("disk full")} + p := newEpochPersister(fs, 0, time.Now().UnixNano()) + defer p.stop() + + // The persister attempts each distinct requested value once (it does not retry a + // failed value), so consecutive failures climb one per new request — exactly as a + // failing disk accrues one failure per rekey. Drive several, waiting for each to + // register so they are not coalesced. + for v := int64(1); v <= 5; v++ { + p.request(uint32(v)) + require.Eventually(t, func() bool { + return p.failures.Load() >= v + }, 2*time.Second, 5*time.Millisecond) + } + + // Without require-state, a failing store is best-effort (never fatal). + require.NoError(t, p.fatal(false, 5, time.Now(), 3, time.Hour)) + + // With require-state, failures past the threshold are fatal. + require.ErrorIs(t, p.fatal(true, 5, time.Now(), 3, time.Hour), errEpochPersistStalled) +} + +// TestEpochPersisterRequestKeepsMax exercises request()'s keep-max coalescing in +// isolation — with no goroutine draining, a lower request must not displace a queued +// higher value (flush's own monotonic guard would otherwise mask a broken request()). +func TestEpochPersisterRequestKeepsMax(t *testing.T) { + p := &epochPersister{reqCh: make(chan uint32, 1)} // no run() goroutine + p.request(7) + p.request(5) // lower + p.request(3) // lower + select { + case v := <-p.reqCh: + require.Equal(t, uint32(7), v, "the higher queued value must survive a lower request") + default: + t.Fatal("expected a coalesced value in the mailbox") + } +} + +func TestEpochPersisterFatalOnStall(t *testing.T) { + fs := &fakeEpochStore{} + // Seed lastOK far in the past and leave a value un-persisted (high < target). + p := newEpochPersister(fs, 0, time.Now().Add(-time.Hour).UnixNano()) + defer p.stop() + // target (5) is ahead of what the store holds (0) and lastOK is stale → stalled. + require.ErrorIs(t, p.fatal(true, 5, time.Now(), 100, time.Minute), errEpochPersistStalled) + // No un-persisted work → not stalled even with a stale lastOK. + require.NoError(t, p.fatal(true, 0, time.Now(), 100, time.Minute)) + + // A successful store clears the stall: high catches up to the target and lastOK is + // refreshed, so the tripwire un-latches. + p.request(5) + require.Eventually(t, func() bool { + hw, ok := fs.loaded() + return ok && hw == 5 + }, 2*time.Second, 5*time.Millisecond) + require.NoError(t, p.fatal(true, 5, time.Now(), 100, time.Minute), "stall must clear once the store catches up") +} diff --git a/control/sa.go b/control/sa.go index a512175..4206718 100644 --- a/control/sa.go +++ b/control/sa.go @@ -105,6 +105,13 @@ func MakeSPI(masterKeyIndex int, role Role, counter uint32) (uint32, error) { return uint32(masterKeyIndex)<<31 | uint32(role)<= spiCounterMask { + return 0, fmt.Errorf("%w (master key %d)", ErrSPIExhausted, masterKeyIndex) + } a.next[masterKeyIndex]++ - c := a.next[masterKeyIndex] - if c > spiCounterMask { - return 0, fmt.Errorf("control: SPI space exhausted for master key %d; rotate", masterKeyIndex) + return MakeSPI(masterKeyIndex, a.role, a.next[masterKeyIndex]) +} + +// SeedFloor raises the allocator's counter for masterKeyIndex so the next +// Allocate returns a counter strictly greater than floor's counter (the low 30 +// bits of floor). It is monotonic — a floor at or below the current position is a +// no-op — and never lowers the counter, so it is safe to call on every new +// session. The control plane uses this to carry a per-direction epoch high-water +// across reconnects/restarts (see control/epochstate.go): seeding the initiator's +// allocator above the survivor's retained data-plane epoch keeps the shared epoch +// strictly increasing across a session boundary, so the survivor's monotonicity +// guard accepts the new SA instead of rejecting a counter that reset to 1. +// +// It takes the same lock as Allocate; an out-of-range index is ignored. +func (a *SPIAllocator) SeedFloor(masterKeyIndex int, floor uint32) { + if masterKeyIndex < 0 || masterKeyIndex >= numMasterKeys { + return + } + c := floor & spiCounterMask + a.mu.Lock() + defer a.mu.Unlock() + if c > a.next[masterKeyIndex] { + a.next[masterKeyIndex] = c } - return MakeSPI(masterKeyIndex, a.role, c) } diff --git a/control/transport.go b/control/transport.go index e4cb1e6..c455dc6 100644 --- a/control/transport.go +++ b/control/transport.go @@ -145,6 +145,13 @@ func newSession(ctx context.Context, conn *quic.Conn, role Role) (*Session, erro // Role reports whether this peer is the initiator or responder. func (s *Session) Role() Role { return s.role } +// SeedRxFloor raises this session's RX SPI allocator so the next negotiated SPI +// (and hence, for the initiator, the shared data-plane epoch) is strictly greater +// than floor. It must be called before the session's first NegotiateSAs. The +// Tunnel uses it to carry the epoch high-water across a reconnect/restart so the +// data-plane epoch keeps increasing rather than resetting to 1. +func (s *Session) SeedRxFloor(floor uint32) { s.rxAlloc.SeedFloor(activeMasterKeyIndex, floor) } + // MasterKeys returns the PSP master keys derived from this session. func (s *Session) MasterKeys() *MasterKeys { return s.masterKeys } From bf7a422ed000d000c1e1b6f56f69139195d57ccf Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Mon, 1 Jun 2026 17:33:53 -0700 Subject: [PATCH 11/20] [cli] wire --state-file/--require-state durable epoch state (APO-648) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the control plane's durable epoch high-water: --state-file enables a FileEpochStore (built role-agnostically; the Tunnel consults it only when this node is the elected initiator, so set it on both peers), and --require-state fails closed on a corrupt/unreadable/un-writable store instead of degrading to a fresh start. --require-state requires --state-file. Rewrite the README "Restart / reconnect" notes to describe the two recovery layers (in-memory always-on; durable with --state-file), the both-peers requirement, and the integrity-tripwire scope (the MAC is not rollback- or deletion-resistant, which is acceptable because per-session ephemeral keys mean a rolled-back high-water cannot cause nonce reuse — only a transient outage). --- cli/README.md | 45 +++++++++++++++++++++++++++++++++++---------- cli/main.go | 27 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/cli/README.md b/cli/README.md index 7e6d881..2da7213 100644 --- a/cli/README.md +++ b/cli/README.md @@ -77,6 +77,13 @@ Relevant flags: - `--rekey-interval DUR` — SA rotation period (default `2m`). - `--require-fips` — refuse to start unless the Go FIPS 140-3 module is active (build/run with `GODEBUG=fips140=on`). +- `--state-file PATH` — persist the epoch high-water so a one-sided restart recovers + seamlessly (see "Restart / reconnect"). Set it on **both** peers. Without it, a + one-sided *initiator* restart recovers only after both peers cycle. +- `--require-state` — fail closed if the state file is corrupt/unreadable or becomes + persistently un-writable, instead of degrading to a fresh start. Requires + `--state-file`. It is an integrity tripwire against accidental corruption, not a + defense against an attacker who can rewrite or delete the file. ### Operational notes @@ -87,16 +94,34 @@ ends close together, and run under a supervisor (systemd `Restart=always`, a con restart policy) so a larger startup skew self-heals on restart. Once established, the control plane reconnects on its own indefinitely. -**Restart / reconnect.** Control-plane keys are ephemeral, so a *clean* restart is safe: a -new session derives fresh keys, and a reset transmit counter cannot reuse a nonce under a -fresh key. But this build does not persist the per-session epoch high-water mark across a -restart, so a **one-sided** restart does not promptly recover: the survivor keeps the high -epoch from the old session, the restarted peer comes back at a low epoch, and the survivor's -monotonicity guard rejects it — breaking traffic in **both** directions. The link forwards on -the survivor's existing keys until they expire and only re-keys once the new session's epoch -counter climbs back above the survivor's retained high-water mark (many rekey intervals). In -practice, **cycle both peers** to recover immediately. Durable epoch state for seamless -one-sided restart is planned follow-up work. +**Restart / reconnect.** Control-plane keys are ephemeral, so a *clean* restart is always +crypto-safe: a new session derives fresh keys, and a reset transmit counter cannot reuse a +nonce under a fresh key. The remaining question is *availability* after a restart, which the +control plane handles in two layers: + +- **Always on (no flag):** the data-plane epoch counter resets to 1 on every (re)connect, but + the surviving peer's monotonicity guard only accepts a strictly increasing epoch. ICX + carries the epoch high-water forward in memory and seeds each new session above it, so a + **transient reconnect** (a network blip) and a **responder restart** recover immediately — + the surviving *initiator* keeps the high-water and the restarted responder comes up fresh. + +- **With `--state-file` (recommended):** the high-water is also persisted durably (fsync + + atomic rename, integrity-protected by a MAC keyed from the identity), so an **initiator + restart** recovers too — the restarted initiator reloads the high-water and resumes above + the survivor's retained epoch. The dialer/listener role is auto-elected from the keys, so + set `--state-file` on **both** peers; only the elected initiator's file is load-bearing, and + the start-up log shows whether durable state is active for this node's role. + +Without `--state-file`, a one-sided *initiator* restart still has the old caveat: the survivor +forwards on its existing keys until they expire and only re-keys once the new counter climbs +past the retained high-water — **cycle both peers** to recover immediately. + +`--require-state` makes a corrupt/unreadable state file (or persistently failing writes) fail +closed instead of silently starting fresh. It is an integrity tripwire against accidental +corruption only: because keys are per-session ephemeral, even a rolled-back or deleted +high-water cannot cause nonce reuse — only a transient outage — so rollback and deletion +resistance (which would need a hardware monotonic counter) are out of scope. Use one state +file per tunnel; do not share it between processes. ## Static keys (legacy) diff --git a/cli/main.go b/cli/main.go index 9836c0b..2e853a6 100644 --- a/cli/main.go +++ b/cli/main.go @@ -101,6 +101,14 @@ func main() { Name: "require-fips", Usage: "Refuse to start unless the Go FIPS 140-3 module is active (GODEBUG=fips140=on)", }, + &cli.StringFlag{ + Name: "state-file", + Usage: "Path to a durable epoch-state file (control plane). Persists the epoch high-water so a one-sided restart recovers seamlessly. Set on BOTH peers (the dialer/listener role is auto-elected). Without it, a one-sided initiator restart recovers only after both peers cycle", + }, + &cli.BoolFlag{ + Name: "require-state", + Usage: "Fail closed if durable epoch state is corrupt/unreadable or persistently un-writable, instead of degrading to a fresh start. Requires --state-file. Integrity tripwire only — not rollback/deletion resistant", + }, &cli.IntFlag{ Name: "port", Aliases: []string{"p"}, @@ -547,6 +555,23 @@ func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni return nil, nil, err } + // Durable epoch state (optional). Built role-agnostically; the Tunnel consults it + // only when this node is the elected initiator (the responder's high-water is not + // load-bearing), which is why it must be configured on both peers. + stateFile := c.String("state-file") + requireState := c.Bool("require-state") + if requireState && stateFile == "" { + return nil, nil, errors.New("--require-state requires --state-file") + } + var epochStore control.EpochStore + if stateFile != "" { + store, err := control.NewFileEpochStore(stateFile, ident, peerPub) + if err != nil { + return nil, nil, err + } + epochStore = store + } + ctrlNet := "udp4" if peerUDPAddr.IP.To4() == nil { ctrlNet = "udp6" @@ -584,6 +609,8 @@ func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni Conn: pconn, PeerAddr: peerControlAddr, RekeyInterval: rekeyIvl, + EpochStore: epochStore, + RequireState: requireState, }, installer) if err != nil { _ = pconn.Close() From dc50124d83d038d7da25655c7bb9e74a5f8b7d31 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 2 Jun 2026 01:16:52 -0700 Subject: [PATCH 12/20] [control] require a fresh ECDHE handshake per session (APO-644) Disable TLS 1.3 session resumption (SessionTicketsDisabled) and fail closed in newSession on a resumed session or 0-RTT (DidResume / Used0RTT). The data plane's AES-GCM nonce uniqueness rests on every control session deriving fresh master keys, so that a per-direction SPI which resets or regresses after a reconnect is always paired with a never-used key. Forbidding resumption is the foundation that lets the install seam accept a reset SPI without persisted state. --- control/tls.go | 8 ++++++++ control/transport.go | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/control/tls.go b/control/tls.go index c20a14a..d350ba4 100644 --- a/control/tls.go +++ b/control/tls.go @@ -63,6 +63,14 @@ func baseTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, erro NextProtos: []string{ALPN}, CurvePreferences: []tls.CurveID{tls.CurveP256, tls.CurveP384}, VerifyConnection: pinVerifier(peerPub), + // Disable TLS 1.3 session resumption (and therefore 0-RTT). Every (re)connect + // MUST be a full ECDHE handshake so each session derives FRESH master keys: that + // freshness is the data plane's nonce-uniqueness foundation (the per-direction + // install guard accepts a reset/regressed SPI precisely because its key is fresh + // — see handler.UpdateVirtualNetworkSAs). A resumed session could reuse keying + // material and, paired with a reset SPI, repeat a (key, nonce) pair. The server + // also never issues tickets; newSession additionally asserts !DidResume/!0-RTT. + SessionTicketsDisabled: true, }, nil } diff --git a/control/transport.go b/control/transport.go index c455dc6..7b48279 100644 --- a/control/transport.go +++ b/control/transport.go @@ -118,12 +118,28 @@ func newSession(ctx context.Context, conn *quic.Conn, role Role) (*Session, erro // NextProtos don't overlap (both sides advertise only ALPN), but enforcing the // invariant in code keeps it true if NextProtos is ever widened, and makes the // guarantee auditable rather than implied. - tlsState := conn.ConnectionState().TLS + state := conn.ConnectionState() + tlsState := state.TLS if tlsState.NegotiatedProtocol != ALPN { _ = conn.CloseWithError(appErrNormal, "alpn mismatch") return nil, fmt.Errorf("control: unexpected ALPN %q, want %q", tlsState.NegotiatedProtocol, ALPN) } + // Enforce a FRESH ECDHE handshake: refuse a resumed session or 0-RTT. The data + // plane's nonce-uniqueness guarantee rests on every session deriving fresh master + // keys (so a reset/regressed SPI is always paired with a fresh key — see + // handler.UpdateVirtualNetworkSAs). Resumption is already disabled in the TLS config + // (SessionTicketsDisabled), so this is a fail-closed backstop against a silent + // regression rather than an expected path. + if tlsState.DidResume { + _ = conn.CloseWithError(appErrNormal, "session resumption forbidden") + return nil, errors.New("control: TLS session was resumed; a fresh ECDHE handshake is required for data-plane nonce safety") + } + if state.Used0RTT { + _ = conn.CloseWithError(appErrNormal, "0-RTT forbidden") + return nil, errors.New("control: connection used 0-RTT; a fresh ECDHE handshake is required for data-plane nonce safety") + } + root, err := ExportRootSecret(tlsState) if err != nil { _ = conn.CloseWithError(appErrNormal, "exporter failure") From ca8982c01f6162caf97a189410e343c4449976f4 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 2 Jun 2026 01:17:46 -0700 Subject: [PATCH 13/20] [handler] add per-direction SPI install seam with key-aware anti-reset guard (APO-644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UpdateVirtualNetworkSAs installs the two PSP simplex SAs under their own role-partitioned SPIs (rxSPI we decrypt under, txSPI we encrypt to), replacing the single shared epoch. The AES-GCM nonce is txSPI‖counter with the counter reset to zero per generation, so nonce uniqueness across rekeys, reconnects and restarts rests on each generation pairing that from-zero counter with a fresh per-session key. Because the control plane now guarantees fresh keys per session, this path does NOT enforce SPI monotonicity: a reconnect resets the allocator to a low SPI and that reset SPI must be re-accepted under its fresh key. The only fail-closed guards are non-zero SPIs, distinct rx/tx keys, and a TX anti-reset check that rejects re-installing the CURRENTLY-live transmit SA — same SPI AND same key. The key comparison (new transmitCipher.key) is load-bearing: on a transient reconnect the allocator can hand back a transmit SPI that collides with the still-live one, but under a fresh key, which is safe; comparing the SPI alone would spuriously reject that recovery. The legacy static-key seam (UpdateVirtualNetworkKeys) keeps STRICT epoch monotonicity, since static pre-shared keys carry no per-session freshness. --- cp_wire_test.go | 87 ++++++++++------- export_test.go | 13 +-- handler.go | 251 ++++++++++++++++++++++++++++++++---------------- handler_test.go | 64 ++++++++++++ 4 files changed, 292 insertions(+), 123 deletions(-) diff --git a/cp_wire_test.go b/cp_wire_test.go index 0105b96..bea61ad 100644 --- a/cp_wire_test.go +++ b/cp_wire_test.go @@ -14,11 +14,11 @@ import ( "github.com/apoxy-dev/icx/control" ) -// This file proves the Phase 4 control-plane → data-plane bridge end to end: the -// shared-epoch chosen by control.SharedEpoch lets two independently-keyed handlers -// exchange Geneve traffic, and the naive alternative (each peer using its own Tx SPI -// as the epoch) provably drops every frame. The handler is cross-platform, so this -// runs without the AF_XDP forwarder. +// This file proves the control-plane → data-plane bridge end to end under per-direction +// SPIs: each peer installs two simplex SAs (its own receive SPI, the peer's receive SPI), +// and two independently-keyed handlers exchange Geneve traffic in both directions, each +// decrypting under its own receive SPI. The handler is cross-platform, so this runs +// without the AF_XDP forwarder. // negotiateLoopback brings up an initiator and a responder control session over // loopback UDP and returns each peer's negotiated directional SAs. @@ -87,39 +87,42 @@ func newPeerHandler(t *testing.T, vni uint, local, remote tcpip.Address) *icx.Ha return h } -func installSAs(t *testing.T, h *icx.Handler, vni uint, epoch uint32, sas *control.DirectionalSAs) { +// installDirectional installs a peer's negotiated directional SAs into its handler via +// the real guarded per-direction seam the production installer calls: rxSPI is our own +// receive SPI, txSPI is the peer's receive SPI (what we transmit to). +func installDirectional(t *testing.T, h *icx.Handler, vni uint, sas *control.DirectionalSAs) { t.Helper() require.Len(t, sas.Rx.Key, 16) require.Len(t, sas.Tx.Key, 16) var rx, tx [16]byte copy(rx[:], sas.Rx.Key) copy(tx[:], sas.Tx.Key) - // Use the real guarded seam the production installer calls. - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, epoch, rx, tx, time.Now().Add(time.Hour))) + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, sas.Rx.SPI, sas.Tx.SPI, rx, tx, time.Now().Add(time.Hour))) } -func TestControlPlaneSharedEpochGeneveRoundTrip(t *testing.T) { +func TestControlPlanePerDirectionGeneveRoundTrip(t *testing.T) { iSAs, rSAs := negotiateLoopback(t) - eI, err := control.SharedEpoch(iSAs) - require.NoError(t, err) - eR, err := control.SharedEpoch(rSAs) - require.NoError(t, err) - require.Equal(t, eI, eR, "both peers must derive the identical shared epoch") + // Per-direction SPIs: each peer's transmit SPI is the other's receive SPI, and the + // two directions are distinct (role-partitioned), so each direction owns its own + // nonce space — there is no shared epoch. + require.NotEqual(t, iSAs.Rx.SPI, iSAs.Tx.SPI, "the two directions must use distinct SPIs") + require.Equal(t, iSAs.Tx.SPI, rSAs.Rx.SPI, "initiator tx SPI must equal responder rx SPI") + require.Equal(t, iSAs.Rx.SPI, rSAs.Tx.SPI, "initiator rx SPI must equal responder tx SPI") const vni = 0x424344 addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) hI := newPeerHandler(t, vni, addrA, addrB) hR := newPeerHandler(t, vni, addrB, addrA) - installSAs(t, hI, vni, eI, iSAs) - installSAs(t, hR, vni, eR, rSAs) + installDirectional(t, hI, vni, iSAs) + installDirectional(t, hR, vni, rSAs) ip := makeIPv4UDPPacket() phy := make([]byte, 1500) out := make([]byte, 1500) - // initiator -> responder + // initiator -> responder: hR decrypts under its own receive SPI (== hI's tx SPI). n, loop := hI.VirtToPhy(ip, phy) require.NotZero(t, n) require.False(t, loop) @@ -127,7 +130,7 @@ func TestControlPlaneSharedEpochGeneveRoundTrip(t *testing.T) { require.NotZero(t, m, "responder must decrypt initiator traffic") require.Equal(t, ip, out[:m]) - // responder -> initiator + // responder -> initiator: hI decrypts under its own receive SPI (== hR's tx SPI). n, loop = hR.VirtToPhy(ip, phy) require.NotZero(t, n) require.False(t, loop) @@ -135,18 +138,26 @@ func TestControlPlaneSharedEpochGeneveRoundTrip(t *testing.T) { require.NotZero(t, m, "initiator must decrypt responder traffic") require.Equal(t, ip, out[:m]) + // No key-miss or SPI-mismatch drops on either side: each direction's frame is bound + // to its own receive SPI and decrypts cleanly. vnR, ok := hR.GetVirtualNetwork(vni) require.True(t, ok) require.Zero(t, vnR.Stats.RXDropsNoKey.Load()) require.Zero(t, vnR.Stats.RXDropsSPIMismatch.Load()) require.Equal(t, uint64(1), vnR.Stats.RXPackets.Load()) + vnI, ok := hI.GetVirtualNetwork(vni) + require.True(t, ok) + require.Zero(t, vnI.Stats.RXDropsNoKey.Load()) + require.Zero(t, vnI.Stats.RXDropsSPIMismatch.Load()) + require.Equal(t, uint64(1), vnI.Stats.RXPackets.Load()) } -// TestInstallResetsTxCounterPerEpoch pins the nonce-uniqueness invariant the durable- -// epoch seeding (control/epochstate.go) depends on: each new epoch install starts a -// FRESH transmit counter, so the AES-GCM nonce (epoch‖counter) never repeats even as -// epochs climb monotonically across rekeys/restarts. A refactor that carried the -// counter across installs would reuse a (key, nonce) pair and trip this test. +// TestInstallResetsTxCounterPerEpoch pins the nonce-uniqueness invariant the control +// plane relies on: each new epoch install starts a FRESH transmit counter. Because every +// session derives a fresh master key (fresh ECDHE per reconnect), pairing a from-zero +// counter with each generation's key keeps the AES-GCM nonce (epoch‖counter) unique even +// when an SPI is reused or regresses after a restart. A refactor that carried the counter +// across installs would reuse a (key, nonce) pair and trip this test. func TestInstallResetsTxCounterPerEpoch(t *testing.T) { const vni = 0x334455 addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) @@ -187,22 +198,32 @@ func TestInstallResetsTxCounterPerEpoch(t *testing.T) { require.Equal(t, uint64(1), c, "first frame under the new epoch counts from 1 again") } -func TestControlPlaneNaiveTxSPIEpochDropsTraffic(t *testing.T) { +// TestSharedEpochCollapseMismatchDropsTraffic shows WHY per-direction SPIs are needed: +// if the two directions are collapsed onto a single epoch but each peer picks its OWN +// receive SPI for it (a naive shared-epoch bridge), the epochs disagree — they are +// role-partitioned — so the sender transmits under an SPI the receiver never installed +// and every frame misses the receiver's rxCiphers. The production path avoids this by +// installing the genuine per-direction SPIs (TestControlPlanePerDirectionGeneveRoundTrip). +func TestSharedEpochCollapseMismatchDropsTraffic(t *testing.T) { iSAs, rSAs := negotiateLoopback(t) - - // The naive bridge — each peer installs under its OWN Tx SPI — gives the two - // peers different epochs (the SPIs are role-partitioned), so the receiver's - // rxCiphers lookup misses and every frame drops. This is exactly why SharedEpoch - // is required; assert the failure mode explicitly. - require.NotEqual(t, iSAs.Tx.SPI, rSAs.Tx.SPI) + require.NotEqual(t, iSAs.Rx.SPI, rSAs.Rx.SPI, "the two receive SPIs are role-partitioned and distinct") const vni = 0x515253 addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) hI := newPeerHandler(t, vni, addrA, addrB) hR := newPeerHandler(t, vni, addrB, addrA) - installSAs(t, hI, vni, iSAs.Tx.SPI, iSAs) - installSAs(t, hR, vni, rSAs.Tx.SPI, rSAs) + + // Collapse both directions onto each peer's OWN receive SPI via the legacy shared- + // epoch shim. hI then transmits under iSAs.Rx.SPI, which hR (installed under + // rSAs.Rx.SPI) does not have. + var iRx, iTx, rRx, rTx [16]byte + copy(iRx[:], iSAs.Rx.Key) + copy(iTx[:], iSAs.Tx.Key) + copy(rRx[:], rSAs.Rx.Key) + copy(rTx[:], rSAs.Tx.Key) + require.NoError(t, hI.UpdateVirtualNetworkKeys(vni, iSAs.Rx.SPI, iRx, iTx, time.Now().Add(time.Hour))) + require.NoError(t, hR.UpdateVirtualNetworkKeys(vni, rSAs.Rx.SPI, rRx, rTx, time.Now().Add(time.Hour))) ip := makeIPv4UDPPacket() phy := make([]byte, 1500) @@ -211,7 +232,7 @@ func TestControlPlaneNaiveTxSPIEpochDropsTraffic(t *testing.T) { require.NotZero(t, n) require.False(t, loop) m := hR.PhyToVirt(phy[:n], out) - require.Zero(t, m, "naive per-direction Tx.SPI epoch must miss the receiver's rxCiphers and drop") + require.Zero(t, m, "a shared-epoch collapse onto disagreeing epochs misses the receiver's rxCiphers and drops") vnR, ok := hR.GetVirtualNetwork(vni) require.True(t, ok) diff --git a/export_test.go b/export_test.go index 50e6442..8f828b3 100644 --- a/export_test.go +++ b/export_test.go @@ -5,22 +5,23 @@ import ( "time" ) -// InstallKeysForTest installs RX/TX ciphers under epoch without the production -// monotonicity and distinct-key guards enforced by UpdateVirtualNetworkKeys. +// InstallKeysForTest installs RX/TX ciphers under a single shared epoch (rxSPI == +// txSPI == epoch) without the production monotonicity and distinct-key guards enforced +// by UpdateVirtualNetworkSAs. // // It exists only for in-process loopback tests that encrypt and decrypt on a // single handler with one shared key (the byte-equivalence, round-trip, fuzz and // benchmark harnesses). Real peers always derive distinct per-direction keys and -// strictly increasing SPIs, so the guarded UpdateVirtualNetworkKeys deliberately -// rejects that shape — hence this unguarded test seam. The file name ends in -// _test.go, so it is compiled only under `go test` and never ships in the +// strictly increasing per-direction SPIs, so the guarded UpdateVirtualNetworkSAs +// deliberately rejects that shape — hence this unguarded test seam. The file name ends +// in _test.go, so it is compiled only under `go test` and never ships in the // production binary or public API. func (h *Handler) InstallKeysForTest(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { value, ok := h.networkByID.Load(vni) if !ok { return fmt.Errorf("VNI %d not found", vni) } - return h.installKeys(value.(*VirtualNetwork), epoch, rxKey, txKey, expiresAt) + return h.installKeys(value.(*VirtualNetwork), epoch, epoch, rxKey, txKey, expiresAt) } // TxCounterForTest returns the active SA's current TX nonce counter for the VNI (and diff --git a/handler.go b/handler.go index dad66fe..27e7b5e 100644 --- a/handler.go +++ b/handler.go @@ -84,7 +84,11 @@ type receiveCipher struct { // Transmit cipher state. type transmitCipher struct { cipher.AEAD - epoch uint32 + epoch uint32 + // key is the transmit key, retained so the TX anti-reset guard can distinguish a + // genuine double-install of the live SA (same SPI AND same key) from a fresh-session + // install that merely reused the SPI value under a new key (see UpdateVirtualNetworkSAs). + key [16]byte counter atomic.Uint64 } @@ -101,6 +105,14 @@ type VirtualNetwork struct { // Internal state (not exposed) rxCiphers sync.Map txCipher atomic.Pointer[transmitCipher] + // rxEpoch is the currently-installed receive SPI (0 = none). Under per-direction + // SPIs the receive and transmit SPIs differ, so the previous RX cipher can no longer + // be found via txCipher.epoch; rxEpoch anchors the prior receive SA so installKeys + // can grace-clamp it. It is NOT a monotonicity guard and need NOT be monotone — it + // simply tracks the most recently installed receive SPI and may regress when a fresh + // session resets the allocator (the receive side emits no nonce, so a reused receive + // SPI is harmless: its replay filter is rebuilt with the fresh key). + rxEpoch atomic.Uint32 } // Clock provides time to the handler. Tests can inject a fake clock. @@ -352,27 +364,102 @@ func (h *Handler) UpdateVirtualNetworkRoutes(vni uint, allowedRoutes []Route) er return nil } -// UpdateVirtualNetworkKeys sets/rotates the encryption keys for a virtual -// network. It must be called at least once every 24 hours or after -// replay.RekeyAfterMessages messages. +// UpdateVirtualNetworkSAs installs/rotates a virtual network's pair of simplex +// security associations (PSP model). It must be called at least once every 24 +// hours or after replay.RekeyAfterMessages messages. // -// epoch is the 32-bit SPI that selects this security association: it is carried -// in the Geneve key-epoch option and bound into the high 4 bytes of the AES-GCM -// nonce (nonce = epoch‖counter). Under the current shared-epoch model the same -// epoch is used for both simplex directions, so the SPI prefix does NOT separate -// them — the distinct rx/tx keys do (see the guards below). The SPI binding's -// value is RX tamper-rejection/auditability and forward-compatibility with -// per-direction SPIs. +// rxSPI and txSPI are the per-direction 32-bit SPIs that select the receive and +// transmit SAs. Each is carried in the Geneve key-epoch option and bound into the +// high 4 bytes of its direction's AES-GCM nonce (nonce = SPI‖counter): +// - rxSPI is OUR receive SPI — the one we allocated and the peer encrypts to. We +// store the RX cipher under it and look inbound frames up by it. Inbound frames +// carry rxSPI in their key-epoch option (== the sender's txSPI). +// - txSPI is the PEER's receive SPI — the one we encrypt to. We stamp it into the +// key-epoch option and nonce[:4] of every outbound frame. // -// Three fail-closed guards require a non-zero epoch, a strictly increasing -// epoch, and rxKey != txKey. IMPORTANT: the monotonicity guard compares against -// in-memory state, so it holds only within a single process lifetime — it cannot -// detect an epoch reused across a restart. Restart safety therefore depends on -// the caller never reusing an (epoch, key) pair: with ephemeral per-session keys -// (the control plane, Phase 4) a restart yields fresh keys and is safe; with -// static persisted keys it is NOT safe absent durable epoch/counter state (the -// residual APO-644 case). Callers must also serialize installs per VNI; the -// guard→install sequence is not internally locked. +// The two SPIs are distinct (the control plane partitions the SPI space by role, +// see control/sa.go), so each direction has its own nonce space. +// +// This entry point is for the CONTROL PLANE, where every SA generation carries a +// FRESH per-session key (each QUIC reconnect is a fresh ECDHE handshake — no 0-RTT, +// no session resumption, enforced in control/transport.go). That freshness is what +// guarantees the nonce-uniqueness invariant — no (key, nonce=SPI‖counter) pair ever +// repeats — across rekeys, reconnects and restarts: +// - within a session the receive-SPI allocator is monotonic, so a given SPI value +// is handed out once and its reset-to-zero counter is always a fresh nonce space; +// - across sessions the master keys are fresh, so even a reused SPI value derives a +// different key. SPIs may therefore reset to 1 on a reconnect and be re-accepted +// here at a LOWER value than before — which is exactly what makes a one-sided +// restart recover seamlessly with no persisted state. +// +// Three fail-closed guards apply: non-zero SPIs, distinct rx/tx keys, and a TX +// anti-reset check that rejects re-installing the CURRENTLY-live transmit SA — same SPI +// AND same key (the only in-process action that would reset a live counter under an +// unchanged key — a defensive backstop against a double-install/retry). A txSPI that +// merely reuses the live SPI value under a FRESH key (the transient-reconnect case) is +// accepted, as is any lower-or-higher txSPI; safety rests on the fresh-key guarantee +// above, not on monotonicity. +// Callers must serialize installs per VNI; the guard→install sequence is not +// internally locked (the control plane is single-threaded per Tunnel). Static +// pre-shared keys, which have NO per-session freshness, must use the strictly-guarded +// UpdateVirtualNetworkKeys instead. +func (h *Handler) UpdateVirtualNetworkSAs(vni uint, rxSPI, txSPI uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { + value, ok := h.networkByID.Load(vni) + if !ok { + return fmt.Errorf("VNI %d not found", vni) + } + vnet := value.(*VirtualNetwork) + + // Reserved-SPI guard: SPI 0 is reserved. Rejecting it keeps the data plane's + // accepted SPI space aligned with the control plane, which never emits an SPI + // whose low 31 bits are zero (control/sa.go), and refuses to write the all-zero + // nonce prefix that predated the SPI binding. + if rxSPI == 0 || txSPI == 0 { + return errors.New("rx and tx SPIs must be non-zero") + } + + // TX anti-reset guard: reject re-installing the SA that is currently live for + // transmit — same SPI AND same key. That pair is the only in-process action that would + // reset the TX counter to zero under a key already used at that SPI (a GCM nonce-reuse + // hazard): a defensive backstop against an accidental double-install/retry of the + // identical generation. The key comparison is load-bearing, not cosmetic: on a transient + // reconnect the receive-SPI allocator resets to a low value, so the new transmit SPI can + // COLLIDE with the still-live one — but it arrives under a FRESH master key (every session + // is a fresh ECDHE handshake; resumption and 0-RTT are disabled and asserted in + // control/transport.go), so its from-zero counter is a fresh nonce space and the install + // is safe. Comparing the SPI alone would spuriously reject that legitimate recovery. A + // different SPI is likewise always accepted. There is deliberately no RX monotonicity + // guard — the receive side never emits a nonce, so a reused receive SPI is harmless (its + // per-SA replay filter is rebuilt with the fresh key); rxEpoch is tracked only to + // grace-clamp the previous receive cipher (see installKeys). + if cur := vnet.txCipher.Load(); cur != nil && txSPI == cur.epoch && txKey == cur.key { + return fmt.Errorf("tx SA (SPI %d) is already live; refusing to reset its counter", txSPI) + } + + // Distinct-key guard. Under per-direction SPIs the role bit already separates the + // two directions' nonce spaces, so this is belt-and-suspenders. Real peers always + // derive distinct per-direction keys (control.DeriveSA over role-partitioned SPIs), + // so this never rejects a legitimate install. + if rxKey == txKey { + return errors.New("rx and tx keys must differ: each direction requires its own key") + } + + return h.installKeys(vnet, rxSPI, txSPI, rxKey, txKey, expiresAt) +} + +// UpdateVirtualNetworkKeys is the legacy install seam for STATIC pre-shared keys +// (the --key-file/INI path): it installs a single epoch (SPI) for BOTH simplex +// directions, separated only by the distinct rx/tx keys. +// +// Unlike the control plane, static keys carry NO per-session freshness — the same key +// is reused across reloads and process restarts — so this path keeps the STRICT +// monotonicity guard: the epoch must strictly increase within the process. That stops +// an operator from reinstalling an older-or-equal epoch with a reused key (which would +// reset the counter under an already-used (epoch, key) and reuse a nonce). It does NOT +// (and cannot) prevent a cross-RESTART reuse — a restart resets the in-memory counter +// to zero under the persisted key (the residual APO-644 hazard); the control plane +// (fresh per-session keys) is the fix, which is why static keying is being retired. +// Callers must serialize installs per VNI (the static path does so under a mutex). func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { value, ok := h.networkByID.Load(vni) if !ok { @@ -380,58 +467,48 @@ func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey } vnet := value.(*VirtualNetwork) - // Reserved-SPI guard: epoch 0 is reserved. Rejecting it keeps the data - // plane's accepted SPI space aligned with the control plane, which never - // emits an SPI whose low 31 bits are zero (control/sa.go), and refuses to - // write the all-zero nonce prefix that predated the SPI binding. if epoch == 0 { return errors.New("epoch (SPI) must be non-zero") } - - // Monotonicity guard: within this process the epoch (SPI) must strictly - // increase. Reinstalling under a live or older SPI would reset that SA's TX - // counter and replay window while its key — and thus its nonce space — has - // already been used (a GCM nonce-reuse hazard). This covers in-process - // re-install/rotation only; cross-restart safety is discussed in the doc above. + // Strict monotonicity (static keys have no per-session key freshness to fall back on). if cur := vnet.txCipher.Load(); cur != nil && epoch <= cur.epoch { return fmt.Errorf("epoch must be monotonically increasing: new %d <= current %d", epoch, cur.epoch) } - - // Distinct-key guard: both simplex directions share the epoch, so the SPI in - // the nonce is identical inbound and outbound. The only thing separating the - // two directions' (key, nonce) spaces is then the key itself; equal rx/tx - // keys would collide nonces under one key — catastrophic for AES-GCM. Real - // peers always derive distinct per-direction keys (control.DeriveSA over - // role-partitioned SPIs). if rxKey == txKey { return errors.New("rx and tx keys must differ: each direction requires its own key") } - return h.installKeys(vnet, epoch, rxKey, txKey, expiresAt) + return h.installKeys(vnet, epoch, epoch, rxKey, txKey, expiresAt) } -// installKeys builds and installs the RX/TX ciphers for epoch, applies the 30s -// grace period to the previous RX key, and sweeps expired RX keys. It is the -// unguarded mechanism behind UpdateVirtualNetworkKeys; the monotonicity and -// distinct-key guards live in that caller. -func (h *Handler) installKeys(vnet *VirtualNetwork, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { - // Set grace period (30s) on the previous RX key, if it exists - if txCipher := vnet.txCipher.Load(); txCipher != nil { - prevEpoch := txCipher.epoch - if prevEpoch != epoch { - if prevCipherAny, ok := vnet.rxCiphers.Load(prevEpoch); ok { - if prevCipher, ok := prevCipherAny.(*receiveCipher); ok { - graceExpiry := h.clock.Now().Add(keyGracePeriod) - // Clamp the expiry to now+gracePeriod now that we have rotated. - if prevCipher.expiresAt.After(graceExpiry) { - prevCipher.expiresAt = graceExpiry - } +// installKeys builds and installs the RX/TX ciphers for a generation, applies the 30s +// grace period to the previous RX key, and sweeps expired RX keys. It is the unguarded +// mechanism behind UpdateVirtualNetworkSAs; the SPI/key guards live in that caller. +func (h *Handler) installKeys(vnet *VirtualNetwork, rxSPI, txSPI uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { + // Clamp the previous RX key to a 30s grace window. The previous receive SA is + // keyed by the previous receive SPI (vnet.rxEpoch) — NOT by txCipher.epoch, which + // under per-direction SPIs is the previous TRANSMIT SPI (the peer's receive SPI) + // and would point at the wrong slot. The grace lets the survivor keep decrypting + // in-flight frames under the old key across a make-before-break rotation. Across a + // reconnect the previous and new receive SPIs differ (the new session's allocator + // reset to a low value while the old SPI was higher), so they occupy distinct + // rxCiphers slots and both stay live through the grace window. The rare exception — + // a fresh allocator climbing back to a still-graced old SPI within 30s — simply + // overwrites that slot with the fresh key; only late frames under the old key at + // that exact SPI are lost, which is acceptable post-reconnect. + if prevRxSPI := vnet.rxEpoch.Load(); prevRxSPI != 0 && prevRxSPI != rxSPI { + if prevCipherAny, ok := vnet.rxCiphers.Load(prevRxSPI); ok { + if prevCipher, ok := prevCipherAny.(*receiveCipher); ok { + graceExpiry := h.clock.Now().Add(keyGracePeriod) + if prevCipher.expiresAt.After(graceExpiry) { + prevCipher.expiresAt = graceExpiry } } } } - // Delete expired keys (to free key material from memory) + // Delete expired keys (to free key material from memory). This sweeps rxCiphers + // only; it does not touch vnet.rxEpoch (which is overwritten on the next install). now := h.clock.Now() vnet.rxCiphers.Range(func(key, value any) bool { cipher := value.(*receiveCipher) @@ -459,20 +536,27 @@ func (h *Handler) installKeys(vnet *VirtualNetwork, epoch uint32, rxKey, txKey [ return fmt.Errorf("failed to create TX GCM: %w", err) } - vnet.rxCiphers.Store(epoch, &receiveCipher{ + // Install RX before TX (make-before-break): store the receive cipher and record the + // currently-installed receive SPI (rxEpoch) first, so we can decrypt the peer's + // new-generation frames before we start emitting our own under the new transmit SPI. + vnet.rxCiphers.Store(rxSPI, &receiveCipher{ AEAD: rxCipher, expiresAt: expiresAt, }) - - // A fresh transmitCipher resets the TX counter to zero for the new epoch. This is - // load-bearing for nonce uniqueness: the AES-GCM nonce is epoch‖counter, so each - // epoch MUST begin its own counter at zero — the control plane's durable-epoch - // seeding makes epochs climb monotonically across rekeys/restarts, and the - // per-epoch counter reset is what keeps (key, nonce) pairs from ever repeating. - // A refactor that carried the counter across installs would reintroduce reuse. + vnet.rxEpoch.Store(rxSPI) + + // A fresh transmitCipher resets the TX counter to zero for the new transmit SPI. + // This is load-bearing for nonce uniqueness: the AES-GCM nonce is txSPI‖counter, so + // each transmit SPI MUST begin its own counter at zero. Safety across rekeys, reconnects + // and restarts rests on each generation pairing that from-zero counter with a FRESH + // per-session key (fresh ECDHE; no resumption/0-RTT), so even a reused or regressed SPI + // value derives a different key and the (key, nonce) pair never repeats. A refactor that + // carried the counter across installs would reintroduce reuse. The key is retained so the + // TX anti-reset guard can reject a literal double-install of this same live SA. vnet.txCipher.Store(&transmitCipher{ AEAD: txCipher, - epoch: epoch, + epoch: txSPI, + key: txKey, }) return nil @@ -569,12 +653,13 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } - // Verify the SPI bound into the nonce matches the epoch that selected this - // SA (nonce = SPI‖counter). A conformant sender always sets nonce[:4] to the - // key epoch; a mismatch is a malformed or tampered frame. GCM would also - // reject it at Open (the nonce and the header both feed the tag), but the - // explicit check makes the binding auditable and gives a precise drop reason. - // (APO-644) + // Verify the SPI bound into the nonce matches the receive SPI that selected this + // SA (nonce = SPI‖counter). Under per-direction SPIs the inbound key-epoch option + // carries the sender's transmit SPI, which is exactly our receive SPI; a conformant + // sender always sets nonce[:4] to that same value, so a mismatch is a malformed or + // tampered frame. GCM would also reject it at Open (the nonce and the header both + // feed the tag), but the explicit check makes the binding auditable and gives a + // precise drop reason. (APO-644) if spi := binary.BigEndian.Uint32(nonce[:4]); spi != epoch { slog.Debug("Dropping frame: nonce SPI does not match key epoch", slog.Uint64("epoch", uint64(epoch)), slog.Uint64("nonceSPI", uint64(spi))) @@ -876,13 +961,12 @@ func (h *Handler) VirtToPhy(virtFrame, phyFrame []byte) (int, bool) { return 0, false } - // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 - // bytes. Under the shared-epoch model this prefix is identical for both - // directions, so it does not separate them (the distinct rx/tx keys do); its - // value here is letting RX reject a tampered/mismatched SPI and forward-compat - // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. - // TODO(phase4): carry the control plane's distinct tx/rx SPIs, which a single - // shared epoch cannot represent. Both halves must be written before Seal. + // nonce = txSPI‖counter: bind this direction's transmit SPI (the peer's receive + // SPI) into the high 4 bytes. Under per-direction SPIs this prefix differs from the + // receive direction's, so the SPI itself separates the two directions' nonce spaces + // (on top of the distinct rx/tx keys); the receiver reconstructs the same SPI from + // its key-epoch option and rejects any frame whose nonce[:4] does not match. The low + // 8 bytes are the per-SA monotonic counter. Both halves must be written before Seal. nonce := hdr.Options[1].Value[:12] binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) @@ -1003,13 +1087,12 @@ func (h *Handler) ToPhy(phyFrame []byte) int { // Fill options: epoch + nonce/counter binary.BigEndian.PutUint32(hdr.Options[0].Value[:4], txCipher.epoch) - // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 - // bytes. Under the shared-epoch model this prefix is identical for both - // directions, so it does not separate them (the distinct rx/tx keys do); its - // value here is letting RX reject a tampered/mismatched SPI and forward-compat - // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. - // TODO(phase4): carry the control plane's distinct tx/rx SPIs, which a single - // shared epoch cannot represent. Both halves must be written before Seal. + // nonce = txSPI‖counter: bind this direction's transmit SPI (the peer's receive + // SPI) into the high 4 bytes. Under per-direction SPIs this prefix differs from the + // receive direction's, so the SPI itself separates the two directions' nonce spaces + // (on top of the distinct rx/tx keys); the receiver reconstructs the same SPI from + // its key-epoch option and rejects any frame whose nonce[:4] does not match. The low + // 8 bytes are the per-SA monotonic counter. Both halves must be written before Seal. nonce := hdr.Options[1].Value[:12] binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) diff --git a/handler_test.go b/handler_test.go index ed0290f..9e88f93 100644 --- a/handler_test.go +++ b/handler_test.go @@ -350,6 +350,70 @@ func TestUpdateVirtualNetworkKeysGuards(t *testing.T) { "lower epoch must be rejected (monotonicity)") } +// TestUpdateVirtualNetworkSAsGuards exercises the control-plane install seam directly. +// Because every control-plane generation carries a FRESH per-session key, this path does +// NOT enforce SPI monotonicity: a reconnect resets the allocator to a low SPI and that +// reset SPI must be re-accepted under its fresh key. The only fail-closed guards are +// non-zero SPIs, distinct rx/tx keys, and a TX anti-reset check that refuses to re-install +// the CURRENTLY-live transmit SA — same SPI AND same key (the one in-process action that +// would reset a live counter under an unchanged key). A colliding SPI under a FRESH key, +// which is exactly the transient-reconnect case, is accepted. +func TestUpdateVirtualNetworkSAsGuards(t *testing.T) { + localAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()), Port: 1234} + peerAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()), Port: 4321} + + h, err := icx.NewHandler(icx.WithLocalAddr(localAddr), icx.WithLayer3VirtFrames()) + require.NoError(t, err) + + const vni = 0x9a9a + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, peerAddr, []icx.Route{{Src: prefix, Dst: prefix}})) + + var k1, k2, k3, k4 [16]byte + copy(k1[:], []byte("aaaaaaaaaaaaaaaa")) + copy(k2[:], []byte("bbbbbbbbbbbbbbbb")) + copy(k3[:], []byte("cccccccccccccccc")) + copy(k4[:], []byte("dddddddddddddddd")) + exp := time.Now().Add(time.Hour) + + // Either direction's SPI being zero (reserved) is rejected. + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 0, 20, k1, k2, exp), "rx SPI 0 must be rejected") + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 10, 0, k1, k2, exp), "tx SPI 0 must be rejected") + + // Equal rx/tx keys are rejected even on the first install. + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k1, k1, exp), "equal rx/tx keys must be rejected") + + // First install with distinct per-direction SPIs and distinct keys succeeds. + // Live transmit SA is now (SPI 20, key k2). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k1, k2, exp)) + + // Re-installing the IDENTICAL live transmit SA (same SPI AND same key) is rejected — + // that is the one action that would reset a live counter under its own key. The rx side + // is irrelevant to this guard. + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k1, k2, exp), + "re-installing the identical live tx SA must be rejected") + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 99, 20, k3, k2, exp), + "the live tx SA is keyed by (SPI, key); a different rx SPI does not make it safe") + + // The SAME transmit SPI under a FRESH key is accepted — this is the transient-reconnect + // case (the allocator reset to a colliding SPI value, but the master key is fresh, so the + // from-zero counter is a fresh nonce space). Live tx SA is now (SPI 20, key k4). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k3, k4, exp), + "a colliding tx SPI under a fresh key is accepted (reconnect recovery)") + + // A reused (non-increasing) RX SPI is accepted — the receive side never emits a nonce, + // so a repeated receive SPI under a fresh key is harmless. Here rx stays at 10 while tx + // advances to 21; live tx SA is now (21, k2). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 10, 21, k1, k2, exp), + "a reused rx SPI is accepted (no rx monotonicity guard)") + + // A LOWER transmit SPI is accepted — it can only arrive from a fresh session whose key + // is fresh, so the reset counter is a fresh nonce space. This models a peer reconnect + // that reset its allocator: rx and tx both drop back to low values. Live tx SA is now (5, k4). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 3, 5, k3, k4, exp), + "a lower tx SPI under a fresh key is accepted (reconnect recovery)") +} + // TestRXRejectsSPINonceMismatch proves the RX side rejects a frame whose nonce // SPI (nonce[:4]) does not match the key epoch it selected, on BOTH the // cross-buffer and in-place decap paths, and that TX binds the SPI into the From 84c94c12c9bf8abd97533f073f1fb17c27d92e0b Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 2 Jun 2026 01:19:35 -0700 Subject: [PATCH 14/20] [control] negotiate per-direction SPIs; drop durable epoch high-water (APO-648) The Tunnel now hands the handler the two genuine per-direction SPIs from NegotiateSAs (rxSPI = our receive SPI, txSPI = the peer's) via the reshaped SAInstaller, instead of collapsing them onto one SharedEpoch. With fresh master keys per session (see the fresh-ECDHE enforcement) a reset/regressed SPI is always paired with a fresh key, so no persisted epoch state is needed for recovery: the handler accepts the reset SPI and a transient reconnect or a one-sided restart of either peer recovers seamlessly with zero on-disk state. Removes the whole receive-SPI high-water layer that the old strict monotonicity guard forced: SharedEpoch/roleBit, the EpochStore/persister and its fatal tripwire, SeedFloor/SeedRxFloor, and the in-memory/durable high-water tracking (control/epochstate.go and its test). Reconnect recovery is now proven by TestTunnelReconnects / TestTunnelReconnectGuardNeverRejects, whose guardInstaller mirrors the handler's same-SPI-AND-same-key anti-reset guard. --- control/cp.go | 301 ++++++------------------- control/cp_test.go | 451 +++++++++---------------------------- control/epochstate.go | 451 ------------------------------------- control/epochstate_test.go | 391 -------------------------------- control/sa.go | 31 +-- control/transport.go | 7 - 6 files changed, 170 insertions(+), 1462 deletions(-) delete mode 100644 control/epochstate.go delete mode 100644 control/epochstate_test.go diff --git a/control/cp.go b/control/cp.go index af35048..9645de2 100644 --- a/control/cp.go +++ b/control/cp.go @@ -16,22 +16,23 @@ import ( // (control/transport.go) and feeds the negotiated SAs into the data plane via an // SAInstaller, so the CLI stays thin and the wiring is unit-testable off Linux. // -// Data-plane epoch model (this build): the handler carries a SINGLE 32-bit epoch -// per security association for both simplex directions (handler.go), while the -// control plane allocates a DISTINCT, role-partitioned SPI per direction. We bridge -// the two with SharedEpoch: both peers install the same scalar epoch (the -// initiator-allocated SPI) but derive DISTINCT per-direction keys from the distinct -// SPIs, so AES-GCM nonce uniqueness rests on the keys differing (the handler's -// rxKey != txKey guard), exactly as documented at handler.go. Carrying the genuine -// per-direction SPI on the wire (true per-direction nonce spaces) is the additive -// UpdateVirtualNetworkSAs follow-up (Option C); it is intentionally out of scope here. +// Data-plane SA model (PSP, per-direction): the handler installs two simplex SAs per +// generation (handler.go: UpdateVirtualNetworkSAs), each selected by its own +// role-partitioned SPI. NegotiateSAs gives each peer a DirectionalSAs{Rx, Tx} where Rx +// is the SPI this peer allocated (our receive SPI) and Tx is the peer's receive SPI +// (what we transmit to). Both peers derive every key locally from the shared master +// keys, so nothing but the SPIs crosses the wire. Each direction therefore has its own +// nonce space, separated by the SPI itself (the role bit) on top of the distinct +// per-direction key. // -// Because the per-session SPI allocator resets on every (re)connect, the shared epoch -// would regress to 1 and the survivor's strictly-increasing epoch guard would reject -// it. The Tunnel carries an epoch high-water forward and seeds each new session's -// allocator above it (in memory always; durably via an EpochStore on the initiator) -// so reconnects and one-sided restarts keep the epoch monotonic — see -// control/epochstate.go. +// No persisted epoch state is needed for recovery. The per-session SPI allocator resets +// on every (re)connect, so a fresh session's SPIs start low again — but that is SAFE +// because every reconnect is a fresh ECDHE handshake (no 0-RTT, no resumption; enforced +// in transport.go) yielding fresh master keys, so a reused SPI value derives a different +// key and the data-plane nonce never repeats. The handler's install guard therefore +// accepts the reset SPI (it rejects only a re-install of the currently-live transmit +// SPI), which makes a transient reconnect and a one-sided restart of either peer recover +// seamlessly with zero on-disk state. // Mode is the keying mode selected from the CLI flags. type Mode int @@ -102,47 +103,14 @@ func CanonicalInitiator(localPub, peerPub *ecdsa.PublicKey) (bool, error) { return bytes.Compare(l, p) < 0, nil } -// roleBit is the SPI bit that encodes the allocating role (see sa.go): the -// initiator allocates SPIs with this bit clear, the responder with it set. -const roleBit = uint32(1) << spiRoleShift - -// SharedEpoch derives the single data-plane epoch both peers install for a -// negotiated SA generation. Of the two role-partitioned SPIs the peer holds -// ({Tx, Rx}), exactly one was allocated by the initiator (role bit clear); both -// peers select that one and compute the identical value, because the initiator's -// Rx SPI is the responder's Tx SPI. The epoch is what lands in the Geneve key-epoch -// option and nonce[:4]; the distinct per-direction KEYS still come from the distinct -// SPIs, so the two directions never share a (key, nonce) pair. -// -// This selection is well-defined only while the master-key index (SPI bit31) is 0 -// for both directions, which holds until master-key rotation is introduced; rotation -// is a known incompatibility for the shared-epoch bridge and is the trigger for the -// per-direction-SPI follow-up. -func SharedEpoch(sas *DirectionalSAs) (uint32, error) { - if sas == nil || sas.Tx == nil || sas.Rx == nil { - return 0, errors.New("control: nil SAs") - } - if MasterKeyIndex(sas.Tx.SPI) != 0 || MasterKeyIndex(sas.Rx.SPI) != 0 { - return 0, errors.New("control: SharedEpoch requires master-key index 0 (rotation not yet supported)") - } - txInitiator := sas.Tx.SPI&roleBit == 0 - rxInitiator := sas.Rx.SPI&roleBit == 0 - if txInitiator == rxInitiator { - return 0, fmt.Errorf("control: SAs are not role-partitioned (tx=%#08x rx=%#08x)", sas.Tx.SPI, sas.Rx.SPI) - } - if txInitiator { - return sas.Tx.SPI, nil - } - return sas.Rx.SPI, nil -} - -// SAInstaller installs a negotiated SA generation into the data plane. epoch is the -// shared data-plane epoch (see SharedEpoch); rxKey/txKey are the 16-byte AES-128 keys -// for the receive/transmit directions. The installer owns the key lifetime/expiry and -// is expected to enforce the handler's fail-closed guards (non-zero, strictly -// increasing epoch, rxKey != txKey). A returned error is treated as a rejected -// rotation, not a session failure. -type SAInstaller func(epoch uint32, rxKey, txKey [16]byte) error +// SAInstaller installs a negotiated SA generation into the data plane. rxSPI is our +// receive SPI (we decrypt inbound frames under it); txSPI is the peer's receive SPI (we +// encrypt outbound frames to it); rxKey/txKey are the 16-byte AES-128 keys for those +// directions. The installer owns the key lifetime/expiry and is expected to enforce the +// handler's fail-closed guards (non-zero, strictly increasing per-direction SPIs, +// rxKey != txKey). A returned error is treated as a rejected rotation, not a session +// failure. +type SAInstaller func(rxSPI, txSPI uint32, rxKey, txKey [16]byte) error // Default lifecycle timings; overridable on Tunnel for tests. const ( @@ -164,22 +132,9 @@ type Tunnel struct { install SAInstaller initiator bool - // Durable/in-memory epoch high-water (initiator only; see control/epochstate.go). - // epochSeed seeds each new session's RX allocator so the shared epoch keeps - // increasing across a reconnect/restart instead of resetting to 1; installedHigh - // is the latest installed epoch (the persist target / stall reference); store and - // persist add durability for an initiator restart. - store EpochStore - requireState bool - epochSeed uint32 - installedHigh uint32 - persist *epochPersister - // tunables (defaults set by NewTunnel; tests may override) - perExchangeTimeout time.Duration - reconnectBackoff time.Duration - maxStoreFailures int64 - persistStallTimeout time.Duration + perExchangeTimeout time.Duration + reconnectBackoff time.Duration ln *Listener // responder only; persists across reconnects sess *Session @@ -197,19 +152,6 @@ type TunnelConfig struct { PeerAddr net.Addr // RekeyInterval is how often the initiator negotiates a fresh SA generation. RekeyInterval time.Duration - // EpochStore, when non-nil, persists the data-plane epoch high-water so a restart - // of the elected INITIATOR recovers seamlessly. It is consulted only when this - // node is the initiator — the responder's high-water is not load-bearing (the - // shared epoch is always the initiator-allocated SPI). A responder configured with - // a store leaves it inert. nil disables durable persistence (a transient reconnect - // and a responder restart still recover via the in-memory high-water; only a - // one-sided initiator restart needs the store). - EpochStore EpochStore - // RequireState makes durable epoch state fail closed instead of degrading: a - // corrupt/unreadable state file fails Bringup, and persistently failing/stalled - // stores fail Run. It requires EpochStore. It is an integrity tripwire against - // accidental corruption, NOT an anti-rollback/anti-deletion control. - RequireState bool } // NewTunnel validates the config, elects the canonical role, and returns a Tunnel @@ -227,27 +169,20 @@ func NewTunnel(cfg TunnelConfig, install SAInstaller) (*Tunnel, error) { if cfg.RekeyInterval <= 0 { return nil, errors.New("control: rekey interval must be positive") } - if cfg.RequireState && cfg.EpochStore == nil { - return nil, errors.New("control: RequireState requires an EpochStore") - } initiator, err := CanonicalInitiator(cfg.Local.PublicKey(), cfg.PeerPub) if err != nil { return nil, err } return &Tunnel{ - local: cfg.Local, - peerPub: cfg.PeerPub, - conn: cfg.Conn, - peerAddr: cfg.PeerAddr, - rekeyIvl: cfg.RekeyInterval, - install: install, - initiator: initiator, - store: cfg.EpochStore, - requireState: cfg.RequireState, - perExchangeTimeout: defaultPerExchangeTimeout, - reconnectBackoff: defaultReconnectBackoff, - maxStoreFailures: defaultMaxStoreFailures, - persistStallTimeout: defaultPersistStallTimeout, + local: cfg.Local, + peerPub: cfg.PeerPub, + conn: cfg.Conn, + peerAddr: cfg.PeerAddr, + rekeyIvl: cfg.RekeyInterval, + install: install, + initiator: initiator, + perExchangeTimeout: defaultPerExchangeTimeout, + reconnectBackoff: defaultReconnectBackoff, }, nil } @@ -259,17 +194,6 @@ func (t *Tunnel) Initiator() bool { return t.initiator } // the handshake, negotiation, or install fails, so the caller must not start the data // plane until Bringup succeeds. func (t *Tunnel) Bringup(ctx context.Context) (err error) { - if err = t.loadEpochState(); err != nil { - return err - } - // loadEpochState may have started the persister goroutine; reap it if Bringup - // fails so a caller that drops the Tunnel on a Bringup error does not leak it. - defer func() { - if err != nil && t.persist != nil { - t.persist.stop() - t.persist = nil - } - }() if err = t.establish(ctx); err != nil { return fmt.Errorf("control: establish session: %w", err) } @@ -282,49 +206,7 @@ func (t *Tunnel) Bringup(ctx context.Context) (err error) { role = "initiator" } slog.Info("control plane established", slog.String("role", role), - slog.String("peer", t.peerAddr.String()), - slog.Bool("durableEpochState", t.persist != nil)) - return nil -} - -// loadEpochState reads the durable epoch high-water and starts the persister. It is a -// no-op except on the initiator with a configured store — the responder's high-water -// is not load-bearing (see control/epochstate.go), so a responder configured with a -// store leaves it inert (and --require-state does not gate the responder). It runs at -// the start of Bringup so NewTunnel stays I/O-free. -func (t *Tunnel) loadEpochState() error { - if t.store == nil { - return nil - } - if !t.initiator { - slog.Info("control: durable epoch state inactive on this node (responder role; the shared epoch is initiator-driven). Ensure the elected initiator also has a state file") - return nil - } - hw, ok, err := t.store.Load() - if err != nil { - if t.requireState { - return fmt.Errorf("control: durable epoch state is unreadable and --require-state is set: %w", err) - } - slog.Error("control: epoch state unreadable; starting fresh — a one-sided initiator restart will not recover until state is re-persisted. Use --require-state to fail closed instead", - slog.Any("error", err)) - hw, ok = 0, false - } - start := uint32(0) - if ok { - // epochSeed/installedHigh track the EXACT high-water; the margin is applied at - // seed time in establish (see seedWithMargin), so it covers both the durable - // gap here and the torn-reconnect lead later. - t.epochSeed = hw - t.installedHigh = hw - start = hw - slog.Info("control: loaded durable epoch high-water", - slog.Uint64("highWater", uint64(hw)), slog.Uint64("seed", uint64(seedWithMargin(hw)))) - if seedWithMargin(hw) >= spiCounterMask-1 { - slog.Warn("control: epoch counter space is nearly exhausted; master-key rotation will be required (the control plane will fail closed when it runs out)", - slog.Uint64("highWater", uint64(hw)), slog.Uint64("ceiling", uint64(spiCounterMask))) - } - } - t.persist = newEpochPersister(t.store, start, time.Now().UnixNano()) + slog.String("peer", t.peerAddr.String())) return nil } @@ -349,18 +231,9 @@ func (t *Tunnel) runInitiator(ctx context.Context) error { ticker := time.NewTicker(t.rekeyIvl) defer ticker.Stop() for { - // A clean shutdown takes priority over the fail-closed tripwire: returning a - // fatal error here on the way out would mis-report a deliberate stop as a - // failure (non-zero exit). Mirror the ctx.Err() guards on the other terminal - // arms below. if ctx.Err() != nil { return nil } - // Fail closed (only under --require-state) if durable persistence has fallen - // far enough behind that a restart could no longer recover. - if err := t.epochPersistFatal(time.Now()); err != nil { - return err - } sessLost := t.sessionDone() select { case <-ctx.Done(): @@ -385,10 +258,8 @@ func (t *Tunnel) runInitiator(ctx context.Context) error { // is the only remedy); reconnecting would just hot-loop. Fail closed. return err } - // Epoch regression after a reconnect is now prevented by seeding the - // allocator from the epoch high-water (see installSAs / loadEpochState); - // installSAs still swallows a stray rejection, so any error here is a - // genuine session/transport failure. + // installSAs swallows an install rejection, so any error here is a genuine + // session/transport failure → reconnect (which derives fresh keys). slog.Warn("control: rekey failed, reconnecting", slog.Any("error", err)) if err := t.reestablish(ctx); err != nil { return err @@ -419,7 +290,7 @@ func (t *Tunnel) runResponder(ctx context.Context) error { } // negotiateAndInstall runs one SA exchange on the live session and installs the -// result. installSAs swallows a rotation rejection (returns nil) so it does not look +// result. installSAs swallows an install rejection (returns nil) so it does not look // like a transport failure; a non-nil error here means the wire exchange failed. func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { sas, err := t.sess.NegotiateSAs(ctx, PSPv0) @@ -429,14 +300,13 @@ func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { return t.installSAs(sas) } -// installSAs validates the negotiated SAs fail-closed (PSPv0, 16-byte keys), -// computes the shared epoch, and hands them to the installer. Seeding the allocator -// from the epoch high-water (see recordInstalled / loadEpochState) keeps the epoch -// strictly increasing across reconnects/restarts, so the monotonicity guard should -// accept every generation in normal operation. A rejection is still swallowed as -// defense-in-depth (e.g. with no durable state after a one-sided initiator restart, -// or a responder whose floor is not seeded): the previously installed keys keep -// forwarding and the data plane fails closed on their own expiry. +// installSAs validates the negotiated SAs fail-closed (PSPv0, 16-byte keys) and hands +// the two per-direction SPIs/keys to the installer. Every session derives fresh keys +// (fresh ECDHE; see transport.go), so the handler accepts the install even when the +// per-session allocator reset the SPIs to a low value after a reconnect. An install +// rejection is swallowed as defense-in-depth (e.g. the handler refusing to reset its +// currently-live transmit SPI): the previously installed keys keep forwarding and the +// data plane fails closed on their own expiry. func (t *Tunnel) installSAs(sas *DirectionalSAs) error { if sas.Tx.Version != PSPv0 || sas.Rx.Version != PSPv0 { return fmt.Errorf("control: only PSPv0/AES-128 is supported in this build (tx=%d rx=%d)", sas.Tx.Version, sas.Rx.Version) @@ -444,45 +314,22 @@ func (t *Tunnel) installSAs(sas *DirectionalSAs) error { if len(sas.Rx.Key) != 16 || len(sas.Tx.Key) != 16 { return fmt.Errorf("control: expected 16-byte SA keys (rx=%d tx=%d)", len(sas.Rx.Key), len(sas.Tx.Key)) } - epoch, err := SharedEpoch(sas) - if err != nil { - return err - } var rxKey, txKey [16]byte copy(rxKey[:], sas.Rx.Key) copy(txKey[:], sas.Tx.Key) - if err := t.install(epoch, rxKey, txKey); err != nil { - slog.Warn("control: SA install rejected; keeping current keys until they expire (seed the epoch floor / configure --state-file for seamless recovery)", - slog.Uint64("epoch", uint64(epoch)), slog.Any("error", err)) + if err := t.install(sas.Rx.SPI, sas.Tx.SPI, rxKey, txKey); err != nil { + slog.Warn("control: SA install rejected; keeping current keys until they expire", + slog.Uint64("rxSPI", uint64(sas.Rx.SPI)), slog.Uint64("txSPI", uint64(sas.Tx.SPI)), slog.Any("error", err)) return nil } - t.recordInstalled(epoch) - slog.Debug("control: installed SA generation", slog.Uint64("epoch", uint64(epoch))) + slog.Debug("control: installed SA generation", + slog.Uint64("rxSPI", uint64(sas.Rx.SPI)), slog.Uint64("txSPI", uint64(sas.Tx.SPI))) return nil } -// recordInstalled advances the initiator's in-memory epoch high-water after a -// successful install and asks the persister to make it durable. It is initiator-only: -// only the initiator's allocator feeds the shared epoch, so only its high-water needs -// to be carried forward. The enqueue never blocks (the persister fsyncs off this -// goroutine). -func (t *Tunnel) recordInstalled(epoch uint32) { - if !t.initiator { - return - } - if epoch > t.epochSeed { - t.epochSeed = epoch - } - if epoch > t.installedHigh { - t.installedHigh = epoch - } - if t.persist != nil { - t.persist.request(t.installedHigh) - } -} - // establish opens a fresh session: the initiator dials, the responder accepts on a -// listener it keeps across reconnects. +// listener it keeps across reconnects. The new session's SPI allocator starts fresh +// (low) — safe because the session's keys are also fresh (see installSAs / transport.go). func (t *Tunnel) establish(ctx context.Context) error { if t.initiator { sess, err := Dial(ctx, t.conn, t.peerAddr, t.local, t.peerPub) @@ -490,17 +337,6 @@ func (t *Tunnel) establish(ctx context.Context) error { return err } t.sess = sess - // Seed THIS session's allocator above the epoch high-water (plus a margin) so - // the shared epoch keeps climbing across the reconnect rather than resetting to - // 1, AND stays strictly above what the survivor retained even if the last - // exchange tore after the responder committed but before we recorded it (see - // seedWithMargin). A fresh start (no high-water) seeds 0 → epoch 1, unchanged. - // Only the initiator is seeded: SharedEpoch always selects the - // initiator-allocated SPI, so the responder's allocator is cosmetic to the wire - // epoch. - if t.epochSeed > 0 { - t.sess.SeedRxFloor(seedWithMargin(t.epochSeed)) - } return nil } if t.ln == nil { @@ -537,16 +373,20 @@ func (t *Tunnel) reestablish(ctx context.Context) error { continue } // Re-key immediately on the new session so traffic resumes without waiting a - // full interval. The new session's allocator is seeded from the epoch - // high-water (establish), so the epoch keeps increasing and the install is - // accepted; a transport error drops back to another reconnect attempt. + // full interval. The new session's SPI allocator starts fresh (low) again, but the + // install is still accepted because the new session derives FRESH master keys + // (fresh ECDHE; see transport.go), so a reset/regressed SPI is paired with a fresh + // key and the data-plane nonce never repeats. A transport error drops back to + // another reconnect attempt. exCtx, cancel := context.WithTimeout(ctx, t.perExchangeTimeout) err := t.negotiateAndInstall(exCtx) cancel() if err != nil && ctx.Err() == nil { if isFatalCP(err) { - // Exhaustion is terminal — re-seeding the same exhausted floor would - // hot-loop. Surface it so Run returns and fails closed. + // SPI counter-space exhaustion (ErrSPIExhausted) is terminal: it requires + // master-key rotation, which this build does not support, and a fresh + // allocator would just exhaust again. Surface it so Run returns and fails + // closed rather than hot-looping reconnects. return err } slog.Warn("control: post-reconnect negotiation failed", slog.Any("error", err)) @@ -573,28 +413,15 @@ func (t *Tunnel) closeSession() { } } -// epochPersistFatal reports a fatal error if durable persistence has degraded past -// the point of guaranteed recovery (only under --require-state; otherwise nil). -func (t *Tunnel) epochPersistFatal(now time.Time) error { - if t.persist == nil { - return nil - } - return t.persist.fatal(t.requireState, t.installedHigh, now, t.maxStoreFailures, t.persistStallTimeout) -} - // isFatalCP reports whether err is a terminal, non-retryable control-plane error that // must stop Run rather than drive a reconnect. func isFatalCP(err error) bool { - return errors.Is(err, ErrSPIExhausted) || errors.Is(err, errEpochPersistStalled) + return errors.Is(err, ErrSPIExhausted) } -// Close releases the session, stops the persister, and (responder) the listener. It is -// idempotent. +// Close releases the session and (responder) the listener. It is idempotent. func (t *Tunnel) Close() error { t.closeSession() - if t.persist != nil { - t.persist.stop() - } if t.ln != nil { err := t.ln.Close() t.ln = nil diff --git a/control/cp_test.go b/control/cp_test.go index b432be1..3e9e8f4 100644 --- a/control/cp_test.go +++ b/control/cp_test.go @@ -68,36 +68,6 @@ func TestCanonicalInitiator(t *testing.T) { require.Error(t, err) } -func TestSharedEpoch(t *testing.T) { - iSPI, err := MakeSPI(0, Initiator, 7) // role bit clear - require.NoError(t, err) - rSPI, err := MakeSPI(0, Responder, 3) // role bit set - require.NoError(t, err) - - // Initiator's view: Tx == peer (responder) Rx == rSPI; Rx == own == iSPI. - eInit, err := SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: rSPI}, Rx: &SA{SPI: iSPI}}) - require.NoError(t, err) - require.Equal(t, iSPI, eInit) - - // Responder's view: Tx == peer (initiator) Rx == iSPI; Rx == own == rSPI. - eResp, err := SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: iSPI}, Rx: &SA{SPI: rSPI}}) - require.NoError(t, err) - require.Equal(t, iSPI, eResp) - - // Both peers therefore install the identical epoch. - require.Equal(t, eInit, eResp) - - // Not role-partitioned (both initiator) → error. - _, err = SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: iSPI}, Rx: &SA{SPI: iSPI}}) - require.Error(t, err) - - // Master-key index != 0 is unsupported by the shared-epoch bridge. - hiSPI, err := MakeSPI(1, Initiator, 1) - require.NoError(t, err) - _, err = SharedEpoch(&DirectionalSAs{Tx: &SA{SPI: rSPI}, Rx: &SA{SPI: hiSPI}}) - require.Error(t, err) -} - // validV0SAs returns a role-partitioned, PSPv0 DirectionalSAs with distinct // 16-byte keys, as NegotiateSAs would produce. func validV0SAs() *DirectionalSAs { @@ -116,7 +86,7 @@ func validV0SAs() *DirectionalSAs { } func TestInstallSAsRejectsNonV0(t *testing.T) { - tn := &Tunnel{install: func(uint32, [16]byte, [16]byte) error { + tn := &Tunnel{install: func(uint32, uint32, [16]byte, [16]byte) error { t.Fatal("installer must not be called for a non-PSPv0 SA") return nil }} @@ -128,10 +98,10 @@ func TestInstallSAsRejectsNonV0(t *testing.T) { func TestInstallSAsSwallowsRotationRejection(t *testing.T) { called := false - tn := &Tunnel{install: func(uint32, [16]byte, [16]byte) error { + tn := &Tunnel{install: func(uint32, uint32, [16]byte, [16]byte) error { called = true - // Mimic the handler's monotonicity guard rejecting a regressed epoch. - return errors.New("epoch must be monotonically increasing") + // Mimic the handler's monotonicity guard rejecting a regressed per-direction SPI. + return errors.New("rx SPI must be monotonically increasing") }} // A rejected rotation is logged and swallowed (the data plane keeps its current // keys and fails closed on their own expiry); it must not look like a transport @@ -142,13 +112,9 @@ func TestInstallSAsSwallowsRotationRejection(t *testing.T) { // twoTunnels wires an initiator and a responder Tunnel over loopback UDP, assigning // the canonical roles correctly, with tight timings for tests. +// twoTunnels wires an initiator and a responder Tunnel over loopback UDP with the +// canonical roles assigned and tight test timings. func twoTunnels(t *testing.T, instInit, instResp SAInstaller, rekey time.Duration) (initT, respT *Tunnel, cleanup func()) { - return twoTunnelsWithStore(t, instInit, instResp, rekey, nil) -} - -// twoTunnelsWithStore is twoTunnels with an optional durable EpochStore on the -// INITIATOR (the only role for which durable state is load-bearing). -func twoTunnelsWithStore(t *testing.T, instInit, instResp SAInstaller, rekey time.Duration, initStore EpochStore) (initT, respT *Tunnel, cleanup func()) { t.Helper() idA, err := GenerateIdentity() require.NoError(t, err) @@ -169,7 +135,7 @@ func twoTunnelsWithStore(t *testing.T, instInit, instResp SAInstaller, rekey tim initT, err = NewTunnel(TunnelConfig{ Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, - PeerAddr: respConn.LocalAddr(), RekeyInterval: rekey, EpochStore: initStore, + PeerAddr: respConn.LocalAddr(), RekeyInterval: rekey, }, instInit) require.NoError(t, err) require.True(t, initT.Initiator()) @@ -195,27 +161,34 @@ func twoTunnelsWithStore(t *testing.T, instInit, instResp SAInstaller, rekey tim return initT, respT, cleanup } -// guardInstaller is an SAInstaller that enforces the handler's strictly-increasing -// epoch guard (handler.go), so tests can detect a regressed/rejected epoch rather -// than the no-op epochRecorder which accepts everything. +// guardInstaller is an SAInstaller that mirrors the handler's relaxed TX anti-reset +// guard (handler.go: UpdateVirtualNetworkSAs) — it rejects only a re-install of the +// currently-live transmit SA, i.e. the same transmit SPI AND the same key — so tests can +// detect a spurious rejection rather than the no-op epochRecorder which accepts everything. +// The key comparison matters: across a reconnect the allocator resets and the transmit SPI +// can collide with the still-live one, but under a fresh key, which is safe and must be +// accepted. Because every control-plane generation carries a fresh key, the guard should +// never reject in normal operation, even across a reconnect that resets SPIs to a low value. type guardInstaller struct { mu sync.Mutex - max uint32 - installed []uint32 + curTx uint32 // currently-live transmit SPI (0 = none) + curTxKey [16]byte // currently-live transmit key + installed []uint32 // accepted receive SPIs, in order rejects int } func newGuardInstaller() *guardInstaller { return &guardInstaller{} } -func (g *guardInstaller) install(epoch uint32, _, _ [16]byte) error { +func (g *guardInstaller) install(rxSPI, txSPI uint32, _, txKey [16]byte) error { g.mu.Lock() defer g.mu.Unlock() - if epoch <= g.max { + if g.curTx != 0 && txSPI == g.curTx && txKey == g.curTxKey { g.rejects++ - return errors.New("epoch must be monotonically increasing") + return errors.New("tx SA is already live") } - g.max = epoch - g.installed = append(g.installed, epoch) + g.curTx = txSPI + g.curTxKey = txKey + g.installed = append(g.installed, rxSPI) return nil } @@ -225,32 +198,34 @@ func (g *guardInstaller) snapshot() (installed []uint32, rejects int) { return append([]uint32(nil), g.installed...), g.rejects } -// epochRecorder is a thread-safe SAInstaller that records the epochs it installs. +// installRec captures one per-direction SA generation for assertions: the receive SPI +// (this peer's own data-plane epoch) and transmit SPI (the peer's receive SPI), plus +// both keys. +type installRec struct { + rxSPI, txSPI uint32 + rxKey, txKey [16]byte +} + +// epochRecorder is a thread-safe SAInstaller that records the per-direction generations +// it installs. type epochRecorder struct { - mu sync.Mutex - epochs []uint32 - keys map[uint32][2][16]byte // epoch -> {rx, tx} + mu sync.Mutex + recs []installRec } -func newEpochRecorder() *epochRecorder { return &epochRecorder{keys: map[uint32][2][16]byte{}} } +func newEpochRecorder() *epochRecorder { return &epochRecorder{} } -func (r *epochRecorder) install(epoch uint32, rxKey, txKey [16]byte) error { +func (r *epochRecorder) install(rxSPI, txSPI uint32, rxKey, txKey [16]byte) error { r.mu.Lock() defer r.mu.Unlock() - r.epochs = append(r.epochs, epoch) - r.keys[epoch] = [2][16]byte{rxKey, txKey} + r.recs = append(r.recs, installRec{rxSPI: rxSPI, txSPI: txSPI, rxKey: rxKey, txKey: txKey}) return nil } -func (r *epochRecorder) snapshot() ([]uint32, map[uint32][2][16]byte) { +func (r *epochRecorder) snapshot() []installRec { r.mu.Lock() defer r.mu.Unlock() - out := append([]uint32(nil), r.epochs...) - cp := make(map[uint32][2][16]byte, len(r.keys)) - for k, v := range r.keys { - cp[k] = v - } - return out, cp + return append([]installRec(nil), r.recs...) } func TestTunnelBringupAndRekey(t *testing.T) { @@ -268,17 +243,21 @@ func TestTunnelBringupAndRekey(t *testing.T) { require.NoError(t, <-brCh) // Both installed exactly one (matching) generation in bring-up. - ie, ik := initRec.snapshot() - re, rk := respRec.snapshot() - require.Len(t, ie, 1) - require.Len(t, re, 1) - require.Equal(t, ie[0], re[0], "peers must install the same shared epoch") - require.NotZero(t, ie[0]) - // Cross-derivation: initiator TX == responder RX and vice versa. - require.Equal(t, ik[ie[0]][1], rk[re[0]][0], "initiator tx key != responder rx key") - require.Equal(t, ik[ie[0]][0], rk[re[0]][1], "initiator rx key != responder tx key") + ir := initRec.snapshot() + rr := respRec.snapshot() + require.Len(t, ir, 1) + require.Len(t, rr, 1) + // Per-direction SPIs: each peer installs its OWN receive SPI; they are distinct + // (role-partitioned), and each peer's transmit SPI is the other peer's receive SPI. + require.NotZero(t, ir[0].rxSPI) + require.NotEqual(t, ir[0].rxSPI, rr[0].rxSPI, "peers must allocate distinct receive SPIs") + require.Equal(t, ir[0].rxSPI, rr[0].txSPI, "initiator rx SPI must equal responder tx SPI") + require.Equal(t, ir[0].txSPI, rr[0].rxSPI, "initiator tx SPI must equal responder rx SPI") + // Cross-derivation: initiator TX key == responder RX key and vice versa. + require.Equal(t, ir[0].txKey, rr[0].rxKey, "initiator tx key != responder rx key") + require.Equal(t, ir[0].rxKey, rr[0].txKey, "initiator rx key != responder tx key") // Within a peer, the two directions use distinct keys. - require.NotEqual(t, ik[ie[0]][0], ik[ie[0]][1]) + require.NotEqual(t, ir[0].rxKey, ir[0].txKey) // Run both peers and let the initiator drive a few rekeys. runCh := make(chan error, 2) @@ -286,27 +265,27 @@ func TestTunnelBringupAndRekey(t *testing.T) { go func() { runCh <- respT.Run(ctx) }() require.Eventually(t, func() bool { - e, _ := initRec.snapshot() - return len(e) >= 3 + return len(initRec.snapshot()) >= 3 }, 10*time.Second, 20*time.Millisecond, "initiator should rekey several times") cancel() require.NoError(t, <-runCh) require.NoError(t, <-runCh) - // Epochs strictly increase per peer, and the two peers agree generation-by-generation. - ie, _ = initRec.snapshot() - re, _ = respRec.snapshot() - for i := 1; i < len(ie); i++ { - require.Greater(t, ie[i], ie[i-1], "initiator epochs must strictly increase") + // Receive SPIs strictly increase per peer, and the two peers agree generation-by- + // generation (the initiator's receive SPI for gen i is the responder's transmit SPI). + ir = initRec.snapshot() + rr = respRec.snapshot() + for i := 1; i < len(ir); i++ { + require.Greater(t, ir[i].rxSPI, ir[i-1].rxSPI, "initiator receive SPIs must strictly increase") } - n := len(ie) - if len(re) < n { - n = len(re) + n := len(ir) + if len(rr) < n { + n = len(rr) } require.GreaterOrEqual(t, n, 2) for i := 0; i < n; i++ { - require.Equal(t, ie[i], re[i], "peers disagree on epoch for generation %d", i) + require.Equal(t, ir[i].rxSPI, rr[i].txSPI, "peers disagree on SPI for generation %d", i) } } @@ -332,7 +311,7 @@ func TestTunnelBringupFailsClosedOnPinMismatch(t *testing.T) { require.NoError(t, err) defer initConn.Close() - mustNotInstall := func(uint32, [16]byte, [16]byte) error { + mustNotInstall := func(uint32, uint32, [16]byte, [16]byte) error { t.Fatal("keys must never be installed on a pin failure") return nil } @@ -383,12 +362,10 @@ func TestTunnelReconnects(t *testing.T) { go func() { runCh <- initT.Run(ctx) }() go func() { runCh <- respT.Run(ctx) }() - // Both peers must reconnect and resume installing matching epochs (well past the - // single bring-up generation), proving the reconnect path self-heals. + // Both peers must reconnect and resume installing generations (well past the single + // bring-up generation), proving the reconnect path self-heals. require.Eventually(t, func() bool { - ie, _ := initRec.snapshot() - re, _ := respRec.snapshot() - return len(ie) >= 3 && len(re) >= 3 + return len(initRec.snapshot()) >= 3 && len(respRec.snapshot()) >= 3 }, 18*time.Second, 25*time.Millisecond, "peers must self-heal and resume rekeying after a session loss") cancel() @@ -397,36 +374,37 @@ func TestTunnelReconnects(t *testing.T) { // Agreement invariant: the initiator installs a generation only after reading the // responder's offer, which the responder writes only after it has committed to - // installing — so every epoch the initiator installed must also have been - // installed by the responder. (The reverse can differ by one: a negotiation torn - // by the forced loss after the responder committed but before the initiator - // finished leaves the responder with an extra generation.) - ie, _ := initRec.snapshot() - _, rk := respRec.snapshot() - for _, e := range ie { - require.Contains(t, rk, e, "responder never installed initiator epoch %d", e) + // installing — so the responder must have transmitted under (i.e. installed as its + // tx SPI) every receive SPI the initiator installed. (The reverse can differ by one: + // a negotiation torn by the forced loss after the responder committed but before the + // initiator finished leaves the responder with an extra generation.) + ir := initRec.snapshot() + respTxSPIs := map[uint32]bool{} + for _, rec := range respRec.snapshot() { + respTxSPIs[rec.txSPI] = true + } + for _, rec := range ir { + require.True(t, respTxSPIs[rec.rxSPI], "responder never transmitted under initiator receive SPI %d", rec.rxSPI) } - // Monotonicity invariant (Phase 5): seeding the new session's allocator from the - // in-memory epoch high-water means the shared epoch no longer resets to 1 across - // the reconnect — the initiator's installed epochs are now globally strictly - // increasing, which is what lets the survivor's monotonicity guard accept the - // post-reconnect generation. - for i := 1; i < len(ie); i++ { - require.Greater(t, ie[i], ie[i-1], "initiator epochs must be globally monotonic across the reconnect") + // The receive SPIs are NOT globally monotonic across the reconnect: the per-session + // allocator resets, so the post-reconnect generations start over at a low value. That + // is safe (and accepted by the handler) because the reconnect derives fresh keys — + // recovery rests on fresh keys, not on a carried-forward high-water. Just assert every + // installed receive SPI is a valid non-zero selector. + for _, rec := range ir { + require.NotZero(t, rec.rxSPI) } } -// TestTunnelReconnectGuardNeverRejects is the recovery regression test the design -// review demanded: with installers that ENFORCE the handler's strictly-increasing -// epoch guard, a forced session loss must self-heal, keep installing generation after -// generation, AND never make either guard reject a regressed epoch. Without the -// epoch-floor seeding the post-reconnect epoch would reset to 1 and the guard would -// reject every generation (installs would stall at the single bring-up generation); -// without the seed MARGIN, a torn-after-responder-install exchange would re-offer an -// already-installed epoch and the responder's guard would reject it. The seed margin -// (>= the worst-case one-generation responder lead) is what makes zero rejections -// hold; the deterministic proof of that margin is TestReconnectSeedCoversTornLead. +// TestTunnelReconnectGuardNeverRejects is the recovery regression test: with installers +// that ENFORCE the handler's relaxed TX anti-reset guard (reject only a re-install of +// the currently-live transmit SPI), a forced session loss must self-heal, keep installing +// generation after generation, AND never make either guard reject. The per-session +// allocator resets the SPIs to a low value after the reconnect, but each generation +// carries a fresh key, so the new (lower) transmit SPI is never equal to the survivor's +// currently-live one and the guard accepts it — recovery with zero persisted state and +// zero rejections. func TestTunnelReconnectGuardNeverRejects(t *testing.T) { initGuard, respGuard := newGuardInstaller(), newGuardInstaller() initT, respT, cleanup := twoTunnels(t, initGuard.install, respGuard.install, 100*time.Millisecond) @@ -447,7 +425,8 @@ func TestTunnelReconnectGuardNeverRejects(t *testing.T) { go func() { runCh <- respT.Run(ctx) }() // Progress well past the single bring-up generation on BOTH peers proves the guard - // keeps accepting because seeding keeps the epoch climbing across the reconnect. + // keeps accepting because each fresh-keyed generation's transmit SPI differs from the + // currently-live one, even after the reconnect resets the allocator. require.Eventually(t, func() bool { ig, _ := initGuard.snapshot() rg, _ := respGuard.snapshot() @@ -460,235 +439,9 @@ func TestTunnelReconnectGuardNeverRejects(t *testing.T) { ig, iRej := initGuard.snapshot() _, rRej := respGuard.snapshot() - require.Zero(t, iRej, "initiator guard must never reject (seeding keeps epochs monotonic)") - require.Zero(t, rRej, "responder guard must never reject (the seed margin covers a torn-exchange lead)") - for i := 1; i < len(ig); i++ { - require.Greater(t, ig[i], ig[i-1], "accepted epochs are strictly increasing") - } -} - -// TestReconnectSeedCoversTornLead is the deterministic guard for the in-memory -// reconnect seed margin: it simulates the torn-exchange race — the responder committed -// epoch E but the initiator failed to record it, so the initiator's high-water lags by -// one — and proves the next session's seed still produces an epoch strictly greater -// than E, so the responder's monotonicity guard accepts it (no black-hole). -func TestReconnectSeedCoversTornLead(t *testing.T) { - const responderMax = uint32(50) // responder installed up to 50 - initHighWater := responderMax - 1 // initiator recorded only 49 (torn after 50) - - a := NewSPIAllocator(Initiator) - a.SeedFloor(activeMasterKeyIndex, seedWithMargin(initHighWater)) - next, err := a.Allocate(activeMasterKeyIndex) - require.NoError(t, err) - require.Greater(t, next&spiCounterMask, responderMax, - "the seeded epoch must exceed the responder's retained max despite the one-generation lag") -} - -// TestTunnelSeedsFromDurableHighWater proves the initiator-restart recovery -// mechanism deterministically: a fresh initiator whose store already holds a high -// high-water (as a prior process would have left) seeds its FIRST epoch above it, so -// a survivor that retained that high-water still accepts the new SA. It also confirms -// the value is re-persisted. -func TestTunnelSeedsFromDurableHighWater(t *testing.T) { - const prior = uint32(1000) - store := &fakeEpochStore{} - store.set(prior) - - initRec, respRec := newEpochRecorder(), newEpochRecorder() - initT, respT, cleanup := twoTunnelsWithStore(t, initRec.install, respRec.install, 100*time.Millisecond, store) - defer cleanup() - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - - brCh := make(chan error, 1) - go func() { brCh <- respT.Bringup(ctx) }() - require.NoError(t, initT.Bringup(ctx)) - require.NoError(t, <-brCh) - - ie, _ := initRec.snapshot() - re, _ := respRec.snapshot() - require.Len(t, ie, 1) - // First epoch is seeded strictly above the durable high-water (+margin), so it - // exceeds anything a survivor retained from the pre-restart session. - require.Equal(t, seedWithMargin(prior)+1, ie[0]) - require.Greater(t, ie[0], prior) - require.Equal(t, ie[0], re[0], "both peers install the seeded epoch") - - // The new high-water is persisted durably for the next restart. - require.Eventually(t, func() bool { - hw, ok := store.loaded() - return ok && hw == ie[0] - }, 2*time.Second, 5*time.Millisecond) -} - -// TestTunnelRequireStateFailsBringupOnCorruptState asserts the fail-closed Load -// policy: under RequireState a corrupt/unreadable store fails Bringup (on the -// initiator), while the default policy starts fresh. -func TestTunnelRequireStateFailsBringupOnCorruptState(t *testing.T) { - idA, err := GenerateIdentity() - require.NoError(t, err) - idB, err := GenerateIdentity() - require.NoError(t, err) - aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) - require.NoError(t, err) - initID, respID := idA, idB - if !aInit { - initID, respID = idB, idA - } - - conn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) - require.NoError(t, err) - defer conn.Close() - - loadErr := errors.New("state corrupt") - tn, err := NewTunnel(TunnelConfig{ - Local: initID, PeerPub: respID.PublicKey(), Conn: conn, - PeerAddr: conn.LocalAddr(), RekeyInterval: time.Second, - EpochStore: &fakeEpochStore{loadErr: loadErr}, RequireState: true, - }, func(uint32, [16]byte, [16]byte) error { return nil }) - require.NoError(t, err) - require.True(t, tn.Initiator()) - defer tn.Close() - - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - // Bringup must fail closed at loadEpochState, before any session is established. - err = tn.Bringup(ctx) - require.Error(t, err) - require.ErrorIs(t, err, loadErr) - require.Nil(t, tn.sess) -} - -func TestNewTunnelRequireStateNeedsStore(t *testing.T) { - idA, err := GenerateIdentity() - require.NoError(t, err) - idB, err := GenerateIdentity() - require.NoError(t, err) - conn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) - require.NoError(t, err) - defer conn.Close() - _, err = NewTunnel(TunnelConfig{ - Local: idA, PeerPub: idB.PublicKey(), Conn: conn, - PeerAddr: conn.LocalAddr(), RekeyInterval: time.Second, RequireState: true, - }, func(uint32, [16]byte, [16]byte) error { return nil }) - require.Error(t, err, "RequireState without an EpochStore must be rejected") -} - -// TestTunnelDurableRoundTripAcrossRestart is the true end-to-end durable-recovery -// proof: a real initiator persists its climbing high-water to a shared store, then a -// SECOND initiator instance on the SAME store (a simulated restart) reloads exactly -// what the first persisted and will seed strictly above it. -func TestTunnelDurableRoundTripAcrossRestart(t *testing.T) { - store := &fakeEpochStore{} - idA, err := GenerateIdentity() - require.NoError(t, err) - idB, err := GenerateIdentity() - require.NoError(t, err) - aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) - require.NoError(t, err) - initID, respID := idA, idB - if !aInit { - initID, respID = idB, idA + require.Zero(t, iRej, "initiator guard must never reject") + require.Zero(t, rRej, "responder guard must never reject") + for _, rxSPI := range ig { + require.NotZero(t, rxSPI) } - - // Round 1: run a real pair until the initiator has persisted a couple generations. - func() { - respConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) - require.NoError(t, err) - defer respConn.Close() - initConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) - require.NoError(t, err) - defer initConn.Close() - - t1, err := NewTunnel(TunnelConfig{ - Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, - PeerAddr: respConn.LocalAddr(), RekeyInterval: 100 * time.Millisecond, EpochStore: store, - }, newEpochRecorder().install) - require.NoError(t, err) - require.True(t, t1.Initiator()) - r, err := NewTunnel(TunnelConfig{ - Local: respID, PeerPub: initID.PublicKey(), Conn: respConn, - PeerAddr: initConn.LocalAddr(), RekeyInterval: 100 * time.Millisecond, - }, newEpochRecorder().install) - require.NoError(t, err) - defer t1.Close() - defer r.Close() - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - brCh := make(chan error, 1) - go func() { brCh <- r.Bringup(ctx) }() - require.NoError(t, t1.Bringup(ctx)) - require.NoError(t, <-brCh) - - runCh := make(chan error, 2) - go func() { runCh <- t1.Run(ctx) }() - go func() { runCh <- r.Run(ctx) }() - require.Eventually(t, func() bool { - hw, ok := store.loaded() - return ok && hw >= 2 - }, 10*time.Second, 10*time.Millisecond, "initiator must persist a few generations") - cancel() - require.NoError(t, <-runCh) - require.NoError(t, <-runCh) - }() - - persisted, ok := store.loaded() - require.True(t, ok) - require.GreaterOrEqual(t, persisted, uint32(2)) - - // Round 2: a fresh initiator (same identity + store) reloads what round 1 wrote. - conn2, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) - require.NoError(t, err) - defer conn2.Close() - t2, err := NewTunnel(TunnelConfig{ - Local: initID, PeerPub: respID.PublicKey(), Conn: conn2, - PeerAddr: conn2.LocalAddr(), RekeyInterval: time.Second, EpochStore: store, - }, func(uint32, [16]byte, [16]byte) error { return nil }) - require.NoError(t, err) - require.True(t, t2.Initiator()) - defer t2.Close() - - require.NoError(t, t2.loadEpochState()) - require.Equal(t, persisted, t2.installedHigh, "second instance loads exactly what the first persisted") - require.Greater(t, seedWithMargin(t2.epochSeed), persisted, "and seeds strictly above it") -} - -// TestResponderIgnoresStore pins the initiator-only invariant: a responder configured -// with a store (and even --require-state) must never consult or write it, must not be -// gated by a load error, and must not start a persister. -func TestResponderIgnoresStore(t *testing.T) { - idA, err := GenerateIdentity() - require.NoError(t, err) - idB, err := GenerateIdentity() - require.NoError(t, err) - aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) - require.NoError(t, err) - initID, respID := idA, idB - if !aInit { - initID, respID = idB, idA - } - - conn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) - require.NoError(t, err) - defer conn.Close() - - store := &fakeEpochStore{loadErr: errors.New("responder must never read this")} - respT, err := NewTunnel(TunnelConfig{ - Local: respID, PeerPub: initID.PublicKey(), Conn: conn, - PeerAddr: conn.LocalAddr(), RekeyInterval: time.Second, - EpochStore: store, RequireState: true, - }, func(uint32, [16]byte, [16]byte) error { return nil }) - require.NoError(t, err) - require.False(t, respT.Initiator()) - defer respT.Close() - - // On the responder, loadEpochState is a no-op: it must not Load (so the load error - // and RequireState do not gate it) and must not start a persister. - require.NoError(t, respT.loadEpochState()) - require.Nil(t, respT.persist) - loads, stores := store.counts() - require.Zero(t, loads, "responder must never Load its store") - require.Zero(t, stores, "responder must never Store") } diff --git a/control/epochstate.go b/control/epochstate.go deleted file mode 100644 index d30800d..0000000 --- a/control/epochstate.go +++ /dev/null @@ -1,451 +0,0 @@ -package control - -import ( - "bytes" - "crypto/ecdsa" - "crypto/hkdf" - "crypto/hmac" - "crypto/sha256" - "crypto/x509" - "encoding/binary" - "errors" - "fmt" - "io/fs" - "log/slog" - "os" - "path/filepath" - "sync" - "sync/atomic" - "time" -) - -// This file implements the durable epoch high-water that lets a one-sided restart -// of the control-plane INITIATOR recover seamlessly (Phase 5 item #1). -// -// Why only the initiator: the shared data-plane epoch is the initiator-allocated -// SPI counter (see SharedEpoch). On every (re)connect the per-session SPI allocator -// resets to 1, so the survivor's strictly-increasing epoch guard (handler.go) would -// reject the regressed epoch. The Tunnel fixes this by seeding the new session's -// allocator above an epoch high-water it carries forward: in memory (covers a -// transient reconnect and a responder restart, where the surviving initiator never -// lost the value) and, when an EpochStore is configured, on durable storage (covers -// an initiator restart, where the value must outlive the process). The responder's -// high-water is not load-bearing, so durable state is consulted only on the -// initiator. -// -// Safety note: the epoch is a data-plane SELECTOR, not a nonce. Reusing an epoch -// VALUE across sessions is harmless because every QUIC session derives fresh master -// keys from a fresh ECDHE exporter (no 0-RTT), so the AES-GCM (key, nonce) pair never -// repeats. The persisted high-water therefore only needs integrity, not rollback -// resistance — see EpochStore. - -// EpochStore persists the data-plane epoch high-water across process restarts. -type EpochStore interface { - // Load returns the persisted high-water. ok is false with a nil error ONLY when - // no state has been written yet (first run); every other condition — unreadable - // file, truncation, bad magic/version, identity-pair pin mismatch, MAC failure, - // out-of-range value — returns a non-nil error so a fail-closed caller - // (--require-state) can refuse to start rather than silently resetting to zero. - Load() (highWater uint32, ok bool, err error) - // Store durably writes highWater (fsync of the file, atomic rename into place, - // fsync of the directory). It may block on disk I/O; callers persist off the - // hot path (see epochPersister). - Store(highWater uint32) error -} - -// On-disk format (fixed 74 bytes). The MAC covers bytes [0:42); the mac occupies -// [42:74). A future non-zero flags value must be byte-identical on Store and Load. -const ( - epochStateMagic = "ICXE" - epochStateVersion = 1 - epochStateLen = 4 + 1 + 1 + 32 + 4 + 32 // magic|version|flags|pin|highWater|mac - - offVersion = 4 - offFlags = 5 - offPin = 6 - offHighWater = 38 - offMAC = 42 // == end of MAC-covered prefix - - // stateMACInfo domain-separates the epoch-state MAC key from any other use of - // the identity key. - stateMACInfo = "icx epoch-state hmac v1" -) - -// FileEpochStore is a file-backed EpochStore. The record is integrity-protected by -// an HMAC keyed from the local identity and bound (via the pin) to the exact -// (local, peer) identity pair, so a file cannot be silently swapped between tunnels -// or nodes. One file per (local, peer) tunnel; it must NOT be shared between -// processes (concurrent writers). -// -// What the MAC defends: accidental corruption/bit-rot (rejected on Load) and forgery -// of a chosen high-water by anyone without the identity key (HMAC). What it does NOT -// defend: rollback-replay of an older, validly-signed file, or deletion (an absent -// file is indistinguishable from a genuine first run). Those need a hardware -// monotonic counter and are out of scope; --require-state is an integrity tripwire -// against corruption, not an anti-rollback control. This is acceptable because, per -// the safety note above, even a rolled-back or absent high-water cannot cause AES-GCM -// nonce reuse on the data plane (keys are per-session ephemeral) — the only -// consequence is a transient one-sided-restart-style outage. -type FileEpochStore struct { - path string - macKey []byte - pin [32]byte -} - -// NewFileEpochStore builds a file-backed store at path, keyed from local and bound to -// the (local, peer) identity pair. -func NewFileEpochStore(path string, local *Identity, peerPub *ecdsa.PublicKey) (*FileEpochStore, error) { - if path == "" { - return nil, errors.New("control: epoch state path is empty") - } - if local == nil || peerPub == nil { - return nil, errors.New("control: epoch state requires local identity and peer key") - } - // Validate the parent directory up front so a missing/unwritable state directory - // fails fast at construction. Without this, os.ReadFile on a path whose parent is - // missing reports fs.ErrNotExist, which Load would (correctly) treat as a genuine - // first run — masking the misconfiguration so even --require-state starts happily - // and only fails closed several rekeys later when the first Store cannot create a - // temp file. - dir := filepath.Dir(path) - info, err := os.Stat(dir) - if err != nil { - return nil, fmt.Errorf("control: epoch state directory %q: %w", dir, err) - } - if !info.IsDir() { - return nil, fmt.Errorf("control: epoch state directory %q is not a directory", dir) - } - macKey, err := stateMACKey(local) - if err != nil { - return nil, err - } - pin, err := identityPairPin(local.PublicKey(), peerPub) - if err != nil { - return nil, err - } - // Best-effort sweep of temp files orphaned by a crash between CreateTemp and rename - // in a prior run (safe at construction: no Store is in flight, and the store is - // not shared between processes). - if matches, gerr := filepath.Glob(filepath.Join(dir, ".icx-epoch-*.tmp")); gerr == nil { - for _, m := range matches { - _ = os.Remove(m) - } - } - return &FileEpochStore{path: path, macKey: macKey, pin: pin}, nil -} - -// stateMACKey derives the HMAC key from the identity. The IKM is the fixed-width -// (32-byte, left-zero-padded) P-256 private scalar — a mathematical value that is -// stable across Go releases — NOT the PKCS#8 DER, whose byte layout is an -// implementation detail a toolchain upgrade could perturb and thereby silently -// invalidate every previously-written MAC. Deriving the MAC key from the identity -// (rather than a separate secret) intentionally overloads one key across roles; the -// HKDF info string domain-separates it, and the consequence is that the epoch-state -// MAC lifetime equals the identity-key lifetime — rotating the identity resets the -// durable epoch state. -func stateMACKey(local *Identity) ([]byte, error) { - var scalar [32]byte - local.priv.D.FillBytes(scalar[:]) - key, err := hkdf.Key(sha256.New, scalar[:], nil, stateMACInfo, 32) - if err != nil { - return nil, fmt.Errorf("control: derive epoch-state MAC key: %w", err) - } - return key, nil -} - -// identityPairPin binds a state file to the exact (local, peer) pair so it cannot be -// confused with another tunnel's file even under the same identity. -func identityPairPin(local, peer *ecdsa.PublicKey) ([32]byte, error) { - var pin [32]byte - l, err := x509.MarshalPKIXPublicKey(local) - if err != nil { - return pin, fmt.Errorf("control: marshal local key: %w", err) - } - p, err := x509.MarshalPKIXPublicKey(peer) - if err != nil { - return pin, fmt.Errorf("control: marshal peer key: %w", err) - } - h := sha256.New() - h.Write(l) - h.Write(p) - copy(pin[:], h.Sum(nil)) - return pin, nil -} - -func (s *FileEpochStore) marshal(highWater uint32) []byte { - buf := make([]byte, epochStateLen) - copy(buf[0:offVersion], epochStateMagic) - buf[offVersion] = epochStateVersion - buf[offFlags] = 0 // reserved - copy(buf[offPin:offHighWater], s.pin[:]) - binary.BigEndian.PutUint32(buf[offHighWater:offMAC], highWater) - mac := hmac.New(sha256.New, s.macKey) - mac.Write(buf[:offMAC]) - copy(buf[offMAC:], mac.Sum(nil)) - return buf -} - -// Load reads and verifies the state file. See EpochStore.Load for the ok/err contract. -func (s *FileEpochStore) Load() (uint32, bool, error) { - buf, err := os.ReadFile(s.path) - if err != nil { - // fs.ErrNotExist is the ONLY signal that maps to "first run"; every other - // open/read error (EACCES, EIO, ENOTDIR, dangling symlink, ...) is a real - // failure so --require-state fails closed instead of starting fresh. - if errors.Is(err, fs.ErrNotExist) { - return 0, false, nil - } - return 0, false, fmt.Errorf("control: read epoch state %q: %w", s.path, err) - } - // Length is validated strictly before any field slicing so a truncated or - // zero-length file returns a clean error rather than panicking. - if len(buf) != epochStateLen { - return 0, false, fmt.Errorf("control: epoch state %q is %d bytes, want %d (corrupt/truncated)", s.path, len(buf), epochStateLen) - } - if string(buf[0:offVersion]) != epochStateMagic { - return 0, false, fmt.Errorf("control: epoch state %q has bad magic", s.path) - } - if buf[offVersion] != epochStateVersion { - return 0, false, fmt.Errorf("control: epoch state %q has unsupported version %d", s.path, buf[offVersion]) - } - if !bytes.Equal(buf[offPin:offHighWater], s.pin[:]) { - return 0, false, fmt.Errorf("control: epoch state %q identity-pair pin mismatch (wrong peer or identity key)", s.path) - } - mac := hmac.New(sha256.New, s.macKey) - mac.Write(buf[:offMAC]) - if !hmac.Equal(buf[offMAC:], mac.Sum(nil)) { - return 0, false, fmt.Errorf("control: epoch state %q MAC verification failed (corrupt or tampered)", s.path) - } - hw := binary.BigEndian.Uint32(buf[offHighWater:offMAC]) - if hw > spiCounterMask { - return 0, false, fmt.Errorf("control: epoch state %q high-water %d exceeds max %d", s.path, hw, spiCounterMask) - } - return hw, true, nil -} - -// Store atomically and durably writes highWater. It writes a uniquely-named temp file -// in the target directory (so overlapping writers cannot collide on a shared temp -// name), fsyncs it, renames it over the target (atomic replace), then fsyncs the -// directory so the rename — the durable commit point — survives a crash. -func (s *FileEpochStore) Store(highWater uint32) (err error) { - buf := s.marshal(highWater) - dir := filepath.Dir(s.path) - tmp, err := os.CreateTemp(dir, ".icx-epoch-*.tmp") - if err != nil { - return fmt.Errorf("control: create temp epoch state in %q: %w", dir, err) - } - tmpName := tmp.Name() - committed := false - defer func() { - if !committed { - _ = tmp.Close() - _ = os.Remove(tmpName) - } - }() - if _, err = tmp.Write(buf); err != nil { - return fmt.Errorf("control: write epoch state: %w", err) - } - if err = tmp.Sync(); err != nil { - return fmt.Errorf("control: fsync epoch state: %w", err) - } - if err = tmp.Close(); err != nil { - return fmt.Errorf("control: close epoch state: %w", err) - } - if err = os.Rename(tmpName, s.path); err != nil { - return fmt.Errorf("control: rename epoch state into place: %w", err) - } - committed = true - if derr := fsyncDir(dir); derr != nil { - // The rename succeeded, so the new high-water IS in the file; only the - // durability of the directory ENTRY across a power loss is unconfirmed. On - // filesystems where directory fsync is unsupported (some overlay/network FS) - // this fails every time. Treat it as success for bookkeeping — returning an - // error here would wrongly count a durable write as a failure and could fail a - // healthy node closed under --require-state — but warn so a genuinely failing - // device is visible. - slog.Warn("control: epoch state written but directory fsync failed; value is durable, crash-durability of the rename is unconfirmed", - slog.Any("error", derr)) - } - return nil -} - -func fsyncDir(dir string) error { - d, err := os.Open(dir) - if err != nil { - return err - } - defer func() { _ = d.Close() }() - return d.Sync() -} - -// Tunables for the durable-epoch machinery. Defaults live on the Tunnel so tests can -// override them; the constants below are the shared, non-overridable parameters. -const ( - // epochSeedMargin is how far above the carried-forward high-water the initiator - // seeds each new session's allocator (see seedWithMargin). The worst case it must - // cover is the survivor holding ~2 generations beyond the initiator's known - // high-water (durable-restart persistence lag) and 1 generation beyond it on a - // torn-exchange reconnect, so a margin >= 2 is sufficient; 8 is slack against any - // future rekey pipelining. It is applied per session, so each reconnect/restart - // spends up to margin epochs — negligible against the 2^30 counter space (a - // reconnect every second for years before it matters). - epochSeedMargin = 8 - - // persistShutdownGrace bounds how long stop() waits for the persister goroutine to - // drain before abandoning it, so a wedged Store (uninterruptible fsync on a dying - // disk) cannot pin shutdown — and the --require-state fail-closed error can still - // reach the errgroup. - persistShutdownGrace = 3 * time.Second - - defaultMaxStoreFailures = 5 - defaultPersistStallTimeout = 60 * time.Second -) - -// errEpochPersistStalled is returned from Run (initiator, --require-state only) when -// durable persistence has fallen far enough behind that seamless restart recovery is -// no longer guaranteed. It is fatal/non-retryable: the operator asked to fail closed. -var errEpochPersistStalled = errors.New("control: durable epoch persistence is failing") - -// seedWithMargin computes the allocator seed floor for a carried-forward high-water: -// hw + epochSeedMargin, clamped to spiCounterMask-1. It is applied at SEED time (every -// new session on the initiator), so the margin covers BOTH gaps that can leave the -// seed at or below what the surviving peer already retained: -// -// - the durable persistence gap on an initiator restart (the on-disk value lags the -// survivor by up to ~2 generations); and -// - the torn-exchange lead on an in-memory reconnect: a session can tear after the -// responder committed epoch E but before the initiator recorded it (recordInstalled -// runs only on a successful install), leaving the initiator's high-water one behind -// the responder's. Without a margin the initiator would re-offer E and the -// responder's strictly-increasing guard would reject it — a one-generation -// data-plane black-hole. The margin (>= 2) seeds strictly above E, so the guard -// accepts. -// -// The clamp is one below the ceiling so the seeded allocator can still hand out the -// terminal counter spiCounterMask before exhausting (clamping to spiCounterMask itself -// would make the very first Allocate fail). Reaching the clamp is the exhaustion -// warning threshold. -func seedWithMargin(hw uint32) uint32 { - const ceil = spiCounterMask - 1 - if hw >= ceil || hw+epochSeedMargin > ceil { - return ceil - } - return hw + epochSeedMargin -} - -// epochPersister owns the EpochStore and writes to it from a single dedicated -// goroutine, so an fsync on a degraded disk never blocks the Tunnel's run loop (which -// also drives reconnect and rekey). Requests are coalesced through a one-slot mailbox -// — only the latest high-water matters — and the in-memory high-water remains the -// source of truth, so a late or dropped write merely widens the rollback gap (which -// the seed margin absorbs) rather than corrupting anything. -type epochPersister struct { - store EpochStore - reqCh chan uint32 - stopCh chan struct{} - doneCh chan struct{} - stopOnce sync.Once - - high atomic.Uint32 // last value successfully stored - failures atomic.Int64 // consecutive Store failures - lastOK atomic.Int64 // unix nanos of the last successful store (or start) -} - -func newEpochPersister(store EpochStore, startHigh uint32, nowNanos int64) *epochPersister { - p := &epochPersister{ - store: store, - reqCh: make(chan uint32, 1), - stopCh: make(chan struct{}), - doneCh: make(chan struct{}), - } - p.high.Store(startHigh) - p.lastOK.Store(nowNanos) - go p.run() - return p -} - -func (p *epochPersister) run() { - defer close(p.doneCh) - for { - select { - case <-p.stopCh: - // Best-effort final flush of any queued value on clean shutdown. - select { - case v := <-p.reqCh: - p.flush(v) - default: - } - return - case v := <-p.reqCh: - p.flush(v) - } - } -} - -func (p *epochPersister) flush(v uint32) { - if v <= p.high.Load() { - return // already durable (coalesced no-op) - } - if err := p.store.Store(v); err != nil { - n := p.failures.Add(1) - slog.Error("control: failed to persist epoch high-water; one-sided-restart recovery is degrading (set --require-state to fail closed)", - slog.Uint64("highWater", uint64(v)), slog.Int64("consecutiveFailures", n), slog.Any("error", err)) - return - } - p.failures.Store(0) - p.high.Store(v) - p.lastOK.Store(time.Now().UnixNano()) -} - -// request enqueues v as the latest high-water to persist, coalescing with any value -// still queued by keeping the LARGER of the two — so a request can never drop a higher -// high-water (requests are monotonic in normal use, but keeping the max is robust -// regardless). It never blocks (single producer: the Tunnel's run goroutine). -func (p *epochPersister) request(v uint32) { - for { - select { - case p.reqCh <- v: - return - case old := <-p.reqCh: - if old > v { - v = old - } - } - } -} - -// fatal reports whether durable persistence has degraded past the point of guaranteed -// recovery, but only under requireState (otherwise persistence is best-effort). -// target is the latest in-memory high-water the caller wants durable. It trips on -// SUSTAINED failure (>= maxFailures consecutive Store errors) or a HUNG store (un- -// persisted work with no progress for longer than stall). Intermittent slowness does -// not trip it, and need not: each success coalesces to and stores the latest -// high-water, so recovery never falls materially behind. -func (p *epochPersister) fatal(requireState bool, target uint32, now time.Time, maxFailures int64, stall time.Duration) error { - if !requireState { - return nil - } - if f := p.failures.Load(); f >= maxFailures { - return fmt.Errorf("%w: %d consecutive store failures", errEpochPersistStalled, f) - } - // Also catch a silently hung store (no error, no progress) once there is - // un-persisted work that has not advanced for too long. - if target > p.high.Load() && now.Sub(time.Unix(0, p.lastOK.Load())) > stall { - return fmt.Errorf("%w: durable high-water stalled > %s behind the live epoch", errEpochPersistStalled, stall) - } - return nil -} - -// stop signals the persister to exit and waits for it, but only up to -// persistShutdownGrace: a Store wedged in an uninterruptible fsync would otherwise -// pin the goroutine forever and, since Run defers Close which calls stop, prevent the -// process from exiting (and prevent a --require-state fatal from reaching the -// errgroup). The wedged goroutine is then abandoned (the OS reaps it at exit). -func (p *epochPersister) stop() { - p.stopOnce.Do(func() { close(p.stopCh) }) - select { - case <-p.doneCh: - case <-time.After(persistShutdownGrace): - slog.Warn("control: epoch-state persister did not stop within grace; abandoning a stuck store write") - } -} diff --git a/control/epochstate_test.go b/control/epochstate_test.go deleted file mode 100644 index dff4182..0000000 --- a/control/epochstate_test.go +++ /dev/null @@ -1,391 +0,0 @@ -package control - -import ( - "crypto/hkdf" - "crypto/hmac" - "crypto/sha256" - "crypto/x509" - "encoding/binary" - "errors" - "os" - "path/filepath" - "sync" - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -func TestSPIAllocatorSeedFloor(t *testing.T) { - a := NewSPIAllocator(Initiator) - // Fresh allocator starts at counter 1. - spi, err := a.Allocate(0) - require.NoError(t, err) - require.Equal(t, uint32(1), spi&spiCounterMask) - - // Seeding above the current position jumps the next allocation past the floor. - a.SeedFloor(0, 1000) - spi, err = a.Allocate(0) - require.NoError(t, err) - require.Equal(t, uint32(1001), spi&spiCounterMask) - - // Seeding at or below the current position is a no-op (monotonic). - a.SeedFloor(0, 5) - spi, err = a.Allocate(0) - require.NoError(t, err) - require.Equal(t, uint32(1002), spi&spiCounterMask) - - // SeedFloor masks off the role/index bits: a full initiator SPI seeds by counter. - a.SeedFloor(0, 2000) - spi, err = a.Allocate(0) - require.NoError(t, err) - require.Equal(t, uint32(2001), spi&spiCounterMask) - - // Out-of-range index is ignored, not a panic. - a.SeedFloor(99, 1<<20) -} - -func TestSPIAllocatorExhaustion(t *testing.T) { - a := NewSPIAllocator(Initiator) - a.SeedFloor(0, spiCounterMask-1) - // The terminal counter (spiCounterMask) is still allocatable. - spi, err := a.Allocate(0) - require.NoError(t, err) - require.Equal(t, spiCounterMask, spi&spiCounterMask) - // The next allocation exhausts the space with the sentinel error. - _, err = a.Allocate(0) - require.ErrorIs(t, err, ErrSPIExhausted) - // Exhaustion is sticky. - _, err = a.Allocate(0) - require.ErrorIs(t, err, ErrSPIExhausted) -} - -func TestSeedWithMargin(t *testing.T) { - require.Equal(t, uint32(1000+epochSeedMargin), seedWithMargin(1000)) - require.Equal(t, uint32(epochSeedMargin), seedWithMargin(0)) - // Near the ceiling, clamp to spiCounterMask-1 so the seeded allocator can still - // hand out the terminal counter before exhausting. - require.Equal(t, spiCounterMask-1, seedWithMargin(spiCounterMask)) - require.Equal(t, spiCounterMask-1, seedWithMargin(spiCounterMask-1)) - require.Equal(t, spiCounterMask-1, seedWithMargin(spiCounterMask-2)) -} - -func newTestStore(t *testing.T) (*FileEpochStore, *Identity, *Identity) { - t.Helper() - local, err := GenerateIdentity() - require.NoError(t, err) - peer, err := GenerateIdentity() - require.NoError(t, err) - path := filepath.Join(t.TempDir(), "epoch.state") - s, err := NewFileEpochStore(path, local, peer.PublicKey()) - require.NoError(t, err) - return s, local, peer -} - -func TestFileEpochStoreRoundTrip(t *testing.T) { - s, _, _ := newTestStore(t) - - // Absent file => first run (ok=false, nil error). - hw, ok, err := s.Load() - require.NoError(t, err) - require.False(t, ok) - require.Zero(t, hw) - - require.NoError(t, s.Store(42)) - hw, ok, err = s.Load() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, uint32(42), hw) - - // Overwrite (atomic replace) with a higher value. - require.NoError(t, s.Store(99)) - hw, ok, err = s.Load() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, uint32(99), hw) -} - -func TestFileEpochStoreMACStableAcrossInstances(t *testing.T) { - local, err := GenerateIdentity() - require.NoError(t, err) - peer, err := GenerateIdentity() - require.NoError(t, err) - path := filepath.Join(t.TempDir(), "epoch.state") - - s1, err := NewFileEpochStore(path, local, peer.PublicKey()) - require.NoError(t, err) - require.NoError(t, s1.Store(7)) - - // A second store built from the SAME identity+peer (i.e. a restart) must verify - // the MAC and load the value — the MAC key derivation is stable. - s2, err := NewFileEpochStore(path, local, peer.PublicKey()) - require.NoError(t, err) - hw, ok, err := s2.Load() - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, uint32(7), hw) -} - -func TestFileEpochStoreRejectsWrongIdentity(t *testing.T) { - local, err := GenerateIdentity() - require.NoError(t, err) - other, err := GenerateIdentity() - require.NoError(t, err) - peer, err := GenerateIdentity() - require.NoError(t, err) - path := filepath.Join(t.TempDir(), "epoch.state") - - s, err := NewFileEpochStore(path, local, peer.PublicKey()) - require.NoError(t, err) - require.NoError(t, s.Store(5)) - - // A different local identity derives a different MAC key → verification fails - // (and it is an error, NOT a silent first-run). - bad, err := NewFileEpochStore(path, other, peer.PublicKey()) - require.NoError(t, err) - _, ok, err := bad.Load() - require.Error(t, err) - require.False(t, ok) -} - -func TestFileEpochStoreRejectsWrongPeer(t *testing.T) { - local, err := GenerateIdentity() - require.NoError(t, err) - peer, err := GenerateIdentity() - require.NoError(t, err) - otherPeer, err := GenerateIdentity() - require.NoError(t, err) - path := filepath.Join(t.TempDir(), "epoch.state") - - s, err := NewFileEpochStore(path, local, peer.PublicKey()) - require.NoError(t, err) - require.NoError(t, s.Store(5)) - - // Same identity, different peer → pin mismatch → error. - bad, err := NewFileEpochStore(path, local, otherPeer.PublicKey()) - require.NoError(t, err) - _, ok, err := bad.Load() - require.Error(t, err) - require.False(t, ok) -} - -func TestFileEpochStoreRejectsCorruption(t *testing.T) { - // Flipping a byte in any region (magic, version, pin, high-water, mac) must be - // detected as an error, never silently accepted or treated as first-run. - // Cover every region's first byte (incl. the reserved flags byte and the - // region boundaries offPin/offMAC) to prove the whole prefix is MAC-protected. - for _, off := range []int{0, offVersion, offFlags, offPin, offHighWater, offMAC, offMAC + 1, epochStateLen - 1} { - s, _, _ := newTestStore(t) - require.NoError(t, s.Store(123)) - buf, err := os.ReadFile(s.path) - require.NoError(t, err) - buf[off] ^= 0xff - require.NoError(t, os.WriteFile(s.path, buf, 0o600)) - _, ok, err := s.Load() - require.Error(t, err, "corruption at offset %d must be rejected", off) - require.False(t, ok) - } -} - -func TestFileEpochStoreRejectsBadLength(t *testing.T) { - for _, n := range []int{0, 1, epochStateLen - 1, epochStateLen + 1} { - s, _, _ := newTestStore(t) - require.NoError(t, os.WriteFile(s.path, make([]byte, n), 0o600)) - _, ok, err := s.Load() - require.Error(t, err, "length %d must be a clean error, not a panic or first-run", n) - require.False(t, ok) - } -} - -func TestFileEpochStoreRejectsOutOfRangeHighWater(t *testing.T) { - s, _, _ := newTestStore(t) - // A validly-MAC'd record whose high-water exceeds the counter space must be - // rejected (it cannot have come from a real allocator). - buf := s.marshal(spiCounterMask + 1) - require.NoError(t, os.WriteFile(s.path, buf, 0o600)) - _, ok, err := s.Load() - require.Error(t, err) - require.False(t, ok) -} - -func TestFileEpochStoreUnreadableIsError(t *testing.T) { - if os.Geteuid() == 0 { - t.Skip("root bypasses file permissions") - } - s, _, _ := newTestStore(t) - require.NoError(t, s.Store(11)) - require.NoError(t, os.Chmod(s.path, 0o000)) - t.Cleanup(func() { _ = os.Chmod(s.path, 0o600) }) - // A present-but-unreadable file is a real error (so --require-state fails closed), - // NOT a first-run. - _, ok, err := s.Load() - require.Error(t, err) - require.False(t, ok) -} - -// TestFileEpochStoreLayout pins the on-disk byte layout (field order, endianness, MAC -// scope and key derivation) via an independent recomputation, so an accidental change -// to any of them is caught. -func TestFileEpochStoreLayout(t *testing.T) { - local, err := GenerateIdentity() - require.NoError(t, err) - peer, err := GenerateIdentity() - require.NoError(t, err) - s, err := NewFileEpochStore(filepath.Join(t.TempDir(), "epoch.state"), local, peer.PublicKey()) - require.NoError(t, err) - - const hw = uint32(0x01020304) - got := s.marshal(hw) - require.Len(t, got, epochStateLen) - - var scalar [32]byte - local.priv.D.FillBytes(scalar[:]) - macKey, err := hkdf.Key(sha256.New, scalar[:], nil, stateMACInfo, 32) - require.NoError(t, err) - lDER, err := x509.MarshalPKIXPublicKey(local.PublicKey()) - require.NoError(t, err) - pDER, err := x509.MarshalPKIXPublicKey(peer.PublicKey()) - require.NoError(t, err) - ph := sha256.New() - ph.Write(lDER) - ph.Write(pDER) - - want := make([]byte, epochStateLen) - copy(want[0:4], "ICXE") - want[4] = 1 - want[5] = 0 - copy(want[6:38], ph.Sum(nil)) - binary.BigEndian.PutUint32(want[38:42], hw) - m := hmac.New(sha256.New, macKey) - m.Write(want[:42]) - copy(want[42:], m.Sum(nil)) - - require.Equal(t, want, got) -} - -// fakeEpochStore is an in-memory EpochStore for the persister/Tunnel tests. It can be -// reused across a simulated process restart (the persisted value survives). -type fakeEpochStore struct { - mu sync.Mutex - high uint32 - has bool - loadErr error - storeErr error - stores int - loads int -} - -func (f *fakeEpochStore) Load() (uint32, bool, error) { - f.mu.Lock() - defer f.mu.Unlock() - f.loads++ - if f.loadErr != nil { - return 0, false, f.loadErr - } - return f.high, f.has, nil -} - -func (f *fakeEpochStore) counts() (loads, stores int) { - f.mu.Lock() - defer f.mu.Unlock() - return f.loads, f.stores -} - -func (f *fakeEpochStore) Store(v uint32) error { - f.mu.Lock() - defer f.mu.Unlock() - f.stores++ - if f.storeErr != nil { - return f.storeErr - } - f.high, f.has = v, true - return nil -} - -func (f *fakeEpochStore) set(v uint32) { - f.mu.Lock() - defer f.mu.Unlock() - f.high, f.has = v, true -} - -func (f *fakeEpochStore) loaded() (uint32, bool) { - f.mu.Lock() - defer f.mu.Unlock() - return f.high, f.has -} - -func TestEpochPersisterCoalesceAndFlush(t *testing.T) { - fs := &fakeEpochStore{} - p := newEpochPersister(fs, 0, time.Now().UnixNano()) - defer p.stop() - - for _, v := range []uint32{1, 2, 3, 7, 5} { - p.request(v) - } - // The persister catches up to the highest requested value; lower/stale values are - // coalesced or ignored (Store is monotonic in the persister). - require.Eventually(t, func() bool { - hw, ok := fs.loaded() - return ok && hw == 7 - }, 2*time.Second, 5*time.Millisecond) - require.Zero(t, p.failures.Load()) -} - -func TestEpochPersisterFatalUnderRequireState(t *testing.T) { - fs := &fakeEpochStore{storeErr: errors.New("disk full")} - p := newEpochPersister(fs, 0, time.Now().UnixNano()) - defer p.stop() - - // The persister attempts each distinct requested value once (it does not retry a - // failed value), so consecutive failures climb one per new request — exactly as a - // failing disk accrues one failure per rekey. Drive several, waiting for each to - // register so they are not coalesced. - for v := int64(1); v <= 5; v++ { - p.request(uint32(v)) - require.Eventually(t, func() bool { - return p.failures.Load() >= v - }, 2*time.Second, 5*time.Millisecond) - } - - // Without require-state, a failing store is best-effort (never fatal). - require.NoError(t, p.fatal(false, 5, time.Now(), 3, time.Hour)) - - // With require-state, failures past the threshold are fatal. - require.ErrorIs(t, p.fatal(true, 5, time.Now(), 3, time.Hour), errEpochPersistStalled) -} - -// TestEpochPersisterRequestKeepsMax exercises request()'s keep-max coalescing in -// isolation — with no goroutine draining, a lower request must not displace a queued -// higher value (flush's own monotonic guard would otherwise mask a broken request()). -func TestEpochPersisterRequestKeepsMax(t *testing.T) { - p := &epochPersister{reqCh: make(chan uint32, 1)} // no run() goroutine - p.request(7) - p.request(5) // lower - p.request(3) // lower - select { - case v := <-p.reqCh: - require.Equal(t, uint32(7), v, "the higher queued value must survive a lower request") - default: - t.Fatal("expected a coalesced value in the mailbox") - } -} - -func TestEpochPersisterFatalOnStall(t *testing.T) { - fs := &fakeEpochStore{} - // Seed lastOK far in the past and leave a value un-persisted (high < target). - p := newEpochPersister(fs, 0, time.Now().Add(-time.Hour).UnixNano()) - defer p.stop() - // target (5) is ahead of what the store holds (0) and lastOK is stale → stalled. - require.ErrorIs(t, p.fatal(true, 5, time.Now(), 100, time.Minute), errEpochPersistStalled) - // No un-persisted work → not stalled even with a stale lastOK. - require.NoError(t, p.fatal(true, 0, time.Now(), 100, time.Minute)) - - // A successful store clears the stall: high catches up to the target and lastOK is - // refreshed, so the tripwire un-latches. - p.request(5) - require.Eventually(t, func() bool { - hw, ok := fs.loaded() - return ok && hw == 5 - }, 2*time.Second, 5*time.Millisecond) - require.NoError(t, p.fatal(true, 5, time.Now(), 100, time.Minute), "stall must clear once the store catches up") -} diff --git a/control/sa.go b/control/sa.go index 4206718..e08f0f5 100644 --- a/control/sa.go +++ b/control/sa.go @@ -106,10 +106,10 @@ func MakeSPI(masterKeyIndex int, role Role, counter uint32) (uint32, error) { } // ErrSPIExhausted is returned by Allocate when the 2^30 counter space for a -// master-key index is used up. It is a TERMINAL condition for the shared-epoch -// bridge: the only remedy is master-key rotation, which SharedEpoch does not yet -// support (it requires master-key index 0). Callers treat it as a non-retryable, -// fail-closed error rather than looping a reconnect. +// master-key index is used up. It is a TERMINAL condition: the only remedy is +// master-key rotation, which this build does not yet support (the active master-key +// index is fixed at 0). Callers treat it as a non-retryable, fail-closed error rather +// than looping a reconnect. var ErrSPIExhausted = errors.New("control: SPI counter space exhausted; master-key rotation required") // SPIAllocator hands out monotonically increasing, collision-free SPIs for one @@ -140,26 +140,3 @@ func (a *SPIAllocator) Allocate(masterKeyIndex int) (uint32, error) { a.next[masterKeyIndex]++ return MakeSPI(masterKeyIndex, a.role, a.next[masterKeyIndex]) } - -// SeedFloor raises the allocator's counter for masterKeyIndex so the next -// Allocate returns a counter strictly greater than floor's counter (the low 30 -// bits of floor). It is monotonic — a floor at or below the current position is a -// no-op — and never lowers the counter, so it is safe to call on every new -// session. The control plane uses this to carry a per-direction epoch high-water -// across reconnects/restarts (see control/epochstate.go): seeding the initiator's -// allocator above the survivor's retained data-plane epoch keeps the shared epoch -// strictly increasing across a session boundary, so the survivor's monotonicity -// guard accepts the new SA instead of rejecting a counter that reset to 1. -// -// It takes the same lock as Allocate; an out-of-range index is ignored. -func (a *SPIAllocator) SeedFloor(masterKeyIndex int, floor uint32) { - if masterKeyIndex < 0 || masterKeyIndex >= numMasterKeys { - return - } - c := floor & spiCounterMask - a.mu.Lock() - defer a.mu.Unlock() - if c > a.next[masterKeyIndex] { - a.next[masterKeyIndex] = c - } -} diff --git a/control/transport.go b/control/transport.go index 7b48279..14c1b4e 100644 --- a/control/transport.go +++ b/control/transport.go @@ -161,13 +161,6 @@ func newSession(ctx context.Context, conn *quic.Conn, role Role) (*Session, erro // Role reports whether this peer is the initiator or responder. func (s *Session) Role() Role { return s.role } -// SeedRxFloor raises this session's RX SPI allocator so the next negotiated SPI -// (and hence, for the initiator, the shared data-plane epoch) is strictly greater -// than floor. It must be called before the session's first NegotiateSAs. The -// Tunnel uses it to carry the epoch high-water across a reconnect/restart so the -// data-plane epoch keeps increasing rather than resetting to 1. -func (s *Session) SeedRxFloor(floor uint32) { s.rxAlloc.SeedFloor(activeMasterKeyIndex, floor) } - // MasterKeys returns the PSP master keys derived from this session. func (s *Session) MasterKeys() *MasterKeys { return s.masterKeys } From e158045f6bde940197cbe3f0189a433f2a697786 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 2 Jun 2026 01:20:02 -0700 Subject: [PATCH 15/20] [cli] drive per-direction SAs; drop --state-file/--require-state (APO-648) The control-plane installer now forwards both per-direction SPIs to UpdateVirtualNetworkSAs. Removes the --state-file/--require-state flags and the durable-epoch wiring (EpochStore/RequireState on the Tunnel): with fresh per-session keys there is no high-water to persist, so a restart recovers with no on-disk state. --- cli/main.go | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/cli/main.go b/cli/main.go index 2e853a6..06a3fac 100644 --- a/cli/main.go +++ b/cli/main.go @@ -101,14 +101,6 @@ func main() { Name: "require-fips", Usage: "Refuse to start unless the Go FIPS 140-3 module is active (GODEBUG=fips140=on)", }, - &cli.StringFlag{ - Name: "state-file", - Usage: "Path to a durable epoch-state file (control plane). Persists the epoch high-water so a one-sided restart recovers seamlessly. Set on BOTH peers (the dialer/listener role is auto-elected). Without it, a one-sided initiator restart recovers only after both peers cycle", - }, - &cli.BoolFlag{ - Name: "require-state", - Usage: "Fail closed if durable epoch state is corrupt/unreadable or persistently un-writable, instead of degrading to a fresh start. Requires --state-file. Integrity tripwire only — not rollback/deletion resistant", - }, &cli.IntFlag{ Name: "port", Aliases: []string{"p"}, @@ -555,23 +547,6 @@ func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni return nil, nil, err } - // Durable epoch state (optional). Built role-agnostically; the Tunnel consults it - // only when this node is the elected initiator (the responder's high-water is not - // load-bearing), which is why it must be configured on both peers. - stateFile := c.String("state-file") - requireState := c.Bool("require-state") - if requireState && stateFile == "" { - return nil, nil, errors.New("--require-state requires --state-file") - } - var epochStore control.EpochStore - if stateFile != "" { - store, err := control.NewFileEpochStore(stateFile, ident, peerPub) - if err != nil { - return nil, nil, err - } - epochStore = store - } - ctrlNet := "udp4" if peerUDPAddr.IP.To4() == nil { ctrlNet = "udp6" @@ -599,8 +574,8 @@ func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni keyLifetime = 24 * time.Hour } - installer := func(epoch uint32, rxKey, txKey [16]byte) error { - return h.UpdateVirtualNetworkKeys(vni, epoch, rxKey, txKey, time.Now().Add(keyLifetime)) + installer := func(rxSPI, txSPI uint32, rxKey, txKey [16]byte) error { + return h.UpdateVirtualNetworkSAs(vni, rxSPI, txSPI, rxKey, txKey, time.Now().Add(keyLifetime)) } tun, err := control.NewTunnel(control.TunnelConfig{ @@ -609,8 +584,6 @@ func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni Conn: pconn, PeerAddr: peerControlAddr, RekeyInterval: rekeyIvl, - EpochStore: epochStore, - RequireState: requireState, }, installer) if err != nil { _ = pconn.Close() From a488852ad4c6d9672e4256ed72c671547adcede5 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Tue, 2 Jun 2026 01:20:07 -0700 Subject: [PATCH 16/20] [cli] document fresh-key restart recovery; drop persisted-state docs (APO-648) Remove the --state-file/--require-state flag documentation and rewrite the "Restart / reconnect" section: recovery is now seamless and symmetric with no persisted state to manage, because every (re)connect is a fresh ECDHE handshake whose fresh keys make a reset per-direction SPI safe. --- cli/README.md | 55 +++++++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/cli/README.md b/cli/README.md index 2da7213..da048b3 100644 --- a/cli/README.md +++ b/cli/README.md @@ -77,13 +77,6 @@ Relevant flags: - `--rekey-interval DUR` — SA rotation period (default `2m`). - `--require-fips` — refuse to start unless the Go FIPS 140-3 module is active (build/run with `GODEBUG=fips140=on`). -- `--state-file PATH` — persist the epoch high-water so a one-sided restart recovers - seamlessly (see "Restart / reconnect"). Set it on **both** peers. Without it, a - one-sided *initiator* restart recovers only after both peers cycle. -- `--require-state` — fail closed if the state file is corrupt/unreadable or becomes - persistently un-writable, instead of degrading to a fresh start. Requires - `--state-file`. It is an integrity tripwire against accidental corruption, not a - defense against an attacker who can rewrite or delete the file. ### Operational notes @@ -94,34 +87,26 @@ ends close together, and run under a supervisor (systemd `Restart=always`, a con restart policy) so a larger startup skew self-heals on restart. Once established, the control plane reconnects on its own indefinitely. -**Restart / reconnect.** Control-plane keys are ephemeral, so a *clean* restart is always -crypto-safe: a new session derives fresh keys, and a reset transmit counter cannot reuse a -nonce under a fresh key. The remaining question is *availability* after a restart, which the -control plane handles in two layers: - -- **Always on (no flag):** the data-plane epoch counter resets to 1 on every (re)connect, but - the surviving peer's monotonicity guard only accepts a strictly increasing epoch. ICX - carries the epoch high-water forward in memory and seeds each new session above it, so a - **transient reconnect** (a network blip) and a **responder restart** recover immediately — - the surviving *initiator* keeps the high-water and the restarted responder comes up fresh. - -- **With `--state-file` (recommended):** the high-water is also persisted durably (fsync + - atomic rename, integrity-protected by a MAC keyed from the identity), so an **initiator - restart** recovers too — the restarted initiator reloads the high-water and resumes above - the survivor's retained epoch. The dialer/listener role is auto-elected from the keys, so - set `--state-file` on **both** peers; only the elected initiator's file is load-bearing, and - the start-up log shows whether durable state is active for this node's role. - -Without `--state-file`, a one-sided *initiator* restart still has the old caveat: the survivor -forwards on its existing keys until they expire and only re-keys once the new counter climbs -past the retained high-water — **cycle both peers** to recover immediately. - -`--require-state` makes a corrupt/unreadable state file (or persistently failing writes) fail -closed instead of silently starting fresh. It is an integrity tripwire against accidental -corruption only: because keys are per-session ephemeral, even a rolled-back or deleted -high-water cannot cause nonce reuse — only a transient outage — so rollback and deletion -resistance (which would need a hardware monotonic counter) are out of scope. Use one state -file per tunnel; do not share it between processes. +**Restart / reconnect.** Control-plane keys are ephemeral, so any restart or reconnect is +both crypto-safe and seamless, with **no persisted state to manage**. + +Each direction is a simplex SA with its own SPI: the receiver allocates it, the sender +encrypts to it (`nonce = SPI‖counter`). The receive-SPI allocator resets to 1 on every +(re)connect, but because each session is a fresh ECDHE handshake (no 0-RTT, no session +resumption — both are disabled and asserted fail-closed), every generation also derives a +**fresh master key**. A reset or regressed SPI is therefore always paired with a key that has +never been used, so its from-zero counter is a fresh nonce space and no AES-GCM nonce can +repeat. The data-plane install seam accepts the reset SPI for exactly this reason; the only +thing it refuses is re-installing the *currently-live* transmit SPI (which would reset a live +counter under an unchanged key). + +This makes every recovery path seamless and symmetric: + +- **Transient reconnect** (a network blip, both processes survive) — the next session derives + fresh keys and both directions resume immediately. +- **One-sided restart** (either peer) — the restarted peer comes back with a fresh allocator + and a fresh handshake; the survivor accepts the reset SPI under its fresh key and traffic + resumes immediately. There is no high-water to carry forward and no peer to cycle. ## Static keys (legacy) From 4de880a5dcb232cdd4600e3d1559fd1ac9d74ba2 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Sun, 14 Jun 2026 10:07:25 -0700 Subject: [PATCH 17/20] [cli] retire static-INI keying; require the control plane (APO-644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the legacy static-key mode: the --key-file flag, runStaticKeying, the SIGHUP INI reload loop, loadKeysFromINI, and the gopkg.in/ini.v1 dependency. The QUIC/mTLS control plane is now the only keying path — run requires both --identity-key and --peer-key, fail-closed, and always drives the rekey loop under the errgroup. This closes the last AES-GCM nonce-reuse path (the residual APO-644 hazard): the static path re-read the INI at epoch 1 on restart and reset the TX counter under an unchanged persisted key. With the control plane as the sole installer, every session derives fresh per-session ECDHE keys, so a reset counter is never paired with a reused key. --- cli/README.md | 69 ++--------------- cli/go.mod | 1 - cli/go.sum | 2 - cli/main.go | 211 +++++++------------------------------------------- 4 files changed, 33 insertions(+), 250 deletions(-) diff --git a/cli/README.md b/cli/README.md index da048b3..3f23010 100644 --- a/cli/README.md +++ b/cli/README.md @@ -1,17 +1,11 @@ # InterCloud eXpress (ICX) - CLI -ICX encrypts tunnel traffic with AES-128-GCM. Keys can be established two ways: +ICX encrypts tunnel traffic with AES-128-GCM. Keys are established by a **QUIC/mTLS +control plane**: a channel that authenticates the peers, negotiates fresh, +forward-secret, per-session keys, and rotates them automatically — so the tunnel is safe +across restarts with no persisted key state. -- **Control plane (recommended):** a QUIC/mTLS channel negotiates fresh, - forward-secret, per-session keys and rotates them automatically. This is the only - mode that is safe across restarts. -- **Static keys (legacy):** a pair of pre-shared keys loaded from an INI file and - rotated by hand via `SIGHUP`. Retained for compatibility; see the caveats below. - -The two modes are mutually exclusive and fail closed: configure exactly one. ICX never -silently falls back from the control plane to static keys. - -## Control plane (recommended) +## Control plane Each node has a long-term **identity key** (ECDSA P-256). Peers authenticate each other WireGuard-style by pinning the expected public key — there is no CA. The control channel @@ -108,56 +102,3 @@ This makes every recovery path seamless and symmetric: and a fresh handshake; the survivor accepts the reset SPI under its fresh key and traffic resumes immediately. There is no high-water to carry forward and no peer to cycle. -## Static keys (legacy) - -> Prefer the control plane. Static keys provide **no forward secrecy** and are **not safe -> across restarts**: a restart re-reads the INI starting again at epoch 1 with the TX -> counter reset to 0, so **do not restart against an unchanged key file** — rotate to -> fresh keys (below), otherwise the AES-GCM nonce sequence is reused under the same key. - -ICX enforces two invariants when keys are installed and refuses the key otherwise: `rx` -and `tx` **must differ** (each direction needs its own key), and the key epoch must -**strictly increase** within a running process. - -### 1) Generate two one-time keys - -```bash -# Key used for A → B traffic -K_AB=$(openssl rand -hex 16) -# Key used for B → A traffic -K_BA=$(openssl rand -hex 16) -``` - -### 2) Create an INI file on each host - -Each host reads keys from an INI file at `--key-file`. The required format is: - -```ini -[keys] -rx=<32 hex chars> # the key this host expects to RECEIVE with -tx=<32 hex chars> # the key this host will TRANSMIT with -# Optional expiry (defaults to 24h if omitted): -# - as a Go duration (e.g. 24h, 90m) -# - or an RFC3339 timestamp (e.g. 2025-10-16T12:34:56Z) -expires=24h -``` - -For Host A `rx=${K_BA}`, `tx=${K_AB}`; for Host B `rx=${K_AB}`, `tx=${K_BA}`. - -### 3) Start ICX on both hosts - -```bash -icx -i --key-file=/path/to/icx.ini : -``` - -### 4) Key rotation (SIGHUP) - -Update the same INI file with new rx/tx values, then send `SIGHUP`: - -```bash -pkill -HUP icx -``` - -ICX reloads the INI, bumps the epoch, and applies the new keys. If the reloaded keys are -identical to the current ones, the reload is refused (epoch unchanged). `SIGHUP` reload is -only active in static mode. diff --git a/cli/go.mod b/cli/go.mod index 9b3486c..6511a5a 100644 --- a/cli/go.mod +++ b/cli/go.mod @@ -10,7 +10,6 @@ require ( github.com/urfave/cli/v2 v2.27.7 github.com/vishvananda/netlink v1.3.1 golang.org/x/sync v0.16.0 - gopkg.in/ini.v1 v1.67.0 gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 ) diff --git a/cli/go.sum b/cli/go.sum index 2c4a788..58c6940 100644 --- a/cli/go.sum +++ b/cli/go.sum @@ -66,8 +66,6 @@ golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= -gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 h1:BEymU11L8DZSC4GNK48JYIR8EcHs+gFxtg9YfYlp68c= diff --git a/cli/main.go b/cli/main.go index 06a3fac..0274350 100644 --- a/cli/main.go +++ b/cli/main.go @@ -6,7 +6,6 @@ import ( "context" "crypto/ecdsa" "crypto/fips140" - "encoding/hex" "errors" "fmt" "io" @@ -18,7 +17,6 @@ import ( "os/signal" "runtime/pprof" "strings" - "sync" "syscall" "time" @@ -37,8 +35,6 @@ import ( "github.com/apoxy-dev/icx/permissions" "github.com/apoxy-dev/icx/queues" "github.com/apoxy-dev/icx/veth" - - ini "gopkg.in/ini.v1" ) func main() { @@ -56,10 +52,10 @@ func main() { Usage: "Set the logging level (debug, info, warn, error, fatal, panic)", Value: "info", }, - // NOTE: --interface and --key-file are intentionally NOT marked Required. - // urfave/cli enforces required root flags before dispatching to a - // subcommand, which would make `icx genkey`/`icx pubkey` unrunnable. They - // are validated inside run() instead. + // NOTE: --interface is intentionally NOT marked Required. urfave/cli + // enforces required root flags before dispatching to a subcommand, which + // would make `icx genkey`/`icx pubkey` unrunnable. It is validated inside + // run() instead. &cli.StringFlag{ Name: "interface", Aliases: []string{"i"}, @@ -71,10 +67,6 @@ func main() { Usage: "Virtual Network Identifier (VNI) for the tunnel (24-bit value)", Value: 1, }, - &cli.StringFlag{ - Name: "key-file", - Usage: "Path to INI file with static keys (rx, tx, optional expires). Legacy/static mode; prefer the control plane (--identity-key/--peer-key)", - }, &cli.StringFlag{ Name: "identity-key", Usage: "Path to this node's identity private key (PKCS#8 PEM from `icx genkey`). Enables the control plane", @@ -253,14 +245,10 @@ func run(c *cli.Context) error { } slog.SetLogLoggerLevel(level) - // Resolve the keying mode (static INI vs control plane) up front, fail-closed: - // exactly one must be configured, and there is no silent fallback between them. - keyFile := c.String("key-file") - identityKey := c.String("identity-key") - peerKey := c.String("peer-key") - mode, err := control.SelectMode(keyFile != "", identityKey != "", peerKey != "") - if err != nil { - return err + // The control plane is the only keying mode: require both halves of the pinned + // identity pair up front, fail-closed. There is deliberately no static/INI fallback. + if c.String("identity-key") == "" || c.String("peer-key") == "" { + return errors.New("control-plane keying requires both --identity-key and --peer-key (generate them with `icx genkey` / `icx pubkey`)") } if c.String("interface") == "" { @@ -409,24 +397,15 @@ func run(c *cli.Context) error { return fmt.Errorf("failed to add virtual network: %w", err) } - // Keying: install the data-plane keys (and arrange ongoing rotation) according to - // the selected mode. tun is non-nil only in control-plane mode. - var tun *control.Tunnel - switch mode { - case control.ModeStatic: - if err := runStaticKeying(ctx, c, h, vni); err != nil { - return err - } - case control.ModeControlPlane: - var ctrlConn *net.UDPConn - tun, ctrlConn, err = startControlPlane(ctx, c, h, vni, peerUDPAddr) - if err != nil { - return err - } - defer func() { _ = tun.Close() }() - // The Tunnel only borrows the control socket; own its lifetime here. - defer func() { _ = ctrlConn.Close() }() + // Establish the QUIC/mTLS control plane: install the initial SAs and drive ongoing + // rekeys. This is the only keying path. + tun, ctrlConn, err := startControlPlane(ctx, c, h, vni, peerUDPAddr) + if err != nil { + return err } + defer func() { _ = tun.Close() }() + // The Tunnel only borrows the control socket; own its lifetime here. + defer func() { _ = ctrlConn.Close() }() fwd, err := forwarder.NewForwarder( h, @@ -439,93 +418,22 @@ func run(c *cli.Context) error { return fmt.Errorf("failed to create forwarder: %w", err) } - // In control-plane mode, run the rekey loop and the forwarder under one errgroup - // sharing a cancel: a signal (ctx) or a forwarder error stops both. The control - // plane reconnects indefinitely on its own (Tunnel.Run does not return on CP - // failures), so it does not abort the forwarder; if it cannot re-establish, the - // data plane fails closed when the installed keys expire (see key lifetime below). - if tun != nil { - g, gctx := errgroup.WithContext(ctx) - g.Go(func() error { return tun.Run(gctx) }) - g.Go(func() error { - if err := fwd.Start(gctx); err != nil && !errors.Is(err, context.Canceled) { - return fmt.Errorf("forwarder: %w", err) - } - return nil - }) - if err := g.Wait(); err != nil && !errors.Is(err, context.Canceled) { - return err + // Run the rekey loop and the forwarder under one errgroup sharing a cancel: a signal + // (ctx) or a forwarder error stops both. The control plane reconnects indefinitely on + // its own (Tunnel.Run does not return on CP failures), so it does not abort the + // forwarder; if it cannot re-establish, the data plane fails closed when the installed + // keys expire (see key lifetime below). + g, gctx := errgroup.WithContext(ctx) + g.Go(func() error { return tun.Run(gctx) }) + g.Go(func() error { + if err := fwd.Start(gctx); err != nil && !errors.Is(err, context.Canceled) { + return fmt.Errorf("forwarder: %w", err) } return nil - } - - if err := fwd.Start(ctx); err != nil && !errors.Is(err, context.Canceled) { - return fmt.Errorf("failed to start forwarder: %w", err) - } - - return nil -} - -// runStaticKeying installs the static INI keys and starts the SIGHUP reload loop. -// This is the legacy path; it is gated to ModeStatic so a control-plane deployment -// never has a competing installer touching the same VNI. -func runStaticKeying(ctx context.Context, c *cli.Context, h *icx.Handler, vni uint) error { - keyFile := c.String("key-file") - epoch := uint32(1) // initial epoch - rxKey, txKey, expiresAt, err := loadKeysFromINI(keyFile) - if err != nil { + }) + if err := g.Wait(); err != nil && !errors.Is(err, context.Canceled) { return err } - if err := h.UpdateVirtualNetworkKeys(vni, epoch, rxKey, txKey, expiresAt); err != nil { - return fmt.Errorf("failed to update virtual network key: %w", err) - } - slog.Warn("using static INI keys (legacy mode); prefer the control plane (--identity-key/--peer-key). " + - "A restart re-reads the INI at epoch 1 with the TX counter reset, so do not restart against an unchanged key file") - - var keyMu sync.Mutex - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh, syscall.SIGHUP) - - go func() { - for { - select { - case <-ctx.Done(): - return - case <-sigCh: - newRX, newTX, newExpires, loadErr := loadKeysFromINI(keyFile) - if loadErr != nil { - slog.Error("Key reload failed", slog.Any("error", loadErr)) - continue - } - - keyMu.Lock() - // Refuse reload if keys are identical to current ones. - if rxKey == newRX && txKey == newTX { - slog.Warn("Refusing key reload: rx/tx unchanged; keeping current epoch", - slog.Uint64("epoch", uint64(epoch)), - ) - keyMu.Unlock() - continue - } - - // Keys changed: commit and bump epoch. - rxKey = newRX - txKey = newTX - expiresAt = newExpires - epoch++ - - if err := h.UpdateVirtualNetworkKeys(vni, epoch, rxKey, txKey, expiresAt); err != nil { - slog.Error("Failed to apply reloaded keys", slog.Any("error", err)) - } else { - slog.Info("Reloaded keys", - slog.Uint64("epoch", uint64(epoch)), - slog.Time("expiresAt", expiresAt), - ) - } - keyMu.Unlock() - } - } - }() return nil } @@ -622,69 +530,6 @@ func readPeerKey(val string) (*ecdsa.PublicKey, error) { return control.ParsePublicKey(s) } -// loadKeysFromINI loads rx/tx (hex, 16-byte each) and optional expires from an INI file. -// -// Required layout: -// -// [keys] -// rx=0123... (32 hex chars -> 16 bytes) -// tx=abcd... (32 hex chars -> 16 bytes) -// expires=24h // OR RFC3339 like 2025-10-16T12:34:56Z -// -// "expires" may be either a Go duration (e.g., "24h", "90m") or RFC3339 timestamp. -// If omitted, a default of 24h from now is applied. -func loadKeysFromINI(path string) (rxKey, txKey [16]byte, expiresAt time.Time, err error) { - cfg, err := ini.Load(path) - if err != nil { - return rxKey, txKey, time.Time{}, fmt.Errorf("open INI: %w", err) - } - - sec, err := cfg.GetSection("keys") - if err != nil { - return rxKey, txKey, time.Time{}, errors.New("INI must contain a [keys] section") - } - - get := func(k string) string { return strings.TrimSpace(sec.Key(k).String()) } - - rxHex := get("rx") - txHex := get("tx") - expStr := get("expires") - - if rxHex == "" || txHex == "" { - return rxKey, txKey, time.Time{}, errors.New("INI [keys] missing required keys: rx and tx") - } - if len(rxHex) != 32 || len(txHex) != 32 { - return rxKey, txKey, time.Time{}, errors.New("INI [keys] rx/tx must be 32 hex characters (16 bytes)") - } - - rxBytes, err := hex.DecodeString(rxHex) - if err != nil { - return rxKey, txKey, time.Time{}, fmt.Errorf("decode INI [keys].rx: %w", err) - } - txBytes, err := hex.DecodeString(txHex) - if err != nil { - return rxKey, txKey, time.Time{}, fmt.Errorf("decode INI [keys].tx: %w", err) - } - copy(rxKey[:], rxBytes) - copy(txKey[:], txBytes) - - // Expiry handling: optional; default to 24h if absent. - if expStr != "" { - // Prefer duration, fall back to RFC3339. - if d, derr := time.ParseDuration(expStr); derr == nil { - expiresAt = time.Now().Add(d) - } else if t, terr := time.Parse(time.RFC3339, expStr); terr == nil { - expiresAt = t - } else { - return rxKey, txKey, time.Time{}, fmt.Errorf("expires must be duration (e.g. 24h) or RFC3339 timestamp: %q", expStr) - } - } else { - expiresAt = time.Now().Add(24 * time.Hour) - } - - return rxKey, txKey, expiresAt, nil -} - func selectSourceAddr(link netlink.Link, dstAddr *tcpip.FullAddress) (*tcpip.FullAddress, error) { var network string ip := net.IP(dstAddr.Addr.AsSlice()) From f79c7922f41d82f779a46ce90306e9ec847c9903 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Sun, 14 Jun 2026 10:07:57 -0700 Subject: [PATCH 18/20] [handler] reframe UpdateVirtualNetworkKeys as a general single-epoch seam With static-INI keying gone, UpdateVirtualNetworkKeys has no production caller and is no longer "the static --key-file path". Re-document it as the simple single-epoch manual- keying seam used by tests and embedders (the control plane installs genuine per-direction SAs via UpdateVirtualNetworkSAs). The strict epoch-monotonicity and distinct-key guards are unchanged; the doc now states the caller's cross-restart responsibility for a manually-supplied key that survives restarts. --- cp_wire_test.go | 6 +++--- handler.go | 35 +++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/cp_wire_test.go b/cp_wire_test.go index bea61ad..607656d 100644 --- a/cp_wire_test.go +++ b/cp_wire_test.go @@ -214,9 +214,9 @@ func TestSharedEpochCollapseMismatchDropsTraffic(t *testing.T) { hI := newPeerHandler(t, vni, addrA, addrB) hR := newPeerHandler(t, vni, addrB, addrA) - // Collapse both directions onto each peer's OWN receive SPI via the legacy shared- - // epoch shim. hI then transmits under iSAs.Rx.SPI, which hR (installed under - // rSAs.Rx.SPI) does not have. + // Collapse both directions onto each peer's OWN receive SPI via the single-epoch + // UpdateVirtualNetworkKeys seam. hI then transmits under iSAs.Rx.SPI, which hR + // (installed under rSAs.Rx.SPI) does not have. var iRx, iTx, rRx, rTx [16]byte copy(iRx[:], iSAs.Rx.Key) copy(iTx[:], iSAs.Tx.Key) diff --git a/handler.go b/handler.go index 27e7b5e..de2b75c 100644 --- a/handler.go +++ b/handler.go @@ -400,9 +400,9 @@ func (h *Handler) UpdateVirtualNetworkRoutes(vni uint, allowedRoutes []Route) er // accepted, as is any lower-or-higher txSPI; safety rests on the fresh-key guarantee // above, not on monotonicity. // Callers must serialize installs per VNI; the guard→install sequence is not -// internally locked (the control plane is single-threaded per Tunnel). Static -// pre-shared keys, which have NO per-session freshness, must use the strictly-guarded -// UpdateVirtualNetworkKeys instead. +// internally locked (the control plane is single-threaded per Tunnel). Manually-keyed +// SAs that lack per-session key freshness should use the strictly-guarded single-epoch +// UpdateVirtualNetworkKeys seam instead. func (h *Handler) UpdateVirtualNetworkSAs(vni uint, rxSPI, txSPI uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { value, ok := h.networkByID.Load(vni) if !ok { @@ -447,19 +447,21 @@ func (h *Handler) UpdateVirtualNetworkSAs(vni uint, rxSPI, txSPI uint32, rxKey, return h.installKeys(vnet, rxSPI, txSPI, rxKey, txKey, expiresAt) } -// UpdateVirtualNetworkKeys is the legacy install seam for STATIC pre-shared keys -// (the --key-file/INI path): it installs a single epoch (SPI) for BOTH simplex -// directions, separated only by the distinct rx/tx keys. +// UpdateVirtualNetworkKeys installs a single epoch (SPI) for BOTH simplex directions, +// separated only by the distinct rx/tx keys. It is the simple manual-keying seam — used +// by tests and by embedders that drive their own keying rather than the QUIC control +// plane (which installs genuine per-direction SAs via UpdateVirtualNetworkSAs). // -// Unlike the control plane, static keys carry NO per-session freshness — the same key -// is reused across reloads and process restarts — so this path keeps the STRICT -// monotonicity guard: the epoch must strictly increase within the process. That stops -// an operator from reinstalling an older-or-equal epoch with a reused key (which would -// reset the counter under an already-used (epoch, key) and reuse a nonce). It does NOT -// (and cannot) prevent a cross-RESTART reuse — a restart resets the in-memory counter -// to zero under the persisted key (the residual APO-644 hazard); the control plane -// (fresh per-session keys) is the fix, which is why static keying is being retired. -// Callers must serialize installs per VNI (the static path does so under a mutex). +// It enforces a STRICT monotonicity guard: the epoch must strictly increase within the +// process. That stops a caller from reinstalling an older-or-equal epoch with a reused +// key, which would reset the GCM counter under an already-used (epoch, key) and repeat a +// nonce. The guard cannot see across process restarts, so a caller that supplies a key +// which SURVIVES restarts (e.g. one read from disk) MUST advance the epoch past the last +// value used in any prior run — otherwise the from-zero counter reuses nonces under the +// persisted key. The control plane sidesteps this entirely by deriving a fresh key per +// session; manual callers own the invariant. +// +// Callers must serialize installs per VNI. func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { value, ok := h.networkByID.Load(vni) if !ok { @@ -470,7 +472,8 @@ func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey if epoch == 0 { return errors.New("epoch (SPI) must be non-zero") } - // Strict monotonicity (static keys have no per-session key freshness to fall back on). + // Strict monotonicity: a manual-keyed caller has no per-session key freshness to fall + // back on, so the epoch must strictly increase or a reset counter could reuse a nonce. if cur := vnet.txCipher.Load(); cur != nil && epoch <= cur.epoch { return fmt.Errorf("epoch must be monotonically increasing: new %d <= current %d", epoch, cur.epoch) } From f0ec409b17dc0568dc5e09b7158d22daa8f5483d Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Sun, 14 Jun 2026 10:08:03 -0700 Subject: [PATCH 19/20] [control] remove the keying-mode machinery (Mode/SelectMode) The control plane is the only keying mode now, so the Mode enum, its String method, and SelectMode (which chose between static-INI and control-plane keying) are dead. Remove them along with TestSelectMode; the CLI validates the required identity/peer keys directly. --- control/cp.go | 45 --------------------------------------------- control/cp_test.go | 28 ---------------------------- 2 files changed, 73 deletions(-) diff --git a/control/cp.go b/control/cp.go index 9645de2..d549e0b 100644 --- a/control/cp.go +++ b/control/cp.go @@ -34,51 +34,6 @@ import ( // SPI), which makes a transient reconnect and a one-sided restart of either peer recover // seamlessly with zero on-disk state. -// Mode is the keying mode selected from the CLI flags. -type Mode int - -const ( - ModeNone Mode = iota - // ModeStatic is the legacy static pre-shared keys loaded from an INI file. - ModeStatic - // ModeControlPlane is the QUIC/mTLS control plane with ephemeral, forward-secret, - // per-session keys. - ModeControlPlane -) - -func (m Mode) String() string { - switch m { - case ModeStatic: - return "static" - case ModeControlPlane: - return "control-plane" - default: - return "none" - } -} - -// SelectMode resolves the keying mode from which flags are set, fail-closed. -// Exactly one mode must be configured: either static keys (--key-file) OR the -// control plane (--identity-key AND --peer-key). Any other combination — both, the -// control plane half-configured, or nothing — is an error. There is deliberately no -// silent fallback from the control plane to static keys. -func SelectMode(hasKeyFile, hasIdentity, hasPeer bool) (Mode, error) { - cp := hasIdentity || hasPeer - switch { - case hasKeyFile && cp: - return ModeNone, errors.New("conflicting keying modes: set --key-file (static) OR --identity-key/--peer-key (control plane), not both") - case cp: - if !hasIdentity || !hasPeer { - return ModeNone, errors.New("control-plane mode requires both --identity-key and --peer-key") - } - return ModeControlPlane, nil - case hasKeyFile: - return ModeStatic, nil - default: - return ModeNone, errors.New("no keying configured: set --key-file (static) or --identity-key and --peer-key (control plane)") - } -} - // CanonicalInitiator reports whether the local node is the control-plane initiator // — the peer that dials. The role is elected deterministically from the two pinned // identities so both ends agree with zero configuration (WireGuard-style): the node diff --git a/control/cp_test.go b/control/cp_test.go index 3e9e8f4..9cbede8 100644 --- a/control/cp_test.go +++ b/control/cp_test.go @@ -13,34 +13,6 @@ import ( "github.com/stretchr/testify/require" ) -func TestSelectMode(t *testing.T) { - cases := []struct { - name string - keyFile, identity, peer bool - want Mode - wantErr bool - }{ - {"static", true, false, false, ModeStatic, false}, - {"control-plane", false, true, true, ModeControlPlane, false}, - {"cp half (identity only)", false, true, false, ModeNone, true}, - {"cp half (peer only)", false, false, true, ModeNone, true}, - {"both modes", true, true, true, ModeNone, true}, - {"static + identity", true, true, false, ModeNone, true}, - {"nothing", false, false, false, ModeNone, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - got, err := SelectMode(tc.keyFile, tc.identity, tc.peer) - if tc.wantErr { - require.Error(t, err) - return - } - require.NoError(t, err) - require.Equal(t, tc.want, got) - }) - } -} - func TestCanonicalInitiator(t *testing.T) { a, err := GenerateIdentity() require.NoError(t, err) From 839a2c1c5f47006007de63fa9192f89c2dde2a8e Mon Sep 17 00:00:00 2001 From: Dmitry Ilyevsky Date: Sun, 14 Jun 2026 18:06:11 -0700 Subject: [PATCH 20/20] =?UTF-8?q?[control]=20rename=20PSPVersion=20?= =?UTF-8?q?=E2=86=92=20ICXVersion=20(cipher-suite=20selector)=20(APO-758)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PSPVersion / PSPv0 / PSPv1 named a PSP wire attribute icx does not emit — icx borrows PSP's key schedule but runs no PSP packet format (the data plane is Geneve + AF_XDP). Rename the local AEAD cipher-suite selector to ICXVersion, with AESGCM128 / AESGCM256 for the two suites, and the saOffer field to Version (matching SA.Version). The genuine PSP key-schedule primitives (DeriveSAKey, MasterKeys, MasterKeyIndex, the SPI bit-layout, the CMAC PRF) keep the PSP name — they implement the spec and the name is a correct pointer to it. Pure Go-identifier rename, no byte changes: the on-wire codepoints (0/1), the KDF label bytes ("Pv0\0"/"Pv1\0"), and the masterKeyInfo domain string are untouched, and the PSP-spec KDF vector test (TestDeriveSAKey_PSPSpec) still passes. --- _examples/keyexchange/main.go | 20 ++++++++--------- control/cp.go | 8 +++---- control/cp_test.go | 10 ++++----- control/kdf.go | 41 ++++++++++++++++++----------------- control/kdf_test.go | 14 ++++++------ control/protocol.go | 14 ++++++------ control/sa.go | 6 ++--- control/sa_test.go | 14 ++++++------ control/transport.go | 14 ++++++------ control/transport_test.go | 6 ++--- cp_wire_test.go | 4 ++-- 11 files changed, 76 insertions(+), 75 deletions(-) diff --git a/_examples/keyexchange/main.go b/_examples/keyexchange/main.go index 2abfdad..8a69177 100644 --- a/_examples/keyexchange/main.go +++ b/_examples/keyexchange/main.go @@ -27,12 +27,12 @@ import ( ) func main() { - pspV1 := flag.Bool("v1", false, "use PSP v1 (AES-256-GCM) instead of v0 (AES-128-GCM)") + useV1 := flag.Bool("v1", false, "use the AES-GCM-256 cipher suite instead of AES-GCM-128") flag.Parse() - version := control.PSPv0 - if *pspV1 { - version = control.PSPv1 + version := control.AESGCM128 + if *useV1 { + version = control.AESGCM256 } if err := run(version); err != nil { @@ -40,7 +40,7 @@ func main() { } } -func run(version control.PSPVersion) error { +func run(version control.ICXVersion) error { ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) defer cancel() @@ -123,7 +123,7 @@ func run(version control.PSPVersion) error { // 6. Report and verify. fmt.Printf("master keys agree: %v\n", initSess.MasterKeys() != nil && r.sess.MasterKeys() != nil) - fmt.Printf("SAs (PSP %s):\n", pspName(version)) + fmt.Printf("SAs (%s):\n", suiteName(version)) fmt.Printf(" initiator: tx spi=%#08x key=%s | rx spi=%#08x key=%s\n", initSAs.Tx.SPI, fp(initSAs.Tx.Key), initSAs.Rx.SPI, fp(initSAs.Rx.Key)) fmt.Printf(" responder: tx spi=%#08x key=%s | rx spi=%#08x key=%s\n", @@ -160,15 +160,15 @@ func equal(a, b []byte) bool { return true } -func expectedKeyLen(v control.PSPVersion) int { - if v == control.PSPv1 { +func expectedKeyLen(v control.ICXVersion) int { + if v == control.AESGCM256 { return 32 } return 16 } -func pspName(v control.PSPVersion) string { - if v == control.PSPv1 { +func suiteName(v control.ICXVersion) string { + if v == control.AESGCM256 { return "v1/AES-256-GCM" } return "v0/AES-128-GCM" diff --git a/control/cp.go b/control/cp.go index d549e0b..7b50473 100644 --- a/control/cp.go +++ b/control/cp.go @@ -248,14 +248,14 @@ func (t *Tunnel) runResponder(ctx context.Context) error { // result. installSAs swallows an install rejection (returns nil) so it does not look // like a transport failure; a non-nil error here means the wire exchange failed. func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { - sas, err := t.sess.NegotiateSAs(ctx, PSPv0) + sas, err := t.sess.NegotiateSAs(ctx, AESGCM128) if err != nil { return err } return t.installSAs(sas) } -// installSAs validates the negotiated SAs fail-closed (PSPv0, 16-byte keys) and hands +// installSAs validates the negotiated SAs fail-closed (AES-GCM-128, 16-byte keys) and hands // the two per-direction SPIs/keys to the installer. Every session derives fresh keys // (fresh ECDHE; see transport.go), so the handler accepts the install even when the // per-session allocator reset the SPIs to a low value after a reconnect. An install @@ -263,8 +263,8 @@ func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { // currently-live transmit SPI): the previously installed keys keep forwarding and the // data plane fails closed on their own expiry. func (t *Tunnel) installSAs(sas *DirectionalSAs) error { - if sas.Tx.Version != PSPv0 || sas.Rx.Version != PSPv0 { - return fmt.Errorf("control: only PSPv0/AES-128 is supported in this build (tx=%d rx=%d)", sas.Tx.Version, sas.Rx.Version) + if sas.Tx.Version != AESGCM128 || sas.Rx.Version != AESGCM128 { + return fmt.Errorf("control: only the AES-GCM-128 cipher suite is supported in this build (tx=%d rx=%d)", sas.Tx.Version, sas.Rx.Version) } if len(sas.Rx.Key) != 16 || len(sas.Tx.Key) != 16 { return fmt.Errorf("control: expected 16-byte SA keys (rx=%d tx=%d)", len(sas.Rx.Key), len(sas.Tx.Key)) diff --git a/control/cp_test.go b/control/cp_test.go index 9cbede8..a55722e 100644 --- a/control/cp_test.go +++ b/control/cp_test.go @@ -40,7 +40,7 @@ func TestCanonicalInitiator(t *testing.T) { require.Error(t, err) } -// validV0SAs returns a role-partitioned, PSPv0 DirectionalSAs with distinct +// validV0SAs returns a role-partitioned, AESGCM128 DirectionalSAs with distinct // 16-byte keys, as NegotiateSAs would produce. func validV0SAs() *DirectionalSAs { iSPI, _ := MakeSPI(0, Initiator, 1) @@ -52,18 +52,18 @@ func validV0SAs() *DirectionalSAs { tx[i] = byte(i + 100) } return &DirectionalSAs{ - Tx: &SA{SPI: rSPI, Key: tx, Version: PSPv0}, - Rx: &SA{SPI: iSPI, Key: rx, Version: PSPv0}, + Tx: &SA{SPI: rSPI, Key: tx, Version: AESGCM128}, + Rx: &SA{SPI: iSPI, Key: rx, Version: AESGCM128}, } } func TestInstallSAsRejectsNonV0(t *testing.T) { tn := &Tunnel{install: func(uint32, uint32, [16]byte, [16]byte) error { - t.Fatal("installer must not be called for a non-PSPv0 SA") + t.Fatal("installer must not be called for a non-AES-GCM-128 SA") return nil }} sas := validV0SAs() - sas.Tx.Version = PSPv1 + sas.Tx.Version = AESGCM256 sas.Tx.Key = make([]byte, 32) require.Error(t, tn.installSAs(sas)) } diff --git a/control/kdf.go b/control/kdf.go index 3ddbde0..1909058 100644 --- a/control/kdf.go +++ b/control/kdf.go @@ -5,17 +5,18 @@ import ( "fmt" ) -// PSPVersion is a PSP encryption-mode codepoint. It selects both the AEAD -// (AES-GCM-128 vs AES-GCM-256) and, via the KDF label, the size of the derived -// security-association key. -type PSPVersion uint8 +// ICXVersion is an AEAD cipher-suite codepoint for an SA. It selects both the +// AEAD (AES-GCM-128 vs AES-GCM-256) and, via the KDF label, the size of the +// derived security-association key. It is a local cipher selector, not a +// wire-format version (that is ProtocolVersion in protocol.go). +type ICXVersion uint8 const ( - // PSPv0 is AES-GCM-128: a 16-byte SA key. Required by every PSP - // implementation; the ICX default (zero churn to the [16]byte data plane). - PSPv0 PSPVersion = 0 - // PSPv1 is AES-GCM-256: a 32-byte SA key. The CNSA / 256-bit path. - PSPv1 PSPVersion = 1 + // AESGCM128 selects AES-GCM-128: a 16-byte SA key. The ICX default (zero + // churn to the [16]byte data plane). + AESGCM128 ICXVersion = 0 + // AESGCM256 selects AES-GCM-256: a 32-byte SA key. The CNSA / 256-bit path. + AESGCM256 ICXVersion = 1 ) // MasterKeyLen is the required length of a PSP master key (256 bits). PSP @@ -26,19 +27,19 @@ const MasterKeyLen = 32 // (0x50 0x76 0x30 0x00) for v0, "Pv1\0" for v1. The trailing NUL also serves as // the SP 800-108 label/context separator. Per the spec, the version number may // be OR'd into the third byte of the base label. -func (v PSPVersion) label() [4]byte { +func (v ICXVersion) label() [4]byte { return [4]byte{0x50, 0x76, 0x30 | byte(v), 0x00} } -// valid reports whether v is a supported PSP version. Callers must reject +// valid reports whether v is a supported cipher suite. Callers must reject // unsupported versions before deriving keys (fail-closed), so keyLen/label are // never asked to map an unknown version. -func (v PSPVersion) valid() bool { return v == PSPv0 || v == PSPv1 } +func (v ICXVersion) valid() bool { return v == AESGCM128 || v == AESGCM256 } // keyLen returns the derived SA key length in bytes for the version. Only valid // versions reach here (guarded by DeriveSAKey); v0 = 16, v1 = 32. -func (v PSPVersion) keyLen() int { - if v == PSPv1 { +func (v ICXVersion) keyLen() int { + if v == AESGCM256 { return 32 } return 16 @@ -57,12 +58,12 @@ func (v PSPVersion) keyLen() int { // The caller is responsible for selecting which master key to pass based on the // SPI's most-significant bit (the PSP master-key selector); the SPI is fed into // the KDF context verbatim, MSB included, so the derivation is bound to it. -func DeriveSAKey(masterKey []byte, spi uint32, v PSPVersion) ([]byte, error) { +func DeriveSAKey(masterKey []byte, spi uint32, v ICXVersion) ([]byte, error) { if len(masterKey) != MasterKeyLen { return nil, fmt.Errorf("control: master key must be %d bytes, got %d", MasterKeyLen, len(masterKey)) } if !v.valid() { - return nil, fmt.Errorf("control: unsupported PSP version %d", v) + return nil, fmt.Errorf("control: unsupported cipher suite %d", v) } keyLen := v.keyLen() @@ -73,10 +74,10 @@ func DeriveSAKey(masterKey []byte, spi uint32, v PSPVersion) ([]byte, error) { out := make([]byte, 0, blocks*16) for i := 1; i <= blocks; i++ { var in [16]byte - binary.BigEndian.PutUint32(in[0:4], uint32(i)) // counter - copy(in[4:8], label[:]) // label - binary.BigEndian.PutUint32(in[8:12], spi) // context = SPI - binary.BigEndian.PutUint32(in[12:16], bitLen) // length (bits) + binary.BigEndian.PutUint32(in[0:4], uint32(i)) // counter + copy(in[4:8], label[:]) // label + binary.BigEndian.PutUint32(in[8:12], spi) // context = SPI + binary.BigEndian.PutUint32(in[12:16], bitLen) // length (bits) mac, err := aesCMAC(masterKey, in[:]) if err != nil { return nil, err diff --git a/control/kdf_test.go b/control/kdf_test.go index af534e0..ed35ce2 100644 --- a/control/kdf_test.go +++ b/control/kdf_test.go @@ -60,20 +60,20 @@ func TestDeriveSAKey_PSPSpec(t *testing.T) { name string master []byte spi uint32 - version PSPVersion + version ICXVersion want string }{ { - name: "v0_spi_12345678_mk0", master: k0, spi: 0x12345678, version: PSPv0, + name: "v0_spi_12345678_mk0", master: k0, spi: 0x12345678, version: AESGCM128, want: "96c22dc799198090b74b70ae468e4e30", }, { // MSB set -> master key 1 selected by the caller. - name: "v0_spi_9A345678_mk1", master: k1, spi: 0x9A345678, version: PSPv0, + name: "v0_spi_9A345678_mk1", master: k1, spi: 0x9A345678, version: AESGCM128, want: "3946da2554eae46ad1ef77a64372edc4", }, { - name: "v1_spi_12345678_mk0", master: k0, spi: 0x12345678, version: PSPv1, + name: "v1_spi_12345678_mk0", master: k0, spi: 0x12345678, version: AESGCM256, want: "2b7d72074e42ca334487f2990e3f8c4037e436f38283449b76463e9b7fb2e3de", }, } @@ -94,14 +94,14 @@ func TestDeriveSAKey_PSPSpec(t *testing.T) { } func TestDeriveSAKey_BadMasterKeyLen(t *testing.T) { - if _, err := DeriveSAKey(make([]byte, 16), 1, PSPv0); err == nil { + if _, err := DeriveSAKey(make([]byte, 16), 1, AESGCM128); err == nil { t.Fatal("expected error for 16-byte master key, got nil") } } func TestDeriveSAKey_UnsupportedVersionFailsClosed(t *testing.T) { mk := make([]byte, MasterKeyLen) - if _, err := DeriveSAKey(mk, 1, PSPVersion(7)); err == nil { - t.Fatal("expected error for unsupported PSP version, got nil (must fail closed, not default to 16 bytes)") + if _, err := DeriveSAKey(mk, 1, ICXVersion(7)); err == nil { + t.Fatal("expected error for unsupported cipher suite, got nil (must fail closed, not default to 16 bytes)") } } diff --git a/control/protocol.go b/control/protocol.go index 082e550..427ffa3 100644 --- a/control/protocol.go +++ b/control/protocol.go @@ -30,20 +30,20 @@ const ( ) // saOffer announces the SPI on which the sender will RECEIVE data-plane traffic -// for the given PSP version. The peer derives the key for this SPI and uses it +// for the given cipher suite. The peer derives the key for this SPI and uses it // as its TX key; the sender uses it as its RX key. type saOffer struct { - PSPVersion PSPVersion - RxSPI uint32 + Version ICXVersion + RxSPI uint32 } -const saOfferLen = 1 + 1 + 1 + 4 // protoVer + type + pspVer + rxSPI +const saOfferLen = 1 + 1 + 1 + 4 // protoVer + type + suite + rxSPI func (o saOffer) marshal() []byte { b := make([]byte, saOfferLen) b[0] = ProtocolVersion b[1] = byte(msgSAOffer) - b[2] = byte(o.PSPVersion) + b[2] = byte(o.Version) binary.BigEndian.PutUint32(b[3:], o.RxSPI) return b } @@ -59,8 +59,8 @@ func parseSAOffer(b []byte) (saOffer, error) { return saOffer{}, fmt.Errorf("control: expected SA offer, got message type %d", b[1]) } return saOffer{ - PSPVersion: PSPVersion(b[2]), - RxSPI: binary.BigEndian.Uint32(b[3:7]), + Version: ICXVersion(b[2]), + RxSPI: binary.BigEndian.Uint32(b[3:7]), }, nil } diff --git a/control/sa.go b/control/sa.go index e08f0f5..27e73b7 100644 --- a/control/sa.go +++ b/control/sa.go @@ -48,15 +48,15 @@ func DeriveMasterKeys(rootSecret []byte) (*MasterKeys, error) { func MasterKeyIndex(spi uint32) int { return int(spi >> 31) } // SA is a unidirectional PSP security association: an SPI, the derived AES-GCM -// key, and the PSP version (which fixes the key length / cipher). +// key, and the cipher suite (which fixes the key length / cipher). type SA struct { SPI uint32 Key []byte - Version PSPVersion + Version ICXVersion } // DeriveSA derives the SA key for spi using the master key its MSB selects. -func (m *MasterKeys) DeriveSA(spi uint32, v PSPVersion) (*SA, error) { +func (m *MasterKeys) DeriveSA(spi uint32, v ICXVersion) (*SA, error) { if spi&spiLowMask == 0 { return nil, errors.New("control: SPI low 31 bits must be non-zero (zero is reserved)") } diff --git a/control/sa_test.go b/control/sa_test.go index d5cb64f..2bf92ab 100644 --- a/control/sa_test.go +++ b/control/sa_test.go @@ -44,20 +44,20 @@ func TestDeriveSAMatchesKDFAndSelectsMasterKey(t *testing.T) { spi0, _ := MakeSPI(0, Initiator, 7) spi1, _ := MakeSPI(1, Initiator, 7) - sa0, err := mk.DeriveSA(spi0, PSPv0) + sa0, err := mk.DeriveSA(spi0, AESGCM128) if err != nil { t.Fatal(err) } - want0, _ := DeriveSAKey(mk.keys[0][:], spi0, PSPv0) + want0, _ := DeriveSAKey(mk.keys[0][:], spi0, AESGCM128) if !bytes.Equal(sa0.Key, want0) { t.Fatal("DeriveSA(MSB=0) did not use master key 0") } - sa1, err := mk.DeriveSA(spi1, PSPv0) + sa1, err := mk.DeriveSA(spi1, AESGCM128) if err != nil { t.Fatal(err) } - want1, _ := DeriveSAKey(mk.keys[1][:], spi1, PSPv0) + want1, _ := DeriveSAKey(mk.keys[1][:], spi1, AESGCM128) if !bytes.Equal(sa1.Key, want1) { t.Fatal("DeriveSA(MSB=1) did not use master key 1") } @@ -89,8 +89,8 @@ func TestDirectionsNeverCollide(t *testing.T) { } seen[is], seen[rs] = true, true - txSA, _ := mk.DeriveSA(is, PSPv0) - rxSA, _ := mk.DeriveSA(rs, PSPv0) + txSA, _ := mk.DeriveSA(is, AESGCM128) + rxSA, _ := mk.DeriveSA(rs, AESGCM128) if bytes.Equal(txSA.Key, rxSA.Key) { t.Fatal("tx and rx SA keys collided") } @@ -112,7 +112,7 @@ func TestMakeSPIValidation(t *testing.T) { func TestDeriveSARejectsReservedSPI(t *testing.T) { mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x01}, RootSecretLen)) // SPI whose low 31 bits are zero (only the master-key bit set) is reserved. - if _, err := mk.DeriveSA(uint32(1)<<31, PSPv0); err == nil { + if _, err := mk.DeriveSA(uint32(1)<<31, AESGCM128); err == nil { t.Fatal("expected error for reserved SPI (zero low 31 bits)") } } diff --git a/control/transport.go b/control/transport.go index 14c1b4e..438e23a 100644 --- a/control/transport.go +++ b/control/transport.go @@ -185,7 +185,7 @@ type DirectionalSAs struct { } // NegotiateSAs runs the SA-setup exchange over a fresh QUIC stream and returns -// the tx/rx SAs for PSP version v. Each peer allocates and announces its own RX +// the tx/rx SAs for cipher suite v. Each peer allocates and announces its own RX // SPI; both then derive every key locally from the shared master keys. The // initiator writes first, the responder replies, so there is no deadlock. // @@ -201,15 +201,15 @@ type DirectionalSAs struct { // sequentially, or have both peers issue the same number of concurrent calls // (≤ MaxIncomingStreams); a surplus initiator call blocks until a matching // responder call or the ctx deadline. -func (s *Session) NegotiateSAs(ctx context.Context, v PSPVersion) (*DirectionalSAs, error) { +func (s *Session) NegotiateSAs(ctx context.Context, v ICXVersion) (*DirectionalSAs, error) { if !v.valid() { - return nil, fmt.Errorf("control: unsupported PSP version %d", v) + return nil, fmt.Errorf("control: unsupported cipher suite %d", v) } myRxSPI, err := s.rxAlloc.Allocate(activeMasterKeyIndex) if err != nil { return nil, err } - offer := saOffer{PSPVersion: v, RxSPI: myRxSPI} + offer := saOffer{Version: v, RxSPI: myRxSPI} var stream *quic.Stream if s.role == Initiator { @@ -248,9 +248,9 @@ func (s *Session) NegotiateSAs(ctx context.Context, v PSPVersion) (*DirectionalS // deriveDirectional derives the tx/rx SAs and enforces the txKey != rxKey // invariant (the role-partitioned SPI space guarantees distinct SPIs, but we // assert on the derived keys as a belt-and-suspenders check). -func (s *Session) deriveDirectional(v PSPVersion, myRxSPI uint32, peer saOffer) (*DirectionalSAs, error) { - if peer.PSPVersion != v { - return nil, fmt.Errorf("control: PSP version mismatch: local %d, peer %d", v, peer.PSPVersion) +func (s *Session) deriveDirectional(v ICXVersion, myRxSPI uint32, peer saOffer) (*DirectionalSAs, error) { + if peer.Version != v { + return nil, fmt.Errorf("control: cipher suite mismatch: local %d, peer %d", v, peer.Version) } rx, err := s.masterKeys.DeriveSA(myRxSPI, v) if err != nil { diff --git a/control/transport_test.go b/control/transport_test.go index 38cabb0..ef1adaa 100644 --- a/control/transport_test.go +++ b/control/transport_test.go @@ -99,10 +99,10 @@ func TestControlSessionHandshakeAndSANegotiation(t *testing.T) { } rCh := make(chan res, 1) go func() { - sas, err := responder.NegotiateSAs(ctx, PSPv0) + sas, err := responder.NegotiateSAs(ctx, AESGCM128) rCh <- res{sas, err} }() - iSAs, err := initiator.NegotiateSAs(ctx, PSPv0) + iSAs, err := initiator.NegotiateSAs(ctx, AESGCM128) if err != nil { t.Fatalf("initiator NegotiateSAs: %v", err) } @@ -164,7 +164,7 @@ func TestControlSessionRejectsUnpinnedPeer(t *testing.T) { return // rejected at dial — also acceptable } defer sess.Close() - if _, err := sess.NegotiateSAs(ctx, PSPv0); err == nil { + if _, err := sess.NegotiateSAs(ctx, AESGCM128); err == nil { t.Fatal("SA negotiation succeeded against a responder that pinned a different key") } } diff --git a/cp_wire_test.go b/cp_wire_test.go index 607656d..7eaf94c 100644 --- a/cp_wire_test.go +++ b/cp_wire_test.go @@ -64,10 +64,10 @@ func negotiateLoopback(t *testing.T) (iSAs, rSAs *control.DirectionalSAs) { } negCh := make(chan nres, 1) go func() { - sas, err := rSess.NegotiateSAs(ctx, control.PSPv0) + sas, err := rSess.NegotiateSAs(ctx, control.AESGCM128) negCh <- nres{sas, err} }() - iSAs, err = iSess.NegotiateSAs(ctx, control.PSPv0) + iSAs, err = iSess.NegotiateSAs(ctx, control.AESGCM128) require.NoError(t, err) neg := <-negCh require.NoError(t, neg.err)