diff --git a/_examples/keyexchange/main.go b/_examples/keyexchange/main.go new file mode 100644 index 0000000..8a69177 --- /dev/null +++ b/_examples/keyexchange/main.go @@ -0,0 +1,182 @@ +// Command keyexchange is a runnable demonstration of the ICX control plane: +// two peers establish a forward-secret, mutually-authenticated QUIC/mTLS +// session over loopback, derive PSP master keys from the TLS exporter, and +// negotiate per-direction Security Associations whose AES-GCM keys feed the +// Geneve/AF_XDP data plane. +// +// It runs both peers in one process and self-verifies the result, so it doubles +// as living documentation and a smoke test. Build under GODEBUG=fips140=on to +// confirm the whole exchange uses only FIPS-approved primitives: +// +// GODEBUG=fips140=on go run ./_examples/keyexchange +// +// This example tracks the control-plane API as it evolves; keep it building. +package main + +import ( + "context" + "crypto/sha256" + "crypto/tls" + "flag" + "fmt" + "log" + "net" + "time" + + "github.com/apoxy-dev/icx/control" +) + +func main() { + useV1 := flag.Bool("v1", false, "use the AES-GCM-256 cipher suite instead of AES-GCM-128") + flag.Parse() + + version := control.AESGCM128 + if *useV1 { + version = control.AESGCM256 + } + + if err := run(version); err != nil { + log.Fatalf("keyexchange demo failed: %v", err) + } +} + +func run(version control.ICXVersion) error { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + // 1. Long-term identities. In production each side holds its own private key + // and is configured with the peer's public key (--peer-key), WireGuard + // style. Here we mint both. + initiatorID, err := control.GenerateIdentity() + if err != nil { + return err + } + responderID, err := control.GenerateIdentity() + if err != nil { + return err + } + iFP, _ := initiatorID.Fingerprint() + rFP, _ := responderID.Fingerprint() + fmt.Printf("identities:\n initiator %s\n responder %s\n", iFP, rFP) + + // 2. Loopback UDP sockets (the control-plane port; AF_XDP owns the data port). + srvConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + return err + } + defer srvConn.Close() + cliConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + return err + } + defer cliConn.Close() + + // 3. Responder listens (and pins the initiator's key). + ln, err := control.Listen(srvConn, responderID, initiatorID.PublicKey()) + if err != nil { + return err + } + defer ln.Close() + + type negResult struct { + sess *control.Session + sas *control.DirectionalSAs + err error + } + respCh := make(chan negResult, 1) + go func() { + sess, err := ln.Accept(ctx) + if err != nil { + respCh <- negResult{err: err} + return + } + sas, err := sess.NegotiateSAs(ctx, version) + respCh <- negResult{sess: sess, sas: sas, err: err} + }() + + // 4. Initiator dials (and pins the responder's key) — this is the TLS 1.3 + // handshake: mutual auth + ephemeral ECDHE (forward secrecy). + initSess, err := control.Dial(ctx, cliConn, ln.Addr(), initiatorID, responderID.PublicKey()) + if err != nil { + return fmt.Errorf("dial: %w", err) + } + defer initSess.Close() + + st := initSess.TLSState() + fmt.Printf("handshake: TLS %s, cipher %s, ALPN %q\n", + tlsVersionName(st.Version), tls.CipherSuiteName(st.CipherSuite), st.NegotiatedProtocol) + + // 5. Negotiate SAs (initiator side). + initSAs, err := initSess.NegotiateSAs(ctx, version) + if err != nil { + return fmt.Errorf("initiator NegotiateSAs: %w", err) + } + + r := <-respCh + if r.sess != nil { + defer r.sess.Close() + } + if r.err != nil { + return fmt.Errorf("responder side: %w", r.err) + } + respSAs := r.sas + + // 6. Report and verify. + fmt.Printf("master keys agree: %v\n", initSess.MasterKeys() != nil && r.sess.MasterKeys() != nil) + fmt.Printf("SAs (%s):\n", suiteName(version)) + fmt.Printf(" initiator: tx spi=%#08x key=%s | rx spi=%#08x key=%s\n", + initSAs.Tx.SPI, fp(initSAs.Tx.Key), initSAs.Rx.SPI, fp(initSAs.Rx.Key)) + fmt.Printf(" responder: tx spi=%#08x key=%s | rx spi=%#08x key=%s\n", + respSAs.Tx.SPI, fp(respSAs.Tx.Key), respSAs.Rx.SPI, fp(respSAs.Rx.Key)) + + if !equal(initSAs.Tx.Key, respSAs.Rx.Key) || !equal(initSAs.Rx.Key, respSAs.Tx.Key) { + return fmt.Errorf("VERIFY FAILED: tx/rx keys do not cross-match between peers") + } + if equal(initSAs.Tx.Key, initSAs.Rx.Key) { + return fmt.Errorf("VERIFY FAILED: initiator tx and rx keys collided") + } + if len(initSAs.Tx.Key) != expectedKeyLen(version) { + return fmt.Errorf("VERIFY FAILED: key length %d, want %d", len(initSAs.Tx.Key), expectedKeyLen(version)) + } + + fmt.Println("VERIFY OK: cross-matched, tx≠rx, FIPS-suite handshake, keys never crossed the wire") + return nil +} + +func fp(key []byte) string { + sum := sha256.Sum256(key) + return fmt.Sprintf("%x", sum[:6]) +} + +func equal(a, b []byte) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func expectedKeyLen(v control.ICXVersion) int { + if v == control.AESGCM256 { + return 32 + } + return 16 +} + +func suiteName(v control.ICXVersion) string { + if v == control.AESGCM256 { + return "v1/AES-256-GCM" + } + return "v0/AES-128-GCM" +} + +func tlsVersionName(v uint16) string { + if v == tls.VersionTLS13 { + return "1.3" + } + return fmt.Sprintf("%#x", v) +} diff --git a/cli/README.md b/cli/README.md index 5a69e54..3f23010 100644 --- a/cli/README.md +++ b/cli/README.md @@ -1,82 +1,104 @@ # InterCloud eXpress (ICX) - CLI -## Usage +ICX encrypts tunnel traffic with AES-128-GCM. Keys are established by a **QUIC/mTLS +control plane**: a channel that authenticates the peers, negotiates fresh, +forward-secret, per-session keys, and rotates them automatically — so the tunnel is safe +across restarts with no persisted key state. -ICX uses a pair of **ephemeral, per-session** symmetric keys for encrypting traffic. -**Do not reuse keys** across sessions (to avoid nonce reuse risks). +## Control plane -In production, use a secure key exchange mechanism (e.g., IKEv2) to -generate and distribute keys. +Each node has a long-term **identity key** (ECDSA P-256). Peers authenticate each other +WireGuard-style by pinning the expected public key — there is no CA. The control channel +runs on its own UDP port (`--control-port`, default `6082`), separate from the Geneve +data port (`--port`, default `6081`); the XDP filter only redirects the data port to +AF_XDP, so the control port rides the normal kernel stack. -### 1) Generate two one-time keys +### 1) Generate an identity on each host ```bash -# Key used for A → B traffic -K_AB=$(openssl rand -hex 16) -# Key used for B → A traffic -K_BA=$(openssl rand -hex 16) -``` - -### 2) Create an INI file on each host - -Each host reads keys from an INI file at --key-file. The required format is: +# Host A +icx genkey --identity-key /etc/icx/identity.pem +# prints Host A's public key (base64) to stderr -```ini -[keys] -rx=<32 hex chars> # the key this host expects to RECEIVE with -tx=<32 hex chars> # the key this host will TRANSMIT with -# Optional expiry (defaults to 24h if omitted): -# - as a Go duration (e.g. 24h, 90m) -# - or an RFC3339 timestamp (e.g. 2025-10-16T12:34:56Z) -expires=24h +# Host B +icx genkey --identity-key /etc/icx/identity.pem ``` -For Host A: +`genkey` refuses to overwrite an existing key file (pass `--force` to override). Recover +a public key at any time: -```ini -[keys] -rx=${K_BA} -tx=${K_AB} -expires=24h +```bash +icx pubkey --identity-key /etc/icx/identity.pem ``` -For Host B: +### 2) Exchange public keys -```ini -[keys] -rx=${K_AB} -tx=${K_BA} -expires=24h -``` +Distribute each host's public key to the other out of band. The value is what you pass +as the peer's `--peer-key` (it accepts the base64 string directly or a path to a file +containing it). ### 3) Start ICX on both hosts -```bash -icx -i --key-file=/path/to/icx.ini : -``` - -#### Examples: +Both hosts run the same command shape; the dialer/listener roles are elected +deterministically from the two public keys, so no extra configuration is needed. Use the +**same `--control-port`** on both ends. ```bash -# Host A -icx -i eth0 --key-file=/etc/icx/keys.ini 203.0.113.2:6081 - -# Host B -icx -i eth0 --key-file=/etc/icx/keys.ini 198.51.100.7:6081 +# Host A (peer is B's data address) +icx -i eth0 \ + --identity-key /etc/icx/identity.pem \ + --peer-key '' \ + 198.51.100.7:6081 + +# Host B (peer is A's data address) +icx -i eth0 \ + --identity-key /etc/icx/identity.pem \ + --peer-key '' \ + 203.0.113.2:6081 ``` -This creates an icx0 interface on both hosts, which you can use to securely -send and receive traffic over the ICX tunnel. - -### 4) Key rotation (SIGHUP) - -To rotate keys, update the same INI file with new rx/tx values, then send -SIGHUP to the running process: - -```bash -pkill -HUP icx -# or: kill -HUP -``` +ICX establishes the control plane (fail-closed: if the handshake or first negotiation +fails, the tunnel does not come up), installs the negotiated keys, and renegotiates a +fresh security association every `--rekey-interval` (default `2m`). Rotation is +make-before-break: the previous receive key is honored for a 30s grace period. + +Relevant flags: + +- `--identity-key PATH` — this node's identity private key. +- `--peer-key STR|PATH` — the peer's pinned public key. +- `--control-port PORT` — control-plane UDP port (default `6082`; must match on both ends). +- `--peer-control-port PORT` — peer's control port if it differs (defaults to `--control-port`). +- `--rekey-interval DUR` — SA rotation period (default `2m`). +- `--require-fips` — refuse to start unless the Go FIPS 140-3 module is active + (build/run with `GODEBUG=fips140=on`). + +### Operational notes + +**Startup ordering.** The peers elect dialer/listener roles from their keys; the dialer +retries the QUIC handshake only for the handshake window (~10s). If the listener is not up +within that window the dialer's process exits (fail-closed — no tunnel comes up). Start both +ends close together, and run under a supervisor (systemd `Restart=always`, a container +restart policy) so a larger startup skew self-heals on restart. Once established, the control +plane reconnects on its own indefinitely. + +**Restart / reconnect.** Control-plane keys are ephemeral, so any restart or reconnect is +both crypto-safe and seamless, with **no persisted state to manage**. + +Each direction is a simplex SA with its own SPI: the receiver allocates it, the sender +encrypts to it (`nonce = SPI‖counter`). The receive-SPI allocator resets to 1 on every +(re)connect, but because each session is a fresh ECDHE handshake (no 0-RTT, no session +resumption — both are disabled and asserted fail-closed), every generation also derives a +**fresh master key**. A reset or regressed SPI is therefore always paired with a key that has +never been used, so its from-zero counter is a fresh nonce space and no AES-GCM nonce can +repeat. The data-plane install seam accepts the reset SPI for exactly this reason; the only +thing it refuses is re-installing the *currently-live* transmit SPI (which would reset a live +counter under an unchanged key). + +This makes every recovery path seamless and symmetric: + +- **Transient reconnect** (a network blip, both processes survive) — the next session derives + fresh keys and both directions resume immediately. +- **One-sided restart** (either peer) — the restarted peer comes back with a fresh allocator + and a fresh handshake; the survivor accepts the reset SPI under its fresh key and traffic + resumes immediately. There is no high-water to carry forward and no peer to cycle. -ICX will reload the INI, bump the epoch, and apply the new keys. If the -reloaded keys are identical to the current ones, the reload is refused (epoch unchanged). \ No newline at end of file diff --git a/cli/go.mod b/cli/go.mod index 8af5765..6511a5a 100644 --- a/cli/go.mod +++ b/cli/go.mod @@ -9,7 +9,7 @@ require ( github.com/google/gopacket v1.1.19 github.com/urfave/cli/v2 v2.27.7 github.com/vishvananda/netlink v1.3.1 - gopkg.in/ini.v1 v1.67.0 + golang.org/x/sync v0.16.0 gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 ) @@ -19,12 +19,13 @@ require ( github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/google/btree v1.1.2 // indirect github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 // indirect + github.com/quic-go/quic-go v0.59.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/safchain/ethtool v0.6.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect - golang.org/x/net v0.39.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/sys v0.33.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sys v0.35.0 // indirect golang.org/x/time v0.7.0 // indirect ) diff --git a/cli/go.sum b/cli/go.sum index 9b021fd..58c6940 100644 --- a/cli/go.sum +++ b/cli/go.sum @@ -22,14 +22,16 @@ github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 h1:C8IqpV7kfAyZD github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9/go.mod h1:dDLiSjNqdp8VjphLdGTx19OeAUsHOzhtc1FFJqpzWMU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/quic-go/quic-go v0.59.1 h1:0Gmua0HW1Tv7ANR7hUYwRyD0MG5OJfgvYSZasGZzBic= +github.com/quic-go/quic-go v0.59.1/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/safchain/ethtool v0.6.1 h1:mhRnXE1H8fV8TTXh/HdqE4tXtb57r//BQh5pPYMuM5k= github.com/safchain/ethtool v0.6.1/go.mod h1:JzoNbG8xeg/BeVeVoMCtCb3UPWoppZZbFpA+1WFh+M0= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= @@ -38,30 +40,32 @@ github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zd github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= +go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= +go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= -gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 h1:BEymU11L8DZSC4GNK48JYIR8EcHs+gFxtg9YfYlp68c= diff --git a/cli/main.go b/cli/main.go index a445ee9..0274350 100644 --- a/cli/main.go +++ b/cli/main.go @@ -4,9 +4,11 @@ package main import ( "context" - "encoding/hex" + "crypto/ecdsa" + "crypto/fips140" "errors" "fmt" + "io" "log/slog" "math" "net" @@ -15,7 +17,6 @@ import ( "os/signal" "runtime/pprof" "strings" - "sync" "syscall" "time" @@ -23,22 +24,27 @@ import ( "github.com/google/gopacket/pcapgo" "github.com/urfave/cli/v2" "github.com/vishvananda/netlink" + "golang.org/x/sync/errgroup" "gvisor.dev/gvisor/pkg/tcpip" "github.com/apoxy-dev/icx" + "github.com/apoxy-dev/icx/control" "github.com/apoxy-dev/icx/filter" "github.com/apoxy-dev/icx/forwarder" "github.com/apoxy-dev/icx/mac" "github.com/apoxy-dev/icx/permissions" "github.com/apoxy-dev/icx/queues" "github.com/apoxy-dev/icx/veth" - - ini "gopkg.in/ini.v1" ) func main() { app := &cli.App{ - Name: "icx", + Name: "icx", + Usage: "InterCloud eXpress — an AF_XDP Geneve L3 tunnel", + Commands: []*cli.Command{ + genkeyCommand(), + pubkeyCommand(), + }, Flags: []cli.Flag{ &cli.StringFlag{ Name: "log-level", @@ -46,11 +52,14 @@ func main() { Usage: "Set the logging level (debug, info, warn, error, fatal, panic)", Value: "info", }, + // NOTE: --interface is intentionally NOT marked Required. urfave/cli + // enforces required root flags before dispatching to a subcommand, which + // would make `icx genkey`/`icx pubkey` unrunnable. It is validated inside + // run() instead. &cli.StringFlag{ - Name: "interface", - Aliases: []string{"i"}, - Usage: "Physical network interface to use", - Required: true, + Name: "interface", + Aliases: []string{"i"}, + Usage: "Physical network interface to use (required for the tunnel)", }, &cli.UintFlag{ Name: "vni", @@ -59,9 +68,30 @@ func main() { Value: 1, }, &cli.StringFlag{ - Name: "key-file", - Usage: "Path to INI file containing keys (rx, tx, optional expires)", - Required: true, + Name: "identity-key", + Usage: "Path to this node's identity private key (PKCS#8 PEM from `icx genkey`). Enables the control plane", + }, + &cli.StringFlag{ + Name: "peer-key", + Usage: "Peer's identity public key (base64 SPKI from `icx pubkey`, or a path to a file containing it). Enables the control plane", + }, + &cli.IntFlag{ + Name: "control-port", + Usage: "UDP port for the QUIC/mTLS control plane (must match on both peers)", + Value: 6082, + }, + &cli.IntFlag{ + Name: "peer-control-port", + Usage: "Peer's control-plane UDP port (defaults to --control-port)", + }, + &cli.DurationFlag{ + Name: "rekey-interval", + Usage: "How often the control plane negotiates a fresh security association", + Value: 2 * time.Minute, + }, + &cli.BoolFlag{ + Name: "require-fips", + Usage: "Refuse to start unless the Go FIPS 140-3 module is active (GODEBUG=fips140=on)", }, &cli.IntFlag{ Name: "port", @@ -105,6 +135,101 @@ func main() { if err := app.Run(os.Args); err != nil { slog.Error("Error running app", slog.Any("error", err)) + os.Exit(1) + } +} + +// genkeyCommand generates a fresh ECDSA P-256 identity private key. +func genkeyCommand() *cli.Command { + return &cli.Command{ + Name: "genkey", + Usage: "Generate a new ECDSA P-256 identity private key (PKCS#8 PEM)", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "identity-key", + Aliases: []string{"o"}, + Usage: "Path to write the private key (default: stdout)", + }, + &cli.BoolFlag{ + Name: "force", + Usage: "Overwrite an existing key file", + }, + }, + Action: func(c *cli.Context) error { + id, err := control.GenerateIdentity() + if err != nil { + return err + } + pemBytes, err := id.MarshalPrivatePEM() + if err != nil { + return err + } + + path := c.String("identity-key") + if path == "" { + _, err = os.Stdout.Write(pemBytes) + return err + } + + // Refuse to clobber an existing private key unless explicitly forced. + flags := os.O_CREATE | os.O_EXCL | os.O_WRONLY + if c.Bool("force") { + flags = os.O_CREATE | os.O_TRUNC | os.O_WRONLY + } + f, err := os.OpenFile(path, flags, 0o600) + if err != nil { + return fmt.Errorf("create identity key %q (use --force to overwrite): %w", path, err) + } + defer func() { _ = f.Close() }() + if _, err := f.Write(pemBytes); err != nil { + return err + } + pub, err := id.PublicKeyString() + if err != nil { + return err + } + fmt.Fprintf(os.Stderr, "wrote identity key to %s\npublic key (share as the peer's --peer-key):\n%s\n", path, pub) + return nil + }, + } +} + +// pubkeyCommand derives the base64(SPKI) public key from an identity private key. +func pubkeyCommand() *cli.Command { + return &cli.Command{ + Name: "pubkey", + Usage: "Print the public key (base64 SPKI) for an identity private key (from --identity-key or stdin)", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "identity-key", + Aliases: []string{"i"}, + Usage: "Path to the identity private key (default: read PEM from stdin)", + }, + }, + Action: func(c *cli.Context) error { + var ( + data []byte + err error + ) + if path := c.String("identity-key"); path != "" { + data, err = os.ReadFile(path) + } else { + data, err = io.ReadAll(os.Stdin) + } + if err != nil { + return err + } + id, err := control.LoadIdentityPEM(data) + if err != nil { + return err + } + pub, err := id.PublicKeyString() + if err != nil { + return err + } + fmt.Println(pub) + return nil + }, } } @@ -120,6 +245,19 @@ func run(c *cli.Context) error { } slog.SetLogLoggerLevel(level) + // The control plane is the only keying mode: require both halves of the pinned + // identity pair up front, fail-closed. There is deliberately no static/INI fallback. + if c.String("identity-key") == "" || c.String("peer-key") == "" { + return errors.New("control-plane keying requires both --identity-key and --peer-key (generate them with `icx genkey` / `icx pubkey`)") + } + + if c.String("interface") == "" { + return errors.New("--interface is required") + } + if c.Bool("require-fips") && !fips140.Enabled() { + return errors.New("--require-fips set but the Go FIPS 140-3 module is not active; build and run with GODEBUG=fips140=on") + } + if cpuProfilePath := c.String("cpu-profile"); cpuProfilePath != "" { f, err := os.Create(cpuProfilePath) if err != nil { @@ -160,7 +298,9 @@ func run(c *cli.Context) error { } } - ctx, cancel := signal.NotifyContext(c.Context, os.Interrupt, os.Kill) + // SIGINT for interactive use, SIGTERM for systemd/container stop. (os.Kill / + // SIGKILL cannot be caught, so registering it would be a no-op.) + ctx, cancel := signal.NotifyContext(c.Context, syscall.SIGINT, syscall.SIGTERM) defer cancel() isNetAdmin, err := permissions.IsNetAdmin() @@ -252,71 +392,20 @@ func run(c *cli.Context) error { {Src: netip.MustParsePrefix("::/0"), Dst: netip.MustParsePrefix("::/0")}, } - if err := h.AddVirtualNetwork(c.Uint("vni"), peerAddr, allRoutes); err != nil { + vni := c.Uint("vni") + if err := h.AddVirtualNetwork(vni, peerAddr, allRoutes); err != nil { return fmt.Errorf("failed to add virtual network: %w", err) } - var ( - epoch uint32 = 1 // initial epoch - rxKey, txKey [16]byte - expiresAt time.Time - ) - - keyFile := c.String("key-file") - rxKey, txKey, expiresAt, err = loadKeysFromINI(keyFile) + // Establish the QUIC/mTLS control plane: install the initial SAs and drive ongoing + // rekeys. This is the only keying path. + tun, ctrlConn, err := startControlPlane(ctx, c, h, vni, peerUDPAddr) if err != nil { return err } - - if err := h.UpdateVirtualNetworkKeys(c.Uint("vni"), epoch, rxKey, txKey, expiresAt); err != nil { - return fmt.Errorf("failed to update virtual network key: %w", err) - } - - var keyMu sync.Mutex - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh, syscall.SIGHUP) - - // SIGHUP reload handler. - go func() { - for { - select { - case <-ctx.Done(): - return - case <-sigCh: - newRX, newTX, newExpires, loadErr := loadKeysFromINI(keyFile) - if loadErr != nil { - slog.Error("Key reload failed", slog.Any("error", loadErr)) - continue - } - - keyMu.Lock() - // Refuse reload if keys are identical to current ones. - if rxKey == newRX && txKey == newTX { - slog.Warn("Refusing key reload: rx/tx unchanged; keeping current epoch", - slog.Uint64("epoch", uint64(epoch)), - ) - keyMu.Unlock() - continue - } - - // Keys changed: commit and bump epoch. - rxKey = newRX - txKey = newTX - expiresAt = newExpires - epoch++ - - if err := h.UpdateVirtualNetworkKeys(c.Uint("vni"), epoch, rxKey, txKey, expiresAt); err != nil { - slog.Error("Failed to apply reloaded keys", slog.Any("error", err)) - } else { - slog.Info("Reloaded keys", - slog.Uint64("epoch", uint64(epoch)), - slog.Time("expiresAt", expiresAt), - ) - } - keyMu.Unlock() - } - } - }() + defer func() { _ = tun.Close() }() + // The Tunnel only borrows the control socket; own its lifetime here. + defer func() { _ = ctrlConn.Close() }() fwd, err := forwarder.NewForwarder( h, @@ -329,74 +418,116 @@ func run(c *cli.Context) error { return fmt.Errorf("failed to create forwarder: %w", err) } - if err := fwd.Start(ctx); err != nil && !errors.Is(err, context.Canceled) { - return fmt.Errorf("failed to start forwarder: %w", err) + // Run the rekey loop and the forwarder under one errgroup sharing a cancel: a signal + // (ctx) or a forwarder error stops both. The control plane reconnects indefinitely on + // its own (Tunnel.Run does not return on CP failures), so it does not abort the + // forwarder; if it cannot re-establish, the data plane fails closed when the installed + // keys expire (see key lifetime below). + g, gctx := errgroup.WithContext(ctx) + g.Go(func() error { return tun.Run(gctx) }) + g.Go(func() error { + if err := fwd.Start(gctx); err != nil && !errors.Is(err, context.Canceled) { + return fmt.Errorf("forwarder: %w", err) + } + return nil + }) + if err := g.Wait(); err != nil && !errors.Is(err, context.Canceled) { + return err } - return nil } -// loadKeysFromINI loads rx/tx (hex, 16-byte each) and optional expires from an INI file. -// -// Required layout: -// -// [keys] -// rx=0123... (32 hex chars -> 16 bytes) -// tx=abcd... (32 hex chars -> 16 bytes) -// expires=24h // OR RFC3339 like 2025-10-16T12:34:56Z -// -// "expires" may be either a Go duration (e.g., "24h", "90m") or RFC3339 timestamp. -// If omitted, a default of 24h from now is applied. -func loadKeysFromINI(path string) (rxKey, txKey [16]byte, expiresAt time.Time, err error) { - cfg, err := ini.Load(path) - if err != nil { - return rxKey, txKey, time.Time{}, fmt.Errorf("open INI: %w", err) +// startControlPlane builds the QUIC/mTLS control-plane tunnel and performs the +// initial, fail-closed SA negotiation and install. The returned Tunnel's Run drives +// ongoing rekeys; the caller is responsible for running it and closing the Tunnel. +func startControlPlane(ctx context.Context, c *cli.Context, h *icx.Handler, vni uint, peerUDPAddr *net.UDPAddr) (*control.Tunnel, *net.UDPConn, error) { + controlPort := c.Int("control-port") + if controlPort == c.Int("port") { + return nil, nil, errors.New("--control-port must differ from the Geneve data port (--port); the XDP filter redirects the data port to AF_XDP and would blackhole the control plane") } - sec, err := cfg.GetSection("keys") + ident, err := loadIdentity(c.String("identity-key")) + if err != nil { + return nil, nil, err + } + peerPub, err := readPeerKey(c.String("peer-key")) if err != nil { - return rxKey, txKey, time.Time{}, errors.New("INI must contain a [keys] section") + return nil, nil, err } - get := func(k string) string { return strings.TrimSpace(sec.Key(k).String()) } + ctrlNet := "udp4" + if peerUDPAddr.IP.To4() == nil { + ctrlNet = "udp6" + } + pconn, err := net.ListenUDP(ctrlNet, &net.UDPAddr{Port: controlPort}) + if err != nil { + return nil, nil, fmt.Errorf("bind control socket on port %d: %w", controlPort, err) + } - rxHex := get("rx") - txHex := get("tx") - expStr := get("expires") + peerControlPort := c.Int("peer-control-port") + if peerControlPort == 0 { + peerControlPort = controlPort + } + peerControlAddr := &net.UDPAddr{IP: peerUDPAddr.IP, Port: peerControlPort} - if rxHex == "" || txHex == "" { - return rxKey, txKey, time.Time{}, errors.New("INI [keys] missing required keys: rx and tx") + rekeyIvl := c.Duration("rekey-interval") + // Bound the installed-key lifetime: long enough that a few missed rekeys do not + // drop traffic, short enough to fail closed if rotation stops, capped at the + // handler's recommended 24h rekey ceiling. + keyLifetime := 4 * rekeyIvl + if keyLifetime < time.Hour { + keyLifetime = time.Hour } - if len(rxHex) != 32 || len(txHex) != 32 { - return rxKey, txKey, time.Time{}, errors.New("INI [keys] rx/tx must be 32 hex characters (16 bytes)") + if keyLifetime > 24*time.Hour { + keyLifetime = 24 * time.Hour + } + + installer := func(rxSPI, txSPI uint32, rxKey, txKey [16]byte) error { + return h.UpdateVirtualNetworkSAs(vni, rxSPI, txSPI, rxKey, txKey, time.Now().Add(keyLifetime)) } - rxBytes, err := hex.DecodeString(rxHex) + tun, err := control.NewTunnel(control.TunnelConfig{ + Local: ident, + PeerPub: peerPub, + Conn: pconn, + PeerAddr: peerControlAddr, + RekeyInterval: rekeyIvl, + }, installer) if err != nil { - return rxKey, txKey, time.Time{}, fmt.Errorf("decode INI [keys].rx: %w", err) + _ = pconn.Close() + return nil, nil, err } - txBytes, err := hex.DecodeString(txHex) + + slog.Info("establishing control plane", + slog.String("peer-control", peerControlAddr.String()), + slog.Bool("initiator", tun.Initiator()), + slog.Duration("rekey-interval", rekeyIvl), + ) + if err := tun.Bringup(ctx); err != nil { + _ = tun.Close() + _ = pconn.Close() + return nil, nil, fmt.Errorf("control plane bring-up failed: %w", err) + } + return tun, pconn, nil +} + +// loadIdentity reads and parses this node's identity private key from a PEM file. +func loadIdentity(path string) (*control.Identity, error) { + data, err := os.ReadFile(path) if err != nil { - return rxKey, txKey, time.Time{}, fmt.Errorf("decode INI [keys].tx: %w", err) - } - copy(rxKey[:], rxBytes) - copy(txKey[:], txBytes) - - // Expiry handling: optional; default to 24h if absent. - if expStr != "" { - // Prefer duration, fall back to RFC3339. - if d, derr := time.ParseDuration(expStr); derr == nil { - expiresAt = time.Now().Add(d) - } else if t, terr := time.Parse(time.RFC3339, expStr); terr == nil { - expiresAt = t - } else { - return rxKey, txKey, time.Time{}, fmt.Errorf("expires must be duration (e.g. 24h) or RFC3339 timestamp: %q", expStr) - } - } else { - expiresAt = time.Now().Add(24 * time.Hour) + return nil, fmt.Errorf("read identity key %q: %w", path, err) } + return control.LoadIdentityPEM(data) +} - return rxKey, txKey, expiresAt, nil +// readPeerKey resolves a --peer-key value, which may be the base64(SPKI) string +// directly or a path to a file containing it. +func readPeerKey(val string) (*ecdsa.PublicKey, error) { + s := strings.TrimSpace(val) + if data, err := os.ReadFile(val); err == nil { + s = strings.TrimSpace(string(data)) + } + return control.ParsePublicKey(s) } func selectSourceAddr(link netlink.Link, dstAddr *tcpip.FullAddress) (*tcpip.FullAddress, error) { diff --git a/control/cmac.go b/control/cmac.go new file mode 100644 index 0000000..a4702ea --- /dev/null +++ b/control/cmac.go @@ -0,0 +1,93 @@ +// Package control implements ICX's key-establishment control plane (a QUIC/mTLS +// channel) and the PSP-model key derivation that turns an authenticated, +// forward-secret session into per-Security-Association AEAD keys for the +// existing Geneve/AF_XDP data plane. +// +// This file implements AES-CMAC (NIST SP 800-38B / RFC 4493), the +// pseudorandom function underlying the PSP SP 800-108 key-derivation function +// (see kdf.go). CMAC is built directly on the FIPS-validated crypto/aes block +// cipher so the whole derivation stays inside the Go FIPS 140-3 module. +package control + +import ( + "crypto/aes" + "crypto/cipher" +) + +// cmacRb is the GF(2^128) reduction constant for a 128-bit block (RFC 4493 §2.3). +const cmacRb = 0x87 + +// aesCMAC computes the AES-CMAC of msg under key k. k must be a valid AES key +// (16, 24, or 32 bytes); the PSP KDF always uses a 32-byte (AES-256) master +// key. The tag is always 16 bytes (the AES block size). +func aesCMAC(k, msg []byte) ([]byte, error) { + block, err := aes.NewCipher(k) + if err != nil { + return nil, err + } + return cmacWithBlock(block, msg), nil +} + +// cmacWithBlock computes AES-CMAC using a pre-constructed block cipher. +func cmacWithBlock(block cipher.Block, msg []byte) []byte { + const bs = aes.BlockSize // 16 + + // Subkey generation (RFC 4493 §2.3): L = AES_K(0^128); K1 = dbl(L); + // K2 = dbl(K1). + l := make([]byte, bs) + block.Encrypt(l, l) + k1 := dbl(l) + k2 := dbl(k1) + + // Determine the number of blocks and whether the final block is complete. + n := (len(msg) + bs - 1) / bs + lastComplete := n != 0 && len(msg)%bs == 0 + if n == 0 { + n = 1 // empty message uses a single (padded) block + } + + // Final block: XOR with K1 if the last block is complete, else pad with + // 10* and XOR with K2. + last := make([]byte, bs) + if lastComplete { + xorInto(last, msg[(n-1)*bs:], k1) + } else { + rem := msg[(n-1)*bs:] + copy(last, rem) + last[len(rem)] = 0x80 + xorInto(last, last, k2) + } + + // CBC-MAC chain over all but the last block, then the final block. + x := make([]byte, bs) + y := make([]byte, bs) + for i := 0; i < n-1; i++ { + xorInto(y, x, msg[i*bs:(i+1)*bs]) + block.Encrypt(x, y) + } + xorInto(y, x, last) + block.Encrypt(x, y) + return x +} + +// dbl performs the GF(2^128) left-shift-and-reduce used in CMAC subkey +// generation: out = (in << 1), XOR'd with Rb if the high bit of in was set. +func dbl(in []byte) []byte { + out := make([]byte, len(in)) + var carry byte + for i := len(in) - 1; i >= 0; i-- { + out[i] = in[i]<<1 | carry + carry = in[i] >> 7 + } + if carry != 0 { + out[len(out)-1] ^= cmacRb + } + return out +} + +// xorInto writes a XOR b into dst. dst, a, and b must be the same length. +func xorInto(dst, a, b []byte) { + for i := range dst { + dst[i] = a[i] ^ b[i] + } +} diff --git a/control/cp.go b/control/cp.go new file mode 100644 index 0000000..7b50473 --- /dev/null +++ b/control/cp.go @@ -0,0 +1,398 @@ +package control + +import ( + "bytes" + "context" + "crypto/ecdsa" + "crypto/x509" + "errors" + "fmt" + "log/slog" + "net" + "time" +) + +// This file is the control-plane orchestrator: it drives the QUIC/mTLS session +// (control/transport.go) and feeds the negotiated SAs into the data plane via an +// SAInstaller, so the CLI stays thin and the wiring is unit-testable off Linux. +// +// Data-plane SA model (PSP, per-direction): the handler installs two simplex SAs per +// generation (handler.go: UpdateVirtualNetworkSAs), each selected by its own +// role-partitioned SPI. NegotiateSAs gives each peer a DirectionalSAs{Rx, Tx} where Rx +// is the SPI this peer allocated (our receive SPI) and Tx is the peer's receive SPI +// (what we transmit to). Both peers derive every key locally from the shared master +// keys, so nothing but the SPIs crosses the wire. Each direction therefore has its own +// nonce space, separated by the SPI itself (the role bit) on top of the distinct +// per-direction key. +// +// No persisted epoch state is needed for recovery. The per-session SPI allocator resets +// on every (re)connect, so a fresh session's SPIs start low again — but that is SAFE +// because every reconnect is a fresh ECDHE handshake (no 0-RTT, no resumption; enforced +// in transport.go) yielding fresh master keys, so a reused SPI value derives a different +// key and the data-plane nonce never repeats. The handler's install guard therefore +// accepts the reset SPI (it rejects only a re-install of the currently-live transmit +// SPI), which makes a transient reconnect and a one-sided restart of either peer recover +// seamlessly with zero on-disk state. + +// CanonicalInitiator reports whether the local node is the control-plane initiator +// — the peer that dials. The role is elected deterministically from the two pinned +// identities so both ends agree with zero configuration (WireGuard-style): the node +// whose SubjectPublicKeyInfo DER sorts lower is the initiator, the other listens. +// Identical keys are rejected — a node must not tunnel to itself, and equal keys +// would make both ends pick the same role (double-dial / double-listen deadlock). +func CanonicalInitiator(localPub, peerPub *ecdsa.PublicKey) (bool, error) { + if localPub == nil || peerPub == nil { + return false, errors.New("control: nil identity key") + } + if PublicKeyEqual(localPub, peerPub) { + return false, errors.New("control: local and peer identity keys are identical; peers must have distinct keys") + } + l, err := x509.MarshalPKIXPublicKey(localPub) + if err != nil { + return false, fmt.Errorf("control: marshal local key: %w", err) + } + p, err := x509.MarshalPKIXPublicKey(peerPub) + if err != nil { + return false, fmt.Errorf("control: marshal peer key: %w", err) + } + return bytes.Compare(l, p) < 0, nil +} + +// SAInstaller installs a negotiated SA generation into the data plane. rxSPI is our +// receive SPI (we decrypt inbound frames under it); txSPI is the peer's receive SPI (we +// encrypt outbound frames to it); rxKey/txKey are the 16-byte AES-128 keys for those +// directions. The installer owns the key lifetime/expiry and is expected to enforce the +// handler's fail-closed guards (non-zero, strictly increasing per-direction SPIs, +// rxKey != txKey). A returned error is treated as a rejected rotation, not a session +// failure. +type SAInstaller func(rxSPI, txSPI uint32, rxKey, txKey [16]byte) error + +// Default lifecycle timings; overridable on Tunnel for tests. +const ( + defaultPerExchangeTimeout = 10 * time.Second + defaultReconnectBackoff = 5 * time.Second +) + +// Tunnel runs the control-plane lifecycle for one peer: it establishes the QUIC/mTLS +// session, performs the initial SA negotiation and install (fail-closed) in Bringup, +// and then keeps the SAs fresh in Run — the initiator drives rekeys on a timer, the +// responder serves them from an accept loop. A Tunnel is not safe for concurrent use; +// Bringup then Run are called once each, in that order. +type Tunnel struct { + local *Identity + peerPub *ecdsa.PublicKey + conn net.PacketConn + peerAddr net.Addr + rekeyIvl time.Duration + install SAInstaller + initiator bool + + // tunables (defaults set by NewTunnel; tests may override) + perExchangeTimeout time.Duration + reconnectBackoff time.Duration + + ln *Listener // responder only; persists across reconnects + sess *Session +} + +// TunnelConfig is the immutable configuration for a Tunnel. +type TunnelConfig struct { + // Local is this node's long-term identity (its private key). + Local *Identity + // PeerPub is the pinned public key of the single expected peer. + PeerPub *ecdsa.PublicKey + // Conn is the bound control-plane UDP socket (separate from the Geneve data port). + Conn net.PacketConn + // PeerAddr is the peer's control-plane address (peer IP + control port). + PeerAddr net.Addr + // RekeyInterval is how often the initiator negotiates a fresh SA generation. + RekeyInterval time.Duration +} + +// NewTunnel validates the config, elects the canonical role, and returns a Tunnel +// ready for Bringup. It does no I/O. +func NewTunnel(cfg TunnelConfig, install SAInstaller) (*Tunnel, error) { + if cfg.Local == nil || cfg.PeerPub == nil { + return nil, errors.New("control: tunnel requires local identity and peer key") + } + if cfg.Conn == nil || cfg.PeerAddr == nil { + return nil, errors.New("control: tunnel requires a control socket and peer address") + } + if install == nil { + return nil, errors.New("control: tunnel requires an SA installer") + } + if cfg.RekeyInterval <= 0 { + return nil, errors.New("control: rekey interval must be positive") + } + initiator, err := CanonicalInitiator(cfg.Local.PublicKey(), cfg.PeerPub) + if err != nil { + return nil, err + } + return &Tunnel{ + local: cfg.Local, + peerPub: cfg.PeerPub, + conn: cfg.Conn, + peerAddr: cfg.PeerAddr, + rekeyIvl: cfg.RekeyInterval, + install: install, + initiator: initiator, + perExchangeTimeout: defaultPerExchangeTimeout, + reconnectBackoff: defaultReconnectBackoff, + }, nil +} + +// Initiator reports the elected role (true = this node dials). +func (t *Tunnel) Initiator() bool { return t.initiator } + +// Bringup establishes the session and performs the first SA negotiation and install. +// It is synchronous and FAIL-CLOSED: it returns an error (and installs nothing) if +// the handshake, negotiation, or install fails, so the caller must not start the data +// plane until Bringup succeeds. +func (t *Tunnel) Bringup(ctx context.Context) (err error) { + if err = t.establish(ctx); err != nil { + return fmt.Errorf("control: establish session: %w", err) + } + if err = t.negotiateAndInstall(ctx); err != nil { + t.closeSession() + return fmt.Errorf("control: initial SA negotiation: %w", err) + } + role := "responder" + if t.initiator { + role = "initiator" + } + slog.Info("control plane established", slog.String("role", role), + slog.String("peer", t.peerAddr.String())) + return nil +} + +// Run keeps the SAs fresh until ctx is cancelled. The initiator rekeys on its timer +// (and reacts promptly to session loss via the QUIC connection context); the +// responder serves rekeys from a blocking accept loop. A failed negotiation is +// session-fatal: the session is torn down and re-established (fresh, aligned +// allocators) rather than retried on a dead session. Control-plane failures are NOT +// returned: they drive reconnect-with-backoff indefinitely, so Run effectively +// returns only when ctx is cancelled (clean shutdown). If the control plane cannot +// re-establish, the data plane fails closed when the installed keys expire — Run does +// not proactively tear it down. Bringup must have succeeded first. +func (t *Tunnel) Run(ctx context.Context) error { + defer t.Close() + if t.initiator { + return t.runInitiator(ctx) + } + return t.runResponder(ctx) +} + +func (t *Tunnel) runInitiator(ctx context.Context) error { + ticker := time.NewTicker(t.rekeyIvl) + defer ticker.Stop() + for { + if ctx.Err() != nil { + return nil + } + sessLost := t.sessionDone() + select { + case <-ctx.Done(): + return nil + case <-sessLost: + slog.Warn("control: session lost, reconnecting") + if err := t.reestablish(ctx); err != nil { + return err + } + case <-ticker.C: + exCtx, cancel := context.WithTimeout(ctx, t.perExchangeTimeout) + err := t.negotiateAndInstall(exCtx) + cancel() + if err == nil { + continue + } + if ctx.Err() != nil { + return nil + } + if isFatalCP(err) { + // SPI-space exhaustion is terminal (master-key rotation, unsupported, + // is the only remedy); reconnecting would just hot-loop. Fail closed. + return err + } + // installSAs swallows an install rejection, so any error here is a genuine + // session/transport failure → reconnect (which derives fresh keys). + slog.Warn("control: rekey failed, reconnecting", slog.Any("error", err)) + if err := t.reestablish(ctx); err != nil { + return err + } + } + } +} + +func (t *Tunnel) runResponder(ctx context.Context) error { + for { + if ctx.Err() != nil { + return nil + } + // The accept loop blocks in NegotiateSAs' AcceptStream until the initiator + // drives the next rekey; the long-lived ctx (no per-exchange deadline) lets it + // wait across the whole interval, and QUIC's MaxIdleTimeout bounds a half-open + // exchange. Errors are session-fatal → reconnect. + if err := t.negotiateAndInstall(ctx); err != nil { + if ctx.Err() != nil { + return nil + } + slog.Warn("control: SA negotiation failed, reconnecting", slog.Any("error", err)) + if err := t.reestablish(ctx); err != nil { + return err + } + } + } +} + +// negotiateAndInstall runs one SA exchange on the live session and installs the +// result. installSAs swallows an install rejection (returns nil) so it does not look +// like a transport failure; a non-nil error here means the wire exchange failed. +func (t *Tunnel) negotiateAndInstall(ctx context.Context) error { + sas, err := t.sess.NegotiateSAs(ctx, AESGCM128) + if err != nil { + return err + } + return t.installSAs(sas) +} + +// installSAs validates the negotiated SAs fail-closed (AES-GCM-128, 16-byte keys) and hands +// the two per-direction SPIs/keys to the installer. Every session derives fresh keys +// (fresh ECDHE; see transport.go), so the handler accepts the install even when the +// per-session allocator reset the SPIs to a low value after a reconnect. An install +// rejection is swallowed as defense-in-depth (e.g. the handler refusing to reset its +// currently-live transmit SPI): the previously installed keys keep forwarding and the +// data plane fails closed on their own expiry. +func (t *Tunnel) installSAs(sas *DirectionalSAs) error { + if sas.Tx.Version != AESGCM128 || sas.Rx.Version != AESGCM128 { + return fmt.Errorf("control: only the AES-GCM-128 cipher suite is supported in this build (tx=%d rx=%d)", sas.Tx.Version, sas.Rx.Version) + } + if len(sas.Rx.Key) != 16 || len(sas.Tx.Key) != 16 { + return fmt.Errorf("control: expected 16-byte SA keys (rx=%d tx=%d)", len(sas.Rx.Key), len(sas.Tx.Key)) + } + var rxKey, txKey [16]byte + copy(rxKey[:], sas.Rx.Key) + copy(txKey[:], sas.Tx.Key) + if err := t.install(sas.Rx.SPI, sas.Tx.SPI, rxKey, txKey); err != nil { + slog.Warn("control: SA install rejected; keeping current keys until they expire", + slog.Uint64("rxSPI", uint64(sas.Rx.SPI)), slog.Uint64("txSPI", uint64(sas.Tx.SPI)), slog.Any("error", err)) + return nil + } + slog.Debug("control: installed SA generation", + slog.Uint64("rxSPI", uint64(sas.Rx.SPI)), slog.Uint64("txSPI", uint64(sas.Tx.SPI))) + return nil +} + +// establish opens a fresh session: the initiator dials, the responder accepts on a +// listener it keeps across reconnects. The new session's SPI allocator starts fresh +// (low) — safe because the session's keys are also fresh (see installSAs / transport.go). +func (t *Tunnel) establish(ctx context.Context) error { + if t.initiator { + sess, err := Dial(ctx, t.conn, t.peerAddr, t.local, t.peerPub) + if err != nil { + return err + } + t.sess = sess + return nil + } + if t.ln == nil { + ln, err := Listen(t.conn, t.local, t.peerPub) + if err != nil { + return err + } + t.ln = ln + } + sess, err := t.ln.Accept(ctx) + if err != nil { + return err + } + t.sess = sess + return nil +} + +// reestablish tears down the dead session and re-establishes one, backing off +// between attempts so a persistent failure does not hot-loop. It returns an error +// only if ctx is cancelled while waiting. +func (t *Tunnel) reestablish(ctx context.Context) error { + t.closeSession() + for attempt := 0; ; attempt++ { + // Try immediately on the first attempt; back off only between retries so a + // transient loss recovers without an added backoff of latency. + if attempt > 0 && !sleepCtx(ctx, t.reconnectBackoff) { + return ctx.Err() + } + if err := t.establish(ctx); err != nil { + if ctx.Err() != nil { + return ctx.Err() + } + slog.Warn("control: reconnect attempt failed", slog.Any("error", err)) + continue + } + // Re-key immediately on the new session so traffic resumes without waiting a + // full interval. The new session's SPI allocator starts fresh (low) again, but the + // install is still accepted because the new session derives FRESH master keys + // (fresh ECDHE; see transport.go), so a reset/regressed SPI is paired with a fresh + // key and the data-plane nonce never repeats. A transport error drops back to + // another reconnect attempt. + exCtx, cancel := context.WithTimeout(ctx, t.perExchangeTimeout) + err := t.negotiateAndInstall(exCtx) + cancel() + if err != nil && ctx.Err() == nil { + if isFatalCP(err) { + // SPI counter-space exhaustion (ErrSPIExhausted) is terminal: it requires + // master-key rotation, which this build does not support, and a fresh + // allocator would just exhaust again. Surface it so Run returns and fails + // closed rather than hot-looping reconnects. + return err + } + slog.Warn("control: post-reconnect negotiation failed", slog.Any("error", err)) + t.closeSession() + continue + } + return ctx.Err() + } +} + +// sessionDone returns the current session's done channel, or nil (which blocks +// forever in a select) if there is no live session. +func (t *Tunnel) sessionDone() <-chan struct{} { + if t.sess == nil { + return nil + } + return t.sess.Context().Done() +} + +func (t *Tunnel) closeSession() { + if t.sess != nil { + _ = t.sess.Close() + t.sess = nil + } +} + +// isFatalCP reports whether err is a terminal, non-retryable control-plane error that +// must stop Run rather than drive a reconnect. +func isFatalCP(err error) bool { + return errors.Is(err, ErrSPIExhausted) +} + +// Close releases the session and (responder) the listener. It is idempotent. +func (t *Tunnel) Close() error { + t.closeSession() + if t.ln != nil { + err := t.ln.Close() + t.ln = nil + return err + } + return nil +} + +// sleepCtx waits for d or until ctx is done. It reports false if ctx was cancelled. +func sleepCtx(ctx context.Context, d time.Duration) bool { + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-ctx.Done(): + return false + case <-timer.C: + return true + } +} diff --git a/control/cp_test.go b/control/cp_test.go new file mode 100644 index 0000000..a55722e --- /dev/null +++ b/control/cp_test.go @@ -0,0 +1,419 @@ +package control + +import ( + "bytes" + "context" + "crypto/x509" + "errors" + "net" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestCanonicalInitiator(t *testing.T) { + a, err := GenerateIdentity() + require.NoError(t, err) + b, err := GenerateIdentity() + require.NoError(t, err) + + aInit, err := CanonicalInitiator(a.PublicKey(), b.PublicKey()) + require.NoError(t, err) + bInit, err := CanonicalInitiator(b.PublicKey(), a.PublicKey()) + require.NoError(t, err) + // Exactly one side is the initiator, and both compute it consistently. + require.NotEqual(t, aInit, bInit) + + // Pin the rule, not just its antisymmetry: the node whose SPKI DER sorts lower is + // the initiator. (A flipped comparison would still pass the NotEqual check above.) + aDER, err := x509.MarshalPKIXPublicKey(a.PublicKey()) + require.NoError(t, err) + bDER, err := x509.MarshalPKIXPublicKey(b.PublicKey()) + require.NoError(t, err) + require.Equal(t, bytes.Compare(aDER, bDER) < 0, aInit, "the lower SPKI must be the initiator") + + _, err = CanonicalInitiator(a.PublicKey(), a.PublicKey()) + require.Error(t, err, "identical keys must be rejected") + _, err = CanonicalInitiator(nil, b.PublicKey()) + require.Error(t, err) +} + +// validV0SAs returns a role-partitioned, AESGCM128 DirectionalSAs with distinct +// 16-byte keys, as NegotiateSAs would produce. +func validV0SAs() *DirectionalSAs { + iSPI, _ := MakeSPI(0, Initiator, 1) + rSPI, _ := MakeSPI(0, Responder, 1) + rx := make([]byte, 16) + tx := make([]byte, 16) + for i := range rx { + rx[i] = byte(i) + tx[i] = byte(i + 100) + } + return &DirectionalSAs{ + Tx: &SA{SPI: rSPI, Key: tx, Version: AESGCM128}, + Rx: &SA{SPI: iSPI, Key: rx, Version: AESGCM128}, + } +} + +func TestInstallSAsRejectsNonV0(t *testing.T) { + tn := &Tunnel{install: func(uint32, uint32, [16]byte, [16]byte) error { + t.Fatal("installer must not be called for a non-AES-GCM-128 SA") + return nil + }} + sas := validV0SAs() + sas.Tx.Version = AESGCM256 + sas.Tx.Key = make([]byte, 32) + require.Error(t, tn.installSAs(sas)) +} + +func TestInstallSAsSwallowsRotationRejection(t *testing.T) { + called := false + tn := &Tunnel{install: func(uint32, uint32, [16]byte, [16]byte) error { + called = true + // Mimic the handler's monotonicity guard rejecting a regressed per-direction SPI. + return errors.New("rx SPI must be monotonically increasing") + }} + // A rejected rotation is logged and swallowed (the data plane keeps its current + // keys and fails closed on their own expiry); it must not look like a transport + // error to the run loop. + require.NoError(t, tn.installSAs(validV0SAs())) + require.True(t, called) +} + +// twoTunnels wires an initiator and a responder Tunnel over loopback UDP, assigning +// the canonical roles correctly, with tight timings for tests. +// twoTunnels wires an initiator and a responder Tunnel over loopback UDP with the +// canonical roles assigned and tight test timings. +func twoTunnels(t *testing.T, instInit, instResp SAInstaller, rekey time.Duration) (initT, respT *Tunnel, cleanup func()) { + t.Helper() + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + respConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + initConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + + initT, err = NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, + PeerAddr: respConn.LocalAddr(), RekeyInterval: rekey, + }, instInit) + require.NoError(t, err) + require.True(t, initT.Initiator()) + + respT, err = NewTunnel(TunnelConfig{ + Local: respID, PeerPub: initID.PublicKey(), Conn: respConn, + PeerAddr: initConn.LocalAddr(), RekeyInterval: rekey, + }, instResp) + require.NoError(t, err) + require.False(t, respT.Initiator()) + + for _, tn := range []*Tunnel{initT, respT} { + tn.perExchangeTimeout = 2 * time.Second + tn.reconnectBackoff = 50 * time.Millisecond + } + + cleanup = func() { + _ = initT.Close() + _ = respT.Close() + _ = initConn.Close() + _ = respConn.Close() + } + return initT, respT, cleanup +} + +// guardInstaller is an SAInstaller that mirrors the handler's relaxed TX anti-reset +// guard (handler.go: UpdateVirtualNetworkSAs) — it rejects only a re-install of the +// currently-live transmit SA, i.e. the same transmit SPI AND the same key — so tests can +// detect a spurious rejection rather than the no-op epochRecorder which accepts everything. +// The key comparison matters: across a reconnect the allocator resets and the transmit SPI +// can collide with the still-live one, but under a fresh key, which is safe and must be +// accepted. Because every control-plane generation carries a fresh key, the guard should +// never reject in normal operation, even across a reconnect that resets SPIs to a low value. +type guardInstaller struct { + mu sync.Mutex + curTx uint32 // currently-live transmit SPI (0 = none) + curTxKey [16]byte // currently-live transmit key + installed []uint32 // accepted receive SPIs, in order + rejects int +} + +func newGuardInstaller() *guardInstaller { return &guardInstaller{} } + +func (g *guardInstaller) install(rxSPI, txSPI uint32, _, txKey [16]byte) error { + g.mu.Lock() + defer g.mu.Unlock() + if g.curTx != 0 && txSPI == g.curTx && txKey == g.curTxKey { + g.rejects++ + return errors.New("tx SA is already live") + } + g.curTx = txSPI + g.curTxKey = txKey + g.installed = append(g.installed, rxSPI) + return nil +} + +func (g *guardInstaller) snapshot() (installed []uint32, rejects int) { + g.mu.Lock() + defer g.mu.Unlock() + return append([]uint32(nil), g.installed...), g.rejects +} + +// installRec captures one per-direction SA generation for assertions: the receive SPI +// (this peer's own data-plane epoch) and transmit SPI (the peer's receive SPI), plus +// both keys. +type installRec struct { + rxSPI, txSPI uint32 + rxKey, txKey [16]byte +} + +// epochRecorder is a thread-safe SAInstaller that records the per-direction generations +// it installs. +type epochRecorder struct { + mu sync.Mutex + recs []installRec +} + +func newEpochRecorder() *epochRecorder { return &epochRecorder{} } + +func (r *epochRecorder) install(rxSPI, txSPI uint32, rxKey, txKey [16]byte) error { + r.mu.Lock() + defer r.mu.Unlock() + r.recs = append(r.recs, installRec{rxSPI: rxSPI, txSPI: txSPI, rxKey: rxKey, txKey: txKey}) + return nil +} + +func (r *epochRecorder) snapshot() []installRec { + r.mu.Lock() + defer r.mu.Unlock() + return append([]installRec(nil), r.recs...) +} + +func TestTunnelBringupAndRekey(t *testing.T) { + initRec, respRec := newEpochRecorder(), newEpochRecorder() + initT, respT, cleanup := twoTunnels(t, initRec.install, respRec.install, 100*time.Millisecond) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + // Bring both peers up; the responder must be accepting before the initiator dials. + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + // Both installed exactly one (matching) generation in bring-up. + ir := initRec.snapshot() + rr := respRec.snapshot() + require.Len(t, ir, 1) + require.Len(t, rr, 1) + // Per-direction SPIs: each peer installs its OWN receive SPI; they are distinct + // (role-partitioned), and each peer's transmit SPI is the other peer's receive SPI. + require.NotZero(t, ir[0].rxSPI) + require.NotEqual(t, ir[0].rxSPI, rr[0].rxSPI, "peers must allocate distinct receive SPIs") + require.Equal(t, ir[0].rxSPI, rr[0].txSPI, "initiator rx SPI must equal responder tx SPI") + require.Equal(t, ir[0].txSPI, rr[0].rxSPI, "initiator tx SPI must equal responder rx SPI") + // Cross-derivation: initiator TX key == responder RX key and vice versa. + require.Equal(t, ir[0].txKey, rr[0].rxKey, "initiator tx key != responder rx key") + require.Equal(t, ir[0].rxKey, rr[0].txKey, "initiator rx key != responder tx key") + // Within a peer, the two directions use distinct keys. + require.NotEqual(t, ir[0].rxKey, ir[0].txKey) + + // Run both peers and let the initiator drive a few rekeys. + runCh := make(chan error, 2) + go func() { runCh <- initT.Run(ctx) }() + go func() { runCh <- respT.Run(ctx) }() + + require.Eventually(t, func() bool { + return len(initRec.snapshot()) >= 3 + }, 10*time.Second, 20*time.Millisecond, "initiator should rekey several times") + + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + + // Receive SPIs strictly increase per peer, and the two peers agree generation-by- + // generation (the initiator's receive SPI for gen i is the responder's transmit SPI). + ir = initRec.snapshot() + rr = respRec.snapshot() + for i := 1; i < len(ir); i++ { + require.Greater(t, ir[i].rxSPI, ir[i-1].rxSPI, "initiator receive SPIs must strictly increase") + } + n := len(ir) + if len(rr) < n { + n = len(rr) + } + require.GreaterOrEqual(t, n, 2) + for i := 0; i < n; i++ { + require.Equal(t, ir[i].rxSPI, rr[i].txSPI, "peers disagree on SPI for generation %d", i) + } +} + +func TestTunnelBringupFailsClosedOnPinMismatch(t *testing.T) { + idA, err := GenerateIdentity() + require.NoError(t, err) + idB, err := GenerateIdentity() + require.NoError(t, err) + imposter, err := GenerateIdentity() + require.NoError(t, err) + + aInit, err := CanonicalInitiator(idA.PublicKey(), idB.PublicKey()) + require.NoError(t, err) + initID, respID := idA, idB + if !aInit { + initID, respID = idB, idA + } + + respConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer respConn.Close() + initConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + defer initConn.Close() + + mustNotInstall := func(uint32, uint32, [16]byte, [16]byte) error { + t.Fatal("keys must never be installed on a pin failure") + return nil + } + + // The initiator pins the real responder, but the responder pins an imposter, so + // the SA-setup round-trip (the mutual key-confirmation) must fail closed. + initT, err := NewTunnel(TunnelConfig{ + Local: initID, PeerPub: respID.PublicKey(), Conn: initConn, + PeerAddr: respConn.LocalAddr(), RekeyInterval: time.Second, + }, mustNotInstall) + require.NoError(t, err) + respT, err := NewTunnel(TunnelConfig{ + Local: respID, PeerPub: imposter.PublicKey(), Conn: respConn, + PeerAddr: initConn.LocalAddr(), RekeyInterval: time.Second, + }, mustNotInstall) + require.NoError(t, err) + defer initT.Close() + defer respT.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second) + defer cancel() + go func() { _ = respT.Bringup(ctx) }() + require.Error(t, initT.Bringup(ctx), "bring-up must fail when the peer does not pin us") +} + +// TestTunnelReconnects exercises the reestablish state machine: a session loss +// mid-run must tear the dead session down and re-establish a fresh one on both ends, +// resuming rotation, rather than wedging or hot-looping. +func TestTunnelReconnects(t *testing.T) { + initRec, respRec := newEpochRecorder(), newEpochRecorder() + initT, respT, cleanup := twoTunnels(t, initRec.install, respRec.install, 100*time.Millisecond) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second) + defer cancel() + + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + // Force a session loss before the run loops start: closing the initiator's session + // tears down both ends' QUIC connections, so the initiator detects loss via the + // connection context and the responder's next accept errors out. + require.NoError(t, initT.sess.Close()) + + runCh := make(chan error, 2) + go func() { runCh <- initT.Run(ctx) }() + go func() { runCh <- respT.Run(ctx) }() + + // Both peers must reconnect and resume installing generations (well past the single + // bring-up generation), proving the reconnect path self-heals. + require.Eventually(t, func() bool { + return len(initRec.snapshot()) >= 3 && len(respRec.snapshot()) >= 3 + }, 18*time.Second, 25*time.Millisecond, "peers must self-heal and resume rekeying after a session loss") + + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + + // Agreement invariant: the initiator installs a generation only after reading the + // responder's offer, which the responder writes only after it has committed to + // installing — so the responder must have transmitted under (i.e. installed as its + // tx SPI) every receive SPI the initiator installed. (The reverse can differ by one: + // a negotiation torn by the forced loss after the responder committed but before the + // initiator finished leaves the responder with an extra generation.) + ir := initRec.snapshot() + respTxSPIs := map[uint32]bool{} + for _, rec := range respRec.snapshot() { + respTxSPIs[rec.txSPI] = true + } + for _, rec := range ir { + require.True(t, respTxSPIs[rec.rxSPI], "responder never transmitted under initiator receive SPI %d", rec.rxSPI) + } + + // The receive SPIs are NOT globally monotonic across the reconnect: the per-session + // allocator resets, so the post-reconnect generations start over at a low value. That + // is safe (and accepted by the handler) because the reconnect derives fresh keys — + // recovery rests on fresh keys, not on a carried-forward high-water. Just assert every + // installed receive SPI is a valid non-zero selector. + for _, rec := range ir { + require.NotZero(t, rec.rxSPI) + } +} + +// TestTunnelReconnectGuardNeverRejects is the recovery regression test: with installers +// that ENFORCE the handler's relaxed TX anti-reset guard (reject only a re-install of +// the currently-live transmit SPI), a forced session loss must self-heal, keep installing +// generation after generation, AND never make either guard reject. The per-session +// allocator resets the SPIs to a low value after the reconnect, but each generation +// carries a fresh key, so the new (lower) transmit SPI is never equal to the survivor's +// currently-live one and the guard accepts it — recovery with zero persisted state and +// zero rejections. +func TestTunnelReconnectGuardNeverRejects(t *testing.T) { + initGuard, respGuard := newGuardInstaller(), newGuardInstaller() + initT, respT, cleanup := twoTunnels(t, initGuard.install, respGuard.install, 100*time.Millisecond) + defer cleanup() + + ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second) + defer cancel() + + brCh := make(chan error, 1) + go func() { brCh <- respT.Bringup(ctx) }() + require.NoError(t, initT.Bringup(ctx)) + require.NoError(t, <-brCh) + + require.NoError(t, initT.sess.Close()) // force a session loss + + runCh := make(chan error, 2) + go func() { runCh <- initT.Run(ctx) }() + go func() { runCh <- respT.Run(ctx) }() + + // Progress well past the single bring-up generation on BOTH peers proves the guard + // keeps accepting because each fresh-keyed generation's transmit SPI differs from the + // currently-live one, even after the reconnect resets the allocator. + require.Eventually(t, func() bool { + ig, _ := initGuard.snapshot() + rg, _ := respGuard.snapshot() + return len(ig) >= 5 && len(rg) >= 5 + }, 18*time.Second, 25*time.Millisecond, "peers must self-heal and keep installing under the guard") + + cancel() + require.NoError(t, <-runCh) + require.NoError(t, <-runCh) + + ig, iRej := initGuard.snapshot() + _, rRej := respGuard.snapshot() + require.Zero(t, iRej, "initiator guard must never reject") + require.Zero(t, rRej, "responder guard must never reject") + for _, rxSPI := range ig { + require.NotZero(t, rxSPI) + } +} diff --git a/control/identity.go b/control/identity.go new file mode 100644 index 0000000..93f80ce --- /dev/null +++ b/control/identity.go @@ -0,0 +1,160 @@ +package control + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/sha256" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/base64" + "encoding/pem" + "fmt" + "math/big" + "time" +) + +// Identity is a node's long-term signing key used to mutually authenticate the +// QUIC/mTLS control channel. It is an ECDSA P-256 key: a FIPS 186-approved +// signature algorithm in the Go FIPS 140-3 module, and a curve TLS 1.3 will use +// in FIPS mode. Peers authenticate each other WireGuard-style — by pinning the +// expected public key — rather than via a CA, so identities are self-signed. +// +// Note this signing key is distinct from the ephemeral ECDHE that TLS performs +// for forward secrecy; the identity only proves "who", the handshake provides +// the fresh per-session secret. +type Identity struct { + priv *ecdsa.PrivateKey +} + +// identityCertValidity is how long the self-signed identity certificate is +// nominally valid. Pinning ignores CA chains and (with the custom verifier) +// time validity, but a sane window keeps stricter stacks happy. +const identityCertValidity = 100 * 365 * 24 * time.Hour + +// GenerateIdentity creates a fresh ECDSA P-256 identity using crypto/rand. +func GenerateIdentity() (*Identity, error) { + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return nil, fmt.Errorf("control: generate identity: %w", err) + } + return &Identity{priv: priv}, nil +} + +// MarshalPrivatePEM encodes the identity private key as a PKCS#8 PEM block, +// suitable for writing to a 0600 key file. +func (id *Identity) MarshalPrivatePEM() ([]byte, error) { + der, err := x509.MarshalPKCS8PrivateKey(id.priv) + if err != nil { + return nil, fmt.Errorf("control: marshal private key: %w", err) + } + return pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: der}), nil +} + +// LoadIdentityPEM parses a PKCS#8 PEM private key produced by MarshalPrivatePEM. +// It rejects anything that is not an ECDSA P-256 key. +func LoadIdentityPEM(pemBytes []byte) (*Identity, error) { + block, _ := pem.Decode(pemBytes) + if block == nil { + return nil, fmt.Errorf("control: no PEM block in identity key") + } + key, err := x509.ParsePKCS8PrivateKey(block.Bytes) + if err != nil { + return nil, fmt.Errorf("control: parse identity key: %w", err) + } + priv, ok := key.(*ecdsa.PrivateKey) + if !ok || priv.Curve != elliptic.P256() { + return nil, fmt.Errorf("control: identity key must be ECDSA P-256") + } + return &Identity{priv: priv}, nil +} + +// PublicKey returns the identity's public key. +func (id *Identity) PublicKey() *ecdsa.PublicKey { + return &id.priv.PublicKey +} + +// PublicKeyString returns the base64(SPKI DER) encoding of the public key. This +// is the value distributed to peers and supplied via --peer-key (analogous to a +// WireGuard public key). +func (id *Identity) PublicKeyString() (string, error) { + return MarshalPublicKey(&id.priv.PublicKey) +} + +// Fingerprint returns a short, stable identifier for the public key: +// base64(SHA-256(SPKI DER)). Used as the certificate subject and in logs. +func (id *Identity) Fingerprint() (string, error) { + der, err := x509.MarshalPKIXPublicKey(&id.priv.PublicKey) + if err != nil { + return "", err + } + sum := sha256.Sum256(der) + return base64.RawStdEncoding.EncodeToString(sum[:]), nil +} + +// MarshalPublicKey encodes a public key as base64(SPKI DER). +func MarshalPublicKey(pub *ecdsa.PublicKey) (string, error) { + der, err := x509.MarshalPKIXPublicKey(pub) + if err != nil { + return "", fmt.Errorf("control: marshal public key: %w", err) + } + return base64.StdEncoding.EncodeToString(der), nil +} + +// ParsePublicKey decodes a base64(SPKI DER) public key (the --peer-key value) +// and verifies it is ECDSA P-256. +func ParsePublicKey(s string) (*ecdsa.PublicKey, error) { + der, err := base64.StdEncoding.DecodeString(s) + if err != nil { + return nil, fmt.Errorf("control: decode peer key: %w", err) + } + pub, err := x509.ParsePKIXPublicKey(der) + if err != nil { + return nil, fmt.Errorf("control: parse peer key: %w", err) + } + ec, ok := pub.(*ecdsa.PublicKey) + if !ok || ec.Curve != elliptic.P256() { + return nil, fmt.Errorf("control: peer key must be ECDSA P-256") + } + return ec, nil +} + +// TLSCertificate builds a self-signed leaf certificate for this identity, for +// use as the local end of the mTLS handshake. Authentication is by key pinning, +// not by chain validation, so the certificate is its own issuer. +func (id *Identity) TLSCertificate() (tls.Certificate, error) { + fp, err := id.Fingerprint() + if err != nil { + return tls.Certificate{}, err + } + // A fixed serial is fine: the cert is never chained or revoked, only pinned. + now := time.Now() + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "icx:" + fp}, + NotBefore: now.Add(-time.Hour), + NotAfter: now.Add(identityCertValidity), + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth}, + BasicConstraintsValid: true, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &id.priv.PublicKey, id.priv) + if err != nil { + return tls.Certificate{}, fmt.Errorf("control: create self-signed cert: %w", err) + } + leaf, err := x509.ParseCertificate(der) + if err != nil { + return tls.Certificate{}, fmt.Errorf("control: parse self-signed cert: %w", err) + } + return tls.Certificate{ + Certificate: [][]byte{der}, + PrivateKey: id.priv, + Leaf: leaf, + }, nil +} + +// PublicKeyEqual reports whether two ECDSA public keys are identical. +func PublicKeyEqual(a, b *ecdsa.PublicKey) bool { + return a != nil && b != nil && a.Equal(b) +} diff --git a/control/identity_test.go b/control/identity_test.go new file mode 100644 index 0000000..baac675 --- /dev/null +++ b/control/identity_test.go @@ -0,0 +1,82 @@ +package control + +import ( + "testing" +) + +func TestIdentityPrivatePEMRoundTrip(t *testing.T) { + id, err := GenerateIdentity() + if err != nil { + t.Fatal(err) + } + pemBytes, err := id.MarshalPrivatePEM() + if err != nil { + t.Fatal(err) + } + got, err := LoadIdentityPEM(pemBytes) + if err != nil { + t.Fatal(err) + } + if !PublicKeyEqual(id.PublicKey(), got.PublicKey()) { + t.Fatal("round-tripped identity public key differs") + } +} + +func TestPublicKeyStringRoundTrip(t *testing.T) { + id, err := GenerateIdentity() + if err != nil { + t.Fatal(err) + } + s, err := id.PublicKeyString() + if err != nil { + t.Fatal(err) + } + pub, err := ParsePublicKey(s) + if err != nil { + t.Fatal(err) + } + if !PublicKeyEqual(id.PublicKey(), pub) { + t.Fatal("parsed peer key differs from original") + } +} + +func TestParsePublicKeyRejectsGarbage(t *testing.T) { + if _, err := ParsePublicKey("not-base64!!"); err == nil { + t.Fatal("expected error for non-base64 peer key") + } + if _, err := ParsePublicKey("aGVsbG8="); err == nil { // valid base64, not a key + t.Fatal("expected error for non-SPKI peer key") + } +} + +func TestTLSCertificatePinsIdentityKey(t *testing.T) { + id, err := GenerateIdentity() + if err != nil { + t.Fatal(err) + } + cert, err := id.TLSCertificate() + if err != nil { + t.Fatal(err) + } + if cert.Leaf == nil { + t.Fatal("expected parsed Leaf on tls.Certificate") + } + // The leaf's public key must equal the identity's, so a pin against the + // identity key matches the certificate presented during the handshake. + if !id.PublicKey().Equal(cert.Leaf.PublicKey) { + t.Fatal("leaf certificate public key does not match identity") + } +} + +func TestDistinctIdentitiesDiffer(t *testing.T) { + a, _ := GenerateIdentity() + b, _ := GenerateIdentity() + if PublicKeyEqual(a.PublicKey(), b.PublicKey()) { + t.Fatal("two generated identities must not collide") + } + fa, _ := a.Fingerprint() + fb, _ := b.Fingerprint() + if fa == fb || fa == "" { + t.Fatalf("fingerprints should be distinct and non-empty: %q %q", fa, fb) + } +} diff --git a/control/kdf.go b/control/kdf.go new file mode 100644 index 0000000..1909058 --- /dev/null +++ b/control/kdf.go @@ -0,0 +1,88 @@ +package control + +import ( + "encoding/binary" + "fmt" +) + +// ICXVersion is an AEAD cipher-suite codepoint for an SA. It selects both the +// AEAD (AES-GCM-128 vs AES-GCM-256) and, via the KDF label, the size of the +// derived security-association key. It is a local cipher selector, not a +// wire-format version (that is ProtocolVersion in protocol.go). +type ICXVersion uint8 + +const ( + // AESGCM128 selects AES-GCM-128: a 16-byte SA key. The ICX default (zero + // churn to the [16]byte data plane). + AESGCM128 ICXVersion = 0 + // AESGCM256 selects AES-GCM-256: a 32-byte SA key. The CNSA / 256-bit path. + AESGCM256 ICXVersion = 1 +) + +// MasterKeyLen is the required length of a PSP master key (256 bits). PSP +// master keys are always AES-256 keys regardless of the SA key size. +const MasterKeyLen = 32 + +// label returns the 4-byte SP 800-108 label for the version: "Pv0\0" +// (0x50 0x76 0x30 0x00) for v0, "Pv1\0" for v1. The trailing NUL also serves as +// the SP 800-108 label/context separator. Per the spec, the version number may +// be OR'd into the third byte of the base label. +func (v ICXVersion) label() [4]byte { + return [4]byte{0x50, 0x76, 0x30 | byte(v), 0x00} +} + +// valid reports whether v is a supported cipher suite. Callers must reject +// unsupported versions before deriving keys (fail-closed), so keyLen/label are +// never asked to map an unknown version. +func (v ICXVersion) valid() bool { return v == AESGCM128 || v == AESGCM256 } + +// keyLen returns the derived SA key length in bytes for the version. Only valid +// versions reach here (guarded by DeriveSAKey); v0 = 16, v1 = 32. +func (v ICXVersion) keyLen() int { + if v == AESGCM256 { + return 32 + } + return 16 +} + +// DeriveSAKey derives a PSP security-association key from a 256-bit master key +// and a 32-bit SPI, exactly per the PSP Architecture Specification: a NIST +// SP 800-108 counter-mode KDF whose PRF is AES-CMAC (see cmac.go). Each PRF +// input block is the 16-byte concatenation +// +// counter(4) || label(4) || context=SPI(4) || length-in-bits(4) +// +// all in network byte order. A 128-bit key needs one block (counter=1); a +// 256-bit key needs two (counter=1, counter=2) concatenated. +// +// The caller is responsible for selecting which master key to pass based on the +// SPI's most-significant bit (the PSP master-key selector); the SPI is fed into +// the KDF context verbatim, MSB included, so the derivation is bound to it. +func DeriveSAKey(masterKey []byte, spi uint32, v ICXVersion) ([]byte, error) { + if len(masterKey) != MasterKeyLen { + return nil, fmt.Errorf("control: master key must be %d bytes, got %d", MasterKeyLen, len(masterKey)) + } + if !v.valid() { + return nil, fmt.Errorf("control: unsupported cipher suite %d", v) + } + + keyLen := v.keyLen() + bitLen := uint32(keyLen * 8) + label := v.label() + blocks := (keyLen + 15) / 16 + + out := make([]byte, 0, blocks*16) + for i := 1; i <= blocks; i++ { + var in [16]byte + binary.BigEndian.PutUint32(in[0:4], uint32(i)) // counter + copy(in[4:8], label[:]) // label + binary.BigEndian.PutUint32(in[8:12], spi) // context = SPI + binary.BigEndian.PutUint32(in[12:16], bitLen) // length (bits) + mac, err := aesCMAC(masterKey, in[:]) + if err != nil { + return nil, err + } + out = append(out, mac...) + } + return out[:keyLen], nil +} diff --git a/control/kdf_test.go b/control/kdf_test.go new file mode 100644 index 0000000..ed35ce2 --- /dev/null +++ b/control/kdf_test.go @@ -0,0 +1,107 @@ +package control + +import ( + "bytes" + "encoding/hex" + "strings" + "testing" +) + +func unhex(t *testing.T, s string) []byte { + t.Helper() + b, err := hex.DecodeString(strings.ReplaceAll(s, " ", "")) + if err != nil { + t.Fatalf("bad hex %q: %v", s, err) + } + return b +} + +// TestAESCMAC_RFC4493 validates the AES-CMAC PRF against the canonical RFC 4493 +// (= NIST SP 800-38B) test vectors: empty, one full block, a partial final +// block, and a multi-block message. +func TestAESCMAC_RFC4493(t *testing.T) { + key := unhex(t, "2b7e151628aed2a6abf7158809cf4f3c") + const ( + b1 = "6bc1bee22e409f96e93d7e117393172a" + b2 = "ae2d8a571e03ac9c9eb76fac45af8e51" + b3 = "30c81c46a35ce411e5fbc1191a0a52ef" + b4 = "f69f2445df4f9b17ad2b417be66c3710" + b3part8 = "30c81c46a35ce411" // first 8 bytes of b3 (40-byte message) + ) + cases := []struct { + name, msg, want string + }{ + {"len0", "", "bb1d6929e95937287fa37d129b756746"}, + {"len16", b1, "070a16b46b4d4144f79bdd9dd04a287c"}, + {"len40", b1 + b2 + b3part8, "dfa66747de9ae63030ca32611497c827"}, + {"len64", b1 + b2 + b3 + b4, "51f0bebf7e3b9d92fc49741779363cfe"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := aesCMAC(key, unhex(t, c.msg)) + if err != nil { + t.Fatal(err) + } + if want := unhex(t, c.want); !bytes.Equal(got, want) { + t.Fatalf("CMAC mismatch\n got %x\nwant %x", got, want) + } + }) + } +} + +// TestDeriveSAKey_PSPSpec validates the SP 800-108 KDF against the worked +// examples in the PSP Architecture Specification ("Examples of key +// derivation", p.7). +func TestDeriveSAKey_PSPSpec(t *testing.T) { + k0 := unhex(t, "34448a064292601b11a0978f56a2d34cf3fc35ede1a6bc04f8db3e5243a2b0ca") + k1 := unhex(t, "563952565d3a78ae773ec1b779f2f2d99f4a7f53a6fbb9b07d5b71f39364d739") + + cases := []struct { + name string + master []byte + spi uint32 + version ICXVersion + want string + }{ + { + name: "v0_spi_12345678_mk0", master: k0, spi: 0x12345678, version: AESGCM128, + want: "96c22dc799198090b74b70ae468e4e30", + }, + { + // MSB set -> master key 1 selected by the caller. + name: "v0_spi_9A345678_mk1", master: k1, spi: 0x9A345678, version: AESGCM128, + want: "3946da2554eae46ad1ef77a64372edc4", + }, + { + name: "v1_spi_12345678_mk0", master: k0, spi: 0x12345678, version: AESGCM256, + want: "2b7d72074e42ca334487f2990e3f8c4037e436f38283449b76463e9b7fb2e3de", + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := DeriveSAKey(c.master, c.spi, c.version) + if err != nil { + t.Fatal(err) + } + if want := unhex(t, c.want); !bytes.Equal(got, want) { + t.Fatalf("SA key mismatch\n got %x\nwant %x", got, want) + } + if len(got) != c.version.keyLen() { + t.Fatalf("key length = %d, want %d", len(got), c.version.keyLen()) + } + }) + } +} + +func TestDeriveSAKey_BadMasterKeyLen(t *testing.T) { + if _, err := DeriveSAKey(make([]byte, 16), 1, AESGCM128); err == nil { + t.Fatal("expected error for 16-byte master key, got nil") + } +} + +func TestDeriveSAKey_UnsupportedVersionFailsClosed(t *testing.T) { + mk := make([]byte, MasterKeyLen) + if _, err := DeriveSAKey(mk, 1, ICXVersion(7)); err == nil { + t.Fatal("expected error for unsupported cipher suite, got nil (must fail closed, not default to 16 bytes)") + } +} diff --git a/control/protocol.go b/control/protocol.go new file mode 100644 index 0000000..427ffa3 --- /dev/null +++ b/control/protocol.go @@ -0,0 +1,104 @@ +package control + +import ( + "encoding/binary" + "fmt" + "io" +) + +// The control-plane SA-setup protocol runs over a bidirectional QUIC stream +// after the mTLS handshake. Because both peers derive the identical PSP master +// keys from the TLS exporter, no key material is ever exchanged — peers only +// announce the SPI on which each will RECEIVE, and derive every key locally. +// +// Frame = uint16 big-endian length prefix + payload. Messages are small and +// fixed today; the leading protocol-version + type bytes leave room to grow +// (lifetimes, capabilities, rekey signalling) without breaking the framing. + +const ( + // ProtocolVersion is the control-plane wire-protocol version. + ProtocolVersion = 1 + // maxFrameLen bounds a single control frame (these are tiny; the cap just + // stops a peer from forcing a large allocation). + maxFrameLen = 4096 +) + +type msgType uint8 + +const ( + msgSAOffer msgType = 1 +) + +// saOffer announces the SPI on which the sender will RECEIVE data-plane traffic +// for the given cipher suite. The peer derives the key for this SPI and uses it +// as its TX key; the sender uses it as its RX key. +type saOffer struct { + Version ICXVersion + RxSPI uint32 +} + +const saOfferLen = 1 + 1 + 1 + 4 // protoVer + type + suite + rxSPI + +func (o saOffer) marshal() []byte { + b := make([]byte, saOfferLen) + b[0] = ProtocolVersion + b[1] = byte(msgSAOffer) + b[2] = byte(o.Version) + binary.BigEndian.PutUint32(b[3:], o.RxSPI) + return b +} + +func parseSAOffer(b []byte) (saOffer, error) { + if len(b) != saOfferLen { + return saOffer{}, fmt.Errorf("control: SA offer wrong size %d, want %d", len(b), saOfferLen) + } + if b[0] != ProtocolVersion { + return saOffer{}, fmt.Errorf("control: unsupported protocol version %d", b[0]) + } + if msgType(b[1]) != msgSAOffer { + return saOffer{}, fmt.Errorf("control: expected SA offer, got message type %d", b[1]) + } + return saOffer{ + Version: ICXVersion(b[2]), + RxSPI: binary.BigEndian.Uint32(b[3:7]), + }, nil +} + +// writeFrame writes a length-prefixed control frame. +func writeFrame(w io.Writer, payload []byte) error { + if len(payload) > maxFrameLen { + return fmt.Errorf("control: frame too large (%d)", len(payload)) + } + var hdr [2]byte + binary.BigEndian.PutUint16(hdr[:], uint16(len(payload))) + if _, err := w.Write(hdr[:]); err != nil { + return err + } + _, err := w.Write(payload) + return err +} + +// readFrame reads a single length-prefixed control frame. +func readFrame(r io.Reader) ([]byte, error) { + var hdr [2]byte + if _, err := io.ReadFull(r, hdr[:]); err != nil { + return nil, err + } + n := binary.BigEndian.Uint16(hdr[:]) + if int(n) > maxFrameLen { + return nil, fmt.Errorf("control: frame too large (%d)", n) + } + buf := make([]byte, n) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + return buf, nil +} + +func readSAOffer(r io.Reader) (saOffer, error) { + b, err := readFrame(r) + if err != nil { + return saOffer{}, err + } + return parseSAOffer(b) +} diff --git a/control/sa.go b/control/sa.go new file mode 100644 index 0000000..27e73b7 --- /dev/null +++ b/control/sa.go @@ -0,0 +1,142 @@ +package control + +import ( + "crypto/hkdf" + "crypto/sha256" + "errors" + "fmt" + "sync" +) + +// numMasterKeys is the PSP master-key count: one active, one retained for +// in-flight SAs during rotation (the MSB of the SPI selects between them). +const numMasterKeys = 2 + +// MasterKeys holds the two 256-bit PSP master keys. They are seeded from the +// forward-secret TLS exporter (see ExportRootSecret) and live only in RAM; they +// are never persisted, so a recorded session cannot be decrypted once they are +// dropped — this is where the forward secrecy reaches the data plane. +type MasterKeys struct { + keys [numMasterKeys][MasterKeyLen]byte +} + +// masterKeyInfo domain-separates the master-key derivation from any other use +// of the root secret. +const masterKeyInfo = "icx psp master keys v1" + +// DeriveMasterKeys expands the TLS-exported root secret into the two PSP master +// keys via HKDF-SHA-256 (FIPS SP 800-56C). Both peers feed the identical root +// secret and therefore derive the identical master keys, so each can compute +// any SA key locally from its SPI — no key material ever crosses the wire. +func DeriveMasterKeys(rootSecret []byte) (*MasterKeys, error) { + if len(rootSecret) < RootSecretLen { + return nil, fmt.Errorf("control: root secret must be >= %d bytes, got %d", RootSecretLen, len(rootSecret)) + } + okm, err := hkdf.Key(sha256.New, rootSecret, nil, masterKeyInfo, numMasterKeys*MasterKeyLen) + if err != nil { + return nil, fmt.Errorf("control: derive master keys: %w", err) + } + mk := &MasterKeys{} + for i := range mk.keys { + copy(mk.keys[i][:], okm[i*MasterKeyLen:(i+1)*MasterKeyLen]) + } + return mk, nil +} + +// MasterKeyIndex returns which master key (0 or 1) an SPI selects: per PSP, the +// most-significant bit of the SPI. +func MasterKeyIndex(spi uint32) int { return int(spi >> 31) } + +// SA is a unidirectional PSP security association: an SPI, the derived AES-GCM +// key, and the cipher suite (which fixes the key length / cipher). +type SA struct { + SPI uint32 + Key []byte + Version ICXVersion +} + +// DeriveSA derives the SA key for spi using the master key its MSB selects. +func (m *MasterKeys) DeriveSA(spi uint32, v ICXVersion) (*SA, error) { + if spi&spiLowMask == 0 { + return nil, errors.New("control: SPI low 31 bits must be non-zero (zero is reserved)") + } + key, err := DeriveSAKey(m.keys[MasterKeyIndex(spi)][:], spi, v) + if err != nil { + return nil, err + } + return &SA{SPI: spi, Key: key, Version: v}, nil +} + +// Role identifies which peer allocated an SPI. The two directions MUST use +// distinct SPIs, otherwise both directions would derive the same key +// (txKey == rxKey). Partitioning the SPI space by role guarantees distinctness +// even though both peers allocate independently from the shared master keys. +type Role uint8 + +const ( + Initiator Role = iota // canonical lower static key + Responder +) + +// SPI bit layout (PSP keeps the SPI opaque except for the MSB master-key +// selector; we additionally reserve one bit to partition by allocating role): +// +// bit31 master-key index (PSP) +// bit30 allocating role (0=initiator, 1=responder) +// bits[29:0] per-(index,role) counter, 1..2^30-1 (0 reserved) +const ( + spiRoleShift = 30 + spiCounterMask = (uint32(1) << spiRoleShift) - 1 // low 30 bits + spiLowMask = uint32(0x7fffffff) // low 31 bits (PSP: must be non-zero) +) + +// MakeSPI composes an SPI from the active master-key index, the allocating role +// and a per-(index,role) counter. +func MakeSPI(masterKeyIndex int, role Role, counter uint32) (uint32, error) { + if masterKeyIndex < 0 || masterKeyIndex >= numMasterKeys { + return 0, fmt.Errorf("control: master key index must be 0..%d", numMasterKeys-1) + } + if role > Responder { + return 0, fmt.Errorf("control: invalid role %d", role) + } + if counter == 0 || counter > spiCounterMask { + return 0, fmt.Errorf("control: SPI counter out of range (1..%d)", spiCounterMask) + } + return uint32(masterKeyIndex)<<31 | uint32(role)<= numMasterKeys { + return 0, fmt.Errorf("control: master key index must be 0..%d", numMasterKeys-1) + } + a.mu.Lock() + defer a.mu.Unlock() + // Check before incrementing so an exhausted counter stays pinned at the + // ceiling (spiCounterMask is the last usable value) rather than wrapping. + if a.next[masterKeyIndex] >= spiCounterMask { + return 0, fmt.Errorf("%w (master key %d)", ErrSPIExhausted, masterKeyIndex) + } + a.next[masterKeyIndex]++ + return MakeSPI(masterKeyIndex, a.role, a.next[masterKeyIndex]) +} diff --git a/control/sa_test.go b/control/sa_test.go new file mode 100644 index 0000000..2bf92ab --- /dev/null +++ b/control/sa_test.go @@ -0,0 +1,118 @@ +package control + +import ( + "bytes" + "testing" +) + +func TestDeriveMasterKeysDeterministic(t *testing.T) { + root := bytes.Repeat([]byte{0xA5}, RootSecretLen) + a, err := DeriveMasterKeys(root) + if err != nil { + t.Fatal(err) + } + b, err := DeriveMasterKeys(root) + if err != nil { + t.Fatal(err) + } + if a.keys != b.keys { + t.Fatal("master keys not deterministic for the same root secret") + } + if a.keys[0] == a.keys[1] { + t.Fatal("the two master keys must differ") + } +} + +func TestDeriveMasterKeysSessionUnique(t *testing.T) { + a, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x01}, RootSecretLen)) + b, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x02}, RootSecretLen)) + if a.keys == b.keys { + t.Fatal("different root secrets must yield different master keys (per-session FS)") + } +} + +func TestDeriveMasterKeysRejectsShortRoot(t *testing.T) { + if _, err := DeriveMasterKeys(make([]byte, RootSecretLen-1)); err == nil { + t.Fatal("expected error for short root secret") + } +} + +func TestDeriveSAMatchesKDFAndSelectsMasterKey(t *testing.T) { + mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x5a}, RootSecretLen)) + + // MSB clear -> master key 0; MSB set -> master key 1. + spi0, _ := MakeSPI(0, Initiator, 7) + spi1, _ := MakeSPI(1, Initiator, 7) + + sa0, err := mk.DeriveSA(spi0, AESGCM128) + if err != nil { + t.Fatal(err) + } + want0, _ := DeriveSAKey(mk.keys[0][:], spi0, AESGCM128) + if !bytes.Equal(sa0.Key, want0) { + t.Fatal("DeriveSA(MSB=0) did not use master key 0") + } + + sa1, err := mk.DeriveSA(spi1, AESGCM128) + if err != nil { + t.Fatal(err) + } + want1, _ := DeriveSAKey(mk.keys[1][:], spi1, AESGCM128) + if !bytes.Equal(sa1.Key, want1) { + t.Fatal("DeriveSA(MSB=1) did not use master key 1") + } +} + +// TestDirectionsNeverCollide is the txKey != rxKey guarantee: the two peers +// allocate RX SPIs independently, but the role bit keeps them in disjoint +// subspaces, so the same counter yields different SPIs and thus different keys. +func TestDirectionsNeverCollide(t *testing.T) { + mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x33}, RootSecretLen)) + initAlloc := NewSPIAllocator(Initiator) + respAlloc := NewSPIAllocator(Responder) + + seen := map[uint32]bool{} + for i := 0; i < 1000; i++ { + is, err := initAlloc.Allocate(0) + if err != nil { + t.Fatal(err) + } + rs, err := respAlloc.Allocate(0) + if err != nil { + t.Fatal(err) + } + if is == rs { + t.Fatalf("initiator and responder allocated the same SPI %#x", is) + } + if seen[is] || seen[rs] { + t.Fatalf("SPI reuse detected at i=%d", i) + } + seen[is], seen[rs] = true, true + + txSA, _ := mk.DeriveSA(is, AESGCM128) + rxSA, _ := mk.DeriveSA(rs, AESGCM128) + if bytes.Equal(txSA.Key, rxSA.Key) { + t.Fatal("tx and rx SA keys collided") + } + } +} + +func TestMakeSPIValidation(t *testing.T) { + if _, err := MakeSPI(0, Initiator, 0); err == nil { + t.Fatal("counter 0 must be rejected") + } + if _, err := MakeSPI(2, Initiator, 1); err == nil { + t.Fatal("master key index 2 must be rejected") + } + if _, err := MakeSPI(0, Initiator, spiCounterMask+1); err == nil { + t.Fatal("counter overflow must be rejected") + } +} + +func TestDeriveSARejectsReservedSPI(t *testing.T) { + mk, _ := DeriveMasterKeys(bytes.Repeat([]byte{0x01}, RootSecretLen)) + // SPI whose low 31 bits are zero (only the master-key bit set) is reserved. + if _, err := mk.DeriveSA(uint32(1)<<31, AESGCM128); err == nil { + t.Fatal("expected error for reserved SPI (zero low 31 bits)") + } +} diff --git a/control/tls.go b/control/tls.go new file mode 100644 index 0000000..d350ba4 --- /dev/null +++ b/control/tls.go @@ -0,0 +1,119 @@ +package control + +import ( + "crypto/ecdsa" + "crypto/tls" + "errors" + "fmt" +) + +// ALPN is the application-layer protocol name negotiated on the ICX control +// channel. A mismatch (e.g. a stray TLS client) fails the handshake. +const ALPN = "icx-ctrl/1" + +// exporterLabel is the RFC 5705 / RFC 8446 §7.5 exporter label used to derive +// the data-plane master-key seed from the completed TLS 1.3 handshake. Changing +// it is a breaking protocol change. +const exporterLabel = "EXPORTER-icx-master-v1" + +// exporterContext domain-separates the master-key seed from any other exporter +// use on the same connection. +var exporterContext = []byte("icx control plane master seed v1") + +// RootSecretLen is the length of the exported master-key seed (256-bit). +const RootSecretLen = 32 + +// pinVerifier returns a tls.Config.VerifyConnection callback that authenticates +// the peer WireGuard-style: the leaf certificate's public key must equal the +// pinned peer identity key. Chain/CA/hostname validation is intentionally not +// used (the certificates are self-signed); pinning is the whole trust model. +func pinVerifier(peerPub *ecdsa.PublicKey) func(tls.ConnectionState) error { + return func(cs tls.ConnectionState) error { + if len(cs.PeerCertificates) == 0 { + return errors.New("control: peer presented no certificate") + } + leafPub, ok := cs.PeerCertificates[0].PublicKey.(*ecdsa.PublicKey) + if !ok { + return errors.New("control: peer certificate key is not ECDSA") + } + if !leafPub.Equal(peerPub) { + return errors.New("control: peer key pin mismatch") + } + return nil + } +} + +// baseTLSConfig builds the shared TLS 1.3 configuration: our self-signed +// identity certificate, the pinned-peer verifier, TLS 1.3 only, the ICX ALPN, +// and FIPS-approved curves. In a fips140=on build the module further restricts +// the suite to AES-GCM + SHA-2 and disables X25519/ChaCha automatically, so the +// whole handshake stays inside the validated boundary. +func baseTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, error) { + if local == nil || peerPub == nil { + return nil, errors.New("control: local identity and peer key are required") + } + cert, err := local.TLSCertificate() + if err != nil { + return nil, err + } + return &tls.Config{ + Certificates: []tls.Certificate{cert}, + MinVersion: tls.VersionTLS13, + MaxVersion: tls.VersionTLS13, + NextProtos: []string{ALPN}, + CurvePreferences: []tls.CurveID{tls.CurveP256, tls.CurveP384}, + VerifyConnection: pinVerifier(peerPub), + // Disable TLS 1.3 session resumption (and therefore 0-RTT). Every (re)connect + // MUST be a full ECDHE handshake so each session derives FRESH master keys: that + // freshness is the data plane's nonce-uniqueness foundation (the per-direction + // install guard accepts a reset/regressed SPI precisely because its key is fresh + // — see handler.UpdateVirtualNetworkSAs). A resumed session could reuse keying + // material and, paired with a reset SPI, repeat a (key, nonce) pair. The server + // also never issues tickets; newSession additionally asserts !DidResume/!0-RTT. + SessionTicketsDisabled: true, + }, nil +} + +// ServerTLSConfig builds the responder side of the control-plane mTLS: it +// requires (and pins) a client certificate. +func ServerTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, error) { + cfg, err := baseTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + // Accept any presented client cert at the chain layer; pinVerifier (run via + // VerifyConnection) is what actually authenticates it. + cfg.ClientAuth = tls.RequireAnyClientCert + return cfg, nil +} + +// ClientTLSConfig builds the initiator side of the control-plane mTLS. +func ClientTLSConfig(local *Identity, peerPub *ecdsa.PublicKey) (*tls.Config, error) { + cfg, err := baseTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + // InsecureSkipVerify disables ONLY the default CA-chain/hostname checks, which + // are meaningless for a self-signed, pinned peer. It does NOT disable peer + // authentication: VerifyConnection (pinVerifier) still runs and fully + // authenticates the peer by its pinned public key. Without this flag the + // handshake would fail on the absent CA chain before pinning could run. + cfg.InsecureSkipVerify = true + cfg.ServerName = "icx" + return cfg, nil +} + +// ExportRootSecret derives the 32-byte data-plane master-key seed from a +// completed TLS 1.3 handshake via the RFC 8446 exporter. Both peers compute the +// identical value; it is the forward-secret root the PSP master keys are seeded +// from (see keys.go). It must only be called after the handshake completes. +func ExportRootSecret(cs tls.ConnectionState) ([]byte, error) { + if cs.Version != tls.VersionTLS13 { + return nil, fmt.Errorf("control: refusing to export from TLS version %#x (want 1.3)", cs.Version) + } + secret, err := cs.ExportKeyingMaterial(exporterLabel, exporterContext, RootSecretLen) + if err != nil { + return nil, fmt.Errorf("control: export keying material: %w", err) + } + return secret, nil +} diff --git a/control/tls_test.go b/control/tls_test.go new file mode 100644 index 0000000..267534a --- /dev/null +++ b/control/tls_test.go @@ -0,0 +1,138 @@ +package control + +import ( + "bytes" + "crypto/tls" + "errors" + "net" + "testing" + "time" +) + +// tlsHandshakeResult carries one side's post-handshake outcome. +type tlsHandshakeResult struct { + state tls.ConnectionState + err error +} + +// doHandshake runs a TLS 1.3 mTLS handshake between a client and server config +// over an in-memory pipe and returns both sides' results. +func doHandshake(t *testing.T, clientCfg, serverCfg *tls.Config) (client, server tlsHandshakeResult) { + t.Helper() + c, s := net.Pipe() + defer c.Close() + defer s.Close() + + srvCh := make(chan tlsHandshakeResult, 1) + go func() { + conn := tls.Server(s, serverCfg) + _ = conn.SetDeadline(time.Now().Add(2 * time.Second)) + err := conn.Handshake() + srvCh <- tlsHandshakeResult{state: conn.ConnectionState(), err: err} + }() + + conn := tls.Client(c, clientCfg) + _ = conn.SetDeadline(time.Now().Add(2 * time.Second)) + clientErr := conn.Handshake() + client = tlsHandshakeResult{state: conn.ConnectionState(), err: clientErr} + server = <-srvCh + return client, server +} + +func mustConfigs(t *testing.T) (clientCfg, serverCfg *tls.Config, a, b *Identity) { + t.Helper() + a, err := GenerateIdentity() // client/initiator + if err != nil { + t.Fatal(err) + } + b, err = GenerateIdentity() // server/responder + if err != nil { + t.Fatal(err) + } + clientCfg, err = ClientTLSConfig(a, b.PublicKey()) + if err != nil { + t.Fatal(err) + } + serverCfg, err = ServerTLSConfig(b, a.PublicKey()) + if err != nil { + t.Fatal(err) + } + return clientCfg, serverCfg, a, b +} + +func TestMTLSHandshakeSucceedsAndExportsSharedSecret(t *testing.T) { + clientCfg, serverCfg, _, _ := mustConfigs(t) + + client, server := doHandshake(t, clientCfg, serverCfg) + if client.err != nil { + t.Fatalf("client handshake failed: %v", client.err) + } + if server.err != nil { + t.Fatalf("server handshake failed: %v", server.err) + } + + if client.state.Version != tls.VersionTLS13 { + t.Fatalf("negotiated TLS version %#x, want 1.3", client.state.Version) + } + if client.state.NegotiatedProtocol != ALPN { + t.Fatalf("ALPN = %q, want %q", client.state.NegotiatedProtocol, ALPN) + } + + cs, err := ExportRootSecret(client.state) + if err != nil { + t.Fatal(err) + } + ss, err := ExportRootSecret(server.state) + if err != nil { + t.Fatal(err) + } + if len(cs) != RootSecretLen { + t.Fatalf("root secret len = %d, want %d", len(cs), RootSecretLen) + } + if !bytes.Equal(cs, ss) { + t.Fatalf("exported root secrets differ:\n client %x\n server %x", cs, ss) + } +} + +func TestMTLSWrongClientPinRejected(t *testing.T) { + // Server pins a DIFFERENT key than the client actually holds. + client, _ := GenerateIdentity() + server, _ := GenerateIdentity() + imposter, _ := GenerateIdentity() + + clientCfg, _ := ClientTLSConfig(client, server.PublicKey()) + // Server expects `imposter`, but the client authenticates as `client`. + serverCfg, _ := ServerTLSConfig(server, imposter.PublicKey()) + + c, s := doHandshake(t, clientCfg, serverCfg) + if s.err == nil { + t.Fatal("server accepted a client whose key it did not pin") + } + if c.err == nil { + t.Fatal("client handshake should also fail when server rejects it") + } +} + +func TestMTLSWrongServerPinRejected(t *testing.T) { + client, _ := GenerateIdentity() + server, _ := GenerateIdentity() + imposter, _ := GenerateIdentity() + + // Client expects `imposter`, but the server authenticates as `server`. + clientCfg, _ := ClientTLSConfig(client, imposter.PublicKey()) + serverCfg, _ := ServerTLSConfig(server, client.PublicKey()) + + c, _ := doHandshake(t, clientCfg, serverCfg) + if c.err == nil { + t.Fatal("client accepted a server whose key it did not pin") + } + if !errors.Is(c.err, c.err) { // smoke: error is non-nil (pin mismatch surfaced) + t.Fatal("expected a pin-mismatch error") + } +} + +func TestExportRootSecretRejectsZeroState(t *testing.T) { + if _, err := ExportRootSecret(tls.ConnectionState{}); err == nil { + t.Fatal("expected error exporting from a non-1.3 (zero) connection state") + } +} diff --git a/control/transport.go b/control/transport.go new file mode 100644 index 0000000..438e23a --- /dev/null +++ b/control/transport.go @@ -0,0 +1,267 @@ +package control + +import ( + "bytes" + "context" + "crypto/ecdsa" + "crypto/tls" + "errors" + "fmt" + "net" + "time" + + "github.com/quic-go/quic-go" +) + +// appErrNormal is the QUIC application close code used for a clean shutdown. +const appErrNormal quic.ApplicationErrorCode = 0 + +// activeMasterKeyIndex is the master key used for new SAs in this first +// generation. Master-key rotation (PSP's double-rotation) is layered on later; +// for now both peers always use index 0. +const activeMasterKeyIndex = 0 + +// defaultQUICConfig is the control-plane QUIC configuration. Notably it does NOT +// enable 0-RTT, so every (re)handshake is a full ECDHE exchange — fresh keys +// per session, i.e. forward secrecy by construction. +func defaultQUICConfig() *quic.Config { + return &quic.Config{ + HandshakeIdleTimeout: 10 * time.Second, + MaxIdleTimeout: 30 * time.Second, + KeepAlivePeriod: 10 * time.Second, + MaxIncomingStreams: 4, + } +} + +// Session is an established control-plane connection: an authenticated, +// forward-secret QUIC/mTLS channel plus the PSP master keys derived from its +// TLS exporter. From a Session, peers negotiate the per-direction SAs whose +// keys feed the Geneve/AF_XDP data plane. +type Session struct { + conn *quic.Conn + role Role + masterKeys *MasterKeys + rxAlloc *SPIAllocator +} + +// Dial establishes the initiator side of a control session to peerAddr over the +// already-bound UDP socket pconn, authenticating as local and pinning peerPub. +func Dial(ctx context.Context, pconn net.PacketConn, peerAddr net.Addr, local *Identity, peerPub *ecdsa.PublicKey) (*Session, error) { + tlsConf, err := ClientTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + conn, err := quic.Dial(ctx, pconn, peerAddr, tlsConf, defaultQUICConfig()) + if err != nil { + return nil, fmt.Errorf("control: dial: %w", err) + } + return newSession(ctx, conn, Initiator) +} + +// Listener accepts inbound control sessions on a UDP socket. The underlying +// quic.Transport performs Retry-based source-address validation and enforces +// QUIC's 3x anti-amplification limit, which is the handshake-flood defense. +type Listener struct { + ln *quic.Listener + tr *quic.Transport +} + +// Listen returns a control-plane listener on pconn that authenticates as local +// and pins peerPub. +func Listen(pconn net.PacketConn, local *Identity, peerPub *ecdsa.PublicKey) (*Listener, error) { + tlsConf, err := ServerTLSConfig(local, peerPub) + if err != nil { + return nil, err + } + tr := &quic.Transport{Conn: pconn} + ln, err := tr.Listen(tlsConf, defaultQUICConfig()) + if err != nil { + _ = tr.Close() + return nil, fmt.Errorf("control: listen: %w", err) + } + return &Listener{ln: ln, tr: tr}, nil +} + +// Accept blocks until a peer completes the mTLS handshake, then returns the +// established session (responder role). +func (l *Listener) Accept(ctx context.Context) (*Session, error) { + conn, err := l.ln.Accept(ctx) + if err != nil { + return nil, err + } + return newSession(ctx, conn, Responder) +} + +// Addr returns the local address the listener is bound to. +func (l *Listener) Addr() net.Addr { return l.ln.Addr() } + +// Close tears down the listener and its transport. +func (l *Listener) Close() error { + err := l.ln.Close() + if cerr := l.tr.Close(); err == nil { + err = cerr + } + return err +} + +// newSession waits for the handshake, derives the master keys from the TLS +// exporter, and returns the ready session. +func newSession(ctx context.Context, conn *quic.Conn, role Role) (*Session, error) { + select { + case <-conn.HandshakeComplete(): + case <-ctx.Done(): + _ = conn.CloseWithError(appErrNormal, "handshake cancelled") + return nil, ctx.Err() + } + + // Assert the negotiated ALPN explicitly. TLS already fails the handshake when + // NextProtos don't overlap (both sides advertise only ALPN), but enforcing the + // invariant in code keeps it true if NextProtos is ever widened, and makes the + // guarantee auditable rather than implied. + state := conn.ConnectionState() + tlsState := state.TLS + if tlsState.NegotiatedProtocol != ALPN { + _ = conn.CloseWithError(appErrNormal, "alpn mismatch") + return nil, fmt.Errorf("control: unexpected ALPN %q, want %q", tlsState.NegotiatedProtocol, ALPN) + } + + // Enforce a FRESH ECDHE handshake: refuse a resumed session or 0-RTT. The data + // plane's nonce-uniqueness guarantee rests on every session deriving fresh master + // keys (so a reset/regressed SPI is always paired with a fresh key — see + // handler.UpdateVirtualNetworkSAs). Resumption is already disabled in the TLS config + // (SessionTicketsDisabled), so this is a fail-closed backstop against a silent + // regression rather than an expected path. + if tlsState.DidResume { + _ = conn.CloseWithError(appErrNormal, "session resumption forbidden") + return nil, errors.New("control: TLS session was resumed; a fresh ECDHE handshake is required for data-plane nonce safety") + } + if state.Used0RTT { + _ = conn.CloseWithError(appErrNormal, "0-RTT forbidden") + return nil, errors.New("control: connection used 0-RTT; a fresh ECDHE handshake is required for data-plane nonce safety") + } + + root, err := ExportRootSecret(tlsState) + if err != nil { + _ = conn.CloseWithError(appErrNormal, "exporter failure") + return nil, err + } + mk, err := DeriveMasterKeys(root) + if err != nil { + _ = conn.CloseWithError(appErrNormal, "key derivation failure") + return nil, err + } + return &Session{ + conn: conn, + role: role, + masterKeys: mk, + rxAlloc: NewSPIAllocator(role), + }, nil +} + +// Role reports whether this peer is the initiator or responder. +func (s *Session) Role() Role { return s.role } + +// MasterKeys returns the PSP master keys derived from this session. +func (s *Session) MasterKeys() *MasterKeys { return s.masterKeys } + +// TLSState returns the negotiated TLS connection state (version, cipher suite, +// peer certificate). Useful for logging and for asserting the FIPS suite. +func (s *Session) TLSState() tls.ConnectionState { return s.conn.ConnectionState().TLS } + +// Context returns a context that is cancelled when the underlying QUIC connection +// closes (peer close, idle timeout, or transport error). RunTunnel selects on it to +// detect session loss promptly rather than waiting for the next rekey tick. +func (s *Session) Context() context.Context { return s.conn.Context() } + +// Close cleanly shuts the session down. +func (s *Session) Close() error { return s.conn.CloseWithError(appErrNormal, "") } + +// DirectionalSAs is a peer's pair of simplex SAs for one session generation: +// Tx is what we encrypt outbound with (the peer's RX SPI), Rx is what we +// decrypt inbound with (our own RX SPI). +type DirectionalSAs struct { + Tx *SA + Rx *SA +} + +// NegotiateSAs runs the SA-setup exchange over a fresh QUIC stream and returns +// the tx/rx SAs for cipher suite v. Each peer allocates and announces its own RX +// SPI; both then derive every key locally from the shared master keys. The +// initiator writes first, the responder replies, so there is no deadlock. +// +// This round-trip is also the mutual key-confirmation: in TLS 1.3 mutual auth +// the initiator's handshake completes before the responder verifies the +// initiator's certificate, so a successful Dial does NOT prove the peer accepted +// us. A peer that fails to pin us tears the connection down, which makes this +// exchange fail. Callers MUST therefore treat a successful NegotiateSAs — not a +// successful Dial/Accept — as the precondition for installing keys (fail-closed). +// +// Concurrency: NOT safe for unmatched concurrent calls on one Session. It pairs +// one initiator OpenStreamSync with one responder AcceptStream, so call it +// sequentially, or have both peers issue the same number of concurrent calls +// (≤ MaxIncomingStreams); a surplus initiator call blocks until a matching +// responder call or the ctx deadline. +func (s *Session) NegotiateSAs(ctx context.Context, v ICXVersion) (*DirectionalSAs, error) { + if !v.valid() { + return nil, fmt.Errorf("control: unsupported cipher suite %d", v) + } + myRxSPI, err := s.rxAlloc.Allocate(activeMasterKeyIndex) + if err != nil { + return nil, err + } + offer := saOffer{Version: v, RxSPI: myRxSPI} + + var stream *quic.Stream + if s.role == Initiator { + stream, err = s.conn.OpenStreamSync(ctx) + } else { + stream, err = s.conn.AcceptStream(ctx) + } + if err != nil { + return nil, fmt.Errorf("control: open SA-setup stream: %w", err) + } + defer stream.Close() + if dl, ok := ctx.Deadline(); ok { + _ = stream.SetDeadline(dl) + } + + var peer saOffer + if s.role == Initiator { + if err := writeFrame(stream, offer.marshal()); err != nil { + return nil, fmt.Errorf("control: send SA offer: %w", err) + } + if peer, err = readSAOffer(stream); err != nil { + return nil, fmt.Errorf("control: read peer SA offer: %w", err) + } + } else { + if peer, err = readSAOffer(stream); err != nil { + return nil, fmt.Errorf("control: read peer SA offer: %w", err) + } + if err := writeFrame(stream, offer.marshal()); err != nil { + return nil, fmt.Errorf("control: send SA offer: %w", err) + } + } + + return s.deriveDirectional(v, myRxSPI, peer) +} + +// deriveDirectional derives the tx/rx SAs and enforces the txKey != rxKey +// invariant (the role-partitioned SPI space guarantees distinct SPIs, but we +// assert on the derived keys as a belt-and-suspenders check). +func (s *Session) deriveDirectional(v ICXVersion, myRxSPI uint32, peer saOffer) (*DirectionalSAs, error) { + if peer.Version != v { + return nil, fmt.Errorf("control: cipher suite mismatch: local %d, peer %d", v, peer.Version) + } + rx, err := s.masterKeys.DeriveSA(myRxSPI, v) + if err != nil { + return nil, fmt.Errorf("control: derive rx SA: %w", err) + } + tx, err := s.masterKeys.DeriveSA(peer.RxSPI, v) + if err != nil { + return nil, fmt.Errorf("control: derive tx SA: %w", err) + } + if bytes.Equal(tx.Key, rx.Key) { + return nil, errors.New("control: tx and rx SA keys collided") + } + return &DirectionalSAs{Tx: tx, Rx: rx}, nil +} diff --git a/control/transport_test.go b/control/transport_test.go new file mode 100644 index 0000000..ef1adaa --- /dev/null +++ b/control/transport_test.go @@ -0,0 +1,170 @@ +package control + +import ( + "bytes" + "context" + "crypto/tls" + "net" + "testing" + "time" +) + +// loopbackPeers wires an initiator and responder over two loopback UDP sockets +// and returns their established sessions. +func loopbackPeers(t *testing.T) (initiator, responder *Session, cleanup func()) { + t.Helper() + idA, err := GenerateIdentity() // initiator + if err != nil { + t.Fatal(err) + } + idB, err := GenerateIdentity() // responder + if err != nil { + t.Fatal(err) + } + + srvConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + t.Fatal(err) + } + cliConn, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + if err != nil { + t.Fatal(err) + } + + ln, err := Listen(srvConn, idB, idA.PublicKey()) + if err != nil { + t.Fatal(err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + + type res struct { + s *Session + err error + } + respCh := make(chan res, 1) + go func() { + s, err := ln.Accept(ctx) + respCh <- res{s, err} + }() + + initiator, err = Dial(ctx, cliConn, ln.ln.Addr(), idA, idB.PublicKey()) + if err != nil { + cancel() + t.Fatalf("dial: %v", err) + } + r := <-respCh + if r.err != nil { + cancel() + t.Fatalf("accept: %v", r.err) + } + responder = r.s + + cleanup = func() { + cancel() + _ = initiator.Close() + _ = responder.Close() + _ = ln.Close() + _ = cliConn.Close() + } + return initiator, responder, cleanup +} + +func TestControlSessionHandshakeAndSANegotiation(t *testing.T) { + initiator, responder, cleanup := loopbackPeers(t) + defer cleanup() + + // The handshake must be TLS 1.3 with an AES-GCM suite (FIPS-approved). + st := initiator.TLSState() + if st.Version != tls.VersionTLS13 { + t.Fatalf("TLS version %#x, want 1.3", st.Version) + } + switch st.CipherSuite { + case tls.TLS_AES_128_GCM_SHA256, tls.TLS_AES_256_GCM_SHA384: + default: + t.Fatalf("negotiated non-AES-GCM suite %#x", st.CipherSuite) + } + + // Both peers must derive identical master keys from the shared exporter. + if initiator.MasterKeys().keys != responder.MasterKeys().keys { + t.Fatal("peers derived different master keys") + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + type res struct { + sas *DirectionalSAs + err error + } + rCh := make(chan res, 1) + go func() { + sas, err := responder.NegotiateSAs(ctx, AESGCM128) + rCh <- res{sas, err} + }() + iSAs, err := initiator.NegotiateSAs(ctx, AESGCM128) + if err != nil { + t.Fatalf("initiator NegotiateSAs: %v", err) + } + r := <-rCh + if r.err != nil { + t.Fatalf("responder NegotiateSAs: %v", r.err) + } + rSAs := r.sas + + // Cross-match: what the initiator transmits with == what the responder + // receives with, and vice versa. This holds only if both derived the same + // master keys and agreed on SPIs. + if !bytes.Equal(iSAs.Tx.Key, rSAs.Rx.Key) { + t.Fatal("initiator TX key != responder RX key") + } + if !bytes.Equal(iSAs.Rx.Key, rSAs.Tx.Key) { + t.Fatal("initiator RX key != responder TX key") + } + // Within each peer, tx and rx must differ (no key/SPI collision). + if bytes.Equal(iSAs.Tx.Key, iSAs.Rx.Key) { + t.Fatal("initiator tx and rx keys collided") + } + if iSAs.Tx.SPI != rSAs.Rx.SPI || iSAs.Rx.SPI != rSAs.Tx.SPI { + t.Fatal("SPIs did not cross-match between peers") + } + if MasterKeyIndex(iSAs.Tx.SPI) != activeMasterKeyIndex { + t.Fatalf("tx SPI selects master key %d, want %d", MasterKeyIndex(iSAs.Tx.SPI), activeMasterKeyIndex) + } +} + +func TestControlSessionRejectsUnpinnedPeer(t *testing.T) { + idA, _ := GenerateIdentity() + idB, _ := GenerateIdentity() + imposter, _ := GenerateIdentity() + + srvConn, _ := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + cliConn, _ := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + defer srvConn.Close() + defer cliConn.Close() + + // Responder pins `imposter`, but the initiator authenticates as idA. + ln, err := Listen(srvConn, idB, imposter.PublicKey()) + if err != nil { + t.Fatal(err) + } + defer ln.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + go func() { _, _ = ln.Accept(ctx) }() + + // In TLS 1.3 mutual auth the client's handshake completes before the server + // verifies the client certificate, so Dial may return a session even though + // the responder will reject us. The security property is that we can never + // NEGOTIATE with a peer that does not pin us: the SA-setup round-trip is the + // mutual key-confirmation, and it must fail closed. + sess, err := Dial(ctx, cliConn, ln.Addr(), idA, idB.PublicKey()) + if err != nil { + return // rejected at dial — also acceptable + } + defer sess.Close() + if _, err := sess.NegotiateSAs(ctx, AESGCM128); err == nil { + t.Fatal("SA negotiation succeeded against a responder that pinned a different key") + } +} diff --git a/cp_wire_test.go b/cp_wire_test.go new file mode 100644 index 0000000..7eaf94c --- /dev/null +++ b/cp_wire_test.go @@ -0,0 +1,240 @@ +package icx_test + +import ( + "context" + "net" + "net/netip" + "testing" + "time" + + "github.com/stretchr/testify/require" + "gvisor.dev/gvisor/pkg/tcpip" + + "github.com/apoxy-dev/icx" + "github.com/apoxy-dev/icx/control" +) + +// This file proves the control-plane → data-plane bridge end to end under per-direction +// SPIs: each peer installs two simplex SAs (its own receive SPI, the peer's receive SPI), +// and two independently-keyed handlers exchange Geneve traffic in both directions, each +// decrypting under its own receive SPI. The handler is cross-platform, so this runs +// without the AF_XDP forwarder. + +// negotiateLoopback brings up an initiator and a responder control session over +// loopback UDP and returns each peer's negotiated directional SAs. +func negotiateLoopback(t *testing.T) (iSAs, rSAs *control.DirectionalSAs) { + t.Helper() + idA, err := control.GenerateIdentity() + require.NoError(t, err) + idB, err := control.GenerateIdentity() + require.NoError(t, err) + + srv, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + cli, err := net.ListenUDP("udp", &net.UDPAddr{IP: net.IPv6loopback}) + require.NoError(t, err) + t.Cleanup(func() { _ = srv.Close(); _ = cli.Close() }) + + ln, err := control.Listen(srv, idB, idA.PublicKey()) + require.NoError(t, err) + t.Cleanup(func() { _ = ln.Close() }) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + type sres struct { + s *control.Session + err error + } + accCh := make(chan sres, 1) + go func() { + s, err := ln.Accept(ctx) + accCh <- sres{s, err} + }() + iSess, err := control.Dial(ctx, cli, ln.Addr(), idA, idB.PublicKey()) + require.NoError(t, err) + acc := <-accCh + require.NoError(t, acc.err) + rSess := acc.s + t.Cleanup(func() { _ = iSess.Close(); _ = rSess.Close() }) + + type nres struct { + sas *control.DirectionalSAs + err error + } + negCh := make(chan nres, 1) + go func() { + sas, err := rSess.NegotiateSAs(ctx, control.AESGCM128) + negCh <- nres{sas, err} + }() + iSAs, err = iSess.NegotiateSAs(ctx, control.AESGCM128) + require.NoError(t, err) + neg := <-negCh + require.NoError(t, neg.err) + return iSAs, neg.sas +} + +func newPeerHandler(t *testing.T, vni uint, local, remote tcpip.Address) *icx.Handler { + t.Helper() + h, err := icx.NewHandler( + icx.WithLocalAddr(&tcpip.FullAddress{Addr: local, Port: 6081}), + icx.WithLayer3VirtFrames(), + ) + require.NoError(t, err) + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, &tcpip.FullAddress{Addr: remote, Port: 6081}, + []icx.Route{{Src: prefix, Dst: prefix}})) + return h +} + +// installDirectional installs a peer's negotiated directional SAs into its handler via +// the real guarded per-direction seam the production installer calls: rxSPI is our own +// receive SPI, txSPI is the peer's receive SPI (what we transmit to). +func installDirectional(t *testing.T, h *icx.Handler, vni uint, sas *control.DirectionalSAs) { + t.Helper() + require.Len(t, sas.Rx.Key, 16) + require.Len(t, sas.Tx.Key, 16) + var rx, tx [16]byte + copy(rx[:], sas.Rx.Key) + copy(tx[:], sas.Tx.Key) + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, sas.Rx.SPI, sas.Tx.SPI, rx, tx, time.Now().Add(time.Hour))) +} + +func TestControlPlanePerDirectionGeneveRoundTrip(t *testing.T) { + iSAs, rSAs := negotiateLoopback(t) + + // Per-direction SPIs: each peer's transmit SPI is the other's receive SPI, and the + // two directions are distinct (role-partitioned), so each direction owns its own + // nonce space — there is no shared epoch. + require.NotEqual(t, iSAs.Rx.SPI, iSAs.Tx.SPI, "the two directions must use distinct SPIs") + require.Equal(t, iSAs.Tx.SPI, rSAs.Rx.SPI, "initiator tx SPI must equal responder rx SPI") + require.Equal(t, iSAs.Rx.SPI, rSAs.Tx.SPI, "initiator rx SPI must equal responder tx SPI") + + const vni = 0x424344 + addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) + addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) + hI := newPeerHandler(t, vni, addrA, addrB) + hR := newPeerHandler(t, vni, addrB, addrA) + installDirectional(t, hI, vni, iSAs) + installDirectional(t, hR, vni, rSAs) + + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + out := make([]byte, 1500) + + // initiator -> responder: hR decrypts under its own receive SPI (== hI's tx SPI). + n, loop := hI.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + m := hR.PhyToVirt(phy[:n], out) + require.NotZero(t, m, "responder must decrypt initiator traffic") + require.Equal(t, ip, out[:m]) + + // responder -> initiator: hI decrypts under its own receive SPI (== hR's tx SPI). + n, loop = hR.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + m = hI.PhyToVirt(phy[:n], out) + require.NotZero(t, m, "initiator must decrypt responder traffic") + require.Equal(t, ip, out[:m]) + + // No key-miss or SPI-mismatch drops on either side: each direction's frame is bound + // to its own receive SPI and decrypts cleanly. + vnR, ok := hR.GetVirtualNetwork(vni) + require.True(t, ok) + require.Zero(t, vnR.Stats.RXDropsNoKey.Load()) + require.Zero(t, vnR.Stats.RXDropsSPIMismatch.Load()) + require.Equal(t, uint64(1), vnR.Stats.RXPackets.Load()) + vnI, ok := hI.GetVirtualNetwork(vni) + require.True(t, ok) + require.Zero(t, vnI.Stats.RXDropsNoKey.Load()) + require.Zero(t, vnI.Stats.RXDropsSPIMismatch.Load()) + require.Equal(t, uint64(1), vnI.Stats.RXPackets.Load()) +} + +// TestInstallResetsTxCounterPerEpoch pins the nonce-uniqueness invariant the control +// plane relies on: each new epoch install starts a FRESH transmit counter. Because every +// session derives a fresh master key (fresh ECDHE per reconnect), pairing a from-zero +// counter with each generation's key keeps the AES-GCM nonce (epoch‖counter) unique even +// when an SPI is reused or regresses after a restart. A refactor that carried the counter +// across installs would reuse a (key, nonce) pair and trip this test. +func TestInstallResetsTxCounterPerEpoch(t *testing.T) { + const vni = 0x334455 + addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) + addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) + h := newPeerHandler(t, vni, addrA, addrB) + + var rx, tx [16]byte + for i := range rx { + rx[i], tx[i] = byte(i), byte(255-i) + } + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 100, rx, tx, time.Now().Add(time.Hour))) + c, ok := h.TxCounterForTest(vni) + require.True(t, ok) + require.Zero(t, c, "fresh install starts at counter 0") + + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + n, loop := h.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + c, _ = h.TxCounterForTest(vni) + require.Equal(t, uint64(1), c, "first frame uses counter 1") + + // Install a NEW, higher epoch (as a rekey or a seeded post-restart generation + // would). The counter MUST reset to zero — no carryover. + var rx2, tx2 [16]byte + for i := range rx2 { + rx2[i], tx2[i] = byte(i+1), byte(254-i) + } + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 200, rx2, tx2, time.Now().Add(time.Hour))) + c, _ = h.TxCounterForTest(vni) + require.Zero(t, c, "a new epoch must start a fresh zero counter (no carryover → no nonce reuse)") + + n, loop = h.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + c, _ = h.TxCounterForTest(vni) + require.Equal(t, uint64(1), c, "first frame under the new epoch counts from 1 again") +} + +// TestSharedEpochCollapseMismatchDropsTraffic shows WHY per-direction SPIs are needed: +// if the two directions are collapsed onto a single epoch but each peer picks its OWN +// receive SPI for it (a naive shared-epoch bridge), the epochs disagree — they are +// role-partitioned — so the sender transmits under an SPI the receiver never installed +// and every frame misses the receiver's rxCiphers. The production path avoids this by +// installing the genuine per-direction SPIs (TestControlPlanePerDirectionGeneveRoundTrip). +func TestSharedEpochCollapseMismatchDropsTraffic(t *testing.T) { + iSAs, rSAs := negotiateLoopback(t) + require.NotEqual(t, iSAs.Rx.SPI, rSAs.Rx.SPI, "the two receive SPIs are role-partitioned and distinct") + + const vni = 0x515253 + addrA := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()) + addrB := tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()) + hI := newPeerHandler(t, vni, addrA, addrB) + hR := newPeerHandler(t, vni, addrB, addrA) + + // Collapse both directions onto each peer's OWN receive SPI via the single-epoch + // UpdateVirtualNetworkKeys seam. hI then transmits under iSAs.Rx.SPI, which hR + // (installed under rSAs.Rx.SPI) does not have. + var iRx, iTx, rRx, rTx [16]byte + copy(iRx[:], iSAs.Rx.Key) + copy(iTx[:], iSAs.Tx.Key) + copy(rRx[:], rSAs.Rx.Key) + copy(rTx[:], rSAs.Tx.Key) + require.NoError(t, hI.UpdateVirtualNetworkKeys(vni, iSAs.Rx.SPI, iRx, iTx, time.Now().Add(time.Hour))) + require.NoError(t, hR.UpdateVirtualNetworkKeys(vni, rSAs.Rx.SPI, rRx, rTx, time.Now().Add(time.Hour))) + + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + out := make([]byte, 1500) + n, loop := hI.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + m := hR.PhyToVirt(phy[:n], out) + require.Zero(t, m, "a shared-epoch collapse onto disagreeing epochs misses the receiver's rxCiphers and drops") + + vnR, ok := hR.GetVirtualNetwork(vni) + require.True(t, ok) + require.Equal(t, uint64(1), vnR.Stats.RXDropsNoKey.Load()) +} diff --git a/export_test.go b/export_test.go new file mode 100644 index 0000000..8f828b3 --- /dev/null +++ b/export_test.go @@ -0,0 +1,41 @@ +package icx + +import ( + "fmt" + "time" +) + +// InstallKeysForTest installs RX/TX ciphers under a single shared epoch (rxSPI == +// txSPI == epoch) without the production monotonicity and distinct-key guards enforced +// by UpdateVirtualNetworkSAs. +// +// It exists only for in-process loopback tests that encrypt and decrypt on a +// single handler with one shared key (the byte-equivalence, round-trip, fuzz and +// benchmark harnesses). Real peers always derive distinct per-direction keys and +// strictly increasing per-direction SPIs, so the guarded UpdateVirtualNetworkSAs +// deliberately rejects that shape — hence this unguarded test seam. The file name ends +// in _test.go, so it is compiled only under `go test` and never ships in the +// production binary or public API. +func (h *Handler) InstallKeysForTest(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { + value, ok := h.networkByID.Load(vni) + if !ok { + return fmt.Errorf("VNI %d not found", vni) + } + return h.installKeys(value.(*VirtualNetwork), epoch, epoch, rxKey, txKey, expiresAt) +} + +// TxCounterForTest returns the active SA's current TX nonce counter for the VNI (and +// whether one is installed). It lets a test assert the per-epoch fresh-counter +// invariant — each new epoch install resets the counter to zero — which is what keeps +// the AES-GCM nonce (epoch‖counter) unique as epochs climb across rekeys/restarts. +func (h *Handler) TxCounterForTest(vni uint) (uint64, bool) { + value, ok := h.networkByID.Load(vni) + if !ok { + return 0, false + } + tc := value.(*VirtualNetwork).txCipher.Load() + if tc == nil { + return 0, false + } + return tc.counter.Load(), true +} diff --git a/forwarder/forwarder.go b/forwarder/forwarder.go index 694a9dd..da39ffc 100644 --- a/forwarder/forwarder.go +++ b/forwarder/forwarder.go @@ -10,6 +10,7 @@ import ( "net" "os" "runtime" + "runtime/debug" "sync" "time" @@ -495,6 +496,32 @@ func (f *Forwarder) processFrames(ctx context.Context, queueID int) error { } } +// datapathPanicOnce bounds the recovered-panic log to a single emission so a +// crafted frame that trips an unguarded path cannot also flood the logs. Each +// such frame is still dropped and the queue keeps running. +var datapathPanicOnce sync.Once + +// safeTransform runs an in-place transform but converts a panic into a frame +// drop. The transforms are written to drop malformed frames rather than panic +// (see the length/IsValid guards in handler.go), so this is a last-resort +// backstop: the processFrames loop holds runtime.LockOSThread and has no other +// recovery, so a single panicking packet would otherwise tear down the whole +// queue goroutine (and, via errgroup, the forwarder). It also contains the GCM +// inexact-overlap panic class that the in-place aliasing contract relies on. +func safeTransform(fn inPlaceFn, buf []byte, off, length int) (outOff, outLen int, handled bool) { + defer func() { + if r := recover(); r != nil { + datapathPanicOnce.Do(func() { + slog.Error("recovered panic in datapath transform; dropping frame and continuing", + slog.Any("panic", r), + slog.String("stack", string(debug.Stack()))) + }) + outOff, outLen, handled = 0, 0, false + } + }() + return fn(buf, off, length) +} + // inPlaceFn transforms the packet at buf[off:off+length] in place and returns // the (offset, length) window of the output within buf, plus handled: true when // the handler produced an immediate local reply that must be transmitted back on @@ -561,7 +588,7 @@ func (f *Forwarder) forwardInPlace( continue } - outOff, outLen, handled := fn(buf, off, int(d.Len)) + outOff, outLen, handled := safeTransform(fn, buf, off, int(d.Len)) if outLen <= 0 { free = append(free, d) continue diff --git a/forwarder/forwarder_crypto_test.go b/forwarder/forwarder_crypto_test.go index fe60924..9f96b27 100644 --- a/forwarder/forwarder_crypto_test.go +++ b/forwarder/forwarder_crypto_test.go @@ -30,16 +30,17 @@ import ( // forwarder decapsulates it in place and emits the recovered inner packet on the // virt interface, byte-for-byte. // -// Construction (single self-keyed handler, option (b1) from the test plan): -// - One *icx.Handler with a single shared key and one route whose Src and Dst -// both cover the inner addresses, so a frame its two-buffer VirtToPhy encaps -// can be decapped by its in-place PhyToVirtInPlace (decap looks the vnet up -// by VNI and validates the inner source against route.Dst; it does NOT check -// the outer underlay source, so a self-encap/decap loop is valid). -// - The encap is done OFFLINE with VirtToPhy to mint a real encrypted frame; -// the frame is injected via a raw AF_PACKET socket on the phy peer (the same -// XDP-redirect primitive TestForwarderRXHeadroom uses), and the decapped -// inner frame is read with a second raw socket on the virt peer. +// Construction (two peer handlers — the production model, since +// UpdateVirtualNetworkKeys rejects equal rx/tx keys): +// - encapH mints genuinely-encrypted frames OFFLINE with VirtToPhy using +// txKey == abKey; the forwarder's handler h decapsulates them in place with +// rxKey == abKey. Both share one VNI and a route whose Src and Dst cover the +// inner addresses, so encap routing and decap source validation both pass +// (decap looks the vnet up by VNI and validates the inner source against +// route.Dst; it does NOT check the outer underlay source). +// - The minted frame is injected via a raw AF_PACKET socket on the phy peer +// (the same XDP-redirect primitive TestForwarderRXHeadroom uses), and the +// decapped inner frame is read with a second raw socket on the virt peer. func TestForwarderCryptoRoundTrip(t *testing.T) { netAdmin, _ := permissions.IsNetAdmin() if !netAdmin { @@ -78,20 +79,38 @@ func TestForwarderCryptoRoundTrip(t *testing.T) { LinkAddr: tcpip.LinkAddress("\x02\x00\x00\x00\x0a\x02"), } + const vni uint = 0x1234 + prefix := netip.MustParsePrefix("10.99.0.0/24") + routes := []icx.Route{{Src: prefix, Dst: prefix}} + + // Two handlers model the two real peers. The A->B direction key (abKey) is + // shared, but each peer's own rx/tx keys differ — UpdateVirtualNetworkKeys + // rejects equal rx/tx keys, since in the shared-epoch nonce layout the key is + // the only thing separating the two directions' nonce spaces. + var abKey, encapRx, hTx [16]byte + copy(abKey[:], []byte("icx-roundtrip-k!")) + copy(encapRx[:], []byte("icx-encap-rxkey!")) + copy(hTx[:], []byte("icx-decap-txkey!")) + expires := time.Now().Add(time.Hour) + + // h: the forwarder's handler. It decapsulates inbound frames with rxKey=abKey. h, err := icx.NewHandler( icx.WithLocalAddr(localUnderlay), icx.WithVirtMAC(virtMAC), ) require.NoError(t, err) + require.NoError(t, h.AddVirtualNetwork(vni, remoteUnderlay, routes)) + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, abKey, hTx, expires)) - const vni uint = 0x1234 - prefix := netip.MustParsePrefix("10.99.0.0/24") - require.NoError(t, h.AddVirtualNetwork(vni, remoteUnderlay, - []icx.Route{{Src: prefix, Dst: prefix}})) - - var key [16]byte - copy(key[:], []byte("icx-roundtrip-k!")) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour))) + // encapH: an offline peer used only to mint genuinely-encrypted frames with + // txKey=abKey (so h can decrypt them); it is never wired to the forwarder. + encapH, err := icx.NewHandler( + icx.WithLocalAddr(localUnderlay), + icx.WithVirtMAC(virtMAC), + ) + require.NoError(t, err) + require.NoError(t, encapH.AddVirtualNetwork(vni, remoteUnderlay, routes)) + require.NoError(t, encapH.UpdateVirtualNetworkKeys(vni, 1, encapRx, abKey, expires)) // Build the forwarder with the REAL handler (not the identity pipe). fwd, err := forwarder.NewForwarder(h, @@ -162,7 +181,7 @@ func TestForwarderCryptoRoundTrip(t *testing.T) { // counter, so every injected frame carries a unique nonce and clears the // replay filter. phyBuf := make([]byte, 2048) - n, handled := h.VirtToPhy(virtFrame, phyBuf) + n, handled := encapH.VirtToPhy(virtFrame, phyBuf) require.Greater(t, n, 0, "offline encap produced no frame") require.False(t, handled) enc := phyBuf[:n] diff --git a/go.mod b/go.mod index f890d00..98e708e 100644 --- a/go.mod +++ b/go.mod @@ -7,11 +7,12 @@ require ( github.com/cilium/ebpf v0.18.0 github.com/google/gopacket v1.1.19 github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 + github.com/quic-go/quic-go v0.59.1 github.com/safchain/ethtool v0.6.1 - github.com/stretchr/testify v1.10.0 + github.com/stretchr/testify v1.11.1 github.com/vishvananda/netlink v1.3.1 - golang.org/x/sync v0.15.0 - golang.org/x/sys v0.33.0 + golang.org/x/sync v0.16.0 + golang.org/x/sys v0.35.0 gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 ) @@ -21,7 +22,8 @@ require ( github.com/google/go-cmp v0.7.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect - golang.org/x/net v0.39.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/time v0.7.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 97b31f3..bab901a 100644 --- a/go.sum +++ b/go.sum @@ -20,40 +20,47 @@ github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9 h1:C8IqpV7kfAyZD github.com/phemmer/go-iptrie v0.0.0-20240326174613-ba542f5282c9/go.mod h1:dDLiSjNqdp8VjphLdGTx19OeAUsHOzhtc1FFJqpzWMU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/quic-go/quic-go v0.59.1 h1:0Gmua0HW1Tv7ANR7hUYwRyD0MG5OJfgvYSZasGZzBic= +github.com/quic-go/quic-go v0.59.1/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/safchain/ethtool v0.6.1 h1:mhRnXE1H8fV8TTXh/HdqE4tXtb57r//BQh5pPYMuM5k= github.com/safchain/ethtool v0.6.1/go.mod h1:JzoNbG8xeg/BeVeVoMCtCb3UPWoppZZbFpA+1WFh+M0= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= +go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko= +go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gvisor.dev/gvisor v0.0.0-20250606001031-fa4c4dd86b43 h1:BEymU11L8DZSC4GNK48JYIR8EcHs+gFxtg9YfYlp68c= diff --git a/handler.go b/handler.go index e3f84cc..de2b75c 100644 --- a/handler.go +++ b/handler.go @@ -4,6 +4,7 @@ import ( "crypto/aes" "crypto/cipher" "encoding/binary" + "errors" "fmt" "log/slog" "net" @@ -45,6 +46,10 @@ type Statistics struct { RXReplayDrops atomic.Uint64 // RXDecryptErrors is the number of received packets that failed decryption. RXDecryptErrors atomic.Uint64 + // RXDropsSPIMismatch is the number of received packets dropped because the + // SPI bound into the AEAD nonce (nonce[:4]) did not match the key epoch the + // frame selected — a malformed or tampered frame (APO-644). + RXDropsSPIMismatch atomic.Uint64 // RXInvalidSrc is the number of received packets with an invalid source address. RXInvalidSrc atomic.Uint64 // TXPackets is the number of transmitted packets. @@ -79,7 +84,11 @@ type receiveCipher struct { // Transmit cipher state. type transmitCipher struct { cipher.AEAD - epoch uint32 + epoch uint32 + // key is the transmit key, retained so the TX anti-reset guard can distinguish a + // genuine double-install of the live SA (same SPI AND same key) from a fresh-session + // install that merely reused the SPI value under a new key (see UpdateVirtualNetworkSAs). + key [16]byte counter atomic.Uint64 } @@ -96,6 +105,14 @@ type VirtualNetwork struct { // Internal state (not exposed) rxCiphers sync.Map txCipher atomic.Pointer[transmitCipher] + // rxEpoch is the currently-installed receive SPI (0 = none). Under per-direction + // SPIs the receive and transmit SPIs differ, so the previous RX cipher can no longer + // be found via txCipher.epoch; rxEpoch anchors the prior receive SA so installKeys + // can grace-clamp it. It is NOT a monotonicity guard and need NOT be monotone — it + // simply tracks the most recently installed receive SPI and may regress when a fresh + // session resets the allocator (the receive side emits no nonce, so a reused receive + // SPI is harmless: its replay filter is rebuilt with the fresh key). + rxEpoch atomic.Uint32 } // Clock provides time to the handler. Tests can inject a fake clock. @@ -347,9 +364,104 @@ func (h *Handler) UpdateVirtualNetworkRoutes(vni uint, allowedRoutes []Route) er return nil } -// UpdateVirtualNetworkKeys sets/rotates the encryption keys for a virtual network. -// This must be called atleast once every 24 hours or after `replay.RekeyAfterMessages` -// messages. The epoch must be a monotonically increasing value. +// UpdateVirtualNetworkSAs installs/rotates a virtual network's pair of simplex +// security associations (PSP model). It must be called at least once every 24 +// hours or after replay.RekeyAfterMessages messages. +// +// rxSPI and txSPI are the per-direction 32-bit SPIs that select the receive and +// transmit SAs. Each is carried in the Geneve key-epoch option and bound into the +// high 4 bytes of its direction's AES-GCM nonce (nonce = SPI‖counter): +// - rxSPI is OUR receive SPI — the one we allocated and the peer encrypts to. We +// store the RX cipher under it and look inbound frames up by it. Inbound frames +// carry rxSPI in their key-epoch option (== the sender's txSPI). +// - txSPI is the PEER's receive SPI — the one we encrypt to. We stamp it into the +// key-epoch option and nonce[:4] of every outbound frame. +// +// The two SPIs are distinct (the control plane partitions the SPI space by role, +// see control/sa.go), so each direction has its own nonce space. +// +// This entry point is for the CONTROL PLANE, where every SA generation carries a +// FRESH per-session key (each QUIC reconnect is a fresh ECDHE handshake — no 0-RTT, +// no session resumption, enforced in control/transport.go). That freshness is what +// guarantees the nonce-uniqueness invariant — no (key, nonce=SPI‖counter) pair ever +// repeats — across rekeys, reconnects and restarts: +// - within a session the receive-SPI allocator is monotonic, so a given SPI value +// is handed out once and its reset-to-zero counter is always a fresh nonce space; +// - across sessions the master keys are fresh, so even a reused SPI value derives a +// different key. SPIs may therefore reset to 1 on a reconnect and be re-accepted +// here at a LOWER value than before — which is exactly what makes a one-sided +// restart recover seamlessly with no persisted state. +// +// Three fail-closed guards apply: non-zero SPIs, distinct rx/tx keys, and a TX +// anti-reset check that rejects re-installing the CURRENTLY-live transmit SA — same SPI +// AND same key (the only in-process action that would reset a live counter under an +// unchanged key — a defensive backstop against a double-install/retry). A txSPI that +// merely reuses the live SPI value under a FRESH key (the transient-reconnect case) is +// accepted, as is any lower-or-higher txSPI; safety rests on the fresh-key guarantee +// above, not on monotonicity. +// Callers must serialize installs per VNI; the guard→install sequence is not +// internally locked (the control plane is single-threaded per Tunnel). Manually-keyed +// SAs that lack per-session key freshness should use the strictly-guarded single-epoch +// UpdateVirtualNetworkKeys seam instead. +func (h *Handler) UpdateVirtualNetworkSAs(vni uint, rxSPI, txSPI uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { + value, ok := h.networkByID.Load(vni) + if !ok { + return fmt.Errorf("VNI %d not found", vni) + } + vnet := value.(*VirtualNetwork) + + // Reserved-SPI guard: SPI 0 is reserved. Rejecting it keeps the data plane's + // accepted SPI space aligned with the control plane, which never emits an SPI + // whose low 31 bits are zero (control/sa.go), and refuses to write the all-zero + // nonce prefix that predated the SPI binding. + if rxSPI == 0 || txSPI == 0 { + return errors.New("rx and tx SPIs must be non-zero") + } + + // TX anti-reset guard: reject re-installing the SA that is currently live for + // transmit — same SPI AND same key. That pair is the only in-process action that would + // reset the TX counter to zero under a key already used at that SPI (a GCM nonce-reuse + // hazard): a defensive backstop against an accidental double-install/retry of the + // identical generation. The key comparison is load-bearing, not cosmetic: on a transient + // reconnect the receive-SPI allocator resets to a low value, so the new transmit SPI can + // COLLIDE with the still-live one — but it arrives under a FRESH master key (every session + // is a fresh ECDHE handshake; resumption and 0-RTT are disabled and asserted in + // control/transport.go), so its from-zero counter is a fresh nonce space and the install + // is safe. Comparing the SPI alone would spuriously reject that legitimate recovery. A + // different SPI is likewise always accepted. There is deliberately no RX monotonicity + // guard — the receive side never emits a nonce, so a reused receive SPI is harmless (its + // per-SA replay filter is rebuilt with the fresh key); rxEpoch is tracked only to + // grace-clamp the previous receive cipher (see installKeys). + if cur := vnet.txCipher.Load(); cur != nil && txSPI == cur.epoch && txKey == cur.key { + return fmt.Errorf("tx SA (SPI %d) is already live; refusing to reset its counter", txSPI) + } + + // Distinct-key guard. Under per-direction SPIs the role bit already separates the + // two directions' nonce spaces, so this is belt-and-suspenders. Real peers always + // derive distinct per-direction keys (control.DeriveSA over role-partitioned SPIs), + // so this never rejects a legitimate install. + if rxKey == txKey { + return errors.New("rx and tx keys must differ: each direction requires its own key") + } + + return h.installKeys(vnet, rxSPI, txSPI, rxKey, txKey, expiresAt) +} + +// UpdateVirtualNetworkKeys installs a single epoch (SPI) for BOTH simplex directions, +// separated only by the distinct rx/tx keys. It is the simple manual-keying seam — used +// by tests and by embedders that drive their own keying rather than the QUIC control +// plane (which installs genuine per-direction SAs via UpdateVirtualNetworkSAs). +// +// It enforces a STRICT monotonicity guard: the epoch must strictly increase within the +// process. That stops a caller from reinstalling an older-or-equal epoch with a reused +// key, which would reset the GCM counter under an already-used (epoch, key) and repeat a +// nonce. The guard cannot see across process restarts, so a caller that supplies a key +// which SURVIVES restarts (e.g. one read from disk) MUST advance the epoch past the last +// value used in any prior run — otherwise the from-zero counter reuses nonces under the +// persisted key. The control plane sidesteps this entirely by deriving a fresh key per +// session; manual callers own the invariant. +// +// Callers must serialize installs per VNI. func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { value, ok := h.networkByID.Load(vni) if !ok { @@ -357,23 +469,49 @@ func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey } vnet := value.(*VirtualNetwork) - // Set grace period (30s) on the previous RX key, if it exists - if txCipher := vnet.txCipher.Load(); txCipher != nil { - prevEpoch := txCipher.epoch - if prevEpoch != epoch { - if prevCipherAny, ok := vnet.rxCiphers.Load(prevEpoch); ok { - if prevCipher, ok := prevCipherAny.(*receiveCipher); ok { - graceExpiry := h.clock.Now().Add(keyGracePeriod) - // Clamp the expiry to now+gracePeriod now that we have rotated. - if prevCipher.expiresAt.After(graceExpiry) { - prevCipher.expiresAt = graceExpiry - } + if epoch == 0 { + return errors.New("epoch (SPI) must be non-zero") + } + // Strict monotonicity: a manual-keyed caller has no per-session key freshness to fall + // back on, so the epoch must strictly increase or a reset counter could reuse a nonce. + if cur := vnet.txCipher.Load(); cur != nil && epoch <= cur.epoch { + return fmt.Errorf("epoch must be monotonically increasing: new %d <= current %d", epoch, cur.epoch) + } + if rxKey == txKey { + return errors.New("rx and tx keys must differ: each direction requires its own key") + } + + return h.installKeys(vnet, epoch, epoch, rxKey, txKey, expiresAt) +} + +// installKeys builds and installs the RX/TX ciphers for a generation, applies the 30s +// grace period to the previous RX key, and sweeps expired RX keys. It is the unguarded +// mechanism behind UpdateVirtualNetworkSAs; the SPI/key guards live in that caller. +func (h *Handler) installKeys(vnet *VirtualNetwork, rxSPI, txSPI uint32, rxKey, txKey [16]byte, expiresAt time.Time) error { + // Clamp the previous RX key to a 30s grace window. The previous receive SA is + // keyed by the previous receive SPI (vnet.rxEpoch) — NOT by txCipher.epoch, which + // under per-direction SPIs is the previous TRANSMIT SPI (the peer's receive SPI) + // and would point at the wrong slot. The grace lets the survivor keep decrypting + // in-flight frames under the old key across a make-before-break rotation. Across a + // reconnect the previous and new receive SPIs differ (the new session's allocator + // reset to a low value while the old SPI was higher), so they occupy distinct + // rxCiphers slots and both stay live through the grace window. The rare exception — + // a fresh allocator climbing back to a still-graced old SPI within 30s — simply + // overwrites that slot with the fresh key; only late frames under the old key at + // that exact SPI are lost, which is acceptable post-reconnect. + if prevRxSPI := vnet.rxEpoch.Load(); prevRxSPI != 0 && prevRxSPI != rxSPI { + if prevCipherAny, ok := vnet.rxCiphers.Load(prevRxSPI); ok { + if prevCipher, ok := prevCipherAny.(*receiveCipher); ok { + graceExpiry := h.clock.Now().Add(keyGracePeriod) + if prevCipher.expiresAt.After(graceExpiry) { + prevCipher.expiresAt = graceExpiry } } } } - // Delete expired keys (to free key material from memory) + // Delete expired keys (to free key material from memory). This sweeps rxCiphers + // only; it does not touch vnet.rxEpoch (which is overwritten on the next install). now := h.clock.Now() vnet.rxCiphers.Range(func(key, value any) bool { cipher := value.(*receiveCipher) @@ -401,14 +539,27 @@ func (h *Handler) UpdateVirtualNetworkKeys(vni uint, epoch uint32, rxKey, txKey return fmt.Errorf("failed to create TX GCM: %w", err) } - vnet.rxCiphers.Store(epoch, &receiveCipher{ + // Install RX before TX (make-before-break): store the receive cipher and record the + // currently-installed receive SPI (rxEpoch) first, so we can decrypt the peer's + // new-generation frames before we start emitting our own under the new transmit SPI. + vnet.rxCiphers.Store(rxSPI, &receiveCipher{ AEAD: rxCipher, expiresAt: expiresAt, }) - + vnet.rxEpoch.Store(rxSPI) + + // A fresh transmitCipher resets the TX counter to zero for the new transmit SPI. + // This is load-bearing for nonce uniqueness: the AES-GCM nonce is txSPI‖counter, so + // each transmit SPI MUST begin its own counter at zero. Safety across rekeys, reconnects + // and restarts rests on each generation pairing that from-zero counter with a FRESH + // per-session key (fresh ECDHE; no resumption/0-RTT), so even a reused or regressed SPI + // value derives a different key and the (key, nonce) pair never repeats. A refactor that + // carried the counter across installs would reintroduce reuse. The key is retained so the + // TX anti-reset guard can reject a literal double-install of this same live SA. vnet.txCipher.Store(&transmitCipher{ AEAD: txCipher, - epoch: epoch, + epoch: txSPI, + key: txKey, }) return nil @@ -470,7 +621,14 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { if opt.Class == geneve.ClassExperimental { switch opt.Type { case geneve.OptionTypeTxCounter: - nonce = opt.Value[:12] + // Require the declared 12-byte (Length=3) value so nonce[:4] (the + // SPI) and the counter are provably sender-written, not stale pooled + // bytes from a short/malformed option — keeps the SPI-mismatch drop + // attribution honest. A wrong length leaves nonce nil → the + // "Expected TX counter" drop below. + if opt.Length == 3 { + nonce = opt.Value[:12] + } case geneve.OptionTypeKeyEpoch: epoch = binary.BigEndian.Uint32(opt.Value[:4]) } @@ -498,15 +656,22 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } - txCounter := binary.BigEndian.Uint64(nonce[4:]) - - if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { - // Delayed packets can cause some uneccesary noise here. - slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) - vnet.Stats.RXReplayDrops.Add(1) + // Verify the SPI bound into the nonce matches the receive SPI that selected this + // SA (nonce = SPI‖counter). Under per-direction SPIs the inbound key-epoch option + // carries the sender's transmit SPI, which is exactly our receive SPI; a conformant + // sender always sets nonce[:4] to that same value, so a mismatch is a malformed or + // tampered frame. GCM would also reject it at Open (the nonce and the header both + // feed the tag), but the explicit check makes the binding auditable and gives a + // precise drop reason. (APO-644) + if spi := binary.BigEndian.Uint32(nonce[:4]); spi != epoch { + slog.Debug("Dropping frame: nonce SPI does not match key epoch", + slog.Uint64("epoch", uint64(epoch)), slog.Uint64("nonceSPI", uint64(spi))) + vnet.Stats.RXDropsSPIMismatch.Add(1) return 0 } + txCounter := binary.BigEndian.Uint64(nonce[4:]) + var ipPacket []byte if h.opts.layer3 { ipPacket = virtFrame[:0] @@ -521,6 +686,18 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } + // Anti-replay AFTER authentication (APO-645/S2): ValidateCounter both checks + // and advances the sliding window, so it must run only on a packet whose tag + // has verified. Running it before Open let an attacker who can spoof the + // outer 4-tuple advance the window with a forged high counter and wedge the + // real peer (whose in-window counters are then rejected as "behind window"). + if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { + // Delayed packets can cause some unnecessary noise here. + slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) + vnet.Stats.RXReplayDrops.Add(1) + return 0 + } + // Is it an authenticated out-of-band message? if hdr.ProtocolType == 0 { slog.Debug("Dropping out-of-band message") @@ -531,6 +708,14 @@ func (h *Handler) PhyToVirt(phyFrame, virtFrame []byte) int { return 0 } + // A non-OOB frame whose authenticated payload is empty has no version nibble; + // ipPacket[0] would panic. An authenticated peer can craft one. (APO-647/S4) + if len(ipPacket) == 0 { + slog.Warn("Dropping empty decrypted payload") + vnet.Stats.RXInvalidSrc.Add(1) + return 0 + } + ipVersion := ipPacket[0] >> 4 // Get the source address of the decrypted frame. @@ -779,7 +964,14 @@ func (h *Handler) VirtToPhy(virtFrame, phyFrame []byte) (int, bool) { return 0, false } + // nonce = txSPI‖counter: bind this direction's transmit SPI (the peer's receive + // SPI) into the high 4 bytes. Under per-direction SPIs this prefix differs from the + // receive direction's, so the SPI itself separates the two directions' nonce spaces + // (on top of the distinct rx/tx keys); the receiver reconstructs the same SPI from + // its key-epoch option and rejects any frame whose nonce[:4] does not match. The low + // 8 bytes are the per-SA monotonic counter. Both halves must be written before Seal. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) switch ipVersion { @@ -898,7 +1090,14 @@ func (h *Handler) ToPhy(phyFrame []byte) int { // Fill options: epoch + nonce/counter binary.BigEndian.PutUint32(hdr.Options[0].Value[:4], txCipher.epoch) + // nonce = txSPI‖counter: bind this direction's transmit SPI (the peer's receive + // SPI) into the high 4 bytes. Under per-direction SPIs this prefix differs from the + // receive direction's, so the SPI itself separates the two directions' nonce spaces + // (on top of the distinct rx/tx keys); the receiver reconstructs the same SPI from + // its key-epoch option and rejects any frame whose nonce[:4] does not match. The low + // 8 bytes are the per-SA monotonic counter. Both halves must be written before Seal. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) // Place Geneve payload inside outer UDP frame. diff --git a/handler_test.go b/handler_test.go index bbf6b97..9e88f93 100644 --- a/handler_test.go +++ b/handler_test.go @@ -13,6 +13,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/header" "github.com/apoxy-dev/icx" + "github.com/apoxy-dev/icx/udp" ) func TestHandler(t *testing.T) { @@ -47,7 +48,7 @@ func TestHandler(t *testing.T) { err = h.AddVirtualNetwork(0x12345, peerAddr, []icx.Route{{Src: wildcardPrefix, Dst: wildcardPrefix}}) require.NoError(t, err) - err = h.UpdateVirtualNetworkKeys(0x12345, 1, key, key, time.Now().Add(time.Hour)) + err = h.InstallKeysForTest(0x12345, 1, key, key, time.Now().Add(time.Hour)) require.NoError(t, err) virtFrame := makeIPv4UDPEthernetFrame(virtMAC) @@ -110,7 +111,7 @@ func TestHandler_Layer3(t *testing.T) { err = h.AddVirtualNetwork(0x12345, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}}) require.NoError(t, err) - err = h.UpdateVirtualNetworkKeys(0x12345, 1, key, key, time.Now().Add(time.Hour)) + err = h.InstallKeysForTest(0x12345, 1, key, key, time.Now().Add(time.Hour)) require.NoError(t, err) ipPacket := makeIPv4UDPPacket() @@ -151,7 +152,7 @@ func TestHandler_Layer3_IPv6(t *testing.T) { // Prefix contains src 2001:db8::1 privatePrefix := netip.MustParsePrefix("2001:db8::/64") require.NoError(t, h.AddVirtualNetwork(0x45678, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}})) - require.NoError(t, h.UpdateVirtualNetworkKeys(0x45678, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(0x45678, 1, key, key, time.Now().Add(time.Hour))) ip6 := makeIPv6UDPPacket() phy := make([]byte, 1500) @@ -189,7 +190,7 @@ func TestUpdateVirtualNetworkRoutes(t *testing.T) { privatePrefix := netip.MustParsePrefix("192.168.1.0/24") err = h.AddVirtualNetwork(0x23456, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}}) require.NoError(t, err) - require.NoError(t, h.UpdateVirtualNetworkKeys(0x23456, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(0x23456, 1, key, key, time.Now().Add(time.Hour))) virt := makeIPv4UDPEthernetFrame(tcpip.GetRandMacAddr()) phy := make([]byte, 1500) @@ -249,7 +250,7 @@ func TestKeyRotation(t *testing.T) { require.NoError(t, h.AddVirtualNetwork(vni, peer, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}})) // Epoch 1 - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, k1, k1, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 1, k1, k1, clk.Now().Add(time.Hour))) ip := makeIPv4UDPPacket() phy := make([]byte, 2000) @@ -267,7 +268,7 @@ func TestKeyRotation(t *testing.T) { epoch1B := append([]byte(nil), phy[:n]...) // Rotate to epoch 2; epoch 1 gets grace - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 2, k2, k2, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 2, k2, k2, clk.Now().Add(time.Hour))) // Within grace: one of the saved epoch-1 frames must decrypt. m := h.PhyToVirt(epoch1A, out) @@ -286,13 +287,13 @@ func TestKeyRotation(t *testing.T) { epoch2A := append([]byte(nil), phy[:n]...) // Rotate to epoch 3 (starts grace for epoch 2) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 3, k3, k3, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 3, k3, k3, clk.Now().Add(time.Hour))) // Let epoch-2 grace expire. clk.Advance(31 * time.Second) // Rotate to epoch 4; expired RX keys should be swept here - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 4, k4, k4, clk.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 4, k4, k4, clk.Now().Add(time.Hour))) // The saved epoch-2 frame should now be rejected (no matching key after cleanup). m = h.PhyToVirt(epoch2A, out[:cap(out)]) @@ -307,6 +308,178 @@ func TestKeyRotation(t *testing.T) { require.Equal(t, ip, out[:m]) } +// TestUpdateVirtualNetworkKeysGuards exercises the two fail-closed guards on the +// production key-install seam: the epoch (SPI) must strictly increase, and the +// rx/tx keys must differ. +func TestUpdateVirtualNetworkKeysGuards(t *testing.T) { + localAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()), Port: 1234} + peerAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()), Port: 4321} + + h, err := icx.NewHandler(icx.WithLocalAddr(localAddr), icx.WithLayer3VirtFrames()) + require.NoError(t, err) + + const vni = 0x9999 + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, peerAddr, []icx.Route{{Src: prefix, Dst: prefix}})) + + var k1, k2 [16]byte + copy(k1[:], []byte("aaaaaaaaaaaaaaaa")) + copy(k2[:], []byte("bbbbbbbbbbbbbbbb")) + exp := time.Now().Add(time.Hour) + + // Equal rx/tx keys are rejected even on the first install. + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 1, k1, k1, exp), + "equal rx/tx keys must be rejected") + + // epoch 0 (reserved SPI) is rejected. + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 0, k1, k2, exp), + "epoch 0 (reserved SPI) must be rejected") + + // Distinct keys with a fresh, non-zero epoch succeed. + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, k1, k2, exp)) + + // Reinstalling the same epoch is rejected (must strictly increase). + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 1, k2, k1, exp), + "same epoch must be rejected (monotonicity)") + + // A higher epoch with distinct keys succeeds. + require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 3, k2, k1, exp)) + + // A non-zero epoch lower than the current one is rejected (monotonicity). + require.Error(t, h.UpdateVirtualNetworkKeys(vni, 2, k1, k2, exp), + "lower epoch must be rejected (monotonicity)") +} + +// TestUpdateVirtualNetworkSAsGuards exercises the control-plane install seam directly. +// Because every control-plane generation carries a FRESH per-session key, this path does +// NOT enforce SPI monotonicity: a reconnect resets the allocator to a low SPI and that +// reset SPI must be re-accepted under its fresh key. The only fail-closed guards are +// non-zero SPIs, distinct rx/tx keys, and a TX anti-reset check that refuses to re-install +// the CURRENTLY-live transmit SA — same SPI AND same key (the one in-process action that +// would reset a live counter under an unchanged key). A colliding SPI under a FRESH key, +// which is exactly the transient-reconnect case, is accepted. +func TestUpdateVirtualNetworkSAsGuards(t *testing.T) { + localAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()), Port: 1234} + peerAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()), Port: 4321} + + h, err := icx.NewHandler(icx.WithLocalAddr(localAddr), icx.WithLayer3VirtFrames()) + require.NoError(t, err) + + const vni = 0x9a9a + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, peerAddr, []icx.Route{{Src: prefix, Dst: prefix}})) + + var k1, k2, k3, k4 [16]byte + copy(k1[:], []byte("aaaaaaaaaaaaaaaa")) + copy(k2[:], []byte("bbbbbbbbbbbbbbbb")) + copy(k3[:], []byte("cccccccccccccccc")) + copy(k4[:], []byte("dddddddddddddddd")) + exp := time.Now().Add(time.Hour) + + // Either direction's SPI being zero (reserved) is rejected. + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 0, 20, k1, k2, exp), "rx SPI 0 must be rejected") + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 10, 0, k1, k2, exp), "tx SPI 0 must be rejected") + + // Equal rx/tx keys are rejected even on the first install. + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k1, k1, exp), "equal rx/tx keys must be rejected") + + // First install with distinct per-direction SPIs and distinct keys succeeds. + // Live transmit SA is now (SPI 20, key k2). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k1, k2, exp)) + + // Re-installing the IDENTICAL live transmit SA (same SPI AND same key) is rejected — + // that is the one action that would reset a live counter under its own key. The rx side + // is irrelevant to this guard. + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k1, k2, exp), + "re-installing the identical live tx SA must be rejected") + require.Error(t, h.UpdateVirtualNetworkSAs(vni, 99, 20, k3, k2, exp), + "the live tx SA is keyed by (SPI, key); a different rx SPI does not make it safe") + + // The SAME transmit SPI under a FRESH key is accepted — this is the transient-reconnect + // case (the allocator reset to a colliding SPI value, but the master key is fresh, so the + // from-zero counter is a fresh nonce space). Live tx SA is now (SPI 20, key k4). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 10, 20, k3, k4, exp), + "a colliding tx SPI under a fresh key is accepted (reconnect recovery)") + + // A reused (non-increasing) RX SPI is accepted — the receive side never emits a nonce, + // so a repeated receive SPI under a fresh key is harmless. Here rx stays at 10 while tx + // advances to 21; live tx SA is now (21, k2). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 10, 21, k1, k2, exp), + "a reused rx SPI is accepted (no rx monotonicity guard)") + + // A LOWER transmit SPI is accepted — it can only arrive from a fresh session whose key + // is fresh, so the reset counter is a fresh nonce space. This models a peer reconnect + // that reset its allocator: rx and tx both drop back to low values. Live tx SA is now (5, k4). + require.NoError(t, h.UpdateVirtualNetworkSAs(vni, 3, 5, k3, k4, exp), + "a lower tx SPI under a fresh key is accepted (reconnect recovery)") +} + +// TestRXRejectsSPINonceMismatch proves the RX side rejects a frame whose nonce +// SPI (nonce[:4]) does not match the key epoch it selected, on BOTH the +// cross-buffer and in-place decap paths, and that TX binds the SPI into the +// nonce in the first place (the offset sanity checks below). +func TestRXRejectsSPINonceMismatch(t *testing.T) { + localAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 1).To4()), Port: 1234} + peerAddr := &tcpip.FullAddress{Addr: tcpip.AddrFrom4Slice(net.IPv4(10, 0, 0, 2).To4()), Port: 4321} + + h, err := icx.NewHandler(icx.WithLocalAddr(localAddr), icx.WithLayer3VirtFrames()) + require.NoError(t, err) + + const vni = 0x1234 + prefix := netip.MustParsePrefix("192.168.1.0/24") + require.NoError(t, h.AddVirtualNetwork(vni, peerAddr, []icx.Route{{Src: prefix, Dst: prefix}})) + + var key [16]byte + copy(key[:], []byte("0123456789abcdef")) + // Loopback (single shared key) so this one handler both encaps and decaps; + // the production seam would reject equal rx/tx keys. + require.NoError(t, h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour))) + + // Mint a real encrypted frame; TX binds epoch 1 into nonce[:4]. + ip := makeIPv4UDPPacket() + phy := make([]byte, 1500) + n, loop := h.VirtToPhy(ip, phy) + require.NotZero(t, n) + require.False(t, loop) + + // The Geneve header sits at the UDP payload offset; its layout is + // base(8) + KeyEpoch option(hdr 4 + value 4) + TxCounter option(hdr 4 + value 12). + // So the key-epoch value and the nonce (= TxCounter value, nonce[:4] = SPI) + // live at these absolute offsets within the IPv4-underlay physical frame. + const geneveBase, optHdr, epochValLen = 8, 4, 4 + keyEpochOff := udp.PayloadOffsetIPv4 + geneveBase + optHdr + nonceOff := keyEpochOff + epochValLen + optHdr + require.Equal(t, uint32(1), binary.BigEndian.Uint32(phy[keyEpochOff:keyEpochOff+4]), + "sanity: key-epoch option should carry epoch 1") + require.Equal(t, uint32(1), binary.BigEndian.Uint32(phy[nonceOff:nonceOff+4]), + "TX must bind the SPI (epoch 1) into nonce[:4]") + + // Tamper nonce[:4] so it no longer matches the key epoch (which stays 1, so + // the RX side still selects the installed cipher and reaches the SPI check). + tampered := append([]byte(nil), phy[:n]...) + tampered[nonceOff] = 0xFF + require.NotEqual(t, uint32(1), binary.BigEndian.Uint32(tampered[nonceOff:nonceOff+4])) + require.Equal(t, uint32(1), binary.BigEndian.Uint32(tampered[keyEpochOff:keyEpochOff+4]), + "key epoch must remain 1 so the SPI check, not the key lookup, rejects the frame") + + vnet, ok := h.GetVirtualNetwork(vni) + require.True(t, ok) + + // Cross-buffer decap drops it and counts an SPI mismatch. + out := make([]byte, 1500) + require.Zero(t, h.PhyToVirt(append([]byte(nil), tampered...), out)) + require.Equal(t, uint64(1), vnet.Stats.RXDropsSPIMismatch.Load()) + + // In-place decap drops it identically (placed at a non-zero offset). + buf := make([]byte, len(tampered)+256) + const off = 64 + copy(buf[off:off+len(tampered)], tampered) + gotOff, gotLen := h.PhyToVirtInPlace(buf, off, len(tampered)) + require.Zero(t, gotLen) + require.Zero(t, gotOff) + require.Equal(t, uint64(2), vnet.Stats.RXDropsSPIMismatch.Load()) +} + func TestARPRequest_Loopback(t *testing.T) { if testing.Verbose() { slog.SetLogLoggerLevel(slog.LevelDebug) @@ -378,7 +551,7 @@ func TestNeighborSolicitation_Loopback(t *testing.T) { privatePrefix := netip.MustParsePrefix("2001:db8::/64") require.NoError(t, h.AddVirtualNetwork(0x56789, peerAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}})) - require.NoError(t, h.UpdateVirtualNetworkKeys(0x56789, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(0x56789, 1, key, key, time.Now().Add(time.Hour))) nsFrame := makeIPv6NeighborSolicitationEthernetFrame() phy := make([]byte, 2000) @@ -450,7 +623,7 @@ func BenchmarkHandler(b *testing.B) { err = h.AddVirtualNetwork(vni, remoteAddr, []icx.Route{{Src: privatePrefix, Dst: privatePrefix}}) require.NoError(b, err) - err = h.UpdateVirtualNetworkKeys(0x12345, 1, key, key, time.Now().Add(time.Hour)) + err = h.InstallKeysForTest(0x12345, 1, key, key, time.Now().Add(time.Hour)) require.NoError(b, err) virtMAC := tcpip.GetRandMacAddr() diff --git a/inplace_bench_test.go b/inplace_bench_test.go index 99c4922..3a8090c 100644 --- a/inplace_bench_test.go +++ b/inplace_bench_test.go @@ -55,8 +55,8 @@ func newBenchEnv(b *testing.B, tc inplaceTestCase) *inplaceEnv { if err := h.AddVirtualNetwork(vni, remoteAddr, routes); err != nil { b.Fatalf("AddVirtualNetwork: %v", err) } - if err := h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour)); err != nil { - b.Fatalf("UpdateVirtualNetworkKeys: %v", err) + if err := h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour)); err != nil { + b.Fatalf("InstallKeysForTest: %v", err) } vnet, ok := h.GetVirtualNetwork(vni) if !ok { diff --git a/inplace_transform.go b/inplace_transform.go index ff43c87..02a3d96 100644 --- a/inplace_transform.go +++ b/inplace_transform.go @@ -109,7 +109,14 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { if opt.Class == geneve.ClassExperimental { switch opt.Type { case geneve.OptionTypeTxCounter: - nonce = opt.Value[:12] + // Require the declared 12-byte (Length=3) value so nonce[:4] (the + // SPI) and the counter are provably sender-written, not stale pooled + // bytes from a short/malformed option — keeps the SPI-mismatch drop + // attribution honest. A wrong length leaves nonce nil → the + // "Expected TX counter" drop below. Mirrors PhyToVirt exactly. + if opt.Length == 3 { + nonce = opt.Value[:12] + } case geneve.OptionTypeKeyEpoch: epoch = binary.BigEndian.Uint32(opt.Value[:4]) } @@ -137,15 +144,21 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { return dropWindowOffset, 0 } - txCounter := binary.BigEndian.Uint64(nonce[4:]) - - if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { - // Delayed packets can cause some uneccesary noise here. - slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) - vnet.Stats.RXReplayDrops.Add(1) + // Verify the SPI bound into the nonce matches the epoch that selected this + // SA (nonce = SPI‖counter). A conformant sender always sets nonce[:4] to the + // key epoch; a mismatch is a malformed or tampered frame. GCM would also + // reject it at Open (the nonce and the header both feed the tag), but the + // explicit check makes the binding auditable and gives a precise drop reason. + // (APO-644). Mirrors PhyToVirt exactly to preserve byte-equivalence. + if spi := binary.BigEndian.Uint32(nonce[:4]); spi != epoch { + slog.Debug("Dropping frame: nonce SPI does not match key epoch", + slog.Uint64("epoch", uint64(epoch)), slog.Uint64("nonceSPI", uint64(spi))) + vnet.Stats.RXDropsSPIMismatch.Add(1) return dropWindowOffset, 0 } + txCounter := binary.BigEndian.Uint64(nonce[4:]) + // In-place decap: the ciphertext (payload[hdrLen:]) lives at ctStart within // buf; we open it onto itself at the SAME start (exact overlap), so the // plaintext is written over the ciphertext region. The AAD is the Geneve @@ -162,6 +175,18 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { return dropWindowOffset, 0 } + // Anti-replay AFTER authentication (APO-645/S2): ValidateCounter both checks + // and advances the sliding window, so it must run only on a packet whose tag + // has verified. Running it before Open let an attacker who can spoof the + // outer 4-tuple advance the window with a forged high counter and wedge the + // real peer (whose in-window counters are then rejected as "behind window"). + if !rxCipher.replayFilter.ValidateCounter(txCounter, replay.RejectAfterMessages) { + // Delayed packets can cause some unnecessary noise here. + slog.Debug("Replay filter rejected frame", slog.Uint64("txCounter", txCounter)) + vnet.Stats.RXReplayDrops.Add(1) + return dropWindowOffset, 0 + } + // Is it an authenticated out-of-band message? if hdr.ProtocolType == 0 { slog.Debug("Dropping out-of-band message") @@ -172,6 +197,14 @@ func (h *Handler) PhyToVirtInPlace(buf []byte, off, length int) (int, int) { return dropWindowOffset, 0 } + // A non-OOB frame whose authenticated payload is empty has no version nibble; + // ipPacket[0] would panic. An authenticated peer can craft one. (APO-647/S4) + if len(ipPacket) == 0 { + slog.Warn("Dropping empty decrypted payload") + vnet.Stats.RXInvalidSrc.Add(1) + return dropWindowOffset, 0 + } + ipVersion := ipPacket[0] >> 4 // Get the source address of the decrypted frame. @@ -458,7 +491,14 @@ func (h *Handler) VirtToPhyInPlace(buf []byte, off, length int) (int, int, bool) return dropWindowOffset, 0, false } + // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 + // bytes. Under the shared-epoch model this prefix is identical for both + // directions, so it does not separate them (the distinct rx/tx keys do); its + // value here is letting RX reject a tampered/mismatched SPI and forward-compat + // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. + // Must match the cross-buffer VirtToPhy/ToPhy nonce layout exactly. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) switch ipVersion { @@ -677,7 +717,14 @@ func (h *Handler) ToPhyInPlace(buf []byte, off int) (int, int) { // Fill options: epoch + nonce/counter binary.BigEndian.PutUint32(hdr.Options[0].Value[:4], txCipher.epoch) + // nonce = epoch‖counter: bind the 32-bit SPI (key epoch) into the high 4 + // bytes. Under the shared-epoch model this prefix is identical for both + // directions, so it does not separate them (the distinct rx/tx keys do); its + // value here is letting RX reject a tampered/mismatched SPI and forward-compat + // with per-direction SPIs. The low 8 bytes are the per-SA monotonic counter. + // Must match the cross-buffer VirtToPhy/ToPhy nonce layout exactly. nonce := hdr.Options[1].Value[:12] + binary.BigEndian.PutUint32(nonce[:4], txCipher.epoch) binary.BigEndian.PutUint64(nonce[4:], txCipher.counter.Add(1)) // Place Geneve payload inside outer UDP frame. diff --git a/inplace_transform_test.go b/inplace_transform_test.go index bb7ae13..955680d 100644 --- a/inplace_transform_test.go +++ b/inplace_transform_test.go @@ -185,10 +185,13 @@ func newInplaceEnv(t *testing.T, tc inplaceTestCase) *inplaceEnv { // Use a single key for both RX and TX so that frames encrypted with the TX // cipher can be decrypted with the RX cipher (the round-trip and decap - // equivalence tests run encap then decap on the same handler). + // equivalence tests run encap then decap on the same handler). This loopback + // shape requires the unguarded InstallKeysForTest seam: the production + // UpdateVirtualNetworkKeys rejects equal rx/tx keys (real peers use distinct + // per-direction keys). key := generateKey(t) require.NoError(t, h.AddVirtualNetwork(vni, remoteAddr, routes)) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour))) vnet, ok := h.GetVirtualNetwork(vni) require.True(t, ok) @@ -482,7 +485,7 @@ func newInplaceEnvKeepAlive(t *testing.T, tc inplaceTestCase, interval time.Dura }} key := generateKey(t) require.NoError(t, h.AddVirtualNetwork(vni, remoteAddr, routes)) - require.NoError(t, h.UpdateVirtualNetworkKeys(vni, 1, key, key, time.Now().Add(time.Hour))) + require.NoError(t, h.InstallKeysForTest(vni, 1, key, key, time.Now().Add(time.Hour))) vnet, ok := h.GetVirtualNetwork(vni) require.True(t, ok)