From 9077c029300daefd06e291fafa8c2b85b225267b Mon Sep 17 00:00:00 2001 From: findias Date: Wed, 6 May 2026 20:55:53 +0300 Subject: [PATCH] srv_prepare: bump conntrack + SYN backlog for VPN workload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default Ubuntu kernel ships nf_conntrack_max=8192 — adequate for a desktop, dangerously small for a VPN aggregation point handling hundreds of concurrent VLESS+Reality, XHTTP/H2, and probe flows. Symptom on EU production: dmesg fills with "nf_conntrack: table full, dropping packet"; new flows from any source IP not already warm in the table fail their TLS handshake. Compounds with the default tcp_timeout_established=432000 (5 days) which keeps slots occupied long after the client disconnected. Live cumulative counters on EU before the fix: insert_failed: 29371 drop: 55667 early_drop: 16 Settings added to srv_prepare_bbr_settings (sysctl applied via /etc/sysctl.conf): net.netfilter.nf_conntrack_max: 131072 net.netfilter.nf_conntrack_buckets: 131072 net.netfilter.nf_conntrack_tcp_timeout_established: 3600 net.ipv4.tcp_max_syn_backlog: 4096 Memory cost: ~50 MB on a 1vCPU/1GB box (131072 entries × ~376 B). Acceptable. The same defaults belong on every host in groups['cloud'] and groups['ru'] — srv_prepare runs on all of them. Hosts will pick up the new values on the next role apply; no service restart needed beyond the sysctl reload that role already triggers. Live patch already applied to vm_my_srv via direct `sysctl -w` plus /etc/sysctl.d/99-vpn-tuning.conf. The latter is now redundant once this role runs and writes /etc/sysctl.conf with the same values — keeping it temporarily until the next deploy normalises state. Note: this fix did NOT resolve the separate TLS-handshake-fails- from-vm_my_ru2 issue (3 of 4 v2 Reality SNIs). That bug has a different root cause and will be investigated separately (likely involving tcpdump on EU :443 during a fresh ru2-source handshake). Signed-off-by: findias --- roles/srv_prepare/defaults/main.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/roles/srv_prepare/defaults/main.yml b/roles/srv_prepare/defaults/main.yml index 39a588c..8eabf15 100644 --- a/roles/srv_prepare/defaults/main.yml +++ b/roles/srv_prepare/defaults/main.yml @@ -41,8 +41,13 @@ srv_prepare_apps_list_alpine: - openssl - ca-certificates -# Default sysctl settings +# Default sysctl settings — TCP performance + conntrack capacity for VPN workload. +# Default Ubuntu values are dangerously low for a multi-thousand-connection VPN +# box: nf_conntrack_max=8192 fills under load, packets get silently dropped, +# and tcp_timeout_established=432000 (5d) holds VPN slots forever even after +# clients disconnect. Cause of "TLS handshake fails from new source IP" symptom. srv_prepare_bbr_settings: + # BBR + buffers net.ipv4.tcp_congestion_control: bbr # Enable BBR net.core.default_qdisc: fq # Set default queue discipline to fq net.core.rmem_max: 67108864 # Maximum receive buffer size @@ -50,6 +55,13 @@ srv_prepare_bbr_settings: net.ipv4.tcp_rmem: 4096 87380 67108864 # Minimum, default, and maximum receive buffer sizes net.ipv4.tcp_wmem: 4096 65536 67108864 # Minimum, default, and maximum send buffer sizes net.core.netdev_max_backlog: 250000 # Maximum number of packets queued on the input side - net.core.somaxconn: 65535 # Maximum number of connections that can be queued for acceptance + net.core.somaxconn: 65535 # Maximum connection backlog net.ipv4.tcp_tw_reuse: 1 # Allow reusing TIME-WAIT sockets for new connections - net.ipv4.tcp_fin_timeout: 15 # Reduce the time a socket stays in TIME-WAIT state + net.ipv4.tcp_fin_timeout: 15 # Reduce TIME-WAIT duration + # SYN backlog — protect bursty handshake bursts + net.ipv4.tcp_max_syn_backlog: 4096 + # Conntrack capacity — VPN workload generates lots of short-lived flows + net.netfilter.nf_conntrack_max: 131072 + net.netfilter.nf_conntrack_buckets: 131072 + # Reduce VPN slot retention — 5 days default is too generous for short flows + net.netfilter.nf_conntrack_tcp_timeout_established: 3600