From b737ec380606926a282af63617351d882824cae9 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 21 May 2026 20:26:17 -0400 Subject: [PATCH 1/5] Make ESP async sends safe across connection reuse The async httpd send workers looked up their target connection by (server, sockfd). A worker queued for a connection that closed before it ran could fire against a different connection that had since reused the same socket fd, sending a frame to the wrong peer (or a torn-down one). Capture a weak_ptr to the originating connection in the work item and lock() it inside the worker instead; if the connection is gone, drop the send. Also gate any frame queued before the client/hello on client_hello_sent_ so stale pre-handshake traffic can't leak out; goodbye and the hello itself opt in via allow_before_hello, since both legitimately precede the handshake. --- src/connection.cpp | 5 +- src/connection.h | 10 +++- src/connection_manager.cpp | 17 ++++--- src/esp/client_connection.cpp | 3 +- src/esp/client_connection.h | 3 +- src/esp/server_connection.cpp | 86 +++++++++++++++++----------------- src/esp/server_connection.h | 3 +- src/host/client_connection.cpp | 3 +- src/host/client_connection.h | 3 +- src/host/server_connection.cpp | 3 +- src/host/server_connection.h | 3 +- 11 files changed, 78 insertions(+), 61 deletions(-) diff --git a/src/connection.cpp b/src/connection.cpp index 022e10d..4a2497a 100644 --- a/src/connection.cpp +++ b/src/connection.cpp @@ -44,7 +44,10 @@ void SendspinConnection::init_time_filter() { SsErr SendspinConnection::send_goodbye_reason(SendspinGoodbyeReason reason, SendCompleteCallback on_complete) { - return this->send_text_message(format_client_goodbye_message(reason), std::move(on_complete)); + // Goodbye is a control message that may legitimately be sent before the client/hello (e.g., + // when rejecting an excess connection), so it bypasses the pre-hello send gate. + return this->send_text_message(format_client_goodbye_message(reason), std::move(on_complete), + /*allow_before_hello=*/true); } // ============================================================================ diff --git a/src/connection.h b/src/connection.h index c0363de..8a06d5e 100644 --- a/src/connection.h +++ b/src/connection.h @@ -116,10 +116,16 @@ class SendspinConnection : public std::enable_shared_from_thisclient_->build_hello_message(); - SsErr err = conn->send_text_message(hello_message, [conn](bool success) { - if (success) { - conn->set_client_hello_sent(true); - } else { - SS_LOGW(TAG, "Hello message send failed"); - } - }); + SsErr err = conn->send_text_message( + hello_message, + [conn](bool success) { + if (success) { + conn->set_client_hello_sent(true); + } else { + SS_LOGW(TAG, "Hello message send failed"); + } + }, + /*allow_before_hello=*/true); if (err == SsErr::OK) { return true; // Successfully queued diff --git a/src/esp/client_connection.cpp b/src/esp/client_connection.cpp index 3044a44..8a31fd1 100644 --- a/src/esp/client_connection.cpp +++ b/src/esp/client_connection.cpp @@ -130,7 +130,8 @@ bool SendspinClientConnection::is_connected() const { } SsErr SendspinClientConnection::send_text_message(const std::string& message, - SendCompleteCallback cb) { + SendCompleteCallback cb, + bool /*allow_before_hello*/) { if (!this->is_connected()) { if (cb) { cb(false); diff --git a/src/esp/client_connection.h b/src/esp/client_connection.h index 9d7e70f..ef70480 100644 --- a/src/esp/client_connection.h +++ b/src/esp/client_connection.h @@ -86,7 +86,8 @@ class SendspinClientConnection : public SendspinConnection { /// @param msg The message string to send. /// @param cb Callback invoked after send completes. /// @return SsErr::OK if sent successfully, error code otherwise. - SsErr send_text_message(const std::string& message, SendCompleteCallback cb) override; + SsErr send_text_message(const std::string& message, SendCompleteCallback cb, + bool allow_before_hello) override; /// @brief Sends a client/time message, capturing the timestamp just before send /// @return true if the message was sent successfully, false otherwise. diff --git a/src/esp/server_connection.cpp b/src/esp/server_connection.cpp index 69e03a0..3946b3a 100644 --- a/src/esp/server_connection.cpp +++ b/src/esp/server_connection.cpp @@ -31,38 +31,30 @@ static const char* const TAG = "sendspin.server_connection"; // Static helpers // ============================================================================ -/// @brief Structure holding the session identity and payload data for async send operations +/// @brief Holds the originating connection and payload data for an async text send /// -/// Workers look the connection up at run time via `httpd_sess_get_ctx(server, sockfd)`. The -/// session-pinned shared_ptr keeps the conn alive until httpd has drained queued work for the -/// session and called the slot's free_fn, so the worker can either find a valid conn or cleanly -/// no-op if the session is already gone. +/// `conn` is a weak_ptr to the connection that queued the work, not a raw `(server, sockfd)` pair. +/// The worker resolves it with `conn.lock()`: a recycled sockfd can therefore never redirect the +/// frame onto a different connection, and a destroyed connection yields a null lock (a clean +/// no-op) rather than a use-after-free. struct AsyncRespArg { - httpd_handle_t server{nullptr}; - int sockfd{-1}; + std::weak_ptr conn; uint8_t* payload{nullptr}; size_t len{0}; bool has_callback{false}; + /// When true the frame may be sent before client/hello (the hello itself and goodbye); when + /// false the worker drops it unless the hello has already been sent on this connection. + bool allow_before_hello{false}; SendCompleteCallback on_complete; }; -/// @brief Small heap struct used by the time-message worker to identify its session +/// @brief Heap struct used by the time-message worker to identify its originating connection +/// +/// Holds a weak_ptr for the same identity-and-lifetime safety as AsyncRespArg. struct SessionLookup { - httpd_handle_t server; - int sockfd; + std::weak_ptr conn; }; -/// @brief Looks up the session-pinned connection shared_ptr, returning a copy or empty if gone -static std::shared_ptr lookup_session_conn(httpd_handle_t server, - int sockfd) { - auto* slot = - static_cast*>(httpd_sess_get_ctx(server, sockfd)); - if (slot == nullptr) { - return nullptr; - } - return *slot; -} - // ============================================================================ // SendspinConnection interface implementation // ============================================================================ @@ -122,7 +114,8 @@ bool SendspinServerConnection::is_connected() const { } SsErr SendspinServerConnection::send_text_message(const std::string& message, - SendCompleteCallback on_complete) { + SendCompleteCallback on_complete, + bool allow_before_hello) { if (!this->is_connected()) { // No client connected - invoke callback with failure if provided if (on_complete) { @@ -144,8 +137,8 @@ SsErr SendspinServerConnection::send_text_message(const std::string& message, // Use placement new to properly construct the struct with the callback new (resp_arg) AsyncRespArg(); - resp_arg->server = this->server_; - resp_arg->sockfd = this->sockfd_; + resp_arg->conn = std::static_pointer_cast(this->shared_from_this()); + resp_arg->allow_before_hello = allow_before_hello; resp_arg->payload = static_cast(platform_malloc(message.size())); if (resp_arg->payload == nullptr) { SS_LOGE(TAG, "Failed to allocate %zu bytes for message payload", message.size()); @@ -246,20 +239,21 @@ bool SendspinServerConnection::send_time_message() { return false; } - // The worker looks up the conn by sockfd at run time via `httpd_sess_get_ctx`. The session - // slot keeps the conn alive past any ConnectionManager teardown that races the httpd ctrl - // queue. The JSON is built inside the worker so client_transmitted is captured as close to - // the wire send as possible. SessionLookup is POD so a raw malloc/free pair (matching the - // AsyncRespArg allocator convention) is sufficient; no constructor/destructor to invoke. + // The worker resolves the originating connection via the weak_ptr below, so a recycled sockfd + // cannot redirect the frame and a destroyed connection yields a clean no-op. The JSON is built + // inside the worker so client_transmitted is captured as close to the wire send as possible. + // SessionLookup now holds a weak_ptr, so it is constructed with placement new and explicitly + // destroyed before free (matching the AsyncRespArg convention). auto* lookup = static_cast(platform_malloc_internal(sizeof(SessionLookup))); if (lookup == nullptr) { SS_LOGE(TAG, "Failed to allocate SessionLookup for time message"); return false; } - lookup->server = this->server_; - lookup->sockfd = this->sockfd_; + new (lookup) SessionLookup(); + lookup->conn = std::static_pointer_cast(this->shared_from_this()); if (httpd_queue_work(this->server_, async_send_time_text, lookup) != ESP_OK) { SS_LOGE(TAG, "httpd_queue_work failed for time message"); + lookup->~SessionLookup(); platform_free(lookup); return false; } @@ -268,12 +262,14 @@ bool SendspinServerConnection::send_time_message() { void SendspinServerConnection::async_send_time_text(void* arg) { auto* lookup = static_cast(arg); - const httpd_handle_t server = lookup->server; - const int sockfd = lookup->sockfd; + auto conn = lookup->conn.lock(); + lookup->~SessionLookup(); platform_free(lookup); - auto conn = lookup_session_conn(server, sockfd); - if (!conn || !conn->is_connected()) { + // Drop the time frame unless client/hello has already been sent on this exact connection. The + // weak_ptr lock guarantees identity (never a recycled-sockfd peer); the hello gate guarantees + // a stale time frame can never jump ahead of a fresh connection's hello. + if (!conn || !conn->is_connected() || !conn->client_hello_sent_) { return; } @@ -309,15 +305,17 @@ void SendspinServerConnection::async_send_text(void* arg) { ws_pkt.len = resp_arg->len; ws_pkt.type = HTTPD_WS_TYPE_TEXT; - // Look up the conn via the session slot and invoke on_complete only when the conn is still - // alive. If the session has already been torn down the slot is gone and we drop the completion - // callback silently: callers frequently capture the connection by raw pointer (e.g., - // ConnectionManager::send_hello_message), and dereferencing it after teardown would be a UAF. - // Lifetime-safe completion logic (e.g., disconnect()'s weak_ptr-guarded trigger_close) already - // tolerates not firing in this case. The AsyncRespArg destructor below releases any captured - // state held by the std::function. - auto conn = lookup_session_conn(resp_arg->server, resp_arg->sockfd); - if (conn && conn->is_connected()) { + // Resolve the originating connection. weak_ptr.lock() yields the exact conn that queued this + // work (or null if it has since been destroyed), so a recycled sockfd can never redirect the + // frame onto a different connection -- the lifetime-safe replacement for the old raw `this` + // pointer that caused the use-after-free crash. Non-handshake frames are additionally gated on + // client_hello_sent_ so nothing can precede the client/hello. When the conn is gone or the + // frame is gated the completion callback is intentionally not fired; callers that key off + // completion (e.g., the hello in ConnectionManager::send_hello_message) always pass + // allow_before_hello, so their callback still runs. + auto conn = resp_arg->conn.lock(); + if (conn && conn->is_connected() && + (resp_arg->allow_before_hello || conn->client_hello_sent_)) { esp_err_t err = httpd_ws_send_frame_async(conn->server_, conn->sockfd_, &ws_pkt); if (resp_arg->has_callback) { resp_arg->on_complete(err == ESP_OK); diff --git a/src/esp/server_connection.h b/src/esp/server_connection.h index 810f394..1a8e634 100644 --- a/src/esp/server_connection.h +++ b/src/esp/server_connection.h @@ -89,7 +89,8 @@ class SendspinServerConnection : public SendspinConnection { /// @param message The message string to send. /// @param on_complete Callback invoked after send completes. /// @return SsErr::OK if queued successfully, error code otherwise. - SsErr send_text_message(const std::string& message, SendCompleteCallback on_complete) override; + SsErr send_text_message(const std::string& message, SendCompleteCallback on_complete, + bool allow_before_hello) override; /// @brief Sends a client/time message, stamping the timestamp inside the httpd worker /// diff --git a/src/host/client_connection.cpp b/src/host/client_connection.cpp index 7965d9e..89b624d 100644 --- a/src/host/client_connection.cpp +++ b/src/host/client_connection.cpp @@ -96,7 +96,8 @@ void SendspinClientConnection::disconnect(SendspinGoodbyeReason reason, } SsErr SendspinClientConnection::send_text_message(const std::string& message, - SendCompleteCallback cb) { + SendCompleteCallback cb, + bool /*allow_before_hello*/) { if (!this->is_connected()) { if (cb) { cb(false); diff --git a/src/host/client_connection.h b/src/host/client_connection.h index 1aead19..acce691 100644 --- a/src/host/client_connection.h +++ b/src/host/client_connection.h @@ -72,7 +72,8 @@ class SendspinClientConnection : public SendspinConnection { /// @param message The message string to send. /// @param cb Callback invoked after send completes. /// @return SsErr::OK if queued successfully, error code otherwise. - SsErr send_text_message(const std::string& message, SendCompleteCallback cb) override; + SsErr send_text_message(const std::string& message, SendCompleteCallback cb, + bool allow_before_hello) override; /// @brief Sends a client/time message, capturing the timestamp synchronously before send /// @return true if the message was sent successfully, false otherwise. diff --git a/src/host/server_connection.cpp b/src/host/server_connection.cpp index c50d228..c9613c6 100644 --- a/src/host/server_connection.cpp +++ b/src/host/server_connection.cpp @@ -60,7 +60,8 @@ bool SendspinServerConnection::is_connected() const { } SsErr SendspinServerConnection::send_text_message(const std::string& message, - SendCompleteCallback on_complete) { + SendCompleteCallback on_complete, + bool /*allow_before_hello*/) { if (!this->is_connected()) { if (on_complete) { on_complete(false); diff --git a/src/host/server_connection.h b/src/host/server_connection.h index d42e7b5..0254005 100644 --- a/src/host/server_connection.h +++ b/src/host/server_connection.h @@ -74,7 +74,8 @@ class SendspinServerConnection : public SendspinConnection { /// @param message The message string to send. /// @param on_complete Callback invoked after send completes. /// @return SsErr::OK if sent successfully, error code otherwise. - SsErr send_text_message(const std::string& message, SendCompleteCallback on_complete) override; + SsErr send_text_message(const std::string& message, SendCompleteCallback on_complete, + bool allow_before_hello) override; /// @brief Sends a client/time message, capturing the timestamp synchronously before send /// @return true if the message was sent successfully, false otherwise. From 81701c080ed4e25d121a5978ecbaa27eba556113 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 21 May 2026 20:26:40 -0400 Subject: [PATCH 2/5] Track hello retries per connection A single hello_retry_ slot meant that when a second connection arrived mid-handshake (e.g. a handoff candidate), arming its retry overwrote the first connection's pending hello, so the first never got one. Keep one HelloRetryState per managed connection in a vector. loop() walks it, drops entries whose connection is no longer current/pending, and retries the rest independently; on_connection_lost removes an entry by connection pointer. --- src/connection_manager.cpp | 101 ++++++++++++++++++++++++------------- src/connection_manager.h | 5 +- 2 files changed, 71 insertions(+), 35 deletions(-) diff --git a/src/connection_manager.cpp b/src/connection_manager.cpp index a3fb01e..eee2625 100644 --- a/src/connection_manager.cpp +++ b/src/connection_manager.cpp @@ -44,7 +44,7 @@ ConnectionManager::~ConnectionManager() { this->current_connection_.reset(); this->pending_connection_.reset(); } - this->hello_retry_.conn.reset(); + this->hello_retries_.clear(); } // ============================================================================ @@ -225,28 +225,41 @@ void ConnectionManager::loop() { pending_copy->loop(); } - // Check hello retry timer - if (this->hello_retry_.retry_time_us > 0 && - platform_time_us() >= this->hello_retry_.retry_time_us) { - this->hello_retry_.retry_time_us = 0; - + // Check hello retry timers (one entry per managed connection, so a second connection arriving + // mid-handshake cannot clobber the first connection's pending hello). + { std::lock_guard lock(this->conn_ptr_mutex_); - // Verify connection is still valid - if (this->hello_retry_.conn == this->current_connection_ || - this->hello_retry_.conn == this->pending_connection_) { - if (!this->send_hello_message(this->hello_retry_.attempts - 1, - this->hello_retry_.conn.get())) { - // Transient failure - retry with exponential backoff - if (this->hello_retry_.attempts > 1) { - this->hello_retry_.delay_ms *= 2; - this->hello_retry_.attempts--; - this->hello_retry_.retry_time_us = - platform_time_us() + - static_cast(this->hello_retry_.delay_ms) * US_PER_MS; - } + const int64_t now_us = platform_time_us(); + for (auto it = this->hello_retries_.begin(); it != this->hello_retries_.end();) { + HelloRetryState& retry = *it; + + // Drop retries whose connection is no longer managed. + if (retry.conn != this->current_connection_ && + retry.conn != this->pending_connection_) { + it = this->hello_retries_.erase(it); + continue; + } + + if (retry.retry_time_us == 0 || now_us < retry.retry_time_us) { + ++it; + continue; + } + + if (this->send_hello_message(retry.attempts - 1, retry.conn.get())) { + // Sent successfully (or connection no longer valid) - retry is complete. + it = this->hello_retries_.erase(it); + continue; + } + + // Transient failure - retry with exponential backoff until attempts are exhausted. + if (retry.attempts > 1) { + retry.delay_ms *= 2; + retry.attempts--; + retry.retry_time_us = now_us + static_cast(retry.delay_ms) * US_PER_MS; + ++it; + } else { + it = this->hello_retries_.erase(it); } - } else { - this->hello_retry_.conn.reset(); } } } @@ -343,11 +356,37 @@ void ConnectionManager::on_new_connection(std::shared_ptrhello_retry_.conn = conn->shared_from_this(); - this->hello_retry_.delay_ms = HelloRetryState::INITIAL_RETRY_DELAY_MS; - this->hello_retry_.attempts = 3; - this->hello_retry_.retry_time_us = platform_time_us() + HELLO_INITIAL_DELAY_US; + // Arm a per-connection hello retry: initial delay, 3 attempts. Reuse an existing entry for this + // connection if one is somehow already present so a connection never gets two retry timers. + auto conn_sp = conn->shared_from_this(); + const int64_t retry_time_us = platform_time_us() + HELLO_INITIAL_DELAY_US; + + for (auto& retry : this->hello_retries_) { + if (retry.conn == conn_sp) { + retry.delay_ms = HelloRetryState::INITIAL_RETRY_DELAY_MS; + retry.attempts = 3; + retry.retry_time_us = retry_time_us; + return; + } + } + + HelloRetryState retry; + retry.conn = std::move(conn_sp); + retry.delay_ms = HelloRetryState::INITIAL_RETRY_DELAY_MS; + retry.attempts = 3; + retry.retry_time_us = retry_time_us; + this->hello_retries_.push_back(std::move(retry)); +} + +void ConnectionManager::remove_hello_retry(SendspinConnection* conn) { + // Note: caller must hold conn_ptr_mutex_ + for (auto it = this->hello_retries_.begin(); it != this->hello_retries_.end();) { + if (it->conn.get() == conn) { + it = this->hello_retries_.erase(it); + } else { + ++it; + } + } } bool ConnectionManager::send_hello_message(uint8_t remaining_attempts, SendspinConnection* conn) { @@ -403,10 +442,7 @@ void ConnectionManager::on_connection_lost(SendspinConnection* conn) { conn->disable_message_dispatch(); this->client_->time_burst_->reset(); this->client_->cleanup_connection_state(); - if (this->hello_retry_.conn.get() == conn) { - this->hello_retry_.conn.reset(); - this->hello_retry_.retry_time_us = 0; - } + this->remove_hello_retry(conn); this->current_connection_.reset(); if (this->pending_connection_ != nullptr) { @@ -415,10 +451,7 @@ void ConnectionManager::on_connection_lost(SendspinConnection* conn) { } } else if (this->pending_connection_ != nullptr && this->pending_connection_.get() == conn) { SS_LOGD(TAG, "Pending connection lost"); - if (this->hello_retry_.conn.get() == conn) { - this->hello_retry_.conn.reset(); - this->hello_retry_.retry_time_us = 0; - } + this->remove_hello_retry(conn); this->pending_connection_.reset(); } } diff --git a/src/connection_manager.h b/src/connection_manager.h index 5929079..8fe0780 100644 --- a/src/connection_manager.h +++ b/src/connection_manager.h @@ -164,6 +164,9 @@ class ConnectionManager { /// @return True if done (sent or connection invalid), false if the send failed and should /// retry. bool send_hello_message(uint8_t remaining_attempts, SendspinConnection* conn); + /// @brief Removes any pending hello-retry entry associated with the given connection. + /// @param conn The connection whose retry state should be dropped. + void remove_hello_retry(SendspinConnection* conn); // ======================================== // Connection lifecycle @@ -190,7 +193,7 @@ class ConnectionManager { // Struct fields std::mutex conn_mutex_; // Protects deferred lifecycle event queues mutable std::mutex conn_ptr_mutex_; // Protects current_connection_ and pending_connection_ - HelloRetryState hello_retry_; + std::vector hello_retries_; // One entry per connection awaiting its hello std::vector pending_close_events_; std::vector> pending_connected_events_; std::vector> pending_disconnect_events_; From a8eda264ca1c80d4e6e293fa0e2676f8160cf66b Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 21 May 2026 20:45:22 -0400 Subject: [PATCH 3/5] Fix stale docs and tighten retry cleanup after review - internals.md: the "Server connection ownership" and "Graceful Disconnect" sections still described the old (server, sockfd) worker lookup. The send workers now capture a weak_ptr; rewrite those sections and document the pre-hello send gate. Update the async_send_time_text header docstring too. - complete_handoff: remove the displaced connection's hello-retry entry when it leaves the managed set, instead of leaving it for loop()'s lazy prune. - Correct the async_send_text comment and the send_text_message callback doc: the completion callback is skipped when the conn is gone, so allow_before_hello bypasses the gate but not the conn-alive requirement. - Clarify the initiate_hello dedup and remove_hello_retry no-op comments. --- docs/internals.md | 12 +++++++----- src/connection.h | 5 ++++- src/connection_manager.cpp | 12 ++++++++++-- src/esp/server_connection.cpp | 10 ++++++---- src/esp/server_connection.h | 7 ++++--- 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/docs/internals.md b/docs/internals.md index e942bf4..fb3d3f4 100644 --- a/docs/internals.md +++ b/docs/internals.md @@ -197,7 +197,7 @@ The bump arena suits ArduinoJson's allocation pattern: during a parse the varian ├─ Process close/disconnect events (on_connection_lost) ├─ Process hello events (handshake completion, handoff decisions) ├─ Call loop() on current and pending connections - └─ Check hello retry timer + └─ Check per-connection hello retry timers 2. time_burst_->loop(conn) (skipped when no current connection) ├─ Send next time message if ready @@ -394,7 +394,7 @@ Both are `std::shared_ptr`, and on the ESP server path they ### Handshake and Handoff 1. A new connection (outbound or inbound) is started. If a current connection exists, the new one becomes `pending_connection_`. -2. The connection sends a CLIENT_HELLO. Retry with exponential backoff (100 ms base, 3 attempts). +2. The connection sends a CLIENT_HELLO. Retry with exponential backoff (100 ms base, 3 attempts). Each managed connection has its own retry entry in `ConnectionManager::hello_retries_`, so a handoff candidate arriving mid-handshake cannot clobber the current connection's pending hello (and vice versa). 3. SERVER_HELLO arrives on the network thread → enqueued into mutex-protected vector. 4. Main loop processes the hello event: - Stores server info on the connection. @@ -422,7 +422,7 @@ When a connection is lost (`on_connection_lost`): `disconnect_and_release()` calls `conn->disconnect(reason, nullptr)` and lets the local `shared_ptr` go out of scope. -- **ESP server**: the goodbye text is queued as an httpd worker job; the worker looks the connection up via `httpd_sess_get_ctx`, sends the frame, then runs the completion lambda that calls `trigger_close()`. The session slot installed in `open_callback` keeps the connection alive across that whole sequence even after `ConnectionManager`'s observer `shared_ptr` is dropped. The session is finally freed when httpd invokes the slot's `free_fn` (see [Server connection ownership (ESP)](#server-connection-ownership-esp)). The completion lambda captures a `weak_ptr` to make this lifetime explicit — `trigger_close()` is skipped if the conn has already been freed. +- **ESP server**: the goodbye text is queued as an httpd worker job. The worker resolves the connection by `lock()`ing the `weak_ptr` captured in the queued arg when the goodbye was enqueued; if it resolves it sends the frame, then runs the completion lambda that calls `trigger_close()`. The session slot installed in `open_callback` keeps the connection alive across that whole sequence even after `ConnectionManager`'s observer `shared_ptr` is dropped. The session is finally freed when httpd invokes the slot's `free_fn` (see [Server connection ownership (ESP)](#server-connection-ownership-esp)). The completion lambda also captures a `weak_ptr` to make this lifetime explicit — `trigger_close()` is skipped if the conn has already been freed. Goodbye is one of the two messages that pass `allow_before_hello=true`, so it is not blocked by the pre-hello send gate (a rejected connection is told to leave before it ever sends a hello). - **Host client**: the IXWebSocket send is synchronous, so the goodbye and close have both completed by the time `disconnect()` returns and the `shared_ptr` drops the last reference. ### Server connection ownership (ESP) @@ -431,10 +431,12 @@ On the ESP build, `SendspinServerConnection` lifetime is pinned to the httpd ses 1. `SendspinWsServer::open_callback` (the httpd `open_fn`) creates the `shared_ptr`, heap-allocates a `shared_ptr*` slot, and calls `httpd_sess_set_ctx(handle, sockfd, slot, free_fn)` with a deleter that `delete`s the slot. That slot is the authoritative reference. 2. The same shared_ptr is forwarded into `ConnectionManager::on_new_connection`, which stores it in `current_connection_` or `pending_connection_` as a *secondary observer*. -3. The httpd WebSocket handler (`websocket_handler`) and queued workers (`async_send_text`, `async_send_time_text`) look the connection up by `httpd_sess_get_ctx(handle, sockfd)` at run time, copying the slot's `shared_ptr` for the duration of their work. They never assume the manager's observer slot is alive. +3. The httpd WebSocket handler (`websocket_handler`) looks the connection up by `httpd_sess_get_ctx(handle, sockfd)` at run time, copying the slot's `shared_ptr` for the duration of its work; it never assumes the manager's observer slot is alive. The queued send workers (`async_send_text`, `async_send_time_text`) instead capture a `weak_ptr` to the originating connection and `lock()` it when they run. 4. When the socket closes, httpd calls the `close_fn` first (which fires `connection_closed_callback_` so `ConnectionManager` can drop its observer in the next `loop()`), then later calls the slot's `free_fn` to release the authoritative reference once no workers are queued for that session. -Queued workers carry only a `{httpd_handle_t, int sockfd}` pair (`AsyncRespArg` for text sends, `SessionLookup` for time sends) rather than capturing the connection directly. If the session has already been torn down by the time a worker runs, the lookup returns null and the worker no-ops cleanly. Both arg structs are allocated through `platform_malloc` / `platform_malloc_internal` and freed in the worker. +Queued send workers capture a `weak_ptr` to the originating connection — `AsyncRespArg` for text sends, `SessionLookup` for time sends — and `lock()` it when the worker runs. This is deliberately **not** a `{httpd_handle_t, int sockfd}` pair: identifying the target by sockfd risked binding to a *different* connection that had recycled the same fd after the original closed, sending a frame to the wrong peer. The `weak_ptr` resolves to the exact connection that queued the work, or to null if it has since been destroyed, in which case the worker no-ops cleanly. Because these structs now hold non-trivial members (the `weak_ptr`, and `AsyncRespArg`'s completion `std::function`), they are constructed with placement-new and explicitly destroyed before `platform_free` rather than treated as POD. Both are allocated through `platform_malloc` / `platform_malloc_internal`. + +The send workers also enforce the protocol's "hello is always first" rule: a frame is dropped unless `client_hello_sent_` is set on the resolved connection, *unless* the caller passed `allow_before_hello=true`. Exactly two callers do — the `client/hello` itself (which would otherwise gate its own send and deadlock) and `goodbye` — so a stale or out-of-order frame can never precede the handshake. The `weak_ptr` guards identity; the gate guards ordering; the two are independent. The host build does not need this scheme: `SendspinWsServer` (host) routes IXWebSocket messages by calling `find_connection_callback_` to resolve a synthetic sockfd back to the connection that `ConnectionManager` is holding. The ESP build keeps the `set_find_connection_callback()` setter as a no-op stub for symmetry; see the comment at the call site in `ConnectionManager::init_server`. diff --git a/src/connection.h b/src/connection.h index 8a06d5e..e5fedb6 100644 --- a/src/connection.h +++ b/src/connection.h @@ -117,7 +117,10 @@ class SendspinConnection : public std::enable_shared_from_thisshared_from_this(); const int64_t retry_time_us = platform_time_us() + HELLO_INITIAL_DELAY_US; @@ -380,6 +381,8 @@ void ConnectionManager::initiate_hello(SendspinConnection* conn) { void ConnectionManager::remove_hello_retry(SendspinConnection* conn) { // Note: caller must hold conn_ptr_mutex_ + // Safe to call unconditionally: a no-op if conn never had a retry entry (e.g. a connection + // rejected before initiate_hello, or one whose hello already sent and cleared its entry). for (auto it = this->hello_retries_.begin(); it != this->hello_retries_.end();) { if (it->conn.get() == conn) { it = this->hello_retries_.erase(it); @@ -504,6 +507,10 @@ void ConnectionManager::complete_handoff(bool switch_to_new) { this->client_->cleanup_connection_state(); auto old_current = std::move(this->current_connection_); this->current_connection_ = std::move(this->pending_connection_); + // Drop any retry entry now that this connection is leaving the managed set, mirroring + // on_connection_lost. (loop() also prunes orphaned entries, but removing it here closes + // the window where a retry could fire against an already-handed-off connection.) + this->remove_hello_retry(old_current.get()); this->disconnect_and_release(std::move(old_current), SendspinGoodbyeReason::ANOTHER_SERVER); } else { @@ -512,6 +519,7 @@ void ConnectionManager::complete_handoff(bool switch_to_new) { } else { SS_LOGD(TAG, "Completing handoff: keeping current server"); if (this->pending_connection_ != nullptr) { + this->remove_hello_retry(this->pending_connection_.get()); this->disconnect_and_release(std::move(this->pending_connection_), SendspinGoodbyeReason::ANOTHER_SERVER); } diff --git a/src/esp/server_connection.cpp b/src/esp/server_connection.cpp index 3946b3a..256b939 100644 --- a/src/esp/server_connection.cpp +++ b/src/esp/server_connection.cpp @@ -309,10 +309,12 @@ void SendspinServerConnection::async_send_text(void* arg) { // work (or null if it has since been destroyed), so a recycled sockfd can never redirect the // frame onto a different connection -- the lifetime-safe replacement for the old raw `this` // pointer that caused the use-after-free crash. Non-handshake frames are additionally gated on - // client_hello_sent_ so nothing can precede the client/hello. When the conn is gone or the - // frame is gated the completion callback is intentionally not fired; callers that key off - // completion (e.g., the hello in ConnectionManager::send_hello_message) always pass - // allow_before_hello, so their callback still runs. + // client_hello_sent_ so nothing can precede the client/hello; allow_before_hello opts the hello + // and goodbye out of that gate. The completion callback fires only when the frame is actually + // sent: it is skipped both when the gate blocks the frame AND when the connection is already + // gone (lock() is null) -- allow_before_hello bypasses the gate but not the conn-alive + // requirement, so callers must not rely on the callback as an unconditional "send finished" + // signal. auto conn = resp_arg->conn.lock(); if (conn && conn->is_connected() && (resp_arg->allow_before_hello || conn->client_hello_sent_)) { diff --git a/src/esp/server_connection.h b/src/esp/server_connection.h index 1a8e634..6c86e30 100644 --- a/src/esp/server_connection.h +++ b/src/esp/server_connection.h @@ -135,9 +135,10 @@ class SendspinServerConnection : public SendspinConnection { /// /// Captures the client_transmitted timestamp inside the worker (just before /// `httpd_ws_send_frame_async`), serializes the JSON, then sends. - /// @param arg A heap-allocated `SessionLookup` carrying the server handle and sockfd. The - /// worker looks the connection up via `httpd_sess_get_ctx`, sends the frame if the - /// session is still alive, and frees the arg on return. + /// @param arg A heap-allocated `SessionLookup` holding a `weak_ptr` to the originating + /// connection. The worker `lock()`s it: if the connection is still alive (and has + /// sent its client/hello) it sends the frame, otherwise it no-ops. The arg is + /// destroyed and freed before the worker returns. static void async_send_time_text(void* arg); // Pointer fields From 91b19a4ffaeef8a5066ac0523cb145326e3d3886 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 21 May 2026 20:59:57 -0400 Subject: [PATCH 4/5] Clear hello_retries_ under conn_ptr_mutex_ in destructor Every other access to hello_retries_ holds conn_ptr_mutex_; the destructor cleared it outside the lock. Move the clear into the same locked block as the connection-pointer resets so the locking discipline is uniform. --- src/connection_manager.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/connection_manager.cpp b/src/connection_manager.cpp index 645ba46..694c7e0 100644 --- a/src/connection_manager.cpp +++ b/src/connection_manager.cpp @@ -39,11 +39,11 @@ static constexpr int64_t HELLO_INITIAL_DELAY_US = HELLO_INITIAL_DELAY_MS * US_PE ConnectionManager::ConnectionManager(SendspinClient* client) : client_(client) {} ConnectionManager::~ConnectionManager() { - { - std::lock_guard lock(this->conn_ptr_mutex_); - this->current_connection_.reset(); - this->pending_connection_.reset(); - } + std::lock_guard lock(this->conn_ptr_mutex_); + this->current_connection_.reset(); + this->pending_connection_.reset(); + // Clear under the same lock that guards every other access to hello_retries_, keeping the + // locking discipline uniform. this->hello_retries_.clear(); } From 3f5564113df04d18c44c4e7f1391ee7decbcd463 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 21 May 2026 21:00:49 -0400 Subject: [PATCH 5/5] Make handshake flags atomic client_hello_sent_ and server_hello_received_ are written on network threads (the send-completion callback and the disconnect handlers) and read from the main loop via is_handshake_complete(), so as plain bools the cross-thread access was a data race. Make them std::atomic, matching the existing treatment of message_dispatch_enabled_. No call sites change: atomic loads and stores are implicit in the existing reads and assignments. --- src/connection.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/connection.h b/src/connection.h index e5fedb6..904243b 100644 --- a/src/connection.h +++ b/src/connection.h @@ -371,8 +371,10 @@ class SendspinConnection : public std::enable_shared_from_this client_hello_sent_{false}; /// true if the current message being assembled is text, false if binary /// Needed because WebSocket continuation frames do not carry the original frame type @@ -384,7 +386,10 @@ class SendspinConnection : public std::enable_shared_from_this server_hello_received_{false}; /// Memory placement preference for `websocket_payload_` allocations (ESP-IDF only). MemoryLocation websocket_payload_location_{MemoryLocation::PREFER_EXTERNAL};