From d0599cb2330b538d97a4d4318d03a68ec86431d3 Mon Sep 17 00:00:00 2001
From: Vadim Skipin <vadim.skipin@clickhouse.com>
Date: Fri, 8 May 2026 05:44:33 +0000
Subject: [PATCH 1/4] Fix jemalloc build on aarch64

---
 contrib/jemalloc-cmake/CMakeLists.txt                    | 6 +++++-
 .../include/jemalloc/internal/jemalloc_internal_defs.h   | 9 ++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt
index 8db14ea..b5bce78 100644
--- a/contrib/jemalloc-cmake/CMakeLists.txt
+++ b/contrib/jemalloc-cmake/CMakeLists.txt
@@ -78,4 +78,8 @@ target_compile_options(_jemalloc PRIVATE
     -Wno-redundant-decls
     -Wno-ignored-attributes)
 
-add_library(jemalloc::jemalloc ALIAS _jemalloc)
+# jemalloc overrides malloc/free/etc.; without --whole-archive the linker only
+# pulls objects that satisfy unresolved symbols, leaving glibc malloc in place.
+add_library(_jemalloc_whole INTERFACE)
+target_link_libraries(_jemalloc_whole INTERFACE "$<LINK_LIBRARY:WHOLE_ARCHIVE,_jemalloc>")
+add_library(jemalloc::jemalloc ALIAS _jemalloc_whole)
diff --git a/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_internal_defs.h b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_internal_defs.h
index 5f69b00..1a2fee4 100644
--- a/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_internal_defs.h
+++ b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_internal_defs.h
@@ -33,9 +33,16 @@
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
  * order to yield to another virtual CPU.
  */
+#if defined(__x86_64__) || defined(__i386__)
 #define CPU_SPINWAIT __asm__ volatile("pause")
-/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
 #define HAVE_CPU_SPINWAIT 1
+#elif defined(__aarch64__)
+#define CPU_SPINWAIT __asm__ volatile("isb")
+#define HAVE_CPU_SPINWAIT 1
+#else
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+#endif
 
 /*
  * Number of significant bits in virtual addresses.  This may be less than the

From 7ef7f042e92efbe31ce42ccd83c03264198497de Mon Sep 17 00:00:00 2001
From: Vadim Skipin <vadim.skipin@clickhouse.com>
Date: Thu, 7 May 2026 20:50:48 +0000
Subject: [PATCH 2/4] Add http-perf server support

---
 bb                      |  99 ++++++++--
 src/perf/fiber-http.cpp |  60 +++++-
 src/perf/fiber-http.h   |  74 ++++++--
 src/perf/file-perf.cpp  |   2 +-
 src/perf/http-perf.cpp  | 391 +++++++++++++++++++++++++++++++++++++++-
 src/perf/net-perf.cpp   |   4 +-
 6 files changed, 582 insertions(+), 48 deletions(-)

diff --git a/bb b/bb
index 4f63952..16d3e6d 100755
--- a/bb
+++ b/bb
@@ -10,6 +10,7 @@ import re
 import resource
 import shutil
 import signal
+import socket
 import subprocess
 import sys
 import tempfile
@@ -66,6 +67,17 @@ def start_process(*args: str, **kwargs: Any) -> subprocess.Popen[str]:
     return proc
 
 
+def wait_for_tcp_port(host: str, port: int, timeout: float = 5.0) -> None:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            with socket.create_connection((host, port), timeout=0.1):
+                return
+        except OSError:
+            time.sleep(0.05)
+    raise TimeoutError(f"{host}:{port} not ready within {timeout}s")
+
+
 def cmd_clean() -> None:
     build_dir = os.path.join(ROOT, "build")
     if os.path.exists(build_dir):
@@ -962,6 +974,7 @@ class HttpPerfParams:
     flamegraph: bool = False
     print_counters: bool = False
     timeout: int = 180
+    nginx: bool = False
 
 
 _HP_HEADERS: list[str] = [
@@ -994,19 +1007,9 @@ http {{
 """
 
 
-def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
-    mode = "threads" if params.threads else "fibers"
-    print()
-    print(f"## http-perf ({mode}) -- HTTP/1.1 GET")
-    print()
-    print(
-        f"nginx loopback, duration={params.duration}, warmup={params.warmup}, delay={params.delay}"
-    )
-    print()
-
-    server_cpus, client_cpus = _cpu_split()
-    half = (os.cpu_count() or 2) // 2
-
+def _start_nginx_server(
+    params: HttpPerfParams, server_cpus: str, workers: int
+) -> subprocess.Popen[str]:
     delay_s = _parse_duration_s(params.delay)
     if delay_s > 0:
         load_modules = "load_module modules/ndk_http_module.so;\nload_module modules/ngx_http_lua_module.so;\n"
@@ -1023,14 +1026,14 @@ def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
         f.write(
             _NGINX_CONF.format(
                 port=params.port,
-                workers=half,
+                workers=workers,
                 handler=handler,
                 load_modules=load_modules,
                 pid_file=os.path.join(TMP_DIR, "http-perf-nginx.pid"),
             )
         )
 
-    nginx = start_process(
+    return start_process(
         "taskset",
         "-c",
         server_cpus,
@@ -1041,6 +1044,46 @@ def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
         "daemon off;",
     )
 
+
+def _start_internal_server(
+    preset: str, params: HttpPerfParams, server_cpus: str
+) -> subprocess.Popen[str]:
+    http_perf = os.path.join(ROOT, f"build/{preset}/bin/http-perf")
+    args = [
+        "taskset",
+        "-c",
+        server_cpus,
+        http_perf,
+        "server",
+        "--port",
+        str(params.port),
+    ]
+    if _parse_duration_s(params.delay) > 0:
+        args += ["--delay", params.delay]
+    if log.isEnabledFor(logging.DEBUG):
+        args += ["--verbose"]
+    return start_process(*args)
+
+
+def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
+    mode = "threads" if params.threads else "fibers"
+    server_kind = "nginx" if params.nginx else "internal"
+    print()
+    print(f"## http-perf (server={server_kind}, client={mode}) -- HTTP/1.1 GET")
+    print()
+    print(f"duration={params.duration}, warmup={params.warmup}, delay={params.delay}")
+    print()
+
+    server_cpus, client_cpus = _cpu_split()
+    workers = (os.cpu_count() or 2) // 2
+
+    if params.nginx:
+        server = _start_nginx_server(params, server_cpus, workers)
+    else:
+        server = _start_internal_server(preset, params, server_cpus)
+
+    wait_for_tcp_port(params.host, params.port)
+
     http_perf = os.path.join(ROOT, f"build/{preset}/bin/http-perf")
     threads_flag = ["--threads"] if params.threads else []
     verbose_flag = ["--verbose"] if log.isEnabledFor(logging.DEBUG) else []
@@ -1112,8 +1155,8 @@ def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
                 if params.print_counters:
                     _print_counters(data)
     finally:
-        nginx.terminate()
-        nginx.wait()
+        server.terminate()
+        server.wait()
 
 
 def _ensure_minio() -> tuple[str, str]:
@@ -1482,9 +1525,16 @@ def _build_parser() -> argparse.ArgumentParser:
         "--net-asio", action="store_true", help="run net-perf-asio"
     )
     perf_parser.add_argument("--file", action="store_true", help="run file-perf")
-    perf_parser.add_argument("--http", action="store_true", help="run http-perf")
     perf_parser.add_argument(
-        "--http-threads", action="store_true", help="run http-perf (threads)"
+        "--http", action="store_true", help="run http-perf (internal server, fibers)"
+    )
+    perf_parser.add_argument(
+        "--http-threads",
+        action="store_true",
+        help="run http-perf (internal server, thread client)",
+    )
+    perf_parser.add_argument(
+        "--http-nginx", action="store_true", help="run http-perf against nginx"
     )
     perf_parser.add_argument("--fio", action="store_true", help="run fio comparison")
     perf_parser.add_argument(
@@ -1783,7 +1833,13 @@ def _build_parser() -> argparse.ArgumentParser:
         dest="http_delay",
         default=http_params.delay,
         metavar="DURATION",
-        help="server-side nginx delay per request (e.g. 1ms, 100us)",
+        help="server-side response delay per request (e.g. 1ms, 100us)",
+    )
+    http_perf_parser.add_argument(
+        "--nginx",
+        dest="http_nginx",
+        action="store_true",
+        help="run client against nginx instead of the internal server",
     )
     http_perf_parser.add_argument(
         "--threads",
@@ -2009,6 +2065,9 @@ def main() -> None:
         if args.http_threads or args.all:
             cmd_build(preset, ["http-perf"])
             cmd_http_perf(preset, replace(http_params, threads=True))
+        if args.http_nginx or args.all:
+            cmd_build(preset, ["http-perf"])
+            cmd_http_perf(preset, replace(http_params, nginx=True))
         s3_params = S3PerfParams(
             numjobs=[1, 16],
             iodepth=[1, 64],
diff --git a/src/perf/fiber-http.cpp b/src/perf/fiber-http.cpp
index 5997ad0..75ca88a 100644
--- a/src/perf/fiber-http.cpp
+++ b/src/perf/fiber-http.cpp
@@ -7,14 +7,27 @@
 
 #include <Poco/Net/StreamSocket.h>
 
+#include <cerrno>
+
 #include <poll.h>
 #include <unistd.h>
 
 #include <sys/socket.h>
 
-// Use silk::FiberScheduler::read/write (io_uring) instead of recv/send + poll.
+// Use FiberScheduler::read/write (io_uring) instead of recv/send + poll.
 #define USE_IO_URING_RW
 
+FiberSocketImpl::FiberSocketImpl(int sockfd)
+    : StreamSocketImpl(sockfd)
+{
+    // SocketImpl(sockfd) initializes _blocking=true even though the fd is
+    // already non-blocking; sync the flag so getBlocking() reflects reality.
+    setBlocking(false);
+    setNoDelay(true);
+
+    atomicFd.store(sockfd, std::memory_order_relaxed);
+}
+
 void FiberSocketImpl::connect(const Poco::Net::SocketAddress & address)
 {
     connect(address, Poco::Timespan(-1));
@@ -26,6 +39,8 @@ void FiberSocketImpl::connect(const Poco::Net::SocketAddress & address, const Po
     setNoDelay(true);
     setBlocking(false);
 
+    atomicFd.store(sockfd(), std::memory_order_relaxed);
+
     int r = ::connect(sockfd(), address.addr(), address.length());
     if (r < 0)
     {
@@ -69,8 +84,6 @@ void FiberSocketImpl::connect(const Poco::Net::SocketAddress & address, const Po
 
 bool FiberSocketImpl::poll(const Poco::Timespan & timeout, int mode)
 {
-    ASSERT(!getBlocking());
-
     uint32_t events = 0;
     if (mode & SELECT_READ)
     {
@@ -116,7 +129,6 @@ bool FiberSocketImpl::poll(const Poco::Timespan & timeout, int mode)
 int FiberSocketImpl::sendBytes(const void * buffer, int length, int flags)
 {
     UNUSED(flags);
-    ASSERT(!getBlocking());
 
     int total = 0;
     const char * ptr = static_cast<const char *>(buffer);
@@ -153,10 +165,48 @@ int FiberSocketImpl::sendBytes(const void * buffer, int length, int flags)
     return total;
 }
 
+void FiberSocketImpl::shutdown()
+{
+    int fd = atomicFd.load(std::memory_order_relaxed);
+    if (fd < 0)
+    {
+        return;
+    }
+
+    int r = ::shutdown(fd, SHUT_RDWR);
+    if (r < 0)
+    {
+        r = errno;
+        error(r, "shutdown");
+    }
+}
+
+Poco::Net::SocketImpl * FiberServerSocketImpl::acceptConnection(Poco::Net::SocketAddress & clientAddr)
+{
+    silk::FiberScheduler::IoFuture pollFuture;
+    silk::FiberScheduler::poll(sockfd(), POLLIN, nullptr, &pollFuture);
+    int r = pollFuture.wait();
+    if (r)
+    {
+        error(r, "accept poll");
+    }
+
+    sockaddr_storage storage;
+    socklen_t addrLen = sizeof(storage);
+    int fd = ::accept4(sockfd(), reinterpret_cast<sockaddr *>(&storage), &addrLen, SOCK_NONBLOCK | SOCK_CLOEXEC);
+    if (fd < 0)
+    {
+        r = errno;
+        error(r, "accept");
+    }
+
+    clientAddr = Poco::Net::SocketAddress(reinterpret_cast<sockaddr *>(&storage), addrLen);
+    return new FiberSocketImpl(fd);
+}
+
 int FiberSocketImpl::receiveBytes(void * buffer, int length, int flags)
 {
     UNUSED(flags);
-    ASSERT(!getBlocking());
 
 #if defined(USE_IO_URING_RW)
     uint64_t bytesRead = 0;
diff --git a/src/perf/fiber-http.h b/src/perf/fiber-http.h
index ac58de6..d9cb8fd 100644
--- a/src/perf/fiber-http.h
+++ b/src/perf/fiber-http.h
@@ -1,32 +1,49 @@
 #pragma once
 
 #include <Poco/Net/HTTPClientSession.h>
+#include <Poco/Net/ServerSocket.h>
+#include <Poco/Net/ServerSocketImpl.h>
 #include <Poco/Net/StreamSocketImpl.h>
 
+#include <atomic>
 #include <cstdint>
 #include <string>
 
-//
-// FiberSocketImpl - fiber-aware StreamSocketImpl backed by silk::FiberScheduler.
-//
-// Overrides connect/poll/sendBytes/receiveBytes to suspend the calling fiber
-// during I/O instead of blocking the OS thread.
-//
-
+/**
+ * Fiber-aware StreamSocketImpl backed by silk::FiberScheduler.
+ *
+ * Overrides connect/poll/sendBytes/receiveBytes to suspend the calling fiber
+ * during I/O instead of blocking the OS thread.
+ */
 class FiberSocketImpl final : public Poco::Net::StreamSocketImpl
 {
 public:
+    FiberSocketImpl() = default;
+
+    /** Wrap an already-accepted fd (must be non-blocking). */
+    explicit FiberSocketImpl(int sockfd);
+
     void connect(const Poco::Net::SocketAddress & address) override;
     void connect(const Poco::Net::SocketAddress & address, const Poco::Timespan & timeout) override;
     bool poll(const Poco::Timespan & timeout, int mode) override;
     int sendBytes(const void * buffer, int length, int flags) override;
     int receiveBytes(void * buffer, int length, int flags) override;
-};
 
-//
-// FiberHTTPClientSession - HTTPClientSession that uses FiberSocketImpl.
-//
+    /**
+     * Race-free shutdown for cross-thread teardown: SocketImpl::_sockfd is
+     * written by Poco's close() inside the fiber's I/O error path, so reading
+     * it from another thread (as the base shutdown() does) races. We mirror
+     * the fd into an atomic on init and ::shutdown that copy directly.
+     */
+    void shutdown() override;
+
+private:
+    std::atomic<int> atomicFd{-1};
+};
 
+/**
+ * HTTPClientSession that uses FiberSocketImpl.
+ */
 class FiberHTTPClientSession : public Poco::Net::HTTPClientSession
 {
 public:
@@ -36,3 +53,38 @@ class FiberHTTPClientSession : public Poco::Net::HTTPClientSession
         attachSocket(new FiberSocketImpl());
     }
 };
+
+/**
+ * Fiber-aware ServerSocketImpl. acceptConnection waits for a pending connection
+ * via silk::FiberScheduler::poll, then accepts it as a FiberSocketImpl-backed
+ * StreamSocket.
+ */
+class FiberServerSocketImpl final : public Poco::Net::ServerSocketImpl
+{
+public:
+    Poco::Net::SocketImpl * acceptConnection(Poco::Net::SocketAddress & clientAddr) override;
+};
+
+/**
+ * ServerSocket that uses FiberServerSocketImpl.
+ */
+class FiberServerSocket : public Poco::Net::ServerSocket
+{
+public:
+    FiberServerSocket()
+        : Poco::Net::ServerSocket(new FiberServerSocketImpl(), true)
+    {
+    }
+
+    explicit FiberServerSocket(uint16_t port, int backlog = 64)
+        : Poco::Net::ServerSocket(new FiberServerSocketImpl(), true)
+    {
+        bind(Poco::Net::SocketAddress(Poco::Net::IPAddress(), port), true);
+        listen(backlog);
+        // FiberServerSocketImpl::acceptConnection requires non-blocking mode
+        // so accept4 returns immediately after silk's poll fires.
+        setBlocking(false);
+    }
+
+    void shutdown() { impl()->shutdown(); }
+};
diff --git a/src/perf/file-perf.cpp b/src/perf/file-perf.cpp
index 62e8dbc..78ddec4 100644
--- a/src/perf/file-perf.cpp
+++ b/src/perf/file-perf.cpp
@@ -167,7 +167,7 @@ class Benchmark
     void submit(Job * job, Slot * slot);
 
     //
-    // silk::Fiber main functions.
+    // Fiber main functions.
     //
 
     struct WorkerFiberParams
diff --git a/src/perf/http-perf.cpp b/src/perf/http-perf.cpp
index 90c2599..eb2444b 100644
--- a/src/perf/http-perf.cpp
+++ b/src/perf/http-perf.cpp
@@ -3,8 +3,10 @@
 
 #include <silk/fibers/fiber.h>
 #include <silk/fibers/future.h>
+#include <silk/fibers/mutex.h>
 #include <silk/util/assert.h>
 #include <silk/util/init.h>
+#include <silk/util/list.h>
 #include <silk/util/logger.h>
 #include <silk/util/perf.h>
 #include <silk/util/platform.h>
@@ -12,11 +14,21 @@
 
 #include <Poco/Net/HTTPClientSession.h>
 #include <Poco/Net/HTTPRequest.h>
+#include <Poco/Net/HTTPRequestHandler.h>
+#include <Poco/Net/HTTPRequestHandlerFactory.h>
 #include <Poco/Net/HTTPResponse.h>
+#include <Poco/Net/HTTPServer.h>
+#include <Poco/Net/HTTPServerConnection.h>
+#include <Poco/Net/HTTPServerParams.h>
+#include <Poco/Net/HTTPServerRequest.h>
+#include <Poco/Net/HTTPServerResponse.h>
 #include <Poco/Net/NetException.h>
+#include <Poco/Net/ServerSocket.h>
+#include <Poco/Net/StreamSocket.h>
 #include <boost/program_options.hpp>
 
 #include <atomic>
+#include <chrono>
 #include <cmath>
 #include <csignal>
 #include <cstdint>
@@ -30,12 +42,10 @@
 #include <thread>
 #include <vector>
 
-#include <poll.h>
 #include <pthread.h>
-#include <unistd.h>
 
 //
-// Benchmark
+// Client
 //
 
 struct ClientConfig
@@ -74,7 +84,7 @@ class Client
     void runLoop(Connection * connection) noexcept;
 
     //
-    // silk::Fiber main functions.
+    // Fiber main functions.
     //
 
     struct FiberParams
@@ -139,13 +149,13 @@ void Client::stop()
     {
         try
         {
-            conn.session->abort();
+            conn.session->socket().shutdown();
         }
         catch (const Poco::Exception & e)
         {
             if (!isExpectedShutdown(e.code()))
             {
-                LOG_ERROR("abort failed: {}", e.displayText());
+                LOG_ERROR("shutdown failed: {}", e.displayText());
             }
         }
     }
@@ -325,6 +335,365 @@ static void runClient(int argc, char ** argv)
     silk::destroy();
 }
 
+//
+// Server
+//
+
+struct ServerConfig
+{
+    uint16_t port = 8080;
+    uint32_t maxQueued = 0;
+    uint64_t delayNs = 0;
+    bool useThreads = false;
+};
+
+struct EchoHandlerConfig
+{
+    uint64_t delayNs = 0;
+    bool fiberSleep = false;
+};
+
+class EchoHandler final : public Poco::Net::HTTPRequestHandler
+{
+public:
+    explicit EchoHandler(const EchoHandlerConfig & cfg)
+        : cfg(cfg)
+    {
+    }
+
+    void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override
+    {
+        UNUSED(request);
+
+        if (cfg.delayNs)
+        {
+            if (cfg.fiberSleep)
+            {
+                silk::FiberScheduler::sleep(cfg.delayNs);
+            }
+            else
+            {
+                std::this_thread::sleep_for(std::chrono::nanoseconds(cfg.delayNs));
+            }
+        }
+
+        response.setStatus(Poco::Net::HTTPResponse::HTTP_OK);
+        response.setContentLength(0);
+        response.send();
+    }
+
+private:
+    const EchoHandlerConfig & cfg;
+};
+
+class EchoHandlerFactory final : public Poco::Net::HTTPRequestHandlerFactory
+{
+public:
+    explicit EchoHandlerFactory(const EchoHandlerConfig & cfg)
+        : cfg(cfg)
+    {
+    }
+
+    Poco::Net::HTTPRequestHandler * createRequestHandler(const Poco::Net::HTTPServerRequest & request) override
+    {
+        UNUSED(request);
+        return new EchoHandler(cfg);
+    }
+
+private:
+    EchoHandlerConfig cfg;
+};
+
+/**
+ * One accept fiber, one fiber per connection.
+ *
+ * Per-connection logic is delegated to Poco::Net::HTTPServerConnection over a
+ * FiberSocketImpl-backed StreamSocket so all read/write I/O suspends fibers
+ * instead of blocking threads.
+ */
+class FiberHTTPServer
+{
+public:
+    FiberHTTPServer(Poco::Net::HTTPRequestHandlerFactory::Ptr factory, FiberServerSocket socket, Poco::Net::HTTPServerParams::Ptr params)
+        : factory(std::move(factory))
+        , params(std::move(params))
+        , socket(std::move(socket))
+    {
+    }
+
+    void start();
+    void stop();
+
+private:
+    struct Conn
+    {
+        silk::ListEntry listEntry;
+        Poco::Net::StreamSocket socket;
+        silk::FiberFuture future;
+    };
+
+    //
+    // Fiber main functions.
+    //
+
+    struct AcceptFiberParams
+    {
+        FiberHTTPServer * server;
+    };
+    static int acceptFiberMain(AcceptFiberParams * params) noexcept
+    {
+        params->server->acceptLoop();
+        return 0;
+    }
+
+    struct ConnFiberParams
+    {
+        FiberHTTPServer * server;
+        Poco::Net::StreamSocket socket;
+    };
+    static int connFiberMain(ConnFiberParams * params) noexcept
+    {
+        params->server->connectionLoop(params->socket);
+        return 0;
+    }
+
+    //
+    // Helpers.
+    //
+
+    void acceptLoop() noexcept;
+    void connectionLoop(Poco::Net::StreamSocket socket) noexcept;
+
+    //
+    // State.
+    //
+
+    Poco::Net::HTTPRequestHandlerFactory::Ptr factory;
+    Poco::Net::HTTPServerParams::Ptr params;
+
+    FiberServerSocket socket;
+    std::atomic<bool> stopping{false};
+    silk::FiberFuture acceptFuture;
+
+    silk::FiberMutex connsMutex;
+    silk::List<Conn, &Conn::listEntry> conns;
+};
+
+void FiberHTTPServer::start()
+{
+    int r = silk::FiberScheduler::run(acceptFiberMain, AcceptFiberParams{this}, &acceptFuture);
+    ASSERT(r == 0, "spawn accept fiber: {}", std::strerror(r));
+}
+
+void FiberHTTPServer::stop()
+{
+    stopping.store(true, std::memory_order_relaxed);
+
+    // shutdown wakes a pending io_uring poll on the listen socket via POLLHUP;
+    // close() alone is not guaranteed to deliver a CQE to the accept fiber.
+    try
+    {
+        socket.shutdown();
+    }
+    catch (const Poco::Exception & e)
+    {
+        if (!isExpectedShutdown(e.code()))
+        {
+            LOG_ERROR("shutdown failed: {}", e.displayText());
+        }
+    }
+
+    int r = acceptFuture.wait();
+    ASSERT(r == 0, "accept fiber: {}", std::strerror(r));
+
+    socket.close();
+
+    {
+        std::lock_guard lock(connsMutex);
+        for (Conn * c = conns.front(); c; c = conns.next(c))
+        {
+            try
+            {
+                c->socket.shutdown();
+            }
+            catch (const Poco::Exception & e)
+            {
+                if (!isExpectedShutdown(e.code()))
+                {
+                    LOG_ERROR("shutdown failed: {}", e.displayText());
+                }
+            }
+        }
+    }
+
+    while (Conn * c = conns.pop_front())
+    {
+        c->future.wait();
+        delete c;
+    }
+}
+
+void FiberHTTPServer::acceptLoop() noexcept
+{
+    while (!stopping.load(std::memory_order_relaxed))
+    {
+        Poco::Net::StreamSocket clientSocket;
+        try
+        {
+            clientSocket = socket.acceptConnection();
+        }
+        catch (const Poco::Exception & e)
+        {
+            if (!stopping.load(std::memory_order_relaxed) && !isExpectedShutdown(e.code()))
+            {
+                LOG_ERROR("accept failed: {}", e.displayText());
+            }
+            return;
+        }
+
+        Conn * conn = new Conn();
+        conn->socket = clientSocket;
+        {
+            std::lock_guard lock(connsMutex);
+            conns.push_back(conn);
+        }
+
+        int r = silk::FiberScheduler::run(connFiberMain, ConnFiberParams{this, clientSocket}, &conn->future);
+        if (r != 0)
+        {
+            LOG_ERROR("spawn conn fiber: {}", std::strerror(r));
+            {
+                std::lock_guard lock(connsMutex);
+                conns.remove(conn);
+            }
+            delete conn;
+            return;
+        }
+    }
+}
+
+void FiberHTTPServer::connectionLoop(Poco::Net::StreamSocket socket) noexcept
+{
+    try
+    {
+        Poco::Net::HTTPServerConnection conn(socket, params, factory);
+        conn.run();
+    }
+    catch (const Poco::Exception & e)
+    {
+        if (!stopping.load(std::memory_order_relaxed) && !isExpectedShutdown(e.code()))
+        {
+            LOG_ERROR("connection error: {}", e.displayText());
+        }
+    }
+}
+
+/**
+ * Server entry point.
+ */
+static void runServer(int argc, char ** argv)
+{
+    ServerConfig cfg;
+    std::string delayStr = "0";
+    bool verbose = false;
+
+    namespace po = boost::program_options;
+    po::options_description desc("http-perf server options");
+
+    // clang-format off
+    desc.add_options()
+        ("help,h",    "show this help")
+        ("port",      po::value(&cfg.port),       "listen port")
+        ("queued",    po::value(&cfg.maxQueued),  "max queued connections (default: 4 * available CPUs)")
+        ("delay",     po::value(&delayStr),       "per-request response delay (e.g. 5ms, 100us)")
+        ("threads",   po::bool_switch(&cfg.useThreads), "use OS threads instead of fibers")
+        ("verbose,v", po::bool_switch(&verbose),  "enable debug logging")
+        ;
+    // clang-format on
+
+    po::variables_map vm;
+    try
+    {
+        po::store(po::parse_command_line(argc, argv, desc), vm);
+        if (vm.count("help"))
+        {
+            std::cout << "usage: http-perf server [options]\n" << desc << "\n";
+            return;
+        }
+        po::notify(vm);
+        cfg.delayNs = parseDuration(delayStr);
+        if (verbose)
+        {
+            silk::Logger::setLevel(silk::LogLevel::DEBUG);
+        }
+    }
+    catch (const po::error & ex)
+    {
+        std::cerr << "error: " << ex.what() << "\n" << desc << "\n";
+        exit(1);
+    }
+
+    uint32_t numProcessors = silk::getAvailableProcessorCount();
+    if (cfg.maxQueued == 0)
+    {
+        cfg.maxQueued = numProcessors * 4;
+    }
+
+    sigset_t mask = blockSignals();
+
+    silk::initialize();
+    if (!cfg.useThreads)
+    {
+        silk::FiberScheduler::initialize();
+    }
+
+    Poco::Net::HTTPServerParams::Ptr params = new Poco::Net::HTTPServerParams();
+    params->setMaxThreads(static_cast<int>(numProcessors));
+    params->setMaxQueued(static_cast<int>(cfg.maxQueued));
+    params->setKeepAlive(true);
+
+    EchoHandlerConfig handlerCfg{.delayNs = cfg.delayNs, .fiberSleep = !cfg.useThreads};
+    Poco::Net::HTTPRequestHandlerFactory::Ptr factory = new EchoHandlerFactory(handlerCfg);
+
+    LOG_INFO(
+        "starting {} http server on port {}, queued={}, delay={}",
+        cfg.useThreads ? "threaded" : "fiber",
+        cfg.port,
+        cfg.maxQueued,
+        formatDuration(cfg.delayNs));
+
+    if (cfg.useThreads)
+    {
+        Poco::Net::ServerSocket socket(cfg.port);
+        Poco::Net::HTTPServer server(factory, socket, params);
+        server.start();
+
+        int sig = 0;
+        sigwait(&mask, &sig);
+
+        LOG_INFO("stopping http server");
+        server.stopAll();
+    }
+    else
+    {
+        FiberServerSocket socket(cfg.port);
+        FiberHTTPServer server(factory, socket, params);
+        server.start();
+
+        int sig = 0;
+        sigwait(&mask, &sig);
+
+        LOG_INFO("stopping http server");
+        server.stop();
+    }
+
+    if (!cfg.useThreads)
+    {
+        silk::FiberScheduler::destroy();
+    }
+    silk::destroy();
+}
+
 /**
  * Main entry point.
  */
@@ -332,8 +701,8 @@ int main(int argc, char ** argv)
 {
     if (argc < 2)
     {
-        std::cerr << "usage: http-perf <client> [options]\n"
-                  << "       http-perf <client> --help\n";
+        std::cerr << "usage: http-perf <client|server> [options]\n"
+                  << "       http-perf <client|server> --help\n";
         return 1;
     }
 
@@ -342,10 +711,14 @@ int main(int argc, char ** argv)
     {
         runClient(argc - 1, argv + 1);
     }
+    else if (strcmp(subcmd, "server") == 0)
+    {
+        runServer(argc - 1, argv + 1);
+    }
     else
     {
         std::cerr << "unknown subcommand: " << subcmd << "\n"
-                  << "usage: http-perf <client> [options]\n";
+                  << "usage: http-perf <client|server> [options]\n";
         return 1;
     }
     return 0;
diff --git a/src/perf/net-perf.cpp b/src/perf/net-perf.cpp
index 271d0ca..920c8ae 100644
--- a/src/perf/net-perf.cpp
+++ b/src/perf/net-perf.cpp
@@ -384,7 +384,7 @@ class Server
     };
 
     //
-    // silk::Fiber main functions.
+    // Fiber main functions.
     //
 
     struct AcceptFiberParams
@@ -558,7 +558,7 @@ class Client
     };
 
     //
-    // silk::Fiber main functions.
+    // Fiber main functions.
     //
 
     struct ClientFiberParams

From c4cf6f34de15dc7be4a5449d2c0b91562140d06f Mon Sep 17 00:00:00 2001
From: Vadim Skipin <vadim.skipin@clickhouse.com>
Date: Thu, 7 May 2026 20:51:07 +0000
Subject: [PATCH 3/4] Run http-perf in CI

---
 .github/workflows/ci.yml | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2a302df..a53b2c1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,10 +40,12 @@ jobs:
           - name: coverage
             cmd: ./bb -b debug test --coverage
           - name: release
+            configure_cmd: ./bb -b release configure --build-poco --build-jemalloc
             cmd: ./bb -b release test
             bench_cmd: ./bb -b release bench
             perf_cmd: ./bb -v -b release perf
           - name: tsan
+            configure_cmd: ./bb -b release -s thread configure --build-poco
             cmd: ./bb -b release -s thread test
             bench_cmd: ./bb -b release -s thread bench
             perf_cmd: ./bb -v -b release -s thread perf
@@ -68,6 +70,14 @@ jobs:
             contrib/googletest \
             contrib/benchmark
 
+      - name: Init Poco submodule
+        if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
+        run: git submodule update --init --depth=1 contrib/poco
+
+      - name: Init jemalloc submodule
+        if: matrix.build.name == 'release'
+        run: git submodule update --init --depth=1 contrib/jemalloc
+
       - name: Init libc++ submodule (MSan only)
         if: matrix.build.name == 'msan'
         run: git submodule update --init --depth=1 contrib/llvm-project
@@ -101,7 +111,9 @@ jobs:
             ccache \
             libboost-context-dev \
             libboost-program-options-dev \
-            libbpf-dev
+            libbpf-dev \
+            libdouble-conversion-dev \
+            zlib1g-dev
 
       - name: Cache ccache
         uses: actions/cache@v5
@@ -112,6 +124,10 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-ccache-${{ matrix.arch.name }}-${{ matrix.build.name }}-
 
+      - name: Configure with Poco
+        if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
+        run: ${{ matrix.build.configure_cmd }}
+
       - name: Build and test
         run: ${{ matrix.build.cmd }}
 
@@ -121,7 +137,7 @@ jobs:
 
       - name: perf
         if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
-        run: ${{ matrix.build.perf_cmd }} --file --net
+        run: ${{ matrix.build.perf_cmd }} --file --net --http
 
       - name: Upload coverage report
         if: matrix.build.name == 'coverage' && matrix.arch.name == 'amd64'

From 8ca8197ba55b7fe3a1ab93dcd10082aef8e764cf Mon Sep 17 00:00:00 2001
From: Vadim Skipin <vadim.skipin@clickhouse.com>
Date: Fri, 8 May 2026 06:41:08 +0000
Subject: [PATCH 4/4] Add http-perf results

---
 docs/perf.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/perf.md b/docs/perf.md
index 83c902f..680922b 100644
--- a/docs/perf.md
+++ b/docs/perf.md
@@ -125,6 +125,23 @@ nginx `return 200` (Content-Length: 0), loopback, 10 s measurement, 2 s warmup.
 
 At 1 connection both modes are identical (~36-38k RPS, ~24-27 µs p50): baseline is Poco's HTTP parsing overhead. At higher concurrency both clients saturate nginx at ~1M RPS, so throughput is similar. The difference is latency: fiber p50 stays nearly flat across all concurrency levels (24-84 µs) while thread p50 grows linearly with thread count, reaching 10x worse at 1024 connections (878 µs vs 84 µs). The fiber scheduler multiplexes all connections across 16 scheduler threads with zero per-fiber context-switch cost; each additional thread adds OS scheduling overhead proportional to the total thread count.
 
+### Server: internal (silk fibers) vs nginx
+
+**Not a production HTTP server.** `http-perf server` is benchmark scaffolding: each accepted connection runs Poco's stock `HTTPServerConnection::run` on a fiber over `FiberSocketImpl`. Poco's HTTP server is allocation-heavy — `std::stringstream`-driven request/response parsing, per-request buffer churn even after our `MemoryPool` patches, virtual dispatch on every byte. Nobody should ship this; we use it because reusing Poco's parser on both ends gives an apples-to-apples comparison: the only thing varying between the two rows of the table below is the server's I/O loop (silk's accept fiber + per-conn fibers + io_uring read/write vs nginx's tuned C event loop). Everything else — request parsing, response building, the client — is held constant.
+
+| connections | server | RPS | avg | p50 | p95 | p99 | p99.9 |
+|---|---|---|---|---|---|---|---|
+| 1 | internal | 27k | 37 µs | 36 µs | 43 µs | 49 µs | 64 µs |
+| 256 | internal | 1023k | 250 µs | 152 µs | 1478 µs | 1960 µs | 2248 µs |
+| 512 | internal | 1011k | 506 µs | 88 µs | 5023 µs | 5146 µs | 5277 µs |
+| 1024 | internal | 964k | 1020 µs | 94 µs | 12445 µs | 16772 µs | 19661 µs |
+| 1 | nginx | 36k | 28 µs | 25 µs | 35 µs | 42 µs | 97 µs |
+| 256 | nginx | 1290k | 198 µs | 72 µs | 1700 µs | 2050 µs | 2137 µs |
+| 512 | nginx | 1248k | 410 µs | 63 µs | 4557 µs | 5738 µs | 5904 µs |
+| 1024 | nginx | 1254k | 816 µs | 58 µs | 9979 µs | 12599 µs | 13163 µs |
+
+The internal server lands at ~80% of nginx RPS at high concurrency (964–1023k vs 1248–1290k). The gap is Poco overhead, not silk overhead: nginx's `return 200` handler skips most of HTTP/1.1 parsing, while Poco constructs `HTTPServerRequestImpl`/`HTTPServerResponseImpl` plus heap-allocated stream buffers per request. p50 latencies sit within a few µs of each other at high concurrency; tail latencies are dominated by client-side queuing in both cases. The takeaway is that silk's accept-fiber + per-connection-fiber I/O loop has negligible overhead on top of whatever HTTP machinery you put on it — to beat nginx you'd swap Poco for a hand-rolled state machine that allocates nothing per request, which is a different project.
+
 ### High-concurrency throughput (connections=10000, delay=10ms, duration=60s)
 
 | connections | mode | RPS | avg | p50 | p95 | p99 | p99.9 |