ClickHouse · vadimskipin · May 8, 2026 · May 8, 2026 · May 7, 2026 · May 7, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -40,10 +40,12 @@ jobs:
           - name: coverage
             cmd: ./bb -b debug test --coverage
           - name: release
+            configure_cmd: ./bb -b release configure --build-poco --build-jemalloc
             cmd: ./bb -b release test
             bench_cmd: ./bb -b release bench
             perf_cmd: ./bb -v -b release perf
           - name: tsan
+            configure_cmd: ./bb -b release -s thread configure --build-poco
             cmd: ./bb -b release -s thread test
             bench_cmd: ./bb -b release -s thread bench
             perf_cmd: ./bb -v -b release -s thread perf
@@ -68,6 +70,14 @@ jobs:
             contrib/googletest \
             contrib/benchmark
 
+      - name: Init Poco submodule
+        if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
+        run: git submodule update --init --depth=1 contrib/poco
+
+      - name: Init jemalloc submodule
+        if: matrix.build.name == 'release'
+        run: git submodule update --init --depth=1 contrib/jemalloc
+
       - name: Init libc++ submodule (MSan only)
         if: matrix.build.name == 'msan'
         run: git submodule update --init --depth=1 contrib/llvm-project
@@ -101,7 +111,9 @@ jobs:
             ccache \
             libboost-context-dev \
             libboost-program-options-dev \
-            libbpf-dev
+            libbpf-dev \
+            libdouble-conversion-dev \
+            zlib1g-dev
 
       - name: Cache ccache
         uses: actions/cache@v5
@@ -112,6 +124,10 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-ccache-${{ matrix.arch.name }}-${{ matrix.build.name }}-
 
+      - name: Configure with Poco
+        if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
+        run: ${{ matrix.build.configure_cmd }}
+
       - name: Build and test
         run: ${{ matrix.build.cmd }}
 
@@ -121,7 +137,7 @@ jobs:
 
       - name: perf
         if: matrix.build.name == 'release' || matrix.build.name == 'tsan'
-        run: ${{ matrix.build.perf_cmd }} --file --net
+        run: ${{ matrix.build.perf_cmd }} --file --net --http
 
       - name: Upload coverage report
         if: matrix.build.name == 'coverage' && matrix.arch.name == 'amd64'

diff --git a/bb b/bb
@@ -10,6 +10,7 @@ import re
 import resource
 import shutil
 import signal
+import socket
 import subprocess
 import sys
 import tempfile
@@ -66,6 +67,17 @@ def start_process(*args: str, **kwargs: Any) -> subprocess.Popen[str]:
     return proc
 
 
+def wait_for_tcp_port(host: str, port: int, timeout: float = 5.0) -> None:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            with socket.create_connection((host, port), timeout=0.1):
+                return
+        except OSError:
+            time.sleep(0.05)
+    raise TimeoutError(f"{host}:{port} not ready within {timeout}s")
+
+
 def cmd_clean() -> None:
     build_dir = os.path.join(ROOT, "build")
     if os.path.exists(build_dir):
@@ -962,6 +974,7 @@ class HttpPerfParams:
     flamegraph: bool = False
     print_counters: bool = False
     timeout: int = 180
+    nginx: bool = False
 
 
 _HP_HEADERS: list[str] = [
@@ -994,19 +1007,9 @@ http {{
 """
 
 
-def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
-    mode = "threads" if params.threads else "fibers"
-    print()
-    print(f"## http-perf ({mode}) -- HTTP/1.1 GET")
-    print()
-    print(
-        f"nginx loopback, duration={params.duration}, warmup={params.warmup}, delay={params.delay}"
-    )
-    print()
-
-    server_cpus, client_cpus = _cpu_split()
-    half = (os.cpu_count() or 2) // 2
-
+def _start_nginx_server(
+    params: HttpPerfParams, server_cpus: str, workers: int
+) -> subprocess.Popen[str]:
     delay_s = _parse_duration_s(params.delay)
     if delay_s > 0:
         load_modules = "load_module modules/ndk_http_module.so;\nload_module modules/ngx_http_lua_module.so;\n"
@@ -1023,14 +1026,14 @@ def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
         f.write(
             _NGINX_CONF.format(
                 port=params.port,
-                workers=half,
+                workers=workers,
                 handler=handler,
                 load_modules=load_modules,
                 pid_file=os.path.join(TMP_DIR, "http-perf-nginx.pid"),
             )
         )
 
-    nginx = start_process(
+    return start_process(
         "taskset",
         "-c",
         server_cpus,
@@ -1041,6 +1044,46 @@ def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
         "daemon off;",
     )
 
+
+def _start_internal_server(
+    preset: str, params: HttpPerfParams, server_cpus: str
+) -> subprocess.Popen[str]:
+    http_perf = os.path.join(ROOT, f"build/{preset}/bin/http-perf")
+    args = [
+        "taskset",
+        "-c",
+        server_cpus,
+        http_perf,
+        "server",
+        "--port",
+        str(params.port),
+    ]
+    if _parse_duration_s(params.delay) > 0:
+        args += ["--delay", params.delay]
+    if log.isEnabledFor(logging.DEBUG):
+        args += ["--verbose"]
+    return start_process(*args)
+
+
+def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
+    mode = "threads" if params.threads else "fibers"
+    server_kind = "nginx" if params.nginx else "internal"
+    print()
+    print(f"## http-perf (server={server_kind}, client={mode}) -- HTTP/1.1 GET")
+    print()
+    print(f"duration={params.duration}, warmup={params.warmup}, delay={params.delay}")
+    print()
+
+    server_cpus, client_cpus = _cpu_split()
+    workers = (os.cpu_count() or 2) // 2
+
+    if params.nginx:
+        server = _start_nginx_server(params, server_cpus, workers)
+    else:
+        server = _start_internal_server(preset, params, server_cpus)
+
+    wait_for_tcp_port(params.host, params.port)
+
     http_perf = os.path.join(ROOT, f"build/{preset}/bin/http-perf")
     threads_flag = ["--threads"] if params.threads else []
     verbose_flag = ["--verbose"] if log.isEnabledFor(logging.DEBUG) else []
@@ -1112,8 +1155,8 @@ def cmd_http_perf(preset: str, params: HttpPerfParams) -> None:
                 if params.print_counters:
                     _print_counters(data)
     finally:
-        nginx.terminate()
-        nginx.wait()
+        server.terminate()
+        server.wait()
 
 
 def _ensure_minio() -> tuple[str, str]:
@@ -1482,9 +1525,16 @@ def _build_parser() -> argparse.ArgumentParser:
         "--net-asio", action="store_true", help="run net-perf-asio"
     )
     perf_parser.add_argument("--file", action="store_true", help="run file-perf")
-    perf_parser.add_argument("--http", action="store_true", help="run http-perf")
     perf_parser.add_argument(
-        "--http-threads", action="store_true", help="run http-perf (threads)"
+        "--http", action="store_true", help="run http-perf (internal server, fibers)"
+    )
+    perf_parser.add_argument(
+        "--http-threads",
+        action="store_true",
+        help="run http-perf (internal server, thread client)",
+    )
+    perf_parser.add_argument(
+        "--http-nginx", action="store_true", help="run http-perf against nginx"
     )
     perf_parser.add_argument("--fio", action="store_true", help="run fio comparison")
     perf_parser.add_argument(
@@ -1783,7 +1833,13 @@ def _build_parser() -> argparse.ArgumentParser:
         dest="http_delay",
         default=http_params.delay,
         metavar="DURATION",
-        help="server-side nginx delay per request (e.g. 1ms, 100us)",
+        help="server-side response delay per request (e.g. 1ms, 100us)",
+    )
+    http_perf_parser.add_argument(
+        "--nginx",
+        dest="http_nginx",
+        action="store_true",
+        help="run client against nginx instead of the internal server",
     )
     http_perf_parser.add_argument(
         "--threads",
@@ -2009,6 +2065,9 @@ def main() -> None:
         if args.http_threads or args.all:
             cmd_build(preset, ["http-perf"])
             cmd_http_perf(preset, replace(http_params, threads=True))
+        if args.http_nginx or args.all:
+            cmd_build(preset, ["http-perf"])
+            cmd_http_perf(preset, replace(http_params, nginx=True))
         s3_params = S3PerfParams(
             numjobs=[1, 16],
             iodepth=[1, 64],

diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt
@@ -78,4 +78,8 @@ target_compile_options(_jemalloc PRIVATE
     -Wno-redundant-decls
     -Wno-ignored-attributes)
 
-add_library(jemalloc::jemalloc ALIAS _jemalloc)
+# jemalloc overrides malloc/free/etc.; without --whole-archive the linker only
+# pulls objects that satisfy unresolved symbols, leaving glibc malloc in place.
+add_library(_jemalloc_whole INTERFACE)
+target_link_libraries(_jemalloc_whole INTERFACE "$<LINK_LIBRARY:WHOLE_ARCHIVE,_jemalloc>")
+add_library(jemalloc::jemalloc ALIAS _jemalloc_whole)
diff --git a/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_internal_defs.h b/contrib/jemalloc-cmake/include/jemalloc/internal/jemalloc_internal_defs.h
@@ -33,9 +33,16 @@
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
  * order to yield to another virtual CPU.
  */
+#if defined(__x86_64__) || defined(__i386__)
 #define CPU_SPINWAIT __asm__ volatile("pause")
-/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
 #define HAVE_CPU_SPINWAIT 1
+#elif defined(__aarch64__)
+#define CPU_SPINWAIT __asm__ volatile("isb")
+#define HAVE_CPU_SPINWAIT 1
+#else
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 0
+#endif
 
 /*
  * Number of significant bits in virtual addresses.  This may be less than the

diff --git a/docs/perf.md b/docs/perf.md
@@ -125,6 +125,23 @@ nginx `return 200` (Content-Length: 0), loopback, 10 s measurement, 2 s warmup.
 
 At 1 connection both modes are identical (~36-38k RPS, ~24-27 µs p50): baseline is Poco's HTTP parsing overhead. At higher concurrency both clients saturate nginx at ~1M RPS, so throughput is similar. The difference is latency: fiber p50 stays nearly flat across all concurrency levels (24-84 µs) while thread p50 grows linearly with thread count, reaching 10x worse at 1024 connections (878 µs vs 84 µs). The fiber scheduler multiplexes all connections across 16 scheduler threads with zero per-fiber context-switch cost; each additional thread adds OS scheduling overhead proportional to the total thread count.
 
+### Server: internal (silk fibers) vs nginx
+
+**Not a production HTTP server.** `http-perf server` is benchmark scaffolding: each accepted connection runs Poco's stock `HTTPServerConnection::run` on a fiber over `FiberSocketImpl`. Poco's HTTP server is allocation-heavy — `std::stringstream`-driven request/response parsing, per-request buffer churn even after our `MemoryPool` patches, virtual dispatch on every byte. Nobody should ship this; we use it because reusing Poco's parser on both ends gives an apples-to-apples comparison: the only thing varying between the two rows of the table below is the server's I/O loop (silk's accept fiber + per-conn fibers + io_uring read/write vs nginx's tuned C event loop). Everything else — request parsing, response building, the client — is held constant.
+
+| connections | server | RPS | avg | p50 | p95 | p99 | p99.9 |
+|---|---|---|---|---|---|---|---|
+| 1 | internal | 27k | 37 µs | 36 µs | 43 µs | 49 µs | 64 µs |
+| 256 | internal | 1023k | 250 µs | 152 µs | 1478 µs | 1960 µs | 2248 µs |
+| 512 | internal | 1011k | 506 µs | 88 µs | 5023 µs | 5146 µs | 5277 µs |
+| 1024 | internal | 964k | 1020 µs | 94 µs | 12445 µs | 16772 µs | 19661 µs |
+| 1 | nginx | 36k | 28 µs | 25 µs | 35 µs | 42 µs | 97 µs |
+| 256 | nginx | 1290k | 198 µs | 72 µs | 1700 µs | 2050 µs | 2137 µs |
+| 512 | nginx | 1248k | 410 µs | 63 µs | 4557 µs | 5738 µs | 5904 µs |
+| 1024 | nginx | 1254k | 816 µs | 58 µs | 9979 µs | 12599 µs | 13163 µs |
+
+The internal server lands at ~80% of nginx RPS at high concurrency (964–1023k vs 1248–1290k). The gap is Poco overhead, not silk overhead: nginx's `return 200` handler skips most of HTTP/1.1 parsing, while Poco constructs `HTTPServerRequestImpl`/`HTTPServerResponseImpl` plus heap-allocated stream buffers per request. p50 latencies sit within a few µs of each other at high concurrency; tail latencies are dominated by client-side queuing in both cases. The takeaway is that silk's accept-fiber + per-connection-fiber I/O loop has negligible overhead on top of whatever HTTP machinery you put on it — to beat nginx you'd swap Poco for a hand-rolled state machine that allocates nothing per request, which is a different project.
+
 ### High-concurrency throughput (connections=10000, delay=10ms, duration=60s)
 
 | connections | mode | RPS | avg | p50 | p95 | p99 | p99.9 |

diff --git a/src/perf/fiber-http.cpp b/src/perf/fiber-http.cpp
@@ -7,14 +7,27 @@
 
 #include <Poco/Net/StreamSocket.h>
 
+#include <cerrno>
+
 #include <poll.h>
 #include <unistd.h>
 
 #include <sys/socket.h>
 
-// Use silk::FiberScheduler::read/write (io_uring) instead of recv/send + poll.
+// Use FiberScheduler::read/write (io_uring) instead of recv/send + poll.
 #define USE_IO_URING_RW
 
+FiberSocketImpl::FiberSocketImpl(int sockfd)
+    : StreamSocketImpl(sockfd)
+{
+    // SocketImpl(sockfd) initializes _blocking=true even though the fd is
+    // already non-blocking; sync the flag so getBlocking() reflects reality.
+    setBlocking(false);
+    setNoDelay(true);
+
+    atomicFd.store(sockfd, std::memory_order_relaxed);
+}
+
 void FiberSocketImpl::connect(const Poco::Net::SocketAddress & address)
 {
     connect(address, Poco::Timespan(-1));
@@ -26,6 +39,8 @@ void FiberSocketImpl::connect(const Poco::Net::SocketAddress & address, const Po
     setNoDelay(true);
     setBlocking(false);
 
+    atomicFd.store(sockfd(), std::memory_order_relaxed);
+
     int r = ::connect(sockfd(), address.addr(), address.length());
     if (r < 0)
     {
@@ -69,8 +84,6 @@ void FiberSocketImpl::connect(const Poco::Net::SocketAddress & address, const Po
 
 bool FiberSocketImpl::poll(const Poco::Timespan & timeout, int mode)
 {
-    ASSERT(!getBlocking());
-
     uint32_t events = 0;
     if (mode & SELECT_READ)
     {
@@ -116,7 +129,6 @@ bool FiberSocketImpl::poll(const Poco::Timespan & timeout, int mode)
 int FiberSocketImpl::sendBytes(const void * buffer, int length, int flags)
 {
     UNUSED(flags);
-    ASSERT(!getBlocking());
 
     int total = 0;
     const char * ptr = static_cast<const char *>(buffer);
@@ -153,10 +165,48 @@ int FiberSocketImpl::sendBytes(const void * buffer, int length, int flags)
     return total;
 }
 
+void FiberSocketImpl::shutdown()
+{
+    int fd = atomicFd.load(std::memory_order_relaxed);
+    if (fd < 0)
+    {
+        return;
+    }
+
+    int r = ::shutdown(fd, SHUT_RDWR);
+    if (r < 0)
+    {
+        r = errno;
+        error(r, "shutdown");
+    }
+}
+
+Poco::Net::SocketImpl * FiberServerSocketImpl::acceptConnection(Poco::Net::SocketAddress & clientAddr)
+{
+    silk::FiberScheduler::IoFuture pollFuture;
+    silk::FiberScheduler::poll(sockfd(), POLLIN, nullptr, &pollFuture);
+    int r = pollFuture.wait();
+    if (r)
+    {
+        error(r, "accept poll");
+    }
+
+    sockaddr_storage storage;
+    socklen_t addrLen = sizeof(storage);
+    int fd = ::accept4(sockfd(), reinterpret_cast<sockaddr *>(&storage), &addrLen, SOCK_NONBLOCK | SOCK_CLOEXEC);
+    if (fd < 0)
+    {
+        r = errno;
+        error(r, "accept");
+    }
+
+    clientAddr = Poco::Net::SocketAddress(reinterpret_cast<sockaddr *>(&storage), addrLen);
+    return new FiberSocketImpl(fd);
+}
+
 int FiberSocketImpl::receiveBytes(void * buffer, int length, int flags)
 {
     UNUSED(flags);
-    ASSERT(!getBlocking());
 
 #if defined(USE_IO_URING_RW)
     uint64_t bytesRead = 0;