rapidsai · pentschev · Feb 18, 2026 · Feb 18, 2026 · Feb 19, 2026 · Feb 19, 2026
@@ -21,7 +21,7 @@ enum class BackendType {
      * @brief Automatically detect the best backend based on environment.
      *
      * Detection order:
-     * 1. File-based (if RRUN_COORD_DIR set by rrun)
+     * 1. File-based (if RRUN_COORD_DIR or RRUN_ROOT_ADDRESS set by rrun)
      * 2. Slurm/PMIx (if SLURM environment detected)
      * 3. File-based (default fallback)
      */

@@ -31,7 +31,7 @@ namespace rapidsmpf::bootstrap::detail {
  *
  * Usage:
  * ```bash
- * # Multiple (4) tasks per node, one task per GPU, two nodes.
+ * # Passthrough: multiple (4) tasks per node, one task per GPU, two nodes.
  * srun \
  *     --mpi=pmix \
  *     --nodes=2 \
@@ -40,6 +40,16 @@ namespace rapidsmpf::bootstrap::detail {
  *     --gpus-per-task=1 \
  *     --gres=gpu:4 \
  *     rrun ./benchmarks/bench_shuffle -C ucxx
+ *
+ * # Hybrid mode: one task per node, 4 GPUs per task, two nodes.
+ * srun \
+ *     --mpi=pmix \
+ *     --nodes=2 \
+ *     --ntasks-per-node=1 \
+ *     --cpus-per-task=144 \
+ *     --gpus-per-task=4 \
+ *     --gres=gpu:4 \
+ *     rrun -n 4 ./benchmarks/bench_shuffle -C ucxx
  * ```
  */
 class SlurmBackend : public Backend {

@@ -28,8 +28,9 @@ namespace {
  */
 BackendType detect_backend() {
     // Check for rrun coordination first (explicit configuration takes priority).
-    // If RRUN_COORD_DIR is set, rrun is coordinating and we should use FILE backend.
-    if (getenv_optional("RRUN_COORD_DIR")) {
+    // If RRUN_COORD_DIR or RRUN_ROOT_ADDRESS is set, rrun is coordinating and we
+    // should use FILE backend (with or without pre-coordinated address).
+    if (getenv_optional("RRUN_COORD_DIR") || getenv_optional("RRUN_ROOT_ADDRESS")) {
         return BackendType::FILE;
     }
 

@@ -9,10 +9,15 @@
 #ifdef RAPIDSMPF_HAVE_UCXX
 
 #include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
 #include <memory>
+#include <sstream>
 #include <string>
 
 #include <cuda_device_runtime_api.h>
+#include <unistd.h>  // for unsetenv
 
 #include <rapidsmpf/bootstrap/bootstrap.hpp>
 #include <rapidsmpf/bootstrap/ucxx.hpp>
@@ -23,6 +28,36 @@
 
 namespace rapidsmpf::bootstrap {
 
+namespace {
+// Hex encoding for binary-safe address transmission
+std::string hex_encode(std::string_view input) {
+    static constexpr const char* hex_chars = "0123456789abcdef";
+    std::string result;
+    result.reserve(input.size() * 2);
+    for (char ch : input) {
+        auto c = static_cast<unsigned char>(ch);
+        result.push_back(hex_chars[c >> 4]);
+        result.push_back(hex_chars[c & 0x0F]);
+    }
+    return result;
+}
+
+std::string hex_decode(std::string_view const& input) {
+    std::string result;
+    result.reserve(input.size() / 2);
+    for (size_t i = 0; i < input.size(); i += 2) {
+        auto high = static_cast<unsigned char>(
+            (input[i] >= 'a') ? (input[i] - 'a' + 10) : (input[i] - '0')
+        );
+        auto low = static_cast<unsigned char>(
+            (input[i + 1] >= 'a') ? (input[i + 1] - 'a' + 10) : (input[i + 1] - '0')
+        );
+        result.push_back(static_cast<char>((high << 4) | low));
+    }
+    return result;
+}
+}  // namespace
+
 std::shared_ptr<ucxx::UCXX> create_ucxx_comm(
     std::shared_ptr<ProgressThread> progress_thread,
     BackendType type,
@@ -35,8 +70,69 @@ std::shared_ptr<ucxx::UCXX> create_ucxx_comm(
 
     std::shared_ptr<ucxx::UCXX> comm;
 
-    // Root rank: Create listener and publish address via put() for non-root ranks.
-    if (ctx.rank == 0) {
+    auto precomputed_address_encoded = getenv_optional("RRUN_ROOT_ADDRESS");
+    auto address_file = getenv_optional("RRUN_ROOT_ADDRESS_FILE");
+
+    // Path 1: Early address mode for root rank in Slurm hybrid mode.
+    // Rank 0 is launched first to create its address and write it to a file.
+    // Parent will coordinate with other parents via PMIx, then launch worker ranks
+    // with RRUN_ROOT_ADDRESS set. No PMIx put/barrier/get bootstrap coordination.
+    if (ctx.rank == 0 && address_file.has_value()) {
+        auto ucxx_initialized_rank =
+            ucxx::init(nullptr, ctx.nranks, std::nullopt, options);
+        comm = std::make_shared<ucxx::UCXX>(
+            std::move(ucxx_initialized_rank), options, progress_thread
+        );
+
+        auto listener_address = comm->listener_address();
+        auto root_worker_address_str =
+            std::get<std::shared_ptr<::ucxx::Address>>(listener_address.address)
+                ->getStringView();
+
+        std::string encoded_address = hex_encode(root_worker_address_str);
+        // Write to a temp file then rename so the reader never sees partial content.
+        std::string const temp_path = *address_file + ".tmp";
+        std::ofstream addr_file(temp_path);
+        if (!addr_file) {
+            throw std::runtime_error(
+                "Failed to write root address to file: " + temp_path
+            );
+        }
+        addr_file << encoded_address << std::endl;
+        addr_file.close();
+        if (std::rename(temp_path.c_str(), address_file->c_str()) != 0) {
+            std::remove(temp_path.c_str());
+            throw std::runtime_error(
+                "Failed to rename root address file to: " + *address_file
+            );
+        }
+
+        auto verbose = getenv_optional("RAPIDSMPF_VERBOSE");
+        if (verbose && *verbose == "1") {
+            std::cerr << "[rank 0] Wrote address to " << *address_file
+                      << ", skipping bootstrap coordination" << std::endl;
+        }
+
+        // Unset now that bootstrap is complete; the variable is no longer used.
+        unsetenv("RRUN_ROOT_ADDRESS_FILE");
+    }
+    // Path 2: Slurm hybrid mode for non-root ranks.
+    // Parent process already coordinated the root address via PMIx and provided it
+    // via RRUN_ROOT_ADDRESS environment variable (hex-encoded).
+    else if (precomputed_address_encoded.has_value() && ctx.rank != 0)
+    {
+        std::string precomputed_address = hex_decode(*precomputed_address_encoded);
+        auto root_worker_address = ::ucxx::createAddressFromString(precomputed_address);
+        auto ucxx_initialized_rank =
+            ucxx::init(nullptr, ctx.nranks, root_worker_address, options);
+        comm = std::make_shared<ucxx::UCXX>(
+            std::move(ucxx_initialized_rank), options, progress_thread
+        );
+    }
+    // Path 3: Normal bootstrap mode for root rank.
+    // Create listener and publish address via put() for non-root ranks to retrieve.
+    else if (ctx.rank == 0)
+    {
         auto ucxx_initialized_rank =
             ucxx::init(nullptr, ctx.nranks, std::nullopt, options);
         comm = std::make_shared<ucxx::UCXX>(
@@ -49,8 +145,11 @@ std::shared_ptr<ucxx::UCXX> create_ucxx_comm(
             std::get<std::shared_ptr<::ucxx::Address>>(listener_address.address)
                 ->getStringView());
         sync(ctx);
-    } else {
-        // Non-root ranks: Retrieve root address via get() and connect.
+    }
+    // Path 4: Normal bootstrap mode for non-root ranks.
+    // Retrieve root address via get() and connect.
+    else
+    {
         sync(ctx);
 
         auto root_worker_address_str =