From 6e812ccf70d59ad79205cbba886e1fbc8fcf1871 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Sun, 24 May 2026 18:46:19 -0400
Subject: [PATCH 01/14] feat(spider-task-executor): Add executor binary with
 bincode wire protocol and integration tests. (#325)

---
 Cargo.lock                                    | 153 ++++++++++
 Cargo.toml                                    |   2 +
 components/spider-task-executor/Cargo.toml    |  31 +-
 .../src/bin/spider_task_executor.rs           | 146 ++++++++++
 components/spider-task-executor/src/error.rs  |  26 +-
 components/spider-task-executor/src/lib.rs    |   1 +
 .../spider-task-executor/src/manager.rs       |   7 +-
 .../spider-task-executor/src/protocol.rs      |  49 ++++
 taskfiles/test.yaml                           |  25 +-
 .../integration-test-tasks/Cargo.toml         |  16 +
 .../integration-test-tasks/src/lib.rs         |  75 +++++
 tests/huntsman/task-executor/Cargo.toml       |  42 +++
 tests/huntsman/task-executor/src/lib.rs       | 275 ++++++++++++++++++
 .../tests/overhead_instrument.rs              | 228 +++++++++++++++
 .../task-executor/tests/test_executor.rs      |  90 ++++++
 .../huntsman/tdl-integration/tests/complex.rs |   4 +-
 16 files changed, 1155 insertions(+), 15 deletions(-)
 create mode 100644 components/spider-task-executor/src/bin/spider_task_executor.rs
 create mode 100644 components/spider-task-executor/src/protocol.rs
 create mode 100644 tests/huntsman/integration-test-tasks/Cargo.toml
 create mode 100644 tests/huntsman/integration-test-tasks/src/lib.rs
 create mode 100644 tests/huntsman/task-executor/Cargo.toml
 create mode 100644 tests/huntsman/task-executor/src/lib.rs
 create mode 100644 tests/huntsman/task-executor/tests/overhead_instrument.rs
 create mode 100644 tests/huntsman/task-executor/tests/test_executor.rs

diff --git a/Cargo.lock b/Cargo.lock
index 516efac4..1c7f0093 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "allocator-api2"
 version = "0.2.21"
@@ -64,6 +73,15 @@ version = "1.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06"
 
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "bitflags"
 version = "2.11.1"
@@ -283,6 +301,16 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
 
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "etcetera"
 version = "0.8.0"
@@ -695,6 +723,14 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "integration-test-tasks"
+version = "0.1.0"
+dependencies = [
+ "serde",
+ "spider-tdl",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.18"
@@ -808,6 +844,15 @@ version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
+[[package]]
+name = "matchers"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
+dependencies = [
+ "regex-automata",
+]
+
 [[package]]
 name = "md-5"
 version = "0.10.6"
@@ -1148,6 +1193,23 @@ dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
 [[package]]
 name = "rmp"
 version = "0.8.15"
@@ -1340,6 +1402,25 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
 [[package]]
 name = "signature"
 version = "2.2.0"
@@ -1431,10 +1512,19 @@ name = "spider-task-executor"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "bincode",
+ "bytes",
+ "futures-util",
  "libloading",
  "rmp-serde",
+ "serde",
+ "spider-core",
  "spider-tdl",
  "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-subscriber",
 ]
 
 [[package]]
@@ -1771,6 +1861,24 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "task-executor-tests"
+version = "0.1.0"
+dependencies = [
+ "bincode",
+ "bytes",
+ "futures-util",
+ "integration-test-tasks",
+ "rmp-serde",
+ "serde",
+ "spider-core",
+ "spider-task-executor",
+ "spider-tdl",
+ "tabled",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "tdl-integration"
 version = "0.1.0"
@@ -1812,6 +1920,15 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "thread_local"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "tinystr"
 version = "0.8.3"
@@ -1847,6 +1964,7 @@ dependencies = [
  "libc",
  "mio",
  "pin-project-lite",
+ "signal-hook-registry",
  "socket2",
  "tokio-macros",
  "windows-sys 0.61.2",
@@ -1918,6 +2036,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
 dependencies = [
  "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319"
+dependencies = [
+ "matchers",
+ "once_cell",
+ "regex-automata",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-serde",
 ]
 
 [[package]]
@@ -1995,6 +2142,12 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
diff --git a/Cargo.toml b/Cargo.toml
index 30796143..67362f87 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,5 +9,7 @@ members = [
   "components/spider-tdl-derive",
   "examples/huntsman/complex/tasks",
   "examples/huntsman/complex/types",
+  "tests/huntsman/integration-test-tasks",
+  "tests/huntsman/task-executor",
   "tests/huntsman/tdl-integration",
 ]
diff --git a/components/spider-task-executor/Cargo.toml b/components/spider-task-executor/Cargo.toml
index c51c09b2..789308ca 100644
--- a/components/spider-task-executor/Cargo.toml
+++ b/components/spider-task-executor/Cargo.toml
@@ -7,11 +7,36 @@ edition = "2024"
 name = "spider_task_executor"
 path = "src/lib.rs"
 
+[[bin]]
+name = "spider-task-executor"
+path = "src/bin/spider_task_executor.rs"
+
 [dependencies]
+anyhow = "1.0.98"
+bincode = "1.3.3"
+bytes = "1.10"
+futures-util = { version = "0.3.31", default-features = false, features = [
+  "sink",
+  "std"
+] }
 libloading = "0.8.5"
 rmp-serde = "1.3.1"
+serde = { version = "1.0.228", features = ["derive"] }
+spider-core = { path = "../spider-core" }
 spider-tdl = { path = "../spider-tdl" }
 thiserror = "2.0.18"
-
-[dev-dependencies]
-anyhow = "1.0.98"
+tokio = { version = "1.50.0", features = [
+  "io-std",
+  "io-util",
+  "macros",
+  "rt",
+  "sync",
+  "time"
+] }
+tokio-util = { version = "0.7", features = ["codec"] }
+tracing = { version = "0.1.41", default-features = false, features = ["std"] }
+tracing-subscriber = { version = "0.3.19", default-features = false, features = [
+  "env-filter",
+  "fmt",
+  "json"
+] }
diff --git a/components/spider-task-executor/src/bin/spider_task_executor.rs b/components/spider-task-executor/src/bin/spider_task_executor.rs
new file mode 100644
index 00000000..5be95bf0
--- /dev/null
+++ b/components/spider-task-executor/src/bin/spider_task_executor.rs
@@ -0,0 +1,146 @@
+//! Spider task-executor binary.
+//!
+//! Reads bincode-framed [`Request`](spider_task_executor::protocol::Request)s from `stdin`,
+//! dispatches them through a [`TdlPackageManager`], and writes
+//! [`Response`](spider_task_executor::protocol::Response)s to `stdout`. The execution manager
+//! spawns this process per slot and supervises it.
+//!
+//! Package resolution: each `Execute` request names a TDL package; the executor looks for
+//! `${SPIDER_TDL_PACKAGE_DIR}/${package}/${package}.so` and caches the loaded library by name.
+//!
+//! Execution model: requests are processed strictly sequentially on a single-threaded tokio
+//! runtime. Tokio is used only to match the async I/O surface on the execution manager side;
+//! the executor itself has no concurrency requirements, and exactly one task runs for the
+//! lifetime of the process.
+
+use std::{
+    path::{Path, PathBuf},
+    time::Instant,
+};
+
+use anyhow::{Result, anyhow};
+use bytes::Bytes;
+use futures_util::{SinkExt, StreamExt};
+use spider_task_executor::{
+    ExecutorError,
+    TdlPackageManager,
+    protocol::{ExecutorOutcome, Request, Response},
+};
+use tokio::io::{stdin, stdout};
+use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec};
+
+/// Env var that points to the directory where compiled TDL packages live.
+const SPIDER_TDL_PACKAGE_DIR: &str = "SPIDER_TDL_PACKAGE_DIR";
+
+/// Initializes tracing logging.
+fn init_tracing() {
+    // Send tracing output to stderr so it doesn't pollute the framed-stdout protocol channel.
+    tracing_subscriber::fmt()
+        .event_format(
+            tracing_subscriber::fmt::format()
+                .with_level(true)
+                .with_target(false)
+                .with_file(true)
+                .with_line_number(true)
+                .json(),
+        )
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .with_ansi(false)
+        .with_writer(std::io::stderr)
+        .init();
+}
+
+/// Runs a task from the given TDL context and inputs.
+///
+/// # Returns
+///
+/// Forwards [`spider_task_executor::TdlPackage::execute_task`]'s return values on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`TdlPackageManager::load`]'s return values on failure.
+/// * Forwards [`spider_task_executor::TdlPackage::execute_task`]'s return values on failure.
+fn run_task(
+    manager: &mut TdlPackageManager,
+    pkg_dir: &Path,
+    package: &str,
+    task_func: &str,
+    raw_ctx: &[u8],
+    raw_inputs: &[u8],
+) -> Result<Vec<u8>, ExecutorError> {
+    let pkg = if let Some(pkg) = manager.get(package) {
+        pkg
+    } else {
+        let path = pkg_dir.join(package).join(format!("lib{package}.so"));
+        manager.load(&path)?
+    };
+    pkg.execute_task(task_func, raw_ctx, raw_inputs)
+}
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> Result<()> {
+    init_tracing();
+
+    let pkg_dir: PathBuf = std::env::var_os(SPIDER_TDL_PACKAGE_DIR)
+        .map(PathBuf::from)
+        .ok_or_else(|| anyhow!("{SPIDER_TDL_PACKAGE_DIR} env var not set"))?;
+
+    let mut requests = FramedRead::new(stdin(), LengthDelimitedCodec::new());
+    let mut responses = FramedWrite::new(stdout(), LengthDelimitedCodec::new());
+
+    let mut manager = TdlPackageManager::new();
+
+    tracing::info!("Executor starts.");
+
+    while let Some(frame) = requests.next().await {
+        let frame = frame
+            .inspect_err(|e| tracing::error!(err = ? e, "Failed to receive request frame."))?;
+        let req: Request = bincode::deserialize(&frame)
+            .inspect_err(|e| tracing::error!(err = ? e, "Failed to deserialize request."))?;
+        match req {
+            Request::Execute {
+                tdl_context,
+                raw_ctx,
+                raw_inputs,
+            } => {
+                let started = Instant::now();
+                let outcome = match run_task(
+                    &mut manager,
+                    &pkg_dir,
+                    &tdl_context.package,
+                    &tdl_context.task_func,
+                    &raw_ctx,
+                    &raw_inputs,
+                ) {
+                    Ok(outputs) => ExecutorOutcome::Success { outputs },
+                    Err(e) => ExecutorOutcome::Failure {
+                        error: rmp_serde::to_vec(&e).inspect_err(
+                            |e| tracing::error!(err = ? e, "Failed to serialize execution result."),
+                        )?,
+                    },
+                };
+                let elapsed_us = u64::try_from(started.elapsed().as_micros()).unwrap_or(u64::MAX);
+
+                let resp = Response::Result {
+                    outcome,
+                    elapsed_us,
+                };
+                let bytes = bincode::serialize(&resp)
+                    .inspect_err(|e| tracing::error!(err = ? e, "Failed to serialize response."))?;
+                responses
+                    .send(Bytes::from(bytes))
+                    .await
+                    .inspect_err(|e| tracing::error!(err = ? e, "Failed to send response."))?;
+            }
+            Request::Shutdown => {
+                tracing::info!("Received shutdown request.");
+                break;
+            }
+        }
+    }
+
+    tracing::info!("Executor exits.");
+    Ok(())
+}
diff --git a/components/spider-task-executor/src/error.rs b/components/spider-task-executor/src/error.rs
index da582342..c8da04ef 100644
--- a/components/spider-task-executor/src/error.rs
+++ b/components/spider-task-executor/src/error.rs
@@ -6,11 +6,11 @@ use spider_tdl::{TdlError, Version};
 ///
 /// [`TdlError`] (failure inside a user task) is wrapped via [`Self::TaskError`] so callers can
 /// distinguish executor-internal failures from in-task failures.
-#[derive(Debug, thiserror::Error)]
+#[derive(Debug, thiserror::Error, serde::Serialize, serde::Deserialize)]
 pub enum ExecutorError {
     /// `dlopen` failed or a required FFI symbol was missing.
     #[error("failed to load TDL package library: {0}")]
-    InvalidLibrary(#[from] libloading::Error),
+    InvalidLibrary(String),
 
     /// The package's declared `spider-tdl` ABI version is not compatible with the executor's.
     #[error(
@@ -33,7 +33,7 @@ pub enum ExecutorError {
 
     /// The byte buffer contains invalid UTF-8 patterns.
     #[error("invalid UTF-8: {0}")]
-    InvalidUtf8(#[from] std::str::Utf8Error),
+    InvalidUtf8(String),
 
     /// A user task returned a [`TdlError`] across the FFI boundary.
     #[error("task execution failed: {0}")]
@@ -42,7 +42,25 @@ pub enum ExecutorError {
     /// The msgpack-encoded error payload returned by a failing task could not be decoded back into
     /// a [`TdlError`].
     #[error("failed to deserialize error payload: {0}")]
-    ErrorPayloadDeserializationFailure(#[from] rmp_serde::decode::Error),
+    ErrorPayloadDeserializationFailure(String),
+}
+
+impl From<libloading::Error> for ExecutorError {
+    fn from(value: libloading::Error) -> Self {
+        Self::InvalidLibrary(value.to_string())
+    }
+}
+
+impl From<std::str::Utf8Error> for ExecutorError {
+    fn from(value: std::str::Utf8Error) -> Self {
+        Self::InvalidUtf8(value.to_string())
+    }
+}
+
+impl From<rmp_serde::decode::Error> for ExecutorError {
+    fn from(value: rmp_serde::decode::Error) -> Self {
+        Self::ErrorPayloadDeserializationFailure(value.to_string())
+    }
 }
 
 impl ExecutorError {
diff --git a/components/spider-task-executor/src/lib.rs b/components/spider-task-executor/src/lib.rs
index b5b05076..3afb0484 100644
--- a/components/spider-task-executor/src/lib.rs
+++ b/components/spider-task-executor/src/lib.rs
@@ -2,6 +2,7 @@
 
 pub mod error;
 pub mod manager;
+pub mod protocol;
 
 pub use error::ExecutorError;
 pub use manager::{TdlPackage, TdlPackageManager};
diff --git a/components/spider-task-executor/src/manager.rs b/components/spider-task-executor/src/manager.rs
index 49fca52b..61060055 100644
--- a/components/spider-task-executor/src/manager.rs
+++ b/components/spider-task-executor/src/manager.rs
@@ -21,6 +21,7 @@ use crate::error::ExecutorError;
 /// avoid repeating the FFI round trip on every call. The execute fn pointer is also resolved once
 /// at load time and cached so each [`Self::execute_task`] call doesn't require `dlsym` per
 /// dispatch.
+#[derive(Debug)]
 pub struct TdlPackage {
     /// The name of the package.
     name: String,
@@ -190,7 +191,7 @@ impl TdlPackageManager {
     ///
     /// # Returns
     ///
-    /// The newly loaded package's name on success.
+    /// The newly loaded package on success.
     ///
     /// # Errors
     ///
@@ -199,14 +200,14 @@ impl TdlPackageManager {
     /// * [`ExecutorError::DuplicatePackage`] if a package with the same name is already loaded. The
     ///   freshly loaded library will be dropped (unloaded).
     /// * Forwards [`TdlPackage::load`]'s return values on failure.
-    pub fn load(&mut self, path: &Path) -> Result<String, ExecutorError> {
+    pub fn load(&mut self, path: &Path) -> Result<&TdlPackage, ExecutorError> {
         let package = TdlPackage::load(path)?;
         if self.packages.contains_key(package.name()) {
             return Err(ExecutorError::DuplicatePackage(package.name().to_owned()));
         }
         let name_key = package.name().to_owned();
         let inserted = self.packages.entry(name_key).or_insert(package);
-        Ok(inserted.name().to_owned())
+        Ok(inserted)
     }
 
     /// # Returns
diff --git a/components/spider-task-executor/src/protocol.rs b/components/spider-task-executor/src/protocol.rs
new file mode 100644
index 00000000..935d60d7
--- /dev/null
+++ b/components/spider-task-executor/src/protocol.rs
@@ -0,0 +1,49 @@
+//! Wire protocol between the execution manager and a `spider-task-executor` subprocess.
+//!
+//! The parent encodes each [`Request`] with `bincode` and writes it as one length-delimited frame
+//! over the executor's `stdin`; the executor reads frames, dispatches to the TDL package manager,
+//! and writes one [`Response`] frame back over `stdout`.
+//!
+//! `stderr` is **not** carried over this protocol. The executor writes diagnostics to its own
+//! stderr; how those bytes are disposed of (inherited, piped, redirected to a log file) is a choice
+//! made by whoever spawned the process.
+
+use serde::{Deserialize, Serialize};
+use spider_core::task::TdlContext;
+
+/// Request from the parent process (execution manager) to the executor.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Request {
+    Execute {
+        /// TDL information for identifying which task to execute.
+        tdl_context: TdlContext,
+
+        /// Serialized task context.
+        raw_ctx: Vec<u8>,
+
+        /// Serialized task inputs.
+        raw_inputs: Vec<u8>,
+    },
+
+    Shutdown,
+}
+
+/// Reply from the executor to the parent process.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Response {
+    Result {
+        outcome: ExecutorOutcome,
+        /// Wall-clock duration of the FFI call, measured by the executor.
+        elapsed_us: u64,
+    },
+}
+
+/// Outcome of a task execution.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum ExecutorOutcome {
+    /// Task outputs serialized in wire-format.
+    Success { outputs: Vec<u8> },
+
+    /// [`crate::ExecutorError`] serialized in msgpack.
+    Failure { error: Vec<u8> },
+}
diff --git a/taskfiles/test.yaml b/taskfiles/test.yaml
index 83807015..7d79bfdb 100644
--- a/taskfiles/test.yaml
+++ b/taskfiles/test.yaml
@@ -209,12 +209,20 @@ tasks:
   # @param {string} SPIDER_STORAGE_URL An URL pointing to the MariaDB instance.
   spider-huntsman-unit-tests-executor:
     internal: true
+    vars:
+      # TDL packages are staged under `${G_TDL_PACKAGES_DIR}/<package_name>/lib<package_name>.so`
+      # so that the `spider-task-executor` binary can resolve them via the on-disk layout it
+      # documents.
+      G_TDL_PACKAGES_DIR: "{{.G_BUILD_DIR}}/tdl_packages"
+      G_RUST_RELEASE_DIR: "{{.G_RUST_BUILD_DIR}}/release"
     env:
       MARIADB_PORT: "{{.MARIADB_PORT}}"
       MARIADB_DATABASE: "{{.MARIADB_DATABASE}}"
       MARIADB_USERNAME: "{{.MARIADB_USERNAME}}"
       MARIADB_PASSWORD: "{{.MARIADB_PASSWORD}}"
-      SPIDER_TDL_PACKAGE_COMPLEX: "{{.G_RUST_BUILD_DIR}}/release/libhuntsman_complex.so"
+      SPIDER_TDL_PACKAGE_COMPLEX: "{{.G_TDL_PACKAGES_DIR}}/complex/libcomplex.so"
+      SPIDER_TDL_PACKAGE_DIR: "{{.G_TDL_PACKAGES_DIR}}"
+      SPIDER_TASK_EXECUTOR_BIN: "{{.G_RUST_RELEASE_DIR}}/spider-task-executor"
       SPIDER_TEST_INSTRUMENT_OUTPUT_DIR:
         sh: "echo {{.G_BUILD_DIR}}/spider-instrument-$(uuidgen)"
     requires:
@@ -222,11 +230,22 @@ tasks:
     dir: "{{.ROOT_DIR}}"
     deps: ["toolchains:rust"]
     cmds:
-      - "mkdir ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}"
+      - "mkdir -p ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}"
       - defer: "rm -rf ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}"
       - |-
         . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}"
-        cargo build --package huntsman-complex --release
+        # `--bin` is a workspace-wide target filter; combining it with cdylib packages in the
+        # same `cargo build` would silently exclude the `.so` artifacts. Use one invocation per
+        # artifact to keep the target selection unambiguous.
+        cargo build --release --package huntsman-complex
+        cargo build --release --package integration-test-tasks
+        cargo build --release --package spider-task-executor --bin spider-task-executor
+        mkdir -p "{{.G_TDL_PACKAGES_DIR}}/complex" \
+                 "{{.G_TDL_PACKAGES_DIR}}/integration_test_tasks"
+        cp "{{.G_RUST_RELEASE_DIR}}/libhuntsman_complex.so" \
+           "{{.G_TDL_PACKAGES_DIR}}/complex/libcomplex.so"
+        cp "{{.G_RUST_RELEASE_DIR}}/libintegration_test_tasks.so" \
+           "{{.G_TDL_PACKAGES_DIR}}/integration_test_tasks/libintegration_test_tasks.so"
         cargo nextest run --all --all-features --run-ignored all --release
       - |-
         for f in ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}/*; do
diff --git a/tests/huntsman/integration-test-tasks/Cargo.toml b/tests/huntsman/integration-test-tasks/Cargo.toml
new file mode 100644
index 00000000..0c77122e
--- /dev/null
+++ b/tests/huntsman/integration-test-tasks/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "integration-test-tasks"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[lib]
+# `cdylib` is what the task-executor dlopens; `rlib` lets other Rust crates (the integration
+# tests) `use` constants like `INSTRUMENT_SLEEP_US`.
+crate-type = ["cdylib", "rlib"]
+name = "integration_test_tasks"
+path = "src/lib.rs"
+
+[dependencies]
+serde = { version = "1.0.228", features = ["derive"] }
+spider-tdl = { path = "../../../components/spider-tdl", features = ["derive"] }
diff --git a/tests/huntsman/integration-test-tasks/src/lib.rs b/tests/huntsman/integration-test-tasks/src/lib.rs
new file mode 100644
index 00000000..1f6bc731
--- /dev/null
+++ b/tests/huntsman/integration-test-tasks/src/lib.rs
@@ -0,0 +1,75 @@
+//! Test TDL package used by the `task-executor` integration tests.
+//!
+//! Exposes four tasks that exercise distinct executor code paths:
+//!
+//! * [`task_decl::fibonacci`] — basic compute + correctness.
+//! * [`task_decl::always_fail`] — in-task error reporting.
+//! * [`task_decl::always_panic`] — process-level crash handling.
+//! * [`task_decl::sleep_and_echo`] — fixed-cost task: sleeps for a known [`INSTRUMENT_SLEEP_US`]
+//!   duration then echoes its `Vec<String>` payload back. Used by the overhead bench so the
+//!   non-sleep portion of the executor's reported FFI time isolates the in-executor input/output
+//!   serde cost, while the parent-side delta isolates IPC framing cost.
+
+/// The constant sleep duration used by [`task_decl::sleep_and_echo`].
+///
+/// Exposed at crate scope so the overhead bench (linked dynamically, so it can't read the value
+/// through the cdylib) can reference the same number to keep them in sync if changed.
+pub const INSTRUMENT_SLEEP_US: u64 = 50;
+
+mod task_decl {
+    use std::{thread::sleep, time::Duration};
+
+    use spider_tdl::{TaskContext, TdlError, task};
+
+    use crate::INSTRUMENT_SLEEP_US;
+
+    /// Computes the `index`-th Fibonacci number with a deliberately naive recursive
+    /// implementation so the call has measurable CPU cost for the overhead benchmark.
+    #[task(name = "fibonacci")]
+    pub fn fibonacci(_ctx: TaskContext, index: u64) -> Result<u64, TdlError> {
+        Ok(fib(index))
+    }
+
+    fn fib(index: u64) -> u64 {
+        if index < 2 {
+            index
+        } else {
+            fib(index - 1) + fib(index - 2)
+        }
+    }
+
+    /// Always returns a [`TdlError::ExecutionError`].
+    #[task(name = "always_fail")]
+    pub fn always_fail(_ctx: TaskContext) -> Result<u64, TdlError> {
+        Err(TdlError::ExecutionError(
+            "always_fail: intentional failure".to_owned(),
+        ))
+    }
+
+    /// Always panics. The panic crosses the `extern "C"` FFI boundary, which aborts the executor
+    /// process — the test asserts the parent observes that crash.
+    #[task(name = "always_panic")]
+    pub fn always_panic(_ctx: TaskContext) -> Result<u64, TdlError> {
+        panic!("always_panic: intentional panic")
+    }
+
+    /// Sleeps for a fixed [`INSTRUMENT_SLEEP_US`] microseconds, then echoes the input back.
+    ///
+    /// The fixed-cost body lets the overhead bench subtract the known sleep from the executor's
+    /// reported FFI duration, isolating the in-executor input/output serde overhead.
+    #[task(name = "sleep_and_echo")]
+    pub fn sleep_and_echo(_ctx: TaskContext, items: Vec<String>) -> Result<Vec<String>, TdlError> {
+        sleep(Duration::from_micros(INSTRUMENT_SLEEP_US));
+        Ok(items)
+    }
+}
+
+spider_tdl::register_tdl_package! {
+    package_name: "integration_test_tasks",
+    tasks: [
+        task_decl::fibonacci,
+        task_decl::always_fail,
+        task_decl::always_panic,
+        task_decl::sleep_and_echo,
+    ],
+}
diff --git a/tests/huntsman/task-executor/Cargo.toml b/tests/huntsman/task-executor/Cargo.toml
new file mode 100644
index 00000000..0d237bef
--- /dev/null
+++ b/tests/huntsman/task-executor/Cargo.toml
@@ -0,0 +1,42 @@
+[package]
+name = "task-executor-tests"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[lib]
+name = "task_executor_tests"
+path = "src/lib.rs"
+
+[[test]]
+name = "executor"
+path = "tests/test_executor.rs"
+
+[[test]]
+name = "overhead_instrument"
+path = "tests/overhead_instrument.rs"
+
+[dependencies]
+bincode = "1.3.3"
+bytes = "1.10"
+futures-util = { version = "0.3.31", default-features = false, features = [
+  "sink",
+  "std"
+] }
+rmp-serde = "1.3.1"
+serde = { version = "1.0.228", features = ["derive"] }
+spider-core = { path = "../../../components/spider-core" }
+spider-task-executor = { path = "../../../components/spider-task-executor" }
+spider-tdl = { path = "../../../components/spider-tdl" }
+tokio = { version = "1.50.0", features = [
+  "io-util",
+  "macros",
+  "process",
+  "rt",
+  "time"
+] }
+tokio-util = { version = "0.7", features = ["codec"] }
+
+[dev-dependencies]
+integration-test-tasks = { path = "../integration-test-tasks" }
+tabled = "0.20.0"
diff --git a/tests/huntsman/task-executor/src/lib.rs b/tests/huntsman/task-executor/src/lib.rs
new file mode 100644
index 00000000..c42a20f4
--- /dev/null
+++ b/tests/huntsman/task-executor/src/lib.rs
@@ -0,0 +1,275 @@
+//! Test harness shared by the `task-executor-tests` integration tests.
+//!
+//! Spawns the `spider-task-executor` binary as a child process, frames bincode requests on its
+//! stdin and reads bincode responses from its stdout — the exact wire protocol of
+//! [`spider_task_executor::protocol`].
+//!
+//! Every fallible operation in this harness panics with `.expect(...)` on failure; the tests are
+//! infrastructure, not production code, and the panic message + backtrace is more useful at the
+//! failure site than threading an error type through every helper.
+//!
+//! Environment:
+//!
+//! * `SPIDER_TASK_EXECUTOR_BIN` — absolute path to the executor binary.
+//! * `SPIDER_TDL_PACKAGE_DIR` — directory the binary searches for TDL packages; gets forwarded to
+//!   the child verbatim.
+
+use std::{path::PathBuf, process::Stdio};
+
+use bytes::Bytes;
+use futures_util::{SinkExt, StreamExt};
+use spider_core::{
+    task::TdlContext,
+    types::{
+        id::{JobId, ResourceGroupId, TaskId},
+        io::TaskInput,
+    },
+};
+use spider_task_executor::protocol::{Request, Response};
+use spider_tdl::{
+    TaskContext,
+    wire::{TaskInputsSerializer, TaskOutputsSerializer},
+};
+use tokio::process::{Child, ChildStdin, ChildStdout, Command};
+use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec};
+
+/// The TDL package name registered by `integration-test-tasks`.
+pub const PACKAGE_NAME: &str = "integration_test_tasks";
+
+/// One running executor subprocess plus framed handles to its stdin / stdout.
+///
+/// The subprocess will be killed when the handle is dropped.
+pub struct ExecutorHandle {
+    child: Child,
+    requests: FramedWrite<ChildStdin, LengthDelimitedCodec>,
+    responses: FramedRead<ChildStdout, LengthDelimitedCodec>,
+}
+
+impl ExecutorHandle {
+    /// Spawns the executor binary with `SPIDER_TDL_PACKAGE_DIR` set; the child inherits the
+    /// parent's stderr so panic / abort messages surface in the test log.
+    ///
+    /// # Returns
+    ///
+    /// A handle owning the running subprocess and framed I/O.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the binary cannot be spawned or its stdio handles cannot be claimed.
+    #[must_use]
+    pub fn spawn() -> Self {
+        let mut child = Command::new(task_executor_bin())
+            .env("SPIDER_TDL_PACKAGE_DIR", tdl_package_dir())
+            .stdin(Stdio::piped())
+            .stdout(Stdio::piped())
+            .stderr(Stdio::inherit())
+            .kill_on_drop(true)
+            .spawn()
+            .expect("spawn executor binary");
+        let stdin = child.stdin.take().expect("stdin must be piped");
+        let stdout = child.stdout.take().expect("stdout must be piped");
+        Self {
+            child,
+            requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()),
+            responses: FramedRead::new(stdout, LengthDelimitedCodec::new()),
+        }
+    }
+
+    /// Bincode-serializes `req` and writes one length-delimited frame to the executor's stdin.
+    ///
+    /// # Panics
+    ///
+    /// Panics if encoding fails or the stdin pipe cannot be written.
+    pub async fn send(&mut self, req: &Request) {
+        let bytes = bincode::serialize(req).expect("bincode encode Request");
+        self.requests
+            .send(Bytes::from(bytes))
+            .await
+            .expect("write request frame");
+    }
+
+    /// Reads exactly one length-delimited frame from the executor's stdout and bincode-decodes it.
+    ///
+    /// # Returns
+    ///
+    /// The next [`Response`] from the executor.
+    ///
+    /// # Panics
+    ///
+    /// Panics if stdout closes before a frame arrives, the frame I/O fails, or decoding fails.
+    pub async fn recv(&mut self) -> Response {
+        let frame = self
+            .responses
+            .next()
+            .await
+            .expect("executor closed stdout before reply")
+            .expect("read response frame");
+        bincode::deserialize(&frame).expect("bincode decode Response")
+    }
+
+    /// Reads at most one length-delimited frame, tolerating a clean EOF (which crash-path tests
+    /// rely on to detect that the executor died).
+    ///
+    /// # Returns
+    ///
+    /// `Some(response)` if a frame was received, `None` if stdout closed cleanly first.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the frame I/O fails for a reason other than EOF or if decoding fails.
+    pub async fn try_recv(&mut self) -> Option<Response> {
+        let frame = self.responses.next().await?;
+        let bytes = frame.expect("read response frame");
+        Some(bincode::deserialize(&bytes).expect("bincode decode Response"))
+    }
+
+    /// Sends [`Request::Shutdown`], closes stdin, and waits for the child to exit cleanly.
+    ///
+    /// # Panics
+    ///
+    /// Panics if waiting on the child fails or the child exits non-zero.
+    pub async fn shutdown_clean(mut self) {
+        self.send(&Request::Shutdown).await;
+        // Close the stdin pipe so the child sees EOF after `Shutdown` is drained.
+        drop(self.requests);
+        let status = self.child.wait().await.expect("wait for executor");
+        assert!(status.success(), "executor exited with status {status:?}");
+    }
+
+    /// Closes stdin and waits for the child to exit. Used by crash-path tests that don't expect
+    /// a clean shutdown.
+    ///
+    /// # Returns
+    ///
+    /// The child's [`ExitStatus`](std::process::ExitStatus).
+    ///
+    /// # Panics
+    ///
+    /// Panics if waiting on the child fails.
+    pub async fn wait_for_exit(mut self) -> std::process::ExitStatus {
+        drop(self.requests);
+        self.child.wait().await.expect("wait for executor")
+    }
+}
+
+/// # Returns
+///
+/// The absolute path of the `spider-task-executor` binary, read from `SPIDER_TASK_EXECUTOR_BIN`.
+///
+/// # Panics
+///
+/// Panics if `SPIDER_TASK_EXECUTOR_BIN` is unset.
+#[must_use]
+pub fn task_executor_bin() -> PathBuf {
+    std::env::var_os("SPIDER_TASK_EXECUTOR_BIN")
+        .map(PathBuf::from)
+        .expect("SPIDER_TASK_EXECUTOR_BIN env var not set")
+}
+
+/// # Returns
+///
+/// The TDL package staging directory, read from `SPIDER_TDL_PACKAGE_DIR`. Forwarded verbatim
+/// into the executor child's environment so it resolves
+/// `${SPIDER_TDL_PACKAGE_DIR}/<package>/lib<package>.so`.
+///
+/// # Panics
+///
+/// Panics if `SPIDER_TDL_PACKAGE_DIR` is unset.
+#[must_use]
+pub fn tdl_package_dir() -> PathBuf {
+    std::env::var_os("SPIDER_TDL_PACKAGE_DIR")
+        .map(PathBuf::from)
+        .expect("SPIDER_TDL_PACKAGE_DIR env var not set")
+}
+
+/// # Returns
+///
+/// A placeholder msgpack-encoded [`TaskContext`] suitable for a one-shot test invocation. The id
+/// fields are fresh per call but the executor doesn't inspect them.
+///
+/// # Panics
+///
+/// Panics if msgpack encoding fails (the test ids serialize trivially).
+#[must_use]
+pub fn build_ctx() -> Vec<u8> {
+    let ctx = TaskContext {
+        job_id: JobId::new(),
+        task_id: TaskId::new(),
+        task_instance_id: 1,
+        resource_group_id: ResourceGroupId::new(),
+    };
+    rmp_serde::to_vec(&ctx).expect("serialize TaskContext")
+}
+
+/// # Type Parameters
+///
+/// * `T` - The Serde-serializable value type passed as the task's single input.
+///
+/// # Returns
+///
+/// A wire-format buffer carrying one [`TaskInput::ValuePayload`] holding the msgpack-encoded
+/// `value` — i.e. the same shape the parent ships for a single-argument task.
+///
+/// # Panics
+///
+/// Panics if msgpack encoding or wire-format append fails.
+#[must_use]
+pub fn encode_single_input<T: serde::Serialize>(value: &T) -> Vec<u8> {
+    let mut inputs = TaskInputsSerializer::new();
+    inputs
+        .append(TaskInput::ValuePayload(
+            rmp_serde::to_vec(value).expect("msgpack encode input"),
+        ))
+        .expect("append wire-format input");
+    inputs.release()
+}
+
+/// # Returns
+///
+/// A wire-format buffer carrying zero inputs — for nullary tasks like `always_fail` and
+/// `always_panic`.
+#[must_use]
+pub fn encode_no_inputs() -> Vec<u8> {
+    TaskInputsSerializer::new().release()
+}
+
+/// # Type Parameters
+///
+/// * `T` - The Serde-deserializable type the output payload should decode into.
+///
+/// # Returns
+///
+/// The single msgpack-encoded value carried in `output_bytes`, deserialized as `T`.
+///
+/// # Panics
+///
+/// Panics if the outputs buffer doesn't contain exactly one value, or if the msgpack decode
+/// fails.
+#[must_use]
+pub fn decode_single_output<T: serde::de::DeserializeOwned>(output_bytes: &[u8]) -> T {
+    let outputs =
+        TaskOutputsSerializer::deserialize(output_bytes).expect("decode wire-format outputs");
+    assert_eq!(
+        outputs.len(),
+        1,
+        "expected exactly one output payload, got {}",
+        outputs.len(),
+    );
+    rmp_serde::from_slice(&outputs[0]).expect("msgpack decode output")
+}
+
+/// # Returns
+///
+/// A [`Request::Execute`] targeting `task_func` in the integration package, with a fresh test
+/// `TaskContext` and the caller-supplied wire-format `raw_inputs`.
+#[must_use]
+pub fn execute_request(task_func: &str, raw_inputs: Vec<u8>) -> Request {
+    Request::Execute {
+        tdl_context: TdlContext {
+            package: PACKAGE_NAME.to_owned(),
+            task_func: task_func.to_owned(),
+        },
+        raw_ctx: build_ctx(),
+        raw_inputs,
+    }
+}
diff --git a/tests/huntsman/task-executor/tests/overhead_instrument.rs b/tests/huntsman/task-executor/tests/overhead_instrument.rs
new file mode 100644
index 00000000..fc4e146e
--- /dev/null
+++ b/tests/huntsman/task-executor/tests/overhead_instrument.rs
@@ -0,0 +1,228 @@
+//! Measures the round-trip overhead of one task execution through the `spider-task-executor`
+//! binary.
+//!
+//! Drives the `sleep_and_echo` task — which sleeps for a known constant
+//! [`INSTRUMENT_SLEEP_US`](integration_test_tasks::INSTRUMENT_SLEEP_US) and then echoes its
+//! `Vec<String>` payload — against a *long-lived* executor subprocess (the FFI library is
+//! cached after the first call, so subsequent dispatches measure steady-state overhead, not
+//! one-time dlopen cost). With the work portion held constant we can split the cost into:
+//!
+//! * `e2e`: parent's wall-clock around `send(Execute)` → `recv(Response::Result)`.
+//! * `executor`: the in-executor FFI duration, taken straight from
+//!   [`Response::Result::elapsed_us`]. This is `INSTRUMENT_SLEEP_US` + the executor's in-FFI
+//!   input/output serde.
+//! * `executor_internal`: `executor - INSTRUMENT_SLEEP_US`. Approximates the in-executor
+//!   input/output serde cost alone.
+//! * `ipc_overhead`: `e2e - executor`. The parent-side framing + bincode + pipe traversal.
+//!
+//! Aggregates (avg, p50, p95, p99) for each metric land in a markdown table at
+//! `${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}/task_executor_overhead.md`.
+
+use std::{
+    fs::File,
+    io::Write,
+    path::PathBuf,
+    time::{Duration, Instant},
+};
+
+use integration_test_tasks::INSTRUMENT_SLEEP_US;
+use spider_task_executor::protocol::{ExecutorOutcome, Response};
+use tabled::{Table, Tabled};
+use task_executor_tests::{
+    ExecutorHandle,
+    decode_single_output,
+    encode_single_input,
+    execute_request,
+};
+
+const PAYLOAD_LEN: usize = 100;
+const ITERATIONS: usize = 10;
+const OUTPUT_FILE: &str = "task_executor_overhead.md";
+const INSTRUMENT_OUTPUT_DIR_ENV: &str = "SPIDER_TEST_INSTRUMENT_OUTPUT_DIR";
+
+/// One row in the markdown table: a metric and its aggregate latency statistics.
+#[derive(Tabled)]
+struct LatencyRow {
+    #[tabled(rename = "Metric")]
+    metric: &'static str,
+    #[tabled(rename = "Count")]
+    count: usize,
+    #[tabled(rename = "Avg (µs)")]
+    avg_us: String,
+    #[tabled(rename = "P50 (µs)")]
+    p50_us: String,
+    #[tabled(rename = "P95 (µs)")]
+    p95_us: String,
+    #[tabled(rename = "P99 (µs)")]
+    p99_us: String,
+}
+
+impl LatencyRow {
+    /// Sorts `samples` in place and computes `count`, `avg`, `p50`, `p95`, `p99` in microseconds.
+    ///
+    /// # Returns
+    ///
+    /// A populated [`LatencyRow`], or a row with `"N/A"` aggregates when `samples` is empty.
+    fn from_samples(metric: &'static str, samples: &mut [Duration]) -> Self {
+        if samples.is_empty() {
+            return Self {
+                metric,
+                count: 0,
+                avg_us: "N/A".to_owned(),
+                p50_us: "N/A".to_owned(),
+                p95_us: "N/A".to_owned(),
+                p99_us: "N/A".to_owned(),
+            };
+        }
+        samples.sort();
+        let count = samples.len();
+        let sum: Duration = samples.iter().sum();
+        #[allow(clippy::cast_precision_loss)]
+        let avg = sum.as_secs_f64() * 1_000_000.0 / count as f64;
+        let last = count - 1;
+        let p50 = samples[(count / 2).min(last)].as_secs_f64() * 1_000_000.0;
+        let p95 = samples[(count * 95 / 100).min(last)].as_secs_f64() * 1_000_000.0;
+        let p99 = samples[(count * 99 / 100).min(last)].as_secs_f64() * 1_000_000.0;
+        Self {
+            metric,
+            count,
+            avg_us: format!("{avg:.2}"),
+            p50_us: format!("{p50:.2}"),
+            p95_us: format!("{p95:.2}"),
+            p99_us: format!("{p99:.2}"),
+        }
+    }
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib, `spider-task-executor` binary, and \
+            SPIDER_TEST_INSTRUMENT_OUTPUT_DIR"]
+async fn instrument_overhead() {
+    let output_dir = std::env::var_os(INSTRUMENT_OUTPUT_DIR_ENV).map_or_else(
+        || panic!("{INSTRUMENT_OUTPUT_DIR_ENV} env var not set"),
+        PathBuf::from,
+    );
+
+    let mut handle = ExecutorHandle::spawn();
+
+    let payload = path_like_payload(PAYLOAD_LEN);
+    let raw_inputs = encode_single_input(&payload);
+    let sleep_floor = Duration::from_micros(INSTRUMENT_SLEEP_US);
+
+    // Warm-up: first call dlopens the package. Assert correctness; discard timing.
+    handle
+        .send(&execute_request("sleep_and_echo", raw_inputs.clone()))
+        .await;
+    expect_echo(&handle.recv().await, &payload);
+
+    let mut e2e_samples = Vec::with_capacity(ITERATIONS);
+    let mut executor_samples = Vec::with_capacity(ITERATIONS);
+    let mut executor_internal_samples = Vec::with_capacity(ITERATIONS);
+    let mut ipc_overhead_samples = Vec::with_capacity(ITERATIONS);
+
+    for _ in 0..ITERATIONS {
+        let started = Instant::now();
+        handle
+            .send(&execute_request("sleep_and_echo", raw_inputs.clone()))
+            .await;
+        let response = handle.recv().await;
+        let e2e = started.elapsed();
+
+        let Response::Result {
+            outcome,
+            elapsed_us,
+        } = response;
+        let ExecutorOutcome::Success { outputs } = outcome else {
+            panic!("sleep_and_echo task unexpectedly failed in overhead loop");
+        };
+        let got: Vec<String> = decode_single_output(&outputs);
+        assert_eq!(got, payload);
+
+        let executor = Duration::from_micros(elapsed_us);
+        // Defensive: a coarse system clock could in principle report e2e < executor, or executor <
+        // sleep_floor (the sleep can return slightly early on some platforms). Treat both as zero
+        // overhead and keep the sample for visibility.
+        let executor_internal = executor.checked_sub(sleep_floor).unwrap_or(Duration::ZERO);
+        let ipc_overhead = e2e.checked_sub(executor).unwrap_or(Duration::ZERO);
+
+        e2e_samples.push(e2e);
+        executor_samples.push(executor);
+        executor_internal_samples.push(executor_internal);
+        ipc_overhead_samples.push(ipc_overhead);
+    }
+
+    handle.shutdown_clean().await;
+
+    let rows = vec![
+        LatencyRow::from_samples("E2E (parent)", &mut e2e_samples.clone()),
+        LatencyRow::from_samples("Executor FFI", &mut executor_samples.clone()),
+        LatencyRow::from_samples(
+            "Executor internal (FFI - sleep)",
+            &mut executor_internal_samples.clone(),
+        ),
+        LatencyRow::from_samples(
+            "IPC overhead (E2E - FFI)",
+            &mut ipc_overhead_samples.clone(),
+        ),
+    ];
+    let table = Table::new(rows).to_string();
+
+    let preamble = format!(
+        "# Task-executor overhead\n\nInputs: `sleep_and_echo` task with {PAYLOAD_LEN} path-like \
+         strings echoed after a {INSTRUMENT_SLEEP_US}µs sleep, {ITERATIONS} samples (excluding \
+         warm-up).\n\n* `Executor internal` ≈ in-executor input/output serde cost.\n* `IPC \
+         overhead` ≈ parent-side framing + bincode + pipe traversal.\n\n"
+    );
+
+    let path = output_dir.join(OUTPUT_FILE);
+    let mut file =
+        File::create(&path).unwrap_or_else(|err| panic!("create {} failed: {err}", path.display()));
+    file.write_all(preamble.as_bytes()).expect("write preamble");
+    file.write_all(table.as_bytes()).expect("write table");
+    file.write_all(b"\n").expect("write trailing newline");
+}
+
+/// Builds `len` deterministic path-like strings. Mixing prefixes and suffixes keeps the payload
+/// representative of a realistic input without depending on `rand`.
+///
+/// # Returns
+///
+/// A `Vec<String>` of length `len`.
+fn path_like_payload(len: usize) -> Vec<String> {
+    const PREFIXES: &[&str] = &[
+        "/var/log",
+        "/usr/local/bin",
+        "/etc/spider",
+        "/home/user/projects",
+        "/opt/data/cache",
+    ];
+    const SUFFIXES: &[&str] = &["log", "txt", "bin", "json", "tmp"];
+    (0..len)
+        .map(|idx| {
+            let prefix = PREFIXES[idx % PREFIXES.len()];
+            let suffix = SUFFIXES[(idx / PREFIXES.len()) % SUFFIXES.len()];
+            format!("{prefix}/file_{:04}_{idx:05}.{suffix}", (idx * 31) % 10_000)
+        })
+        .collect()
+}
+
+/// Asserts that `response` is a `Success` whose decoded payload equals `expected`.
+///
+/// # Panics
+///
+/// Panics if the response is a `Failure` (the decoded
+/// [`ExecutorError`](spider_task_executor::ExecutorError) is included in the panic message), or if
+/// the decoded payload doesn't match `expected`.
+fn expect_echo(response: &Response, expected: &[String]) {
+    let Response::Result { outcome, .. } = response;
+    let outputs = match outcome {
+        ExecutorOutcome::Success { outputs } => outputs,
+        ExecutorOutcome::Failure { error } => {
+            let err: spider_task_executor::ExecutorError =
+                rmp_serde::from_slice(error).expect("decode ExecutorError payload");
+            panic!("sleep_and_echo warm-up returned Failure: {err:?}");
+        }
+    };
+    let got: Vec<String> = decode_single_output(outputs);
+    assert_eq!(got, expected, "warm-up output mismatch");
+}
diff --git a/tests/huntsman/task-executor/tests/test_executor.rs b/tests/huntsman/task-executor/tests/test_executor.rs
new file mode 100644
index 00000000..e2eb8ec4
--- /dev/null
+++ b/tests/huntsman/task-executor/tests/test_executor.rs
@@ -0,0 +1,90 @@
+//! End-to-end correctness tests against the `spider-task-executor` binary.
+//!
+//! Each test spawns a fresh executor subprocess via [`ExecutorHandle::spawn`], exchanges one framed
+//! bincode request/response over the binary's stdin/stdout, and asserts on the result.
+
+use spider_task_executor::{
+    ExecutorError,
+    protocol::{ExecutorOutcome, Response},
+};
+use spider_tdl::TdlError;
+use task_executor_tests::{
+    ExecutorHandle,
+    decode_single_output,
+    encode_no_inputs,
+    encode_single_input,
+    execute_request,
+};
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn fibonacci_returns_correct_value() {
+    let mut handle = ExecutorHandle::spawn();
+    let input: u64 = 10;
+    handle
+        .send(&execute_request("fibonacci", encode_single_input(&input)))
+        .await;
+    let Response::Result { outcome, .. } = handle.recv().await;
+    match outcome {
+        ExecutorOutcome::Success { outputs } => {
+            let got: u64 = decode_single_output(&outputs);
+            // Fib(10) = 55
+            assert_eq!(got, 55);
+        }
+        ExecutorOutcome::Failure { error } => {
+            let err: ExecutorError =
+                rmp_serde::from_slice(&error).expect("decode ExecutorError payload");
+            panic!("expected Success for fibonacci(10), got Failure: {err:?}");
+        }
+    }
+    handle.shutdown_clean().await;
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn always_fail_reports_task_error() {
+    let mut handle = ExecutorHandle::spawn();
+    handle
+        .send(&execute_request("always_fail", encode_no_inputs()))
+        .await;
+    let Response::Result { outcome, .. } = handle.recv().await;
+    match outcome {
+        ExecutorOutcome::Success { outputs } => {
+            panic!("expected Failure, got Success with {} bytes", outputs.len());
+        }
+        ExecutorOutcome::Failure { error } => {
+            let err: ExecutorError =
+                rmp_serde::from_slice(&error).expect("decode ExecutorError payload");
+            let ExecutorError::TaskError(TdlError::ExecutionError(message)) = &err else {
+                panic!("expected TaskError(ExecutionError), got {err:?}");
+            };
+            assert!(
+                message.contains("always_fail"),
+                "unexpected error message: {message}",
+            );
+        }
+    }
+    handle.shutdown_clean().await;
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn always_panic_crashes_the_process() {
+    let mut handle = ExecutorHandle::spawn();
+    handle
+        .send(&execute_request("always_panic", encode_no_inputs()))
+        .await;
+
+    // A panic across the `extern "C"` boundary aborts the executor process. The parent must
+    // observe stdout EOF (no further frames) and a non-zero exit status.
+    let frame = handle.try_recv().await;
+    assert!(
+        frame.is_none(),
+        "expected stdout EOF after panic, got a response frame: {frame:?}",
+    );
+    let status = handle.wait_for_exit().await;
+    assert!(
+        !status.success(),
+        "expected non-zero exit after panic, got {status:?}",
+    );
+}
diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs
index 007cb557..513e7d75 100644
--- a/tests/huntsman/tdl-integration/tests/complex.rs
+++ b/tests/huntsman/tdl-integration/tests/complex.rs
@@ -88,8 +88,8 @@ fn decode_complex_vec(output_bytes: &[u8]) -> anyhow::Result<ComplexVec> {
 fn load_and_query_name() -> anyhow::Result<()> {
     let path = lib_path();
     let mut manager = TdlPackageManager::new();
-    let name = manager.load(&path)?;
-    assert_eq!(name, PACKAGE_NAME);
+    let pkg = manager.load(&path)?;
+    assert_eq!(pkg.name(), PACKAGE_NAME);
     let pkg = manager
         .get(PACKAGE_NAME)
         .expect("just-loaded package should be retrievable");

From 86c7bea4c1542745d8e1b36699ba6e674f8bf6d9 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Tue, 26 May 2026 16:07:06 -0400
Subject: [PATCH 02/14] feat(spider-execution-manager): Add single-process
 supervisor pool for the task executor. (#326)

---
 Cargo.lock                                    |  18 +
 Cargo.toml                                    |   1 +
 .../spider-execution-manager/Cargo.toml       |  28 ++
 .../spider-execution-manager/src/lib.rs       |   4 +
 .../src/process_pool.rs                       | 385 ++++++++++++++++++
 components/spider-storage/Cargo.toml          |  10 +-
 components/spider-task-executor/Cargo.toml    |  31 +-
 examples/huntsman/complex/tasks/Cargo.toml    |   5 +-
 tests/huntsman/task-executor/Cargo.toml       |  27 +-
 .../task-executor/tests/test_process_pool.rs  | 208 ++++++++++
 10 files changed, 682 insertions(+), 35 deletions(-)
 create mode 100644 components/spider-execution-manager/Cargo.toml
 create mode 100644 components/spider-execution-manager/src/lib.rs
 create mode 100644 components/spider-execution-manager/src/process_pool.rs
 create mode 100644 tests/huntsman/task-executor/tests/test_process_pool.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1c7f0093..2888d5e8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1482,6 +1482,23 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "spider-execution-manager"
+version = "0.1.0"
+dependencies = [
+ "bincode",
+ "bytes",
+ "futures-util",
+ "rmp-serde",
+ "spider-core",
+ "spider-task-executor",
+ "spider-tdl",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "spider-storage"
 version = "0.1.0"
@@ -1872,6 +1889,7 @@ dependencies = [
  "rmp-serde",
  "serde",
  "spider-core",
+ "spider-execution-manager",
  "spider-task-executor",
  "spider-tdl",
  "tabled",
diff --git a/Cargo.toml b/Cargo.toml
index 67362f87..ea9992cf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ resolver = "3"
 members = [
   "components/spider-core",
   "components/spider-derive",
+  "components/spider-execution-manager",
   "components/spider-storage",
   "components/spider-task-executor",
   "components/spider-tdl",
diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml
new file mode 100644
index 00000000..ed8e74db
--- /dev/null
+++ b/components/spider-execution-manager/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "spider-execution-manager"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+name = "spider_execution_manager"
+path = "src/lib.rs"
+
+[dependencies]
+bincode = "1.3.3"
+bytes = "1.10"
+futures-util = {
+  version = "0.3.31",
+  default-features = false,
+  features = ["sink", "std"]
+}
+rmp-serde = "1.3.1"
+spider-core = { path = "../spider-core" }
+spider-task-executor = { path = "../spider-task-executor" }
+spider-tdl = { path = "../spider-tdl" }
+thiserror = "2.0.18"
+tokio = {
+  version = "1.50.0",
+  features = ["io-util", "macros", "process", "rt", "sync", "time"]
+}
+tokio-util = { version = "0.7", features = ["codec"] }
+tracing = { version = "0.1.41", default-features = false, features = ["std"] }
diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs
new file mode 100644
index 00000000..2d7171a9
--- /dev/null
+++ b/components/spider-execution-manager/src/lib.rs
@@ -0,0 +1,4 @@
+//! Execution manager — the per-node service that drives Spider task execution against a
+//! `spider-task-executor` subprocess.
+
+pub mod process_pool;
diff --git a/components/spider-execution-manager/src/process_pool.rs b/components/spider-execution-manager/src/process_pool.rs
new file mode 100644
index 00000000..fab51d53
--- /dev/null
+++ b/components/spider-execution-manager/src/process_pool.rs
@@ -0,0 +1,385 @@
+//! Process supervisor for `spider-task-executor` subprocesses.
+
+use std::{
+    fs::File,
+    path::PathBuf,
+    process::Stdio,
+    sync::atomic::{AtomicU64, Ordering},
+    time::Duration,
+};
+
+use bytes::Bytes;
+use futures_util::{SinkExt, StreamExt};
+use spider_core::types::{
+    id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId},
+    io::ExecutionContext,
+};
+use spider_task_executor::protocol::{ExecutorOutcome, Request, Response};
+use spider_tdl::{
+    TaskContext,
+    wire::{TaskInputsSerializer, WireError},
+};
+use tokio::{
+    process::{Child, ChildStdin, ChildStdout, Command},
+    sync::Mutex,
+};
+use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec};
+
+/// Pool configuration. Supplied once at construction time and never mutated.
+#[derive(Debug, Clone)]
+pub struct ProcessPoolConfig {
+    /// Identity of the owning execution manager.
+    pub em_id: ExecutionManagerId,
+
+    /// Absolute path to the `spider-task-executor` binary the pool will spawn.
+    pub executor_binary_path: PathBuf,
+
+    /// Directory exposed to the child via `SPIDER_TDL_PACKAGE_DIR`. The executor resolves
+    /// `${dir}/<package>/lib<package>.so` for each package it dispatches.
+    pub package_dir: PathBuf,
+
+    /// Directory the pool writes per-executor stderr log files into. Each spawn opens
+    /// `<log_dir>/<em_id>-<executor_id>.log` in create-or-append mode and routes the child's
+    /// stderr there.
+    ///
+    /// Per-spawn filenames mean each respawn naturally rotates onto a fresh file; a long-lived
+    /// healthy executor accumulates into one file.
+    pub log_dir: PathBuf,
+}
+
+/// Request to execute a task inside the spawned task executor.
+#[derive(Debug)]
+pub struct ExecuteRequest {
+    pub job_id: JobId,
+    pub task_id: TaskId,
+    pub resource_group_id: ResourceGroupId,
+    pub ctx: ExecutionContext,
+}
+
+/// Outcome of a single [`ProcessPool::execute`] call.
+#[derive(Debug)]
+pub enum Outcome {
+    /// Task ran to completion. `outputs` is the wire-format
+    /// [`spider_tdl::wire::TaskOutputsSerializer`] buffer ready to forward to storage as
+    /// `serialized_outputs`. `elapsed_us` is the in-FFI duration measured by the executor.
+    Success { outputs: Vec<u8>, elapsed_us: u64 },
+
+    /// Task ran to completion but returned an error. `error` is the msgpack-encoded
+    /// [`spider_task_executor::ExecutorError`].
+    InTaskFailure { error: Vec<u8>, elapsed_us: u64 },
+
+    /// `hard_timeout` elapsed before the executor replied. The pool has `SIGKILL`-ed the process.
+    Timeout { hard_timeout: Duration },
+
+    /// The executor process exited (or closed stdout) before replying.
+    ExecutorCrash { exit_status: Option<i32> },
+}
+
+/// Internal failure of the pool itself, distinct from a task-execution [`Outcome`]. These indicate
+/// the pool can't serve the current request (and possibly any future request).
+///
+/// This error may indicate a non-recoverable failure. The upper-level caller may need to close the
+/// entire process pool and restart the execution manager service from the ground.
+#[derive(Debug, thiserror::Error)]
+pub enum InternalError {
+    /// The pool was entered with no running executor.
+    #[error("task executor process is not running")]
+    NotRunning,
+
+    /// Failed to spawn the executor (any I/O step during spawn — `create_dir_all`, log-file open,
+    /// [`Command::spawn`], or claiming the piped stdio handles).
+    #[error("failed to create an executor process: {0}")]
+    ExecutorCreationFailure(#[from] std::io::Error),
+
+    /// Failed to msgpack-encode the [`TaskContext`] when building the executor request.
+    #[error("failed to encode task context: {0}")]
+    EncodeTaskContext(#[from] rmp_serde::encode::Error),
+
+    /// Failed to wire-format-encode the task inputs when building the executor request.
+    #[error("failed to encode task inputs: {0}")]
+    EncodeTaskInputs(#[from] WireError),
+}
+
+/// The process pool of pre-forked task executor subprocesses ready for task execution.
+pub struct ProcessPool {
+    config: ProcessPoolConfig,
+    next_executor_id: AtomicU64,
+    /// Lock-serializes concurrent [`Self::execute`] callers. The single executor means each caller
+    /// takes the lock for the whole call, so the mutex is the entire concurrency gate.
+    handle: Mutex<Option<ExecutorHandle>>,
+}
+
+impl ProcessPool {
+    /// Factory function.
+    ///
+    /// Spawns the initial executor process and returns a ready-to-use pool.
+    ///
+    /// # Returns
+    ///
+    /// A pool whose handle already holds a freshly spawned executor on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * Forwards [`Self::spawn_executor`]'s return values on failure.
+    pub fn new(config: ProcessPoolConfig) -> Result<Self, InternalError> {
+        let mut this = Self {
+            config,
+            handle: Mutex::new(None),
+            next_executor_id: AtomicU64::new(0),
+        };
+        let handle = this.spawn_executor().inspect_err(|err| {
+            tracing::error!(err = ? err, "Failed to spawn executor process on construction.");
+        })?;
+        *this.handle.get_mut() = Some(handle);
+        Ok(this)
+    }
+
+    /// Runs one task on the pooled executor.
+    ///
+    /// Locks the handle so concurrent callers queue. Once inside, the request is bincode-framed
+    /// onto the child's stdin and the parent races a deadline against the response frame. On
+    /// timeout or crash the process is killed and respawned before the call returns; subsequent
+    /// calls see a fresh executor.
+    ///
+    /// # Returns
+    ///
+    /// Exactly one [`Outcome`] variant describing the dispatch result on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`InternalError::NotRunning`] if the pool's handle was empty at entry — meaning a prior
+    ///   respawn failed and the pool is unrecoverable. The pool should be discarded.
+    /// * Forwards [`build_request`]'s return values on failure.
+    /// * Forwards [`Self::spawn_executor`]'s return values on failure.
+    pub async fn execute(
+        &self,
+        request: ExecuteRequest,
+        hard_timeout: Duration,
+    ) -> Result<Outcome, InternalError> {
+        let mut handle_guard = self.handle.lock().await;
+        let handle = handle_guard.as_mut().ok_or(InternalError::NotRunning)?;
+        tracing::info!(
+            job_id = ? request.job_id,
+            task_id = ? request.task_id,
+            task_instance_id = ? request.ctx.task_instance_id,
+            executor_id = handle.executor_id,
+            "Task executor acquired for execution."
+        );
+        let frame_request = build_request(request)?;
+        let outcome = handle.run(frame_request, hard_timeout).await;
+
+        if matches!(
+            outcome,
+            Outcome::Timeout { .. } | Outcome::ExecutorCrash { .. }
+        ) {
+            // Dropping the handle will automatically kill the child process.
+            drop(handle_guard.take());
+            let new_handle = self.spawn_executor().inspect_err(|err| {
+                tracing::error!(
+                    err = ? err,
+                    "Failed to respawn the executor process after a crash or timeout."
+                );
+            })?;
+            tracing::info!(
+                executor_id = new_handle.executor_id,
+                "Executor respawned successfully."
+            );
+            *handle_guard = Some(new_handle);
+        }
+
+        drop(handle_guard);
+        Ok(outcome)
+    }
+
+    /// Spawns the executor binary, allocates the next monotonic executor-id, opens the per-executor
+    /// log file, and wraps the child's stdin/stdout in length-delimited codec frames.
+    ///
+    /// The child's stderr is redirected to `<log_dir>/<em_id>-<executor_id>.log` in
+    /// create-or-append mode.
+    ///
+    /// # Returns
+    ///
+    /// A fully wired [`ExecutorHandle`] on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`InternalError::ExecutorCreationFailure`] if the piped stdin or stdout handles cannot be
+    ///   claimed after spawn.
+    /// * Forwards [`std::fs::create_dir_all`]'s return values on failure.
+    /// * Forwards [`std::fs::OpenOptions::open`]'s return values on failure.
+    /// * Forwards [`Command::spawn`]'s return values on failure.
+    fn spawn_executor(&self) -> Result<ExecutorHandle, InternalError> {
+        let executor_id = self.next_executor_id.fetch_add(1, Ordering::Relaxed);
+        std::fs::create_dir_all(&self.config.log_dir)?;
+        let log_path = self.config.log_dir.join(format!(
+            "{}-{executor_id}.log",
+            self.config.em_id.as_uuid_ref()
+        ));
+        let log_file = File::options().create(true).append(true).open(&log_path)?;
+
+        let mut command = Command::new(&self.config.executor_binary_path);
+        command
+            .env("SPIDER_TDL_PACKAGE_DIR", &self.config.package_dir)
+            .stdin(Stdio::piped())
+            .stdout(Stdio::piped())
+            .stderr(Stdio::from(log_file))
+            .kill_on_drop(true);
+        let mut child = command.spawn()?;
+        let stdin = child
+            .stdin
+            .take()
+            .ok_or_else(|| std::io::Error::other("executor stdin not piped"))?;
+        let stdout = child
+            .stdout
+            .take()
+            .ok_or_else(|| std::io::Error::other("executor stdout not piped"))?;
+        tracing::info!(executor_id, "Executor spawned.");
+        Ok(ExecutorHandle {
+            executor_id,
+            child,
+            requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()),
+            responses: FramedRead::new(stdout, LengthDelimitedCodec::new()),
+        })
+    }
+}
+
+/// One running executor subprocess plus framed handles to its stdin / stdout.
+struct ExecutorHandle {
+    executor_id: u64,
+    child: Child,
+    requests: FramedWrite<ChildStdin, LengthDelimitedCodec>,
+    responses: FramedRead<ChildStdout, LengthDelimitedCodec>,
+}
+
+impl ExecutorHandle {
+    /// Sends `request` and awaits exactly one reply, racing it against `hard_timeout` and against
+    /// stdout EOF (process death).
+    ///
+    /// # Returns
+    ///
+    /// Exactly one [`Outcome`] variant:
+    ///
+    /// * [`Outcome::Success`] or [`Outcome::InTaskFailure`] from a well-formed reply.
+    /// * [`Outcome::Timeout`] if `hard_timeout` fires.
+    /// * [`Outcome::ExecutorCrash`] on any write/read/decode failure (which all imply the child is
+    ///   no longer usable).
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`bincode::serialize`] fails to encode `request` — the protocol types are
+    /// `derive(Serialize)` and serialize trivially, so an encoding failure indicates programmer
+    /// error rather than a runtime condition.
+    async fn run(&mut self, request: Request, hard_timeout: Duration) -> Outcome {
+        let bytes = bincode::serialize(&request).expect("bincode encode Request");
+        if let Err(err) = self.requests.send(Bytes::from(bytes)).await {
+            tracing::warn!(
+                executor_id = self.executor_id,
+                err = ? err,
+                "Failed to send request to executor."
+            );
+            return Outcome::ExecutorCrash {
+                exit_status: self.poll_exit_code(),
+            };
+        }
+
+        tokio::select! {
+            biased;
+            frame = self.responses.next() => match frame {
+                Some(Ok(bytes)) => match bincode::deserialize::<Response>(&bytes) {
+                    Ok(Response::Result { outcome, elapsed_us }) => match outcome {
+                        ExecutorOutcome::Success { outputs } => {
+                            Outcome::Success { outputs, elapsed_us }
+                        }
+                        ExecutorOutcome::Failure { error } => {
+                            Outcome::InTaskFailure { error, elapsed_us }
+                        }
+                    },
+                    Err(err) => {
+                        tracing::error!(
+                            executor_id = self.executor_id,
+                            err = ? err,
+                            "Failed to decode executor's response. Considered as crashed."
+                        );
+                        Outcome::ExecutorCrash { exit_status: self.poll_exit_code() }
+                    }
+                },
+                Some(Err(err)) => {
+                    tracing::error!(
+                        executor_id = self.executor_id,
+                        err = ? err,
+                        "Failed to receive executor's response."
+                    );
+                    Outcome::ExecutorCrash { exit_status: self.poll_exit_code() }
+                }
+                None => Outcome::ExecutorCrash { exit_status: self.poll_exit_code() },
+            },
+            () = tokio::time::sleep(hard_timeout) => {
+                tracing::warn!(executor_id = self.executor_id, "Executor time out triggered.");
+                Outcome::Timeout { hard_timeout }
+            }
+        }
+    }
+
+    /// Non-blocking peek at the child's exit status.
+    ///
+    /// # Returns
+    ///
+    /// `Some(code)` if the child has already exited with a code; `None` if it is still running, was
+    /// terminated by a signal, or `try_wait` itself errored.
+    fn poll_exit_code(&mut self) -> Option<i32> {
+        self.child
+            .try_wait()
+            .ok()
+            .flatten()
+            .and_then(|status| status.code())
+    }
+}
+
+/// Builds the wire [`Request::Execute`] from caller inputs.
+///
+/// # Returns
+///
+/// A populated [`Request::Execute`] with `raw_ctx` set to the msgpack-encoded [`TaskContext`] and
+/// `raw_inputs` set to the wire-format [`TaskInputsSerializer`] buffer on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`rmp_serde::to_vec`]'s return values on failure.
+/// * Forwards [`TaskInputsSerializer::append`]'s return values on failure.
+fn build_request(request: ExecuteRequest) -> Result<Request, InternalError> {
+    let ExecuteRequest {
+        job_id,
+        task_id,
+        resource_group_id,
+        ctx,
+    } = request;
+    let ExecutionContext {
+        task_instance_id,
+        tdl_context,
+        timeout_policy: _,
+        inputs,
+    } = ctx;
+    let raw_ctx = rmp_serde::to_vec(&TaskContext {
+        job_id,
+        task_id,
+        task_instance_id,
+        resource_group_id,
+    })?;
+    let mut inputs_ser = TaskInputsSerializer::new();
+    for input in inputs {
+        inputs_ser.append(input)?;
+    }
+    Ok(Request::Execute {
+        tdl_context,
+        raw_ctx,
+        raw_inputs: inputs_ser.release(),
+    })
+}
diff --git a/components/spider-storage/Cargo.toml b/components/spider-storage/Cargo.toml
index 2a661e89..f0a39b72 100644
--- a/components/spider-storage/Cargo.toml
+++ b/components/spider-storage/Cargo.toml
@@ -25,12 +25,10 @@ spider-derive = { path = "../spider-derive" }
 sqlx = { version = "0.8.6", features = ["mysql", "runtime-tokio"] }
 subtle = "2.6.1"
 thiserror = "2.0.18"
-tokio = { version = "1.50.0", features = [
-  "macros",
-  "rt-multi-thread",
-  "sync",
-  "time"
-] }
+tokio = {
+  version = "1.50.0",
+  features = ["macros", "rt-multi-thread", "sync", "time"]
+}
 uuid = { version = "1.19.0", features = ["serde"] }
 
 [dev-dependencies]
diff --git a/components/spider-task-executor/Cargo.toml b/components/spider-task-executor/Cargo.toml
index 789308ca..450d2567 100644
--- a/components/spider-task-executor/Cargo.toml
+++ b/components/spider-task-executor/Cargo.toml
@@ -15,28 +15,25 @@ path = "src/bin/spider_task_executor.rs"
 anyhow = "1.0.98"
 bincode = "1.3.3"
 bytes = "1.10"
-futures-util = { version = "0.3.31", default-features = false, features = [
-  "sink",
-  "std"
-] }
+futures-util = {
+  version = "0.3.31",
+  default-features = false,
+  features = ["sink", "std"]
+}
 libloading = "0.8.5"
 rmp-serde = "1.3.1"
 serde = { version = "1.0.228", features = ["derive"] }
 spider-core = { path = "../spider-core" }
 spider-tdl = { path = "../spider-tdl" }
 thiserror = "2.0.18"
-tokio = { version = "1.50.0", features = [
-  "io-std",
-  "io-util",
-  "macros",
-  "rt",
-  "sync",
-  "time"
-] }
+tokio = {
+  version = "1.50.0",
+  features = ["io-std", "io-util", "macros", "rt", "sync", "time"]
+}
 tokio-util = { version = "0.7", features = ["codec"] }
 tracing = { version = "0.1.41", default-features = false, features = ["std"] }
-tracing-subscriber = { version = "0.3.19", default-features = false, features = [
-  "env-filter",
-  "fmt",
-  "json"
-] }
+tracing-subscriber = {
+  version = "0.3.19",
+  default-features = false,
+  features = ["env-filter", "fmt", "json"]
+}
diff --git a/examples/huntsman/complex/tasks/Cargo.toml b/examples/huntsman/complex/tasks/Cargo.toml
index 71a5dfbc..df76713a 100644
--- a/examples/huntsman/complex/tasks/Cargo.toml
+++ b/examples/huntsman/complex/tasks/Cargo.toml
@@ -12,4 +12,7 @@ path = "src/lib.rs"
 [dependencies]
 huntsman-complex-types = { path = "../types" }
 serde = { version = "1.0.228", features = ["derive"] }
-spider-tdl = { path = "../../../../components/spider-tdl", features = ["derive"] }
+spider-tdl = {
+  path = "../../../../components/spider-tdl",
+  features = ["derive"]
+}
diff --git a/tests/huntsman/task-executor/Cargo.toml b/tests/huntsman/task-executor/Cargo.toml
index 0d237bef..ca86c0ad 100644
--- a/tests/huntsman/task-executor/Cargo.toml
+++ b/tests/huntsman/task-executor/Cargo.toml
@@ -16,27 +16,32 @@ path = "tests/test_executor.rs"
 name = "overhead_instrument"
 path = "tests/overhead_instrument.rs"
 
+[[test]]
+name = "process_pool"
+path = "tests/test_process_pool.rs"
+
 [dependencies]
 bincode = "1.3.3"
 bytes = "1.10"
-futures-util = { version = "0.3.31", default-features = false, features = [
-  "sink",
-  "std"
-] }
+futures-util = {
+  version = "0.3.31",
+  default-features = false,
+  features = ["sink", "std"]
+}
 rmp-serde = "1.3.1"
 serde = { version = "1.0.228", features = ["derive"] }
 spider-core = { path = "../../../components/spider-core" }
 spider-task-executor = { path = "../../../components/spider-task-executor" }
 spider-tdl = { path = "../../../components/spider-tdl" }
-tokio = { version = "1.50.0", features = [
-  "io-util",
-  "macros",
-  "process",
-  "rt",
-  "time"
-] }
+tokio = {
+  version = "1.50.0",
+  features = ["io-util", "macros", "process", "rt", "time"]
+}
 tokio-util = { version = "0.7", features = ["codec"] }
 
 [dev-dependencies]
 integration-test-tasks = { path = "../integration-test-tasks" }
+spider-execution-manager = {
+  path = "../../../components/spider-execution-manager"
+}
 tabled = "0.20.0"
diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs
new file mode 100644
index 00000000..7bc5d332
--- /dev/null
+++ b/tests/huntsman/task-executor/tests/test_process_pool.rs
@@ -0,0 +1,208 @@
+//! End-to-end tests of [`spider_execution_manager::process_pool::ProcessPool`] against the real
+//! task-executor binary.
+//!
+//! Mirrors `tests/executor.rs` but exercises the pool's `execute` API rather than the raw
+//! [`task_executor_tests::ExecutorHandle`]. Adds coverage for the two paths that respawn the
+//! executor:
+//!
+//! * Hard timeout — a long-running task is force-killed when the parent's timer fires.
+//! * Crash — a panicking task aborts the executor process.
+//!
+//! Each of those paths is followed by a second `execute` that asserts the pool transparently
+//! respawned the child and is ready to serve again.
+
+use std::time::Duration;
+
+use spider_core::{
+    task::{TdlContext, TimeoutPolicy},
+    types::{
+        id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId},
+        io::{ExecutionContext, TaskInput},
+    },
+};
+use spider_execution_manager::process_pool::{
+    ExecuteRequest,
+    Outcome,
+    ProcessPool,
+    ProcessPoolConfig,
+};
+use spider_task_executor::ExecutorError;
+use spider_tdl::TdlError;
+use task_executor_tests::{PACKAGE_NAME, decode_single_output, task_executor_bin, tdl_package_dir};
+
+/// Generous timeout for tasks expected to finish quickly.
+const NORMAL_TIMEOUT: Duration = Duration::from_secs(5);
+
+/// Hard timeout chosen to fire well before [`SLOW_FIB_INDEX`] can complete even on a fast host.
+/// Tokio's sleep granularity is comfortably below this value.
+const SHORT_TIMEOUT: Duration = Duration::from_millis(200);
+
+/// Fibonacci index whose naive-recursive execution takes well over [`SHORT_TIMEOUT`] on any
+/// realistic host (`fib(45)` ~= 1.1×10^9 recursive calls — about a second in release mode).
+const SLOW_FIB_INDEX: u64 = 45;
+
+/// Builds a fresh [`ProcessPool`] wired to the test-harness env (executor binary + package dir)
+/// with a unique temp log directory.
+///
+/// # Returns
+///
+/// A ready-to-use pool whose handle already holds a spawned executor.
+///
+/// # Panics
+///
+/// Panics if [`ProcessPool::new`] fails — i.e., the task-executor binary cannot be spawned.
+fn build_pool() -> ProcessPool {
+    let em_id = ExecutionManagerId::new();
+    let log_dir = std::env::temp_dir().join(format!("spider-em-pool-test-{}", em_id.as_uuid_ref()));
+    let config = ProcessPoolConfig {
+        em_id,
+        executor_binary_path: task_executor_bin(),
+        package_dir: tdl_package_dir(),
+        log_dir,
+    };
+    ProcessPool::new(config).expect("construct pool")
+}
+
+/// Builds an [`ExecuteRequest`] targeting `task_func` in the integration package.
+///
+/// # Returns
+///
+/// A request with fresh IDs, a placeholder [`TimeoutPolicy`] (which the pool ignores — the caller
+/// supplies `hard_timeout` directly to [`ProcessPool::execute`]), and the supplied `inputs`.
+fn make_request(task_func: &str, inputs: Vec<TaskInput>) -> ExecuteRequest {
+    ExecuteRequest {
+        job_id: JobId::new(),
+        task_id: TaskId::new(),
+        resource_group_id: ResourceGroupId::new(),
+        ctx: ExecutionContext {
+            task_instance_id: 1,
+            tdl_context: TdlContext {
+                package: PACKAGE_NAME.to_owned(),
+                task_func: task_func.to_owned(),
+            },
+            timeout_policy: TimeoutPolicy {
+                soft_timeout_ms: 100,
+                hard_timeout_ms: 1000,
+            },
+            inputs,
+        },
+    }
+}
+
+/// Wraps `value` into a single-payload input list.
+///
+/// # Type Parameters
+///
+/// * `T` - The Serde-serializable value type carried as the task's single input.
+///
+/// # Returns
+///
+/// A `Vec<TaskInput>` of length 1 carrying the msgpack-encoded `value`.
+///
+/// # Panics
+///
+/// Panics if msgpack encoding fails.
+fn single_input<T: serde::Serialize>(value: &T) -> Vec<TaskInput> {
+    vec![TaskInput::ValuePayload(
+        rmp_serde::to_vec(value).expect("msgpack encode input"),
+    )]
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn fibonacci_succeeds() {
+    let pool = build_pool();
+    let outcome = pool
+        .execute(
+            make_request("fibonacci", single_input(&10_u64)),
+            NORMAL_TIMEOUT,
+        )
+        .await
+        .expect("execute");
+    let Outcome::Success { outputs, .. } = outcome else {
+        panic!("expected Success, got {outcome:?}");
+    };
+    assert_eq!(decode_single_output::<u64>(&outputs), 55);
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn always_fail_reports_task_error() {
+    let pool = build_pool();
+    let outcome = pool
+        .execute(make_request("always_fail", vec![]), NORMAL_TIMEOUT)
+        .await
+        .expect("execute");
+    let Outcome::InTaskFailure { error, .. } = outcome else {
+        panic!("expected InTaskFailure, got {outcome:?}");
+    };
+    let err: ExecutorError = rmp_serde::from_slice(&error).expect("decode ExecutorError");
+    let ExecutorError::TaskError(TdlError::ExecutionError(message)) = err else {
+        panic!("expected TaskError(ExecutionError), got {err:?}");
+    };
+    assert!(
+        message.contains("always_fail"),
+        "unexpected message: {message}"
+    );
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn always_panic_returns_crash_then_respawns() {
+    let pool = build_pool();
+
+    let outcome = pool
+        .execute(make_request("always_panic", vec![]), NORMAL_TIMEOUT)
+        .await
+        .expect("execute (crash)");
+    assert!(
+        matches!(outcome, Outcome::ExecutorCrash { .. }),
+        "expected ExecutorCrash, got {outcome:?}",
+    );
+
+    // The pool must have respawned the executor before returning. A follow-up call must succeed
+    // against the fresh process.
+    let outcome = pool
+        .execute(
+            make_request("fibonacci", single_input(&7_u64)),
+            NORMAL_TIMEOUT,
+        )
+        .await
+        .expect("execute (after respawn)");
+    let Outcome::Success { outputs, .. } = outcome else {
+        panic!("expected Success after respawn, got {outcome:?}");
+    };
+    assert_eq!(decode_single_output::<u64>(&outputs), 13);
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn hard_timeout_kills_then_respawns() {
+    let pool = build_pool();
+
+    let outcome = pool
+        .execute(
+            make_request("fibonacci", single_input(&SLOW_FIB_INDEX)),
+            SHORT_TIMEOUT,
+        )
+        .await
+        .expect("execute (timeout)");
+    let Outcome::Timeout { hard_timeout } = outcome else {
+        panic!("expected Timeout, got {outcome:?}");
+    };
+    assert_eq!(hard_timeout, SHORT_TIMEOUT);
+
+    // The pool must have respawned the executor before returning. A follow-up call must succeed
+    // against the fresh process.
+    let outcome = pool
+        .execute(
+            make_request("fibonacci", single_input(&7_u64)),
+            NORMAL_TIMEOUT,
+        )
+        .await
+        .expect("execute (after respawn)");
+    let Outcome::Success { outputs, .. } = outcome else {
+        panic!("expected Success after respawn, got {outcome:?}");
+    };
+    assert_eq!(decode_single_output::<u64>(&outputs), 13);
+}

From 27091f06f5384816b0be1a0c1e419399f05785a2 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Thu, 28 May 2026 22:43:54 -0400
Subject: [PATCH 03/14] feat(spider-execution-manager): Add scheduler, storage,
 and liveness client traits. (#327)

---
 Cargo.lock                                    |   1 +
 .../spider-execution-manager/Cargo.toml       |   1 +
 .../spider-execution-manager/src/client.rs    |  15 ++
 .../src/client/liveness.rs                    |  79 +++++++++++
 .../src/client/scheduler.rs                   |  59 ++++++++
 .../src/client/storage.rs                     | 134 ++++++++++++++++++
 .../spider-execution-manager/src/lib.rs       |   1 +
 7 files changed, 290 insertions(+)
 create mode 100644 components/spider-execution-manager/src/client.rs
 create mode 100644 components/spider-execution-manager/src/client/liveness.rs
 create mode 100644 components/spider-execution-manager/src/client/scheduler.rs
 create mode 100644 components/spider-execution-manager/src/client/storage.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2888d5e8..e862f5dc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1486,6 +1486,7 @@ dependencies = [
 name = "spider-execution-manager"
 version = "0.1.0"
 dependencies = [
+ "async-trait",
  "bincode",
  "bytes",
  "futures-util",
diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml
index ed8e74db..6b687212 100644
--- a/components/spider-execution-manager/Cargo.toml
+++ b/components/spider-execution-manager/Cargo.toml
@@ -8,6 +8,7 @@ name = "spider_execution_manager"
 path = "src/lib.rs"
 
 [dependencies]
+async-trait = "0.1.89"
 bincode = "1.3.3"
 bytes = "1.10"
 futures-util = {
diff --git a/components/spider-execution-manager/src/client.rs b/components/spider-execution-manager/src/client.rs
new file mode 100644
index 00000000..4f335f6e
--- /dev/null
+++ b/components/spider-execution-manager/src/client.rs
@@ -0,0 +1,15 @@
+//! Network client traits used by the execution manager.
+//!
+//! Three traits cover the EM's outbound traffic:
+//!
+//! * [`scheduler::SchedulerClient`] — pulls task assignments from the scheduler.
+//! * [`storage::StorageClient`] — registers task instances and reports their outcome.
+//! * [`liveness::LivenessClient`] — registers the EM at boot and ticks the heartbeat thereafter.
+
+pub mod liveness;
+pub mod scheduler;
+pub mod storage;
+
+pub use liveness::{LivenessClient, LivenessResponseError, RegistrationResponse};
+pub use scheduler::{SchedulerClient, SchedulerError, SchedulerResponse};
+pub use storage::{StorageClient, StorageResponseError};
diff --git a/components/spider-execution-manager/src/client/liveness.rs b/components/spider-execution-manager/src/client/liveness.rs
new file mode 100644
index 00000000..3261c9d8
--- /dev/null
+++ b/components/spider-execution-manager/src/client/liveness.rs
@@ -0,0 +1,79 @@
+//! Liveness client trait.
+//!
+//! The execution manager registers itself with storage at boot, then sends a periodic heartbeat.
+//! Each heartbeat both keeps the EM marked alive and returns storage's current session id.
+
+use std::net::IpAddr;
+
+use async_trait::async_trait;
+use spider_core::types::id::{ExecutionManagerId, SessionId};
+
+/// The execution manager's identity and the storage session at registration time.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct RegistrationResponse {
+    pub em_id: ExecutionManagerId,
+    pub session_id: SessionId,
+}
+
+/// Errors returned by [`LivenessClient`] operations.
+#[derive(Debug, thiserror::Error)]
+pub enum LivenessResponseError {
+    /// Storage has reaped this execution manager.
+    #[error("execution manager already marked dead")]
+    MarkedDead,
+
+    /// Connection lost, request timeout, or wire-format serialization failure. Callers may back off
+    /// and retry.
+    #[error("transport error: {0}")]
+    Transport(String),
+
+    /// The execution manager id was rejected by storage (e.g. unknown id).
+    #[error("execution manager id rejected: {0}")]
+    IllegalId(String),
+}
+
+/// Client interface to the storage server's execution-manager liveness endpoint.
+#[async_trait]
+pub trait LivenessClient: Send + Sync {
+    /// Registers the execution manager with storage and obtains its id.
+    ///
+    /// Called once at boot.
+    ///
+    /// # Parameters
+    ///
+    /// * `ip` - The advertised IP address of the execution manager process.
+    ///
+    /// # Returns
+    ///
+    /// The freshly assigned execution manager id and the current storage session id on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`LivenessResponseError::Transport`] if the connection was lost or timed out.
+    async fn register(&self, ip: IpAddr) -> Result<RegistrationResponse, LivenessResponseError>;
+
+    /// Sends one heartbeat for `em_id` and returns the storage's current session id.
+    ///
+    /// # Parameters
+    ///
+    /// * `em_id` - The execution manager id being heartbeated.
+    ///
+    /// # Returns
+    ///
+    /// The storage's current session id on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`LivenessResponseError::MarkedDead`] if storage has already reaped this execution
+    ///   manager.
+    /// * [`LivenessResponseError::Transport`] if the connection was lost or timed out.
+    /// * [`LivenessResponseError::IllegalId`] if storage rejected the id.
+    async fn heartbeat(
+        &self,
+        em_id: ExecutionManagerId,
+    ) -> Result<SessionId, LivenessResponseError>;
+}
diff --git a/components/spider-execution-manager/src/client/scheduler.rs b/components/spider-execution-manager/src/client/scheduler.rs
new file mode 100644
index 00000000..cf13687a
--- /dev/null
+++ b/components/spider-execution-manager/src/client/scheduler.rs
@@ -0,0 +1,59 @@
+//! Scheduler client trait.
+//!
+//! The execution manager acquires tasks from the scheduler through [`SchedulerClient`].
+
+use async_trait::async_trait;
+use spider_core::types::id::{ExecutionManagerId, JobId, SessionId, TaskId};
+
+/// A task assignment handed to the execution manager by the scheduler.
+///
+/// `session_id` is the scheduler's view of storage's session at the moment the assignment was
+/// produced. The execution manager pins this exact value on every subsequent storage call for the
+/// attempt.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct SchedulerResponse {
+    pub job_id: JobId,
+    pub task_id: TaskId,
+    pub session_id: SessionId,
+}
+
+/// Errors returned by [`SchedulerClient::next_task`].
+#[derive(Debug, thiserror::Error)]
+pub enum SchedulerError {
+    /// Connection to the scheduler was lost or the request timed out. Callers may back off and
+    /// retry.
+    #[error("transport error: {0}")]
+    Transport(String),
+
+    /// The scheduler returned a malformed reply.
+    #[error("protocol error: {0}")]
+    Protocol(String),
+}
+
+/// Client interface to the scheduler service.
+#[async_trait]
+pub trait SchedulerClient: Send + Sync {
+    /// Blocks until a task is assigned to this execution manager.
+    ///
+    /// Implementations may long-poll the scheduler; callers should treat this call as a
+    /// cancellation point.
+    ///
+    /// # Parameters
+    ///
+    /// * `em_id` - The identity of the calling execution manager.
+    ///
+    /// # Returns
+    ///
+    /// A [`SchedulerResponse`] describing the assigned task on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`SchedulerError::Transport`] if the connection was lost or the request timed out.
+    /// * [`SchedulerError::Protocol`] if the scheduler returned a malformed reply.
+    async fn next_task(
+        &self,
+        em_id: ExecutionManagerId,
+    ) -> Result<SchedulerResponse, SchedulerError>;
+}
diff --git a/components/spider-execution-manager/src/client/storage.rs b/components/spider-execution-manager/src/client/storage.rs
new file mode 100644
index 00000000..89732c8c
--- /dev/null
+++ b/components/spider-execution-manager/src/client/storage.rs
@@ -0,0 +1,134 @@
+//! Storage client trait.
+//!
+//! The execution manager interacts with the storage server through this trait to register a task
+//! instance, fetch its [`ExecutionContext`], and report success or failure.
+
+use async_trait::async_trait;
+use spider_core::types::{
+    id::{ExecutionManagerId, JobId, SessionId, TaskId},
+    io::ExecutionContext,
+};
+
+/// Errors returned by [`StorageClient`] operations.
+///
+/// The variants intentionally mirror the storage server's externally visible failure modes (see
+/// `spider_storage::state::error::StorageServerError`) plus a transport bucket for connection /
+/// serialization failures.
+#[derive(Debug, thiserror::Error)]
+pub enum StorageResponseError {
+    /// The `session_id` carried with the request does not match storage's current session.
+    #[error("stale session (storage now at {storage_session})")]
+    StaleSession { storage_session: SessionId },
+
+    /// Storage's job cache rejected the operation as stale (e.g. the task or its job has already
+    /// terminated).
+    #[error("cache stale: {0}")]
+    CacheStale(String),
+
+    /// Connection lost, request timeout, or wire-format serialization failure. Callers may back off
+    /// and retry.
+    #[error("transport error: {0}")]
+    Transport(String),
+
+    /// The storage server returned an otherwise-uncategorized error.
+    #[error("storage server: {0}")]
+    Server(String),
+
+    /// The input to the operation is invalid.
+    #[error("invalid input: {0}")]
+    InvalidInput(String),
+}
+
+/// Client interface to the storage server.
+#[async_trait]
+pub trait StorageClient: Send + Sync {
+    /// Registers a task instance and fetches its execution context.
+    ///
+    /// # Parameters
+    ///
+    /// * `job_id` - The owning job.
+    /// * `task_id` - The task being instantiated.
+    /// * `em_id` - The identity of the calling execution manager.
+    /// * `session_id` - The session id captured from the scheduler assignment, pinned for the
+    ///   lifetime of the attempt.
+    ///
+    /// # Returns
+    ///
+    /// The [`ExecutionContext`] for the task instance on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`StorageResponseError::StaleSession`] if `session_id` no longer matches storage's current
+    ///   session.
+    /// * [`StorageResponseError::CacheStale`] if storage's job cache rejected the registration.
+    /// * [`StorageResponseError::Transport`] if the connection was lost or timed out.
+    /// * [`StorageResponseError::Server`] if storage returned an otherwise-uncategorized error.
+    async fn register_task_instance(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+    ) -> Result<ExecutionContext, StorageResponseError>;
+
+    /// Reports successful execution of a task instance.
+    ///
+    /// # Parameters
+    ///
+    /// * `job_id` - The owning job.
+    /// * `task_id` - The task that ran.
+    /// * `em_id` - The identity of the calling execution manager.
+    /// * `session_id` - The session id captured from the scheduler assignment.
+    /// * `serialized_outputs` - The wire-format encoded task outputs buffer, forwarded verbatim to
+    ///   storage. For commit tasks and cleanup tasks, this must be `None`.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`StorageResponseError::StaleSession`] if `session_id` no longer matches storage's current
+    ///   session.
+    /// * [`StorageResponseError::CacheStale`] if storage's job cache rejected the report.
+    /// * [`StorageResponseError::Transport`] if the connection was lost or timed out.
+    /// * [`StorageResponseError::Server`] if storage returned an otherwise-uncategorized error.
+    /// * [`StorageResponseError::InvalidInput`] if `serialized_outputs` is `Some` for a commit or
+    ///   cleanup task.
+    async fn report_task_success(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+        serialized_outputs: Option<Vec<u8>>,
+    ) -> Result<(), StorageResponseError>;
+
+    /// Reports failed execution of a task instance.
+    ///
+    /// # Parameters
+    ///
+    /// * `job_id` - The owning job.
+    /// * `task_id` - The task that ran.
+    /// * `em_id` - The identity of the calling execution manager.
+    /// * `session_id` - The session id captured from the scheduler assignment.
+    /// * `error_message` - The formatted error message.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`StorageResponseError::StaleSession`] if `session_id` no longer matches storage's current
+    ///   session.
+    /// * [`StorageResponseError::CacheStale`] if storage's job cache rejected the report.
+    /// * [`StorageResponseError::Transport`] if the connection was lost or timed out.
+    /// * [`StorageResponseError::Server`] if storage returned an otherwise-uncategorized error.
+    async fn report_task_failure(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+        error_message: String,
+    ) -> Result<(), StorageResponseError>;
+}
diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs
index 2d7171a9..84a2b6b2 100644
--- a/components/spider-execution-manager/src/lib.rs
+++ b/components/spider-execution-manager/src/lib.rs
@@ -1,4 +1,5 @@
 //! Execution manager — the per-node service that drives Spider task execution against a
 //! `spider-task-executor` subprocess.
 
+pub mod client;
 pub mod process_pool;

From 85a5130cb77af8e2611bb7536e8b511174bb63a5 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Mon, 1 Jun 2026 22:10:03 -0400
Subject: [PATCH 04/14] feat(spider-execution-manager): Add liveness actor with
 session ID tracker; Refactor integration tests to extract common helpers into
 `test-utils`. (#328)

---
 Cargo.lock                                    |  26 +-
 Cargo.toml                                    |   1 +
 components/spider-core/Cargo.toml             |   4 +
 components/spider-core/src/lib.rs             |   1 +
 components/spider-core/src/session.rs         | 107 +++++
 .../spider-execution-manager/Cargo.toml       |   2 +-
 .../spider-execution-manager/src/lib.rs       |   1 +
 .../spider-execution-manager/src/liveness.rs  | 398 ++++++++++++++++++
 tests/huntsman/task-executor/Cargo.toml       |  26 +-
 tests/huntsman/task-executor/src/lib.rs       | 278 +-----------
 .../tests/overhead_instrument.rs              |   7 +-
 .../task-executor/tests/test_executor.rs      |   2 +-
 .../task-executor/tests/test_process_pool.rs  |  27 +-
 tests/huntsman/test-utils/Cargo.toml          |  32 ++
 tests/huntsman/test-utils/src/executor.rs     | 297 +++++++++++++
 tests/huntsman/test-utils/src/lib.rs          |  16 +
 tests/huntsman/test-utils/src/mock.rs         | 195 +++++++++
 17 files changed, 1095 insertions(+), 325 deletions(-)
 create mode 100644 components/spider-core/src/session.rs
 create mode 100644 components/spider-execution-manager/src/liveness.rs
 create mode 100644 tests/huntsman/test-utils/Cargo.toml
 create mode 100644 tests/huntsman/test-utils/src/executor.rs
 create mode 100644 tests/huntsman/test-utils/src/lib.rs
 create mode 100644 tests/huntsman/test-utils/src/mock.rs

diff --git a/Cargo.lock b/Cargo.lock
index e862f5dc..bb6a1c31 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1469,6 +1469,8 @@ dependencies = [
  "sqlx",
  "strum",
  "thiserror",
+ "tokio",
+ "tokio-util",
  "uuid",
 ]
 
@@ -1883,19 +1885,15 @@ dependencies = [
 name = "task-executor-tests"
 version = "0.1.0"
 dependencies = [
- "bincode",
- "bytes",
- "futures-util",
  "integration-test-tasks",
  "rmp-serde",
- "serde",
  "spider-core",
  "spider-execution-manager",
  "spider-task-executor",
  "spider-tdl",
  "tabled",
+ "test-utils",
  "tokio",
- "tokio-util",
 ]
 
 [[package]]
@@ -1910,6 +1908,24 @@ dependencies = [
  "spider-tdl",
 ]
 
+[[package]]
+name = "test-utils"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "bincode",
+ "bytes",
+ "futures-util",
+ "rmp-serde",
+ "serde",
+ "spider-core",
+ "spider-execution-manager",
+ "spider-task-executor",
+ "spider-tdl",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "testing_table"
 version = "0.3.0"
diff --git a/Cargo.toml b/Cargo.toml
index ea9992cf..5eb18596 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,4 +13,5 @@ members = [
   "tests/huntsman/integration-test-tasks",
   "tests/huntsman/task-executor",
   "tests/huntsman/tdl-integration",
+  "tests/huntsman/test-utils",
 ]
diff --git a/components/spider-core/Cargo.toml b/components/spider-core/Cargo.toml
index c75bc4e3..87531aaa 100644
--- a/components/spider-core/Cargo.toml
+++ b/components/spider-core/Cargo.toml
@@ -18,3 +18,7 @@ sqlx = { version = "0.8.6", features = ["mysql", "uuid"] }
 strum = { version = "0.28.0", features = ["derive"] }
 thiserror = "2.0.18"
 uuid = { version = "1.19.0", features = ["serde", "v4"] }
+
+[dev-dependencies]
+tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread"] }
+tokio-util = { version = "0.7", features = ["rt"] }
diff --git a/components/spider-core/src/lib.rs b/components/spider-core/src/lib.rs
index 66ed84f0..7e546853 100644
--- a/components/spider-core/src/lib.rs
+++ b/components/spider-core/src/lib.rs
@@ -1,3 +1,4 @@
 pub mod job;
+pub mod session;
 pub mod task;
 pub mod types;
diff --git a/components/spider-core/src/session.rs b/components/spider-core/src/session.rs
new file mode 100644
index 00000000..a428e001
--- /dev/null
+++ b/components/spider-core/src/session.rs
@@ -0,0 +1,107 @@
+//! Monotonically increasing session tracker shared across services.
+//!
+//! Wraps an [`AtomicU64`] in [`Arc`] so multiple tasks (and multiple consumers such as the
+//! execution manager and the scheduler) can observe and advance a shared view of storage's current
+//! session id.
+
+use std::sync::{
+    Arc,
+    atomic::{AtomicU64, Ordering},
+};
+
+use crate::types::id::SessionId;
+
+/// Monotonically increasing counter holding a service's view of the current storage session id.
+///
+/// Cloneable; clones share the same underlying counter so writers in different tasks stay coherent.
+#[derive(Clone, Debug, Default)]
+pub struct SessionTracker {
+    inner: Arc<AtomicU64>,
+}
+
+impl SessionTracker {
+    /// Builds a tracker pre-loaded with `initial`.
+    ///
+    /// # Returns
+    ///
+    /// A newly created [`SessionTracker`] on success.
+    #[must_use]
+    pub fn new(initial: SessionId) -> Self {
+        Self {
+            inner: Arc::new(AtomicU64::new(initial)),
+        }
+    }
+
+    /// # Returns
+    ///
+    /// The currently stored session id.
+    #[must_use]
+    pub fn current(&self) -> SessionId {
+        self.inner.load(Ordering::Acquire)
+    }
+
+    /// Attempts to advance the stored session id to `new_sid`.
+    ///
+    /// CAS-loop: if the stored value is already `>= new_sid`, the call no-ops. Otherwise the
+    /// stored value is bumped to `new_sid`. Coherent under concurrent writers.
+    ///
+    /// # Returns
+    ///
+    /// Whether `new_sid` strictly advanced the stored value.
+    #[must_use]
+    pub fn try_advance(&self, new_sid: SessionId) -> bool {
+        let mut cur = self.inner.load(Ordering::Acquire);
+        loop {
+            if new_sid <= cur {
+                return false;
+            }
+            match self.inner.compare_exchange_weak(
+                cur,
+                new_sid,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            ) {
+                Ok(_) => return true,
+                Err(actual) => cur = actual,
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tokio_util::task::TaskTracker;
+
+    use super::SessionTracker;
+
+    #[test]
+    fn try_advance_forward() {
+        let tracker = SessionTracker::new(1);
+        assert!(tracker.try_advance(5));
+        assert_eq!(tracker.current(), 5);
+    }
+
+    #[test]
+    fn try_advance_stale_or_equal() {
+        let tracker = SessionTracker::new(10);
+        assert!(!tracker.try_advance(10));
+        assert!(!tracker.try_advance(7));
+        assert_eq!(tracker.current(), 10);
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn concurrent_advance_converges_to_max() {
+        const MAX_TARGET: u64 = 1_000;
+        let tracker = SessionTracker::new(0);
+        let task_tracker = TaskTracker::new();
+        for i in 1..=MAX_TARGET {
+            let t = tracker.clone();
+            task_tracker.spawn(async move {
+                let _ = t.try_advance(i);
+            });
+        }
+        task_tracker.close();
+        task_tracker.wait().await;
+        assert_eq!(tracker.current(), MAX_TARGET);
+    }
+}
diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml
index 6b687212..10f0e3ac 100644
--- a/components/spider-execution-manager/Cargo.toml
+++ b/components/spider-execution-manager/Cargo.toml
@@ -25,5 +25,5 @@ tokio = {
   version = "1.50.0",
   features = ["io-util", "macros", "process", "rt", "sync", "time"]
 }
-tokio-util = { version = "0.7", features = ["codec"] }
+tokio-util = { version = "0.7", features = ["codec", "rt"] }
 tracing = { version = "0.1.41", default-features = false, features = ["std"] }
diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs
index 84a2b6b2..259fc8a9 100644
--- a/components/spider-execution-manager/src/lib.rs
+++ b/components/spider-execution-manager/src/lib.rs
@@ -2,4 +2,5 @@
 //! `spider-task-executor` subprocess.
 
 pub mod client;
+pub mod liveness;
 pub mod process_pool;
diff --git a/components/spider-execution-manager/src/liveness.rs b/components/spider-execution-manager/src/liveness.rs
new file mode 100644
index 00000000..411931cd
--- /dev/null
+++ b/components/spider-execution-manager/src/liveness.rs
@@ -0,0 +1,398 @@
+//! Liveness actor — owns the periodic heartbeat to storage and the runtime's view of the current
+//! storage session id.
+//!
+//! The actor runs as a dedicated tokio task driven by [`tokio::select!`] over three sources:
+//!
+//! 1. A [`tokio::time::interval`] driving periodic heartbeat ticks.
+//! 2. An [`mpsc`] command channel from the rest of the runtime.
+//! 3. A [`CancellationToken`] that the runtime flips on shutdown.
+
+use std::{sync::Arc, time::Duration};
+
+use spider_core::{session::SessionTracker, types::id::ExecutionManagerId};
+use tokio::{
+    sync::mpsc,
+    task::JoinHandle,
+    time::{Interval, MissedTickBehavior},
+};
+use tokio_util::sync::CancellationToken;
+
+use crate::client::{LivenessClient, LivenessResponseError};
+
+/// Commands the runtime sends to the actor.
+#[derive(Debug)]
+pub enum LivenessCommand {
+    /// Asks the actor to send an immediate heartbeat to storage instead of waiting for the next
+    /// interval tick.
+    ///
+    /// Sent by the main loop when it suspects its session view is stale (e.g. after storage replies
+    /// with a stale-session error). Storage's heartbeat response is the authoritative source of
+    /// truth for the current session id, so the actor always re-checks rather than blindly trusting
+    /// the caller's observation.
+    Refresh,
+}
+
+/// Cloneable handle for sending commands into the running actor.
+#[derive(Clone)]
+pub struct LivenessHandle {
+    cmd_sender: mpsc::Sender<LivenessCommand>,
+}
+
+impl LivenessHandle {
+    /// Asks the actor to send an immediate heartbeat to storage in a fire-and-forget manner.
+    pub async fn refresh(&self) {
+        let _ = self.cmd_sender.send(LivenessCommand::Refresh).await;
+    }
+}
+
+/// Spawns the liveness actor on the current tokio runtime.
+///
+/// The first heartbeat fires immediately when the spawned task is polled for the first time; from
+/// there it ticks every `heartbeat_interval`. Missed ticks are skipped rather than burst-replayed.
+///
+/// # Returns
+///
+/// A pair containing:
+///
+/// * A handle for sending commands to the actor.
+/// * The spawned task's [`JoinHandle`].
+pub fn spawn<LivenessClientType: LivenessClient + 'static>(
+    em_id: ExecutionManagerId,
+    client: Arc<LivenessClientType>,
+    session_tracker: SessionTracker,
+    cancellation_token: CancellationToken,
+    heartbeat_interval: Duration,
+) -> (LivenessHandle, JoinHandle<()>) {
+    let (tx, rx) = mpsc::channel(COMMAND_CHANNEL_CAP);
+    let mut interval = tokio::time::interval(heartbeat_interval);
+    interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+    let actor = LivenessActor {
+        em_id,
+        client,
+        session_tracker,
+        cmd_receiver: rx,
+        cancellation_token,
+        interval,
+    };
+    let join = tokio::spawn(actor.run());
+    (LivenessHandle { cmd_sender: tx }, join)
+}
+
+/// Capacity of the command channel between the runtime and the actor.
+const COMMAND_CHANNEL_CAP: usize = 16;
+
+/// The actor's owned state. Lives entirely inside the spawned task.
+struct LivenessActor<LivenessClientType: LivenessClient> {
+    em_id: ExecutionManagerId,
+    client: Arc<LivenessClientType>,
+    session_tracker: SessionTracker,
+    cmd_receiver: mpsc::Receiver<LivenessCommand>,
+    cancellation_token: CancellationToken,
+    interval: Interval,
+}
+
+impl<LivenessClientType: LivenessClient> LivenessActor<LivenessClientType> {
+    /// Drives the actor until cancellation or the command channel closes.
+    async fn run(mut self) {
+        loop {
+            tokio::select! {
+                () = self.cancellation_token.cancelled() => {
+                    tracing::info!("Cancellation token received. Liveness actor shutting down.");
+                    break;
+                },
+                cmd = self.cmd_receiver.recv() => if let Some(cmd) = cmd {
+                    self.on_command(&cmd).await;
+                } else {
+                    tracing::info!("Command channel closed. Liveness actor shutting down.");
+                    break;
+                },
+                _ = self.interval.tick() => self.send_heartbeat().await,
+            }
+        }
+    }
+
+    /// Handles one command popped from the channel.
+    async fn on_command(&mut self, cmd: &LivenessCommand) {
+        match cmd {
+            LivenessCommand::Refresh => {
+                self.send_heartbeat().await;
+            }
+        }
+    }
+
+    /// Sends one heartbeat to storage, processes the response, and resets the interval so the next
+    /// scheduled tick fires one period from now.
+    ///
+    /// Resetting the interval rate-limits refresh-triggered heartbeats: an off-schedule call
+    /// (driven by [`LivenessCommand::Refresh`]) postpones the next scheduled tick, so the actor
+    /// never sends two heartbeats closer together than `heartbeat_interval`.
+    async fn send_heartbeat(&mut self) {
+        match self.client.heartbeat(self.em_id).await {
+            Ok(session_id) => {
+                let previous = self.session_tracker.current();
+                if previous != session_id {
+                    if self.session_tracker.try_advance(session_id) {
+                        tracing::info!(
+                            from = previous,
+                            to = session_id,
+                            "Session advanced by heartbeat."
+                        );
+                    } else {
+                        tracing::error!(
+                            from = previous,
+                            to = session_id,
+                            "Session update rejected. This is unexpected since there should be no \
+                             concurrent session updates in the current implementation. Cancelling \
+                             the runtime."
+                        );
+                        self.cancellation_token.cancel();
+                    }
+                }
+            }
+            Err(LivenessResponseError::MarkedDead) => {
+                tracing::error!(
+                    "Liveness reports execution manager marked dead. Cancelling the runtime."
+                );
+                self.cancellation_token.cancel();
+            }
+            Err(LivenessResponseError::IllegalId(msg)) => {
+                tracing::error!(
+                    err = %msg,
+                    "Liveness rejected the execution manager ID. Cancelling the runtime."
+                );
+                self.cancellation_token.cancel();
+            }
+            Err(LivenessResponseError::Transport(msg)) => {
+                tracing::warn!(err = %msg, "Heartbeat transport error; retrying next tick.");
+            }
+        }
+        self.interval.reset();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        collections::VecDeque,
+        net::IpAddr,
+        sync::{Arc, Mutex},
+        time::Duration,
+    };
+
+    use async_trait::async_trait;
+    use spider_core::{
+        session::SessionTracker,
+        types::id::{ExecutionManagerId, SessionId},
+    };
+    use tokio::{sync::Notify, task::JoinHandle};
+    use tokio_util::sync::CancellationToken;
+
+    use super::{LivenessHandle, spawn};
+    use crate::client::{LivenessClient, LivenessResponseError, RegistrationResponse};
+
+    struct MockState {
+        responses: VecDeque<Result<SessionId, LivenessResponseError>>,
+        call_count: u64,
+    }
+
+    /// Mock [`LivenessClient`] that returns scripted heartbeat responses and notifies the test
+    /// once per call.
+    struct MockLivenessClient {
+        state: Mutex<MockState>,
+        notify: Notify,
+    }
+
+    impl MockLivenessClient {
+        /// Builds an empty mock. Tests prime the response queue via [`Self::push_response`] before
+        /// spawning the actor.
+        ///
+        /// # Returns
+        ///
+        /// A newly created [`MockLivenessClient`] with an empty response queue.
+        fn new() -> Self {
+            Self {
+                state: Mutex::new(MockState {
+                    responses: VecDeque::new(),
+                    call_count: 0,
+                }),
+                notify: Notify::new(),
+            }
+        }
+
+        /// Pushes one scripted heartbeat response onto the queue.
+        ///
+        /// Responses are returned in FIFO order, one per [`LivenessClient::heartbeat`] call. If the
+        /// queue is exhausted, the mock returns a synthetic [`LivenessResponseError::Transport`] so
+        /// a misconfigured test fails loudly rather than hanging.
+        fn push_response(&self, response: Result<SessionId, LivenessResponseError>) {
+            self.state
+                .lock()
+                .expect("mock state lock poisoned")
+                .responses
+                .push_back(response);
+        }
+
+        /// # Returns
+        ///
+        /// The total number of [`LivenessClient::heartbeat`] invocations observed so far.
+        fn call_count(&self) -> u64 {
+            self.state
+                .lock()
+                .expect("mock state lock poisoned")
+                .call_count
+        }
+
+        /// Awaits the next [`LivenessClient::heartbeat`] invocation.
+        ///
+        /// Backed by a [`Notify`] permit, so an invocation that fires before this future is polled
+        /// can be still observed.
+        async fn wait_for_call(&self) {
+            self.notify.notified().await;
+        }
+    }
+
+    #[async_trait]
+    impl LivenessClient for MockLivenessClient {
+        async fn register(
+            &self,
+            _ip: IpAddr,
+        ) -> Result<RegistrationResponse, LivenessResponseError> {
+            unimplemented!("`LivenessClient::register` is not exercised by actor tests")
+        }
+
+        async fn heartbeat(
+            &self,
+            _em_id: ExecutionManagerId,
+        ) -> Result<SessionId, LivenessResponseError> {
+            let response = {
+                let mut state = self.state.lock().expect("mock state lock poisoned");
+                state.call_count += 1;
+                state.responses.pop_front().unwrap_or_else(|| {
+                    Err(LivenessResponseError::Transport(
+                        "MockLivenessClient: response queue exhausted".to_owned(),
+                    ))
+                })
+            };
+            self.notify.notify_one();
+            response
+        }
+    }
+
+    /// Spawns the actor with a long heartbeat interval so only the initial tick and explicit
+    /// `Refresh`-driven heartbeats fire during the test.
+    ///
+    /// # Returns
+    ///
+    /// Forwards [`spawn`]'s return values.
+    fn spawn_actor(
+        client: Arc<MockLivenessClient>,
+        tracker: SessionTracker,
+        cancellation_token: CancellationToken,
+    ) -> (LivenessHandle, JoinHandle<()>) {
+        spawn(
+            ExecutionManagerId::new(),
+            client,
+            tracker,
+            cancellation_token,
+            Duration::from_mins(1),
+        )
+    }
+
+    /// Joins the actor with a short upper bound so a stuck task surfaces as a test failure
+    /// instead of an infinite hang.
+    async fn join_actor(join: JoinHandle<()>) {
+        tokio::time::timeout(Duration::from_secs(1), join)
+            .await
+            .expect("actor did not exit within 1s")
+            .expect("actor task panicked");
+    }
+
+    #[tokio::test]
+    async fn heartbeat_advances_tracker_on_success() {
+        let client = Arc::new(MockLivenessClient::new());
+        client.push_response(Ok(7));
+        let tracker = SessionTracker::new(5);
+        let cancellation_token = CancellationToken::new();
+
+        let (_handle, join) = spawn_actor(
+            Arc::clone(&client),
+            tracker.clone(),
+            cancellation_token.clone(),
+        );
+
+        client.wait_for_call().await;
+        assert_eq!(tracker.current(), 7);
+        assert!(!cancellation_token.is_cancelled());
+
+        cancellation_token.cancel();
+        join_actor(join).await;
+    }
+
+    #[tokio::test]
+    async fn marked_dead_cancels_runtime() {
+        let client = Arc::new(MockLivenessClient::new());
+        client.push_response(Err(LivenessResponseError::MarkedDead));
+        let cancellation_token = CancellationToken::new();
+
+        let (_handle, join) = spawn_actor(
+            Arc::clone(&client),
+            SessionTracker::new(0),
+            cancellation_token.clone(),
+        );
+
+        tokio::time::timeout(Duration::from_secs(1), cancellation_token.cancelled())
+            .await
+            .expect("token was not cancelled within 1s");
+        join_actor(join).await;
+    }
+
+    #[tokio::test]
+    async fn transport_error_does_not_cancel_runtime() {
+        let client = Arc::new(MockLivenessClient::new());
+        client.push_response(Err(LivenessResponseError::Transport(
+            "simulated".to_owned(),
+        )));
+        let tracker = SessionTracker::new(5);
+        let cancellation_token = CancellationToken::new();
+
+        let (_handle, join) = spawn_actor(
+            Arc::clone(&client),
+            tracker.clone(),
+            cancellation_token.clone(),
+        );
+
+        client.wait_for_call().await;
+        assert!(!cancellation_token.is_cancelled());
+        assert_eq!(tracker.current(), 5);
+
+        cancellation_token.cancel();
+        join_actor(join).await;
+    }
+
+    #[tokio::test]
+    async fn refresh_triggers_immediate_heartbeat() {
+        let client = Arc::new(MockLivenessClient::new());
+        client.push_response(Ok(5));
+        client.push_response(Ok(7));
+        let tracker = SessionTracker::new(0);
+        let cancellation_token = CancellationToken::new();
+
+        let (handle, join) = spawn_actor(
+            Arc::clone(&client),
+            tracker.clone(),
+            cancellation_token.clone(),
+        );
+
+        client.wait_for_call().await;
+        assert_eq!(tracker.current(), 5);
+        assert_eq!(client.call_count(), 1);
+
+        handle.refresh().await;
+        client.wait_for_call().await;
+        assert_eq!(tracker.current(), 7);
+        assert_eq!(client.call_count(), 2);
+
+        cancellation_token.cancel();
+        join_actor(join).await;
+    }
+}
diff --git a/tests/huntsman/task-executor/Cargo.toml b/tests/huntsman/task-executor/Cargo.toml
index ca86c0ad..94909303 100644
--- a/tests/huntsman/task-executor/Cargo.toml
+++ b/tests/huntsman/task-executor/Cargo.toml
@@ -20,28 +20,18 @@ path = "tests/overhead_instrument.rs"
 name = "process_pool"
 path = "tests/test_process_pool.rs"
 
-[dependencies]
-bincode = "1.3.3"
-bytes = "1.10"
-futures-util = {
-  version = "0.3.31",
-  default-features = false,
-  features = ["sink", "std"]
-}
+[dev-dependencies]
+integration-test-tasks = { path = "../integration-test-tasks" }
 rmp-serde = "1.3.1"
-serde = { version = "1.0.228", features = ["derive"] }
 spider-core = { path = "../../../components/spider-core" }
+spider-execution-manager = {
+  path = "../../../components/spider-execution-manager"
+}
 spider-task-executor = { path = "../../../components/spider-task-executor" }
 spider-tdl = { path = "../../../components/spider-tdl" }
+tabled = "0.20.0"
+test-utils = { path = "../test-utils" }
 tokio = {
   version = "1.50.0",
-  features = ["io-util", "macros", "process", "rt", "time"]
+  features = ["macros", "rt", "rt-multi-thread", "time"]
 }
-tokio-util = { version = "0.7", features = ["codec"] }
-
-[dev-dependencies]
-integration-test-tasks = { path = "../integration-test-tasks" }
-spider-execution-manager = {
-  path = "../../../components/spider-execution-manager"
-}
-tabled = "0.20.0"
diff --git a/tests/huntsman/task-executor/src/lib.rs b/tests/huntsman/task-executor/src/lib.rs
index c42a20f4..c69ee050 100644
--- a/tests/huntsman/task-executor/src/lib.rs
+++ b/tests/huntsman/task-executor/src/lib.rs
@@ -1,275 +1,5 @@
-//! Test harness shared by the `task-executor-tests` integration tests.
+//! Workspace member that hosts cross-crate integration tests for the `spider-task-executor`
+//! binary and the execution manager's process pool.
 //!
-//! Spawns the `spider-task-executor` binary as a child process, frames bincode requests on its
-//! stdin and reads bincode responses from its stdout — the exact wire protocol of
-//! [`spider_task_executor::protocol`].
-//!
-//! Every fallible operation in this harness panics with `.expect(...)` on failure; the tests are
-//! infrastructure, not production code, and the panic message + backtrace is more useful at the
-//! failure site than threading an error type through every helper.
-//!
-//! Environment:
-//!
-//! * `SPIDER_TASK_EXECUTOR_BIN` — absolute path to the executor binary.
-//! * `SPIDER_TDL_PACKAGE_DIR` — directory the binary searches for TDL packages; gets forwarded to
-//!   the child verbatim.
-
-use std::{path::PathBuf, process::Stdio};
-
-use bytes::Bytes;
-use futures_util::{SinkExt, StreamExt};
-use spider_core::{
-    task::TdlContext,
-    types::{
-        id::{JobId, ResourceGroupId, TaskId},
-        io::TaskInput,
-    },
-};
-use spider_task_executor::protocol::{Request, Response};
-use spider_tdl::{
-    TaskContext,
-    wire::{TaskInputsSerializer, TaskOutputsSerializer},
-};
-use tokio::process::{Child, ChildStdin, ChildStdout, Command};
-use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec};
-
-/// The TDL package name registered by `integration-test-tasks`.
-pub const PACKAGE_NAME: &str = "integration_test_tasks";
-
-/// One running executor subprocess plus framed handles to its stdin / stdout.
-///
-/// The subprocess will be killed when the handle is dropped.
-pub struct ExecutorHandle {
-    child: Child,
-    requests: FramedWrite<ChildStdin, LengthDelimitedCodec>,
-    responses: FramedRead<ChildStdout, LengthDelimitedCodec>,
-}
-
-impl ExecutorHandle {
-    /// Spawns the executor binary with `SPIDER_TDL_PACKAGE_DIR` set; the child inherits the
-    /// parent's stderr so panic / abort messages surface in the test log.
-    ///
-    /// # Returns
-    ///
-    /// A handle owning the running subprocess and framed I/O.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the binary cannot be spawned or its stdio handles cannot be claimed.
-    #[must_use]
-    pub fn spawn() -> Self {
-        let mut child = Command::new(task_executor_bin())
-            .env("SPIDER_TDL_PACKAGE_DIR", tdl_package_dir())
-            .stdin(Stdio::piped())
-            .stdout(Stdio::piped())
-            .stderr(Stdio::inherit())
-            .kill_on_drop(true)
-            .spawn()
-            .expect("spawn executor binary");
-        let stdin = child.stdin.take().expect("stdin must be piped");
-        let stdout = child.stdout.take().expect("stdout must be piped");
-        Self {
-            child,
-            requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()),
-            responses: FramedRead::new(stdout, LengthDelimitedCodec::new()),
-        }
-    }
-
-    /// Bincode-serializes `req` and writes one length-delimited frame to the executor's stdin.
-    ///
-    /// # Panics
-    ///
-    /// Panics if encoding fails or the stdin pipe cannot be written.
-    pub async fn send(&mut self, req: &Request) {
-        let bytes = bincode::serialize(req).expect("bincode encode Request");
-        self.requests
-            .send(Bytes::from(bytes))
-            .await
-            .expect("write request frame");
-    }
-
-    /// Reads exactly one length-delimited frame from the executor's stdout and bincode-decodes it.
-    ///
-    /// # Returns
-    ///
-    /// The next [`Response`] from the executor.
-    ///
-    /// # Panics
-    ///
-    /// Panics if stdout closes before a frame arrives, the frame I/O fails, or decoding fails.
-    pub async fn recv(&mut self) -> Response {
-        let frame = self
-            .responses
-            .next()
-            .await
-            .expect("executor closed stdout before reply")
-            .expect("read response frame");
-        bincode::deserialize(&frame).expect("bincode decode Response")
-    }
-
-    /// Reads at most one length-delimited frame, tolerating a clean EOF (which crash-path tests
-    /// rely on to detect that the executor died).
-    ///
-    /// # Returns
-    ///
-    /// `Some(response)` if a frame was received, `None` if stdout closed cleanly first.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the frame I/O fails for a reason other than EOF or if decoding fails.
-    pub async fn try_recv(&mut self) -> Option<Response> {
-        let frame = self.responses.next().await?;
-        let bytes = frame.expect("read response frame");
-        Some(bincode::deserialize(&bytes).expect("bincode decode Response"))
-    }
-
-    /// Sends [`Request::Shutdown`], closes stdin, and waits for the child to exit cleanly.
-    ///
-    /// # Panics
-    ///
-    /// Panics if waiting on the child fails or the child exits non-zero.
-    pub async fn shutdown_clean(mut self) {
-        self.send(&Request::Shutdown).await;
-        // Close the stdin pipe so the child sees EOF after `Shutdown` is drained.
-        drop(self.requests);
-        let status = self.child.wait().await.expect("wait for executor");
-        assert!(status.success(), "executor exited with status {status:?}");
-    }
-
-    /// Closes stdin and waits for the child to exit. Used by crash-path tests that don't expect
-    /// a clean shutdown.
-    ///
-    /// # Returns
-    ///
-    /// The child's [`ExitStatus`](std::process::ExitStatus).
-    ///
-    /// # Panics
-    ///
-    /// Panics if waiting on the child fails.
-    pub async fn wait_for_exit(mut self) -> std::process::ExitStatus {
-        drop(self.requests);
-        self.child.wait().await.expect("wait for executor")
-    }
-}
-
-/// # Returns
-///
-/// The absolute path of the `spider-task-executor` binary, read from `SPIDER_TASK_EXECUTOR_BIN`.
-///
-/// # Panics
-///
-/// Panics if `SPIDER_TASK_EXECUTOR_BIN` is unset.
-#[must_use]
-pub fn task_executor_bin() -> PathBuf {
-    std::env::var_os("SPIDER_TASK_EXECUTOR_BIN")
-        .map(PathBuf::from)
-        .expect("SPIDER_TASK_EXECUTOR_BIN env var not set")
-}
-
-/// # Returns
-///
-/// The TDL package staging directory, read from `SPIDER_TDL_PACKAGE_DIR`. Forwarded verbatim
-/// into the executor child's environment so it resolves
-/// `${SPIDER_TDL_PACKAGE_DIR}/<package>/lib<package>.so`.
-///
-/// # Panics
-///
-/// Panics if `SPIDER_TDL_PACKAGE_DIR` is unset.
-#[must_use]
-pub fn tdl_package_dir() -> PathBuf {
-    std::env::var_os("SPIDER_TDL_PACKAGE_DIR")
-        .map(PathBuf::from)
-        .expect("SPIDER_TDL_PACKAGE_DIR env var not set")
-}
-
-/// # Returns
-///
-/// A placeholder msgpack-encoded [`TaskContext`] suitable for a one-shot test invocation. The id
-/// fields are fresh per call but the executor doesn't inspect them.
-///
-/// # Panics
-///
-/// Panics if msgpack encoding fails (the test ids serialize trivially).
-#[must_use]
-pub fn build_ctx() -> Vec<u8> {
-    let ctx = TaskContext {
-        job_id: JobId::new(),
-        task_id: TaskId::new(),
-        task_instance_id: 1,
-        resource_group_id: ResourceGroupId::new(),
-    };
-    rmp_serde::to_vec(&ctx).expect("serialize TaskContext")
-}
-
-/// # Type Parameters
-///
-/// * `T` - The Serde-serializable value type passed as the task's single input.
-///
-/// # Returns
-///
-/// A wire-format buffer carrying one [`TaskInput::ValuePayload`] holding the msgpack-encoded
-/// `value` — i.e. the same shape the parent ships for a single-argument task.
-///
-/// # Panics
-///
-/// Panics if msgpack encoding or wire-format append fails.
-#[must_use]
-pub fn encode_single_input<T: serde::Serialize>(value: &T) -> Vec<u8> {
-    let mut inputs = TaskInputsSerializer::new();
-    inputs
-        .append(TaskInput::ValuePayload(
-            rmp_serde::to_vec(value).expect("msgpack encode input"),
-        ))
-        .expect("append wire-format input");
-    inputs.release()
-}
-
-/// # Returns
-///
-/// A wire-format buffer carrying zero inputs — for nullary tasks like `always_fail` and
-/// `always_panic`.
-#[must_use]
-pub fn encode_no_inputs() -> Vec<u8> {
-    TaskInputsSerializer::new().release()
-}
-
-/// # Type Parameters
-///
-/// * `T` - The Serde-deserializable type the output payload should decode into.
-///
-/// # Returns
-///
-/// The single msgpack-encoded value carried in `output_bytes`, deserialized as `T`.
-///
-/// # Panics
-///
-/// Panics if the outputs buffer doesn't contain exactly one value, or if the msgpack decode
-/// fails.
-#[must_use]
-pub fn decode_single_output<T: serde::de::DeserializeOwned>(output_bytes: &[u8]) -> T {
-    let outputs =
-        TaskOutputsSerializer::deserialize(output_bytes).expect("decode wire-format outputs");
-    assert_eq!(
-        outputs.len(),
-        1,
-        "expected exactly one output payload, got {}",
-        outputs.len(),
-    );
-    rmp_serde::from_slice(&outputs[0]).expect("msgpack decode output")
-}
-
-/// # Returns
-///
-/// A [`Request::Execute`] targeting `task_func` in the integration package, with a fresh test
-/// `TaskContext` and the caller-supplied wire-format `raw_inputs`.
-#[must_use]
-pub fn execute_request(task_func: &str, raw_inputs: Vec<u8>) -> Request {
-    Request::Execute {
-        tdl_context: TdlContext {
-            package: PACKAGE_NAME.to_owned(),
-            task_func: task_func.to_owned(),
-        },
-        raw_ctx: build_ctx(),
-        raw_inputs,
-    }
-}
+//! Tests live under `tests/`; the shared harness and helpers live in the `test-utils` crate. The
+//! library itself is intentionally empty.
diff --git a/tests/huntsman/task-executor/tests/overhead_instrument.rs b/tests/huntsman/task-executor/tests/overhead_instrument.rs
index fc4e146e..64bba93e 100644
--- a/tests/huntsman/task-executor/tests/overhead_instrument.rs
+++ b/tests/huntsman/task-executor/tests/overhead_instrument.rs
@@ -28,12 +28,7 @@ use std::{
 use integration_test_tasks::INSTRUMENT_SLEEP_US;
 use spider_task_executor::protocol::{ExecutorOutcome, Response};
 use tabled::{Table, Tabled};
-use task_executor_tests::{
-    ExecutorHandle,
-    decode_single_output,
-    encode_single_input,
-    execute_request,
-};
+use test_utils::{ExecutorHandle, decode_single_output, encode_single_input, execute_request};
 
 const PAYLOAD_LEN: usize = 100;
 const ITERATIONS: usize = 10;
diff --git a/tests/huntsman/task-executor/tests/test_executor.rs b/tests/huntsman/task-executor/tests/test_executor.rs
index e2eb8ec4..cd91c1d6 100644
--- a/tests/huntsman/task-executor/tests/test_executor.rs
+++ b/tests/huntsman/task-executor/tests/test_executor.rs
@@ -8,7 +8,7 @@ use spider_task_executor::{
     protocol::{ExecutorOutcome, Response},
 };
 use spider_tdl::TdlError;
-use task_executor_tests::{
+use test_utils::{
     ExecutorHandle,
     decode_single_output,
     encode_no_inputs,
diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs
index 7bc5d332..7983285b 100644
--- a/tests/huntsman/task-executor/tests/test_process_pool.rs
+++ b/tests/huntsman/task-executor/tests/test_process_pool.rs
@@ -28,7 +28,13 @@ use spider_execution_manager::process_pool::{
 };
 use spider_task_executor::ExecutorError;
 use spider_tdl::TdlError;
-use task_executor_tests::{PACKAGE_NAME, decode_single_output, task_executor_bin, tdl_package_dir};
+use test_utils::{
+    PACKAGE_NAME,
+    decode_single_output,
+    single_input,
+    task_executor_bin,
+    tdl_package_dir,
+};
 
 /// Generous timeout for tasks expected to finish quickly.
 const NORMAL_TIMEOUT: Duration = Duration::from_secs(5);
@@ -89,25 +95,6 @@ fn make_request(task_func: &str, inputs: Vec<TaskInput>) -> ExecuteRequest {
     }
 }
 
-/// Wraps `value` into a single-payload input list.
-///
-/// # Type Parameters
-///
-/// * `T` - The Serde-serializable value type carried as the task's single input.
-///
-/// # Returns
-///
-/// A `Vec<TaskInput>` of length 1 carrying the msgpack-encoded `value`.
-///
-/// # Panics
-///
-/// Panics if msgpack encoding fails.
-fn single_input<T: serde::Serialize>(value: &T) -> Vec<TaskInput> {
-    vec![TaskInput::ValuePayload(
-        rmp_serde::to_vec(value).expect("msgpack encode input"),
-    )]
-}
-
 #[tokio::test]
 #[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
 async fn fibonacci_succeeds() {
diff --git a/tests/huntsman/test-utils/Cargo.toml b/tests/huntsman/test-utils/Cargo.toml
new file mode 100644
index 00000000..cd477a15
--- /dev/null
+++ b/tests/huntsman/test-utils/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "test-utils"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[lib]
+name = "test_utils"
+path = "src/lib.rs"
+
+[dependencies]
+async-trait = "0.1.89"
+bincode = "1.3.3"
+bytes = "1.10"
+futures-util = {
+  version = "0.3.31",
+  default-features = false,
+  features = ["sink", "std"]
+}
+rmp-serde = "1.3.1"
+serde = "1.0.228"
+spider-core = { path = "../../../components/spider-core" }
+spider-execution-manager = {
+  path = "../../../components/spider-execution-manager"
+}
+spider-task-executor = { path = "../../../components/spider-task-executor" }
+spider-tdl = { path = "../../../components/spider-tdl" }
+tokio = {
+  version = "1.50.0",
+  features = ["io-util", "macros", "process", "rt", "sync", "time"]
+}
+tokio-util = { version = "0.7", features = ["codec"] }
diff --git a/tests/huntsman/test-utils/src/executor.rs b/tests/huntsman/test-utils/src/executor.rs
new file mode 100644
index 00000000..43ae646f
--- /dev/null
+++ b/tests/huntsman/test-utils/src/executor.rs
@@ -0,0 +1,297 @@
+//! Executor subprocess harness plus the TDL wire-payload helpers the integration suites share.
+//!
+//! [`ExecutorHandle`] spawns the `spider-task-executor` binary as a child process, frames bincode
+//! requests on its stdin and reads bincode responses from its stdout — the exact wire protocol of
+//! [`spider_task_executor::protocol`].
+//!
+//! Every fallible operation in this harness panics with `.expect(...)` on failure; the tests are
+//! infrastructure, not production code, and the panic message + backtrace is more useful at the
+//! failure site than threading an error type through every helper.
+//!
+//! Environment:
+//!
+//! * `SPIDER_TASK_EXECUTOR_BIN` — absolute path to the executor binary.
+//! * `SPIDER_TDL_PACKAGE_DIR` — directory the binary searches for TDL packages; gets forwarded to
+//!   the child verbatim.
+
+use std::{path::PathBuf, process::Stdio};
+
+use bytes::Bytes;
+use futures_util::{SinkExt, StreamExt};
+use spider_core::{
+    task::TdlContext,
+    types::{
+        id::{JobId, ResourceGroupId, TaskId},
+        io::TaskInput,
+    },
+};
+use spider_task_executor::protocol::{Request, Response};
+use spider_tdl::{
+    TaskContext,
+    wire::{TaskInputsSerializer, TaskOutputsSerializer},
+};
+use tokio::process::{Child, ChildStdin, ChildStdout, Command};
+use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec};
+
+/// The TDL package name registered by `integration-test-tasks`.
+pub const PACKAGE_NAME: &str = "integration_test_tasks";
+
+/// One running executor subprocess plus framed handles to its stdin / stdout.
+///
+/// The subprocess will be killed when the handle is dropped.
+pub struct ExecutorHandle {
+    child: Child,
+    requests: FramedWrite<ChildStdin, LengthDelimitedCodec>,
+    responses: FramedRead<ChildStdout, LengthDelimitedCodec>,
+}
+
+impl ExecutorHandle {
+    /// Spawns the executor binary with `SPIDER_TDL_PACKAGE_DIR` set; the child inherits the
+    /// parent's stderr so panic / abort messages surface in the test log.
+    ///
+    /// # Returns
+    ///
+    /// A handle owning the running subprocess and framed I/O.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the binary cannot be spawned or its stdio handles cannot be claimed.
+    #[must_use]
+    pub fn spawn() -> Self {
+        let mut child = Command::new(task_executor_bin())
+            .env("SPIDER_TDL_PACKAGE_DIR", tdl_package_dir())
+            .stdin(Stdio::piped())
+            .stdout(Stdio::piped())
+            .stderr(Stdio::inherit())
+            .kill_on_drop(true)
+            .spawn()
+            .expect("spawn executor binary");
+        let stdin = child.stdin.take().expect("stdin must be piped");
+        let stdout = child.stdout.take().expect("stdout must be piped");
+        Self {
+            child,
+            requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()),
+            responses: FramedRead::new(stdout, LengthDelimitedCodec::new()),
+        }
+    }
+
+    /// Bincode-serializes `req` and writes one length-delimited frame to the executor's stdin.
+    ///
+    /// # Panics
+    ///
+    /// Panics if encoding fails or the stdin pipe cannot be written.
+    pub async fn send(&mut self, req: &Request) {
+        let bytes = bincode::serialize(req).expect("bincode encode Request");
+        self.requests
+            .send(Bytes::from(bytes))
+            .await
+            .expect("write request frame");
+    }
+
+    /// Reads exactly one length-delimited frame from the executor's stdout and bincode-decodes it.
+    ///
+    /// # Returns
+    ///
+    /// The next [`Response`] from the executor.
+    ///
+    /// # Panics
+    ///
+    /// Panics if stdout closes before a frame arrives, the frame I/O fails, or decoding fails.
+    pub async fn recv(&mut self) -> Response {
+        let frame = self
+            .responses
+            .next()
+            .await
+            .expect("executor closed stdout before reply")
+            .expect("read response frame");
+        bincode::deserialize(&frame).expect("bincode decode Response")
+    }
+
+    /// Reads at most one length-delimited frame, tolerating a clean EOF (which crash-path tests
+    /// rely on to detect that the executor died).
+    ///
+    /// # Returns
+    ///
+    /// `Some(response)` if a frame was received, `None` if stdout closed cleanly first.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the frame I/O fails for a reason other than EOF or if decoding fails.
+    pub async fn try_recv(&mut self) -> Option<Response> {
+        let frame = self.responses.next().await?;
+        let bytes = frame.expect("read response frame");
+        Some(bincode::deserialize(&bytes).expect("bincode decode Response"))
+    }
+
+    /// Sends [`Request::Shutdown`], closes stdin, and waits for the child to exit cleanly.
+    ///
+    /// # Panics
+    ///
+    /// Panics if waiting on the child fails or the child exits non-zero.
+    pub async fn shutdown_clean(mut self) {
+        self.send(&Request::Shutdown).await;
+        // Close the stdin pipe so the child sees EOF after `Shutdown` is drained.
+        drop(self.requests);
+        let status = self.child.wait().await.expect("wait for executor");
+        assert!(status.success(), "executor exited with status {status:?}");
+    }
+
+    /// Closes stdin and waits for the child to exit. Used by crash-path tests that don't expect
+    /// a clean shutdown.
+    ///
+    /// # Returns
+    ///
+    /// The child's [`ExitStatus`](std::process::ExitStatus).
+    ///
+    /// # Panics
+    ///
+    /// Panics if waiting on the child fails.
+    pub async fn wait_for_exit(mut self) -> std::process::ExitStatus {
+        drop(self.requests);
+        self.child.wait().await.expect("wait for executor")
+    }
+}
+
+/// # Returns
+///
+/// The absolute path of the `spider-task-executor` binary, read from `SPIDER_TASK_EXECUTOR_BIN`.
+///
+/// # Panics
+///
+/// Panics if `SPIDER_TASK_EXECUTOR_BIN` is unset.
+#[must_use]
+pub fn task_executor_bin() -> PathBuf {
+    std::env::var_os("SPIDER_TASK_EXECUTOR_BIN")
+        .map(PathBuf::from)
+        .expect("SPIDER_TASK_EXECUTOR_BIN env var not set")
+}
+
+/// # Returns
+///
+/// The TDL package staging directory, read from `SPIDER_TDL_PACKAGE_DIR`.
+///
+/// # Panics
+///
+/// Panics if `SPIDER_TDL_PACKAGE_DIR` is unset.
+#[must_use]
+pub fn tdl_package_dir() -> PathBuf {
+    std::env::var_os("SPIDER_TDL_PACKAGE_DIR")
+        .map(PathBuf::from)
+        .expect("SPIDER_TDL_PACKAGE_DIR env var not set")
+}
+
+/// # Returns
+///
+/// A placeholder msgpack-encoded [`TaskContext`] suitable for a one-shot test invocation. The id
+/// fields are fresh per call but the executor doesn't inspect them.
+///
+/// # Panics
+///
+/// Panics if msgpack encoding fails.
+#[must_use]
+pub fn build_ctx() -> Vec<u8> {
+    let ctx = TaskContext {
+        job_id: JobId::new(),
+        task_id: TaskId::new(),
+        task_instance_id: 1,
+        resource_group_id: ResourceGroupId::new(),
+    };
+    rmp_serde::to_vec(&ctx).expect("serialize TaskContext")
+}
+
+/// Wraps `value` into a single-payload [`TaskInput`] list — the shape carried in
+/// [`spider_core::types::io::ExecutionContext::inputs`] for a single-argument task.
+///
+/// # Type Parameters
+///
+/// * `ValueType` - The Serde-serializable value type carried as the task's single input.
+///
+/// # Returns
+///
+/// A [`Vec<TaskInput>`] of length 1 holding the msgpack-encoded `value`.
+///
+/// # Panics
+///
+/// Panics if msgpack encoding fails.
+#[must_use]
+pub fn single_input<ValueType: serde::Serialize>(value: &ValueType) -> Vec<TaskInput> {
+    vec![TaskInput::ValuePayload(
+        rmp_serde::to_vec(value).expect("msgpack encode input"),
+    )]
+}
+
+/// # Type Parameters
+///
+/// * `ValueType` - The Serde-serializable value type passed as the task's single input.
+///
+/// # Returns
+///
+/// A wire-format buffer carrying one [`TaskInput::ValuePayload`] holding the msgpack-encoded
+/// `value` — i.e. the same shape the parent ships for a single-argument task.
+///
+/// # Panics
+///
+/// Panics if msgpack encoding or wire-format append fails.
+#[must_use]
+pub fn encode_single_input<ValueType: serde::Serialize>(value: &ValueType) -> Vec<u8> {
+    let mut inputs = TaskInputsSerializer::new();
+    inputs
+        .append(TaskInput::ValuePayload(
+            rmp_serde::to_vec(value).expect("msgpack encode input"),
+        ))
+        .expect("append wire-format input");
+    inputs.release()
+}
+
+/// # Returns
+///
+/// A wire-format buffer carrying zero inputs — for nullary tasks like `always_fail` and
+/// `always_panic`.
+#[must_use]
+pub fn encode_no_inputs() -> Vec<u8> {
+    TaskInputsSerializer::new().release()
+}
+
+/// # Type Parameters
+///
+/// * `OutputType` - The Serde-deserializable type the output payload should decode into.
+///
+/// # Returns
+///
+/// The single msgpack-encoded value carried in `output_bytes`, deserialized as `OutputType`.
+///
+/// # Panics
+///
+/// Panics if:
+///
+/// * The output buffer doesn't contain exactly one value.
+/// * The msgpack decoding fails.
+#[must_use]
+pub fn decode_single_output<OutputType: serde::de::DeserializeOwned>(
+    output_bytes: &[u8],
+) -> OutputType {
+    let outputs =
+        TaskOutputsSerializer::deserialize(output_bytes).expect("decode wire-format outputs");
+    assert_eq!(
+        outputs.len(),
+        1,
+        "expected exactly one output payload, got {}",
+        outputs.len(),
+    );
+    rmp_serde::from_slice(&outputs[0]).expect("msgpack decode output")
+}
+
+/// # Returns
+///
+/// A [`Request::Execute`] targeting `task_func` in the integration package.
+#[must_use]
+pub fn execute_request(task_func: &str, raw_inputs: Vec<u8>) -> Request {
+    Request::Execute {
+        tdl_context: TdlContext {
+            package: PACKAGE_NAME.to_owned(),
+            task_func: task_func.to_owned(),
+        },
+        raw_ctx: build_ctx(),
+        raw_inputs,
+    }
+}
diff --git a/tests/huntsman/test-utils/src/lib.rs b/tests/huntsman/test-utils/src/lib.rs
new file mode 100644
index 00000000..825f3628
--- /dev/null
+++ b/tests/huntsman/test-utils/src/lib.rs
@@ -0,0 +1,16 @@
+//! Shared test utilities for the huntsman integration suites.
+//!
+//! Two concern areas:
+//!
+//! * [`executor`] — the `spider-task-executor` subprocess harness ([`ExecutorHandle`]) plus the TDL
+//!   wire-payload helpers and environment readers the suites share.
+//! * [`mock`] — in-process mock implementations of the execution manager's client traits.
+//!
+//! Both modules' items are re-exported at the crate level, so tests can `use test_utils::*`-style
+//! imports without naming the submodule.
+
+mod executor;
+mod mock;
+
+pub use executor::*;
+pub use mock::*;
diff --git a/tests/huntsman/test-utils/src/mock.rs b/tests/huntsman/test-utils/src/mock.rs
new file mode 100644
index 00000000..19122cbe
--- /dev/null
+++ b/tests/huntsman/test-utils/src/mock.rs
@@ -0,0 +1,195 @@
+//! In-process mock implementations of the execution manager's client traits.
+//!
+//! Each mock is `Clone` (internally `Arc`-backed) so the test body retains an inspection handle
+//! while the runtime owns a clone. Response queues let the test drive deterministic call sequences;
+//! inboxes record every call so assertions can be made.
+
+use std::{
+    collections::VecDeque,
+    net::IpAddr,
+    sync::{
+        Arc,
+        Mutex,
+        MutexGuard,
+        PoisonError,
+        atomic::{AtomicU64, Ordering},
+    },
+    time::Duration,
+};
+
+use async_trait::async_trait;
+use spider_core::types::id::{ExecutionManagerId, SessionId};
+use spider_execution_manager::client::{
+    LivenessClient,
+    LivenessResponseError,
+    RegistrationResponse,
+};
+use tokio::sync::Notify;
+
+/// Mock [`LivenessClient`].
+#[derive(Clone)]
+pub struct MockLiveness {
+    inner: Arc<LivenessInner>,
+}
+
+impl MockLiveness {
+    /// Factory function.
+    ///
+    /// # Returns
+    ///
+    /// A fresh liveness mock with a freshly generated `em_id`, initial session 1, and Ok(1)
+    /// heartbeats by default.
+    #[must_use]
+    pub fn new() -> Self {
+        Self::with_initial_session(1)
+    }
+
+    /// Factory function.
+    ///
+    /// # Returns
+    ///
+    /// A fresh liveness mock with the given initial session id (used both for the registration
+    /// response and as the default heartbeat reply).
+    #[must_use]
+    pub fn with_initial_session(initial_session: SessionId) -> Self {
+        Self {
+            inner: Arc::new(LivenessInner {
+                em_id: ExecutionManagerId::new(),
+                initial_session: AtomicU64::new(initial_session),
+                register_response: Mutex::new(None),
+                heartbeat_responses: Mutex::new(VecDeque::new()),
+                default_session: AtomicU64::new(initial_session),
+                register_calls: Mutex::new(Vec::new()),
+                heartbeat_count: AtomicU64::new(0),
+                heartbeat_notify: Notify::new(),
+            }),
+        }
+    }
+
+    /// Overrides the registration response. By default `register` returns
+    /// `Ok(RegistrationResponse { em_id, session_id: initial_session })`.
+    pub fn set_register_response(
+        &self,
+        response: Result<RegistrationResponse, LivenessResponseError>,
+    ) {
+        *lock(&self.inner.register_response) = Some(response);
+    }
+
+    /// Updates the fallback session id returned by `heartbeat` when the response queue is empty.
+    pub fn set_default_heartbeat_session(&self, session: SessionId) {
+        self.inner.default_session.store(session, Ordering::Relaxed);
+    }
+
+    /// Queues `response` for the next `heartbeat` call (takes priority over the default session).
+    pub fn push_heartbeat_response(&self, response: Result<SessionId, LivenessResponseError>) {
+        lock(&self.inner.heartbeat_responses).push_back(response);
+    }
+
+    /// # Returns
+    ///
+    /// The `em_id` baked into this mock — the same value the runtime sees through
+    /// [`LivenessClient::register`].
+    #[must_use]
+    pub fn em_id(&self) -> ExecutionManagerId {
+        self.inner.em_id
+    }
+
+    /// # Returns
+    ///
+    /// The number of `heartbeat` calls observed.
+    #[must_use]
+    pub fn heartbeat_count(&self) -> u64 {
+        self.inner.heartbeat_count.load(Ordering::Relaxed)
+    }
+
+    /// # Returns
+    ///
+    /// The list of IPs passed to `register`.
+    #[must_use]
+    pub fn register_calls(&self) -> Vec<IpAddr> {
+        lock(&self.inner.register_calls).clone()
+    }
+
+    /// Waits until at least `target` heartbeats have been observed, bounded by `timeout`.
+    ///
+    /// # Returns
+    ///
+    /// `true` if the threshold was reached, `false` if `timeout` elapsed first.
+    pub async fn wait_for_heartbeats(&self, target: u64, timeout: Duration) -> bool {
+        let deadline = tokio::time::Instant::now() + timeout;
+        loop {
+            if self.heartbeat_count() >= target {
+                return true;
+            }
+            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+            if remaining.is_zero() {
+                return false;
+            }
+            let notified = self.inner.heartbeat_notify.notified();
+            tokio::select! {
+                () = notified => {}
+                () = tokio::time::sleep(remaining.min(POLL_INTERVAL)) => {}
+            }
+        }
+    }
+}
+
+impl Default for MockLiveness {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl LivenessClient for MockLiveness {
+    async fn register(&self, ip: IpAddr) -> Result<RegistrationResponse, LivenessResponseError> {
+        lock(&self.inner.register_calls).push(ip);
+        let programmed = lock(&self.inner.register_response).take();
+        if let Some(response) = programmed {
+            return response;
+        }
+        Ok(RegistrationResponse {
+            em_id: self.inner.em_id,
+            session_id: self.inner.initial_session.load(Ordering::Relaxed),
+        })
+    }
+
+    async fn heartbeat(
+        &self,
+        _em_id: ExecutionManagerId,
+    ) -> Result<SessionId, LivenessResponseError> {
+        self.inner.heartbeat_count.fetch_add(1, Ordering::Relaxed);
+        self.inner.heartbeat_notify.notify_waiters();
+        let queued = lock(&self.inner.heartbeat_responses).pop_front();
+        queued.unwrap_or_else(|| Ok(self.inner.default_session.load(Ordering::Relaxed)))
+    }
+}
+
+/// Default polling interval for `wait_until_*` helpers. Short enough to keep tests snappy.
+const POLL_INTERVAL: Duration = Duration::from_millis(5);
+
+/// Shared state behind [`MockLiveness`].
+struct LivenessInner {
+    em_id: ExecutionManagerId,
+    initial_session: AtomicU64,
+    register_response: Mutex<Option<Result<RegistrationResponse, LivenessResponseError>>>,
+    heartbeat_responses: Mutex<VecDeque<Result<SessionId, LivenessResponseError>>>,
+    default_session: AtomicU64,
+    register_calls: Mutex<Vec<IpAddr>>,
+    heartbeat_count: AtomicU64,
+    heartbeat_notify: Notify,
+}
+
+/// Acquires `mutex`, silently recovering from poisoning so the helpers never panic from a peer
+/// test's failure.
+///
+/// # Type Parameters
+///
+/// * `InnerType` - The type wrapped by `mutex`.
+///
+/// # Returns
+///
+/// A [`MutexGuard`] over `mutex`'s contents.
+fn lock<InnerType>(mutex: &Mutex<InnerType>) -> MutexGuard<'_, InnerType> {
+    mutex.lock().unwrap_or_else(PoisonError::into_inner)
+}

From 100129e6c0a3e7f54287f005ee60ec1ee7211950 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Fri, 5 Jun 2026 16:19:20 -0400
Subject: [PATCH 05/14] refactor(huntsman): Unify `TaskId` by replacing
 `spider-core`'s definition with `spider-storage`'s. (#331)

---
 components/spider-core/src/types/id.rs        | 47 ++++++-------------
 components/spider-storage/src/cache.rs        | 15 ------
 components/spider-storage/src/cache/job.rs    |  3 +-
 .../spider-storage/src/task_instance_pool.rs  |  3 +-
 .../spider-storage/tests/scheduling_infra.rs  |  3 +-
 components/spider-tdl/src/task.rs             |  2 +-
 components/spider-tdl/src/task_context.rs     |  2 +-
 .../spider-tdl/tests/test_task_macro.rs       |  4 +-
 .../task-executor/tests/test_process_pool.rs  |  2 +-
 .../huntsman/tdl-integration/tests/complex.rs |  2 +-
 tests/huntsman/test-utils/src/executor.rs     |  2 +-
 11 files changed, 24 insertions(+), 61 deletions(-)

diff --git a/components/spider-core/src/types/id.rs b/components/spider-core/src/types/id.rs
index 21821e7e..4735f798 100644
--- a/components/spider-core/src/types/id.rs
+++ b/components/spider-core/src/types/id.rs
@@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize};
 use sqlx::{Database, encode::IsNull};
 use uuid::Uuid;
 
+use crate::task::TaskIndex;
+
 /// A generic identifier type that wraps a UUID and a type marker.
 ///
 /// # Type Parameters:
@@ -96,9 +98,18 @@ pub type UuidBytes = uuid::Bytes;
 pub enum ResourceGroupIdMarker {}
 pub type ResourceGroupId = Id<ResourceGroupIdMarker>;
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum TaskIdMarker {}
-pub type TaskId = Id<TaskIdMarker>;
+/// Identifier of a task inside a job.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum TaskId {
+    /// The index of the task in the job's task graph.
+    Index(TaskIndex),
+
+    /// The commit task.
+    Commit,
+
+    /// The cleanup task.
+    Cleanup,
+}
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum JobIdMarker {}
@@ -169,33 +180,3 @@ where
 }
 
 pub type SignedJobId = SignedId<JobIdMarker>;
-
-pub type SignedTaskId = SignedId<TaskIdMarker>;
-
-#[cfg(test)]
-mod tests {
-    use std::any::TypeId;
-
-    use super::*;
-
-    #[test]
-    fn test_id_basic() {
-        let id = TaskId::new();
-        let underlying_uuid = id.as_uuid_ref().to_owned();
-        assert_eq!(id, TaskId::from(underlying_uuid));
-
-        assert_ne!(TypeId::of::<TaskId>(), TypeId::of::<JobId>());
-    }
-
-    #[test]
-    fn task_id_json_roundtrip() {
-        let id = TaskId::new();
-        let deserialized_id: TaskId = serde_json::from_str(
-            serde_json::to_string(&id)
-                .expect("JSON serialization failure")
-                .as_str(),
-        )
-        .expect("JSON deserialization failure");
-        assert_eq!(id, deserialized_id);
-    }
-}
diff --git a/components/spider-storage/src/cache.rs b/components/spider-storage/src/cache.rs
index d520f519..89a5e13d 100644
--- a/components/spider-storage/src/cache.rs
+++ b/components/spider-storage/src/cache.rs
@@ -1,21 +1,6 @@
-use spider_core::task::TaskIndex;
-
 pub mod error;
 pub mod io;
 pub mod job;
 pub mod job_submission;
 mod sync;
 pub mod task;
-
-/// Identifier of a task inside a job.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum TaskId {
-    /// The index of the task in the job's task graph.
-    Index(TaskIndex),
-
-    /// The commit task.
-    Commit,
-
-    /// The cleanup task.
-    Cleanup,
-}
diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs
index 5c575e8e..c5a06ccb 100644
--- a/components/spider-storage/src/cache/job.rs
+++ b/components/spider-storage/src/cache/job.rs
@@ -10,7 +10,7 @@ use spider_core::{
     job::JobState,
     task::{TaskIndex, TaskState},
     types::{
-        id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId},
+        id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId},
         io::{ExecutionContext, TaskOutput},
     },
 };
@@ -18,7 +18,6 @@ use tokio::sync::{RwLockReadGuard, RwLockWriteGuard};
 
 use crate::{
     cache::{
-        TaskId,
         error::{CacheError, InternalError, InternalError::UnexpectedJobState, StaleStateError},
         job_submission::ValidatedJobSubmission,
         task::TaskGraph,
diff --git a/components/spider-storage/src/task_instance_pool.rs b/components/spider-storage/src/task_instance_pool.rs
index ace45ce6..bba0cf77 100644
--- a/components/spider-storage/src/task_instance_pool.rs
+++ b/components/spider-storage/src/task_instance_pool.rs
@@ -23,12 +23,11 @@ use std::{
 };
 
 use async_trait::async_trait;
-use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId};
+use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId};
 use tokio::sync::mpsc;
 
 use crate::{
     cache::{
-        TaskId,
         error::InternalError,
         task::{SharedTaskControlBlock, SharedTerminationTaskControlBlock},
     },
diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs
index d3e5eb98..046a35eb 100644
--- a/components/spider-storage/tests/scheduling_infra.rs
+++ b/components/spider-storage/tests/scheduling_infra.rs
@@ -87,13 +87,12 @@ use spider_core::{
     job::JobState,
     task::TaskIndex,
     types::{
-        id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId},
+        id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId},
         io::{ExecutionContext, TaskOutput},
     },
 };
 use spider_storage::{
     cache::{
-        TaskId,
         error::{CacheError, InternalError},
         job::SharedJobControlBlock,
         job_submission::ValidatedJobSubmission,
diff --git a/components/spider-tdl/src/task.rs b/components/spider-tdl/src/task.rs
index 99ca904d..d4015e0c 100644
--- a/components/spider-tdl/src/task.rs
+++ b/components/spider-tdl/src/task.rs
@@ -254,7 +254,7 @@ mod tests {
     fn make_encoded_ctx() -> Vec<u8> {
         let ctx = TaskContext {
             job_id: JobId::new(),
-            task_id: TaskId::new(),
+            task_id: TaskId::Index(0),
             task_instance_id: 1,
             resource_group_id: ResourceGroupId::new(),
         };
diff --git a/components/spider-tdl/src/task_context.rs b/components/spider-tdl/src/task_context.rs
index 60348315..d412bdb4 100644
--- a/components/spider-tdl/src/task_context.rs
+++ b/components/spider-tdl/src/task_context.rs
@@ -31,7 +31,7 @@ mod tests {
     fn round_trip_msgpack() -> anyhow::Result<()> {
         let ctx = TaskContext {
             job_id: JobId::new(),
-            task_id: TaskId::new(),
+            task_id: TaskId::Index(0),
             task_instance_id: 13,
             resource_group_id: ResourceGroupId::new(),
         };
diff --git a/components/spider-tdl/tests/test_task_macro.rs b/components/spider-tdl/tests/test_task_macro.rs
index e2a070fe..9a891f19 100644
--- a/components/spider-tdl/tests/test_task_macro.rs
+++ b/components/spider-tdl/tests/test_task_macro.rs
@@ -81,7 +81,7 @@ fn translate(_ctx: TaskContext, p: Point, dx: int32, dy: int32) -> Result<(Point
 fn make_encoded_ctx() -> Vec<u8> {
     let ctx = TaskContext {
         job_id: JobId::new(),
-        task_id: TaskId::new(),
+        task_id: TaskId::Index(0),
         task_instance_id: 1,
         resource_group_id: ResourceGroupId::new(),
     };
@@ -303,7 +303,7 @@ fn direct_execute_call_round_trips() -> anyhow::Result<()> {
 
     let ctx = TaskContext {
         job_id: JobId::new(),
-        task_id: TaskId::new(),
+        task_id: TaskId::Index(0),
         task_instance_id: 1,
         resource_group_id: ResourceGroupId::new(),
     };
diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs
index 7983285b..e646352f 100644
--- a/tests/huntsman/task-executor/tests/test_process_pool.rs
+++ b/tests/huntsman/task-executor/tests/test_process_pool.rs
@@ -78,7 +78,7 @@ fn build_pool() -> ProcessPool {
 fn make_request(task_func: &str, inputs: Vec<TaskInput>) -> ExecuteRequest {
     ExecuteRequest {
         job_id: JobId::new(),
-        task_id: TaskId::new(),
+        task_id: TaskId::Index(0),
         resource_group_id: ResourceGroupId::new(),
         ctx: ExecutionContext {
             task_instance_id: 1,
diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs
index 513e7d75..0e2bc7d5 100644
--- a/tests/huntsman/tdl-integration/tests/complex.rs
+++ b/tests/huntsman/tdl-integration/tests/complex.rs
@@ -33,7 +33,7 @@ fn lib_path() -> std::path::PathBuf {
 fn encode_ctx() -> Vec<u8> {
     let ctx = TaskContext {
         job_id: JobId::new(),
-        task_id: TaskId::new(),
+        task_id: TaskId::Index(0),
         task_instance_id: 1,
         resource_group_id: ResourceGroupId::new(),
     };
diff --git a/tests/huntsman/test-utils/src/executor.rs b/tests/huntsman/test-utils/src/executor.rs
index 43ae646f..24f8db5f 100644
--- a/tests/huntsman/test-utils/src/executor.rs
+++ b/tests/huntsman/test-utils/src/executor.rs
@@ -192,7 +192,7 @@ pub fn tdl_package_dir() -> PathBuf {
 pub fn build_ctx() -> Vec<u8> {
     let ctx = TaskContext {
         job_id: JobId::new(),
-        task_id: TaskId::new(),
+        task_id: TaskId::Index(0),
         task_instance_id: 1,
         resource_group_id: ResourceGroupId::new(),
     };

From d95057fc5fc4d2e12d4616d4969df1c5cb697f92 Mon Sep 17 00:00:00 2001
From: sitaowang1998 <sitaowang1998@outlook.com>
Date: Sat, 6 Jun 2026 14:50:50 -0400
Subject: [PATCH 06/14] refactor(spider-huntsman): Use auto-incrementing u64
 IDs instead of UUIDv7 for database-generated IDs. (#337)

Co-authored-by: LinZhihao-723 <pleiades3190@gmail.com>
---
 Cargo.lock                                    | 266 +-----------------
 components/spider-core/Cargo.toml             |   4 +-
 components/spider-core/src/types/id.rs        | 126 ++++++---
 .../spider-execution-manager/src/liveness.rs  |   2 +-
 .../src/process_pool.rs                       |   8 +-
 components/spider-storage/Cargo.toml          |   2 -
 components/spider-storage/src/db/mariadb.rs   |  29 +-
 .../spider-storage/src/state/job_cache.rs     |  16 +-
 .../spider-storage/src/task_instance_pool.rs  |  20 +-
 .../spider-storage/tests/mariadb_infra.rs     |   2 +-
 .../spider-storage/tests/mariadb_test.rs      |  30 +-
 .../spider-storage/tests/scheduling_infra.rs  |   4 +-
 components/spider-tdl/src/task.rs             |   4 +-
 components/spider-tdl/src/task_context.rs     |   4 +-
 .../spider-tdl/tests/test_task_macro.rs       |   8 +-
 .../task-executor/tests/test_process_pool.rs  |   8 +-
 .../huntsman/tdl-integration/tests/complex.rs |   4 +-
 tests/huntsman/test-utils/src/executor.rs     |   4 +-
 tests/huntsman/test-utils/src/mock.rs         |   2 +-
 19 files changed, 166 insertions(+), 377 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bb6a1c31..d6ac6cd1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -100,12 +100,6 @@ dependencies = [
  "generic-array",
 ]
 
-[[package]]
-name = "bumpalo"
-version = "3.20.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
-
 [[package]]
 name = "bytecount"
 version = "0.6.9"
@@ -497,21 +491,8 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "libc",
- "r-efi 5.3.0",
- "wasip2",
-]
-
-[[package]]
-name = "getrandom"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
-dependencies = [
- "cfg-if",
- "libc",
- "r-efi 6.0.0",
+ "r-efi",
  "wasip2",
- "wasip3",
 ]
 
 [[package]]
@@ -684,12 +665,6 @@ dependencies = [
  "zerovec",
 ]
 
-[[package]]
-name = "id-arena"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
-
 [[package]]
 name = "idna"
 version = "1.1.0"
@@ -719,8 +694,6 @@ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
  "hashbrown 0.17.0",
- "serde",
- "serde_core",
 ]
 
 [[package]]
@@ -737,18 +710,6 @@ version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
 
-[[package]]
-name = "js-sys"
-version = "0.3.97"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf"
-dependencies = [
- "cfg-if",
- "futures-util",
- "once_cell",
- "wasm-bindgen",
-]
-
 [[package]]
 name = "konst"
 version = "0.2.20"
@@ -773,12 +734,6 @@ dependencies = [
  "spin",
 ]
 
-[[package]]
-name = "leb128fmt"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
-
 [[package]]
 name = "libc"
 version = "0.2.186"
@@ -1054,16 +1009,6 @@ dependencies = [
  "zerocopy",
 ]
 
-[[package]]
-name = "prettyplease"
-version = "0.2.37"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
-dependencies = [
- "proc-macro2",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "proc-macro-error-attr2"
 version = "2.0.0"
@@ -1110,12 +1055,6 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
-[[package]]
-name = "r-efi"
-version = "6.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
-
 [[package]]
 name = "rand"
 version = "0.8.6"
@@ -1249,12 +1188,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "rustversion"
-version = "1.0.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
-
 [[package]]
 name = "ryu"
 version = "1.0.23"
@@ -1461,6 +1394,7 @@ name = "spider-core"
 version = "0.1.0"
 dependencies = [
  "non-empty-string",
+ "rand 0.9.4",
  "rmp-serde",
  "semver",
  "serde",
@@ -1471,7 +1405,6 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "uuid",
 ]
 
 [[package]]
@@ -1524,7 +1457,6 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "uuid",
 ]
 
 [[package]]
@@ -1633,7 +1565,6 @@ dependencies = [
  "tokio-stream",
  "tracing",
  "url",
- "uuid",
 ]
 
 [[package]]
@@ -1713,7 +1644,6 @@ dependencies = [
  "stringprep",
  "thiserror",
  "tracing",
- "uuid",
  "whoami",
 ]
 
@@ -1751,7 +1681,6 @@ dependencies = [
  "stringprep",
  "thiserror",
  "tracing",
- "uuid",
  "whoami",
 ]
 
@@ -1777,7 +1706,6 @@ dependencies = [
  "thiserror",
  "tracing",
  "url",
- "uuid",
 ]
 
 [[package]]
@@ -2165,18 +2093,6 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
 
-[[package]]
-name = "uuid"
-version = "1.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
-dependencies = [
- "getrandom 0.4.2",
- "js-sys",
- "serde_core",
- "wasm-bindgen",
-]
-
 [[package]]
 name = "valuable"
 version = "0.1.1"
@@ -2207,16 +2123,7 @@ version = "1.0.3+wasi-0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
 dependencies = [
- "wit-bindgen 0.57.1",
-]
-
-[[package]]
-name = "wasip3"
-version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
-dependencies = [
- "wit-bindgen 0.51.0",
+ "wit-bindgen",
 ]
 
 [[package]]
@@ -2225,85 +2132,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
 
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.120"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1"
-dependencies = [
- "cfg-if",
- "once_cell",
- "rustversion",
- "wasm-bindgen-macro",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.120"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.120"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41"
-dependencies = [
- "bumpalo",
- "proc-macro2",
- "quote",
- "syn 2.0.117",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.120"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "wasm-encoder"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
-dependencies = [
- "leb128fmt",
- "wasmparser",
-]
-
-[[package]]
-name = "wasm-metadata"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
-dependencies = [
- "anyhow",
- "indexmap",
- "wasm-encoder",
- "wasmparser",
-]
-
-[[package]]
-name = "wasmparser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
-dependencies = [
- "bitflags",
- "hashbrown 0.15.5",
- "indexmap",
- "semver",
-]
-
 [[package]]
 name = "whoami"
 version = "1.6.1"
@@ -2417,100 +2245,12 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
-[[package]]
-name = "wit-bindgen"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
-dependencies = [
- "wit-bindgen-rust-macro",
-]
-
 [[package]]
 name = "wit-bindgen"
 version = "0.57.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
 
-[[package]]
-name = "wit-bindgen-core"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
-dependencies = [
- "anyhow",
- "heck",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-bindgen-rust"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
-dependencies = [
- "anyhow",
- "heck",
- "indexmap",
- "prettyplease",
- "syn 2.0.117",
- "wasm-metadata",
- "wit-bindgen-core",
- "wit-component",
-]
-
-[[package]]
-name = "wit-bindgen-rust-macro"
-version = "0.51.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
-dependencies = [
- "anyhow",
- "prettyplease",
- "proc-macro2",
- "quote",
- "syn 2.0.117",
- "wit-bindgen-core",
- "wit-bindgen-rust",
-]
-
-[[package]]
-name = "wit-component"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
-dependencies = [
- "anyhow",
- "bitflags",
- "indexmap",
- "log",
- "serde",
- "serde_derive",
- "serde_json",
- "wasm-encoder",
- "wasm-metadata",
- "wasmparser",
- "wit-parser",
-]
-
-[[package]]
-name = "wit-parser"
-version = "0.244.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
-dependencies = [
- "anyhow",
- "id-arena",
- "indexmap",
- "log",
- "semver",
- "serde",
- "serde_derive",
- "serde_json",
- "unicode-xid",
- "wasmparser",
-]
-
 [[package]]
 name = "writeable"
 version = "0.6.3"
diff --git a/components/spider-core/Cargo.toml b/components/spider-core/Cargo.toml
index 87531aaa..7167cfde 100644
--- a/components/spider-core/Cargo.toml
+++ b/components/spider-core/Cargo.toml
@@ -9,15 +9,15 @@ path = "src/lib.rs"
 
 [dependencies]
 non-empty-string = { version = "0.2.6", features = ["serde"] }
+rand = "0.9.1"
 rmp-serde = "1.3.1"
 semver = "1.0.27"
 serde = { version = "1.0.228", features = ["derive"] }
 serde_json = "1.0.149"
 spider-derive = { path = "../spider-derive" }
-sqlx = { version = "0.8.6", features = ["mysql", "uuid"] }
+sqlx = { version = "0.8.6", features = ["mysql"] }
 strum = { version = "0.28.0", features = ["derive"] }
 thiserror = "2.0.18"
-uuid = { version = "1.19.0", features = ["serde", "v4"] }
 
 [dev-dependencies]
 tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread"] }
diff --git a/components/spider-core/src/types/id.rs b/components/spider-core/src/types/id.rs
index 4735f798..e205d27e 100644
--- a/components/spider-core/src/types/id.rs
+++ b/components/spider-core/src/types/id.rs
@@ -1,12 +1,14 @@
-use std::{fmt::Debug, marker::PhantomData};
+use std::{
+    fmt::{Debug, Display},
+    marker::PhantomData,
+};
 
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use sqlx::{Database, encode::IsNull};
-use uuid::Uuid;
 
 use crate::task::TaskIndex;
 
-/// A generic identifier type that wraps a UUID and a type marker.
+/// A generic identifier type that wraps a numeric ID and a type marker.
 ///
 /// # Type Parameters:
 ///
@@ -15,84 +17,109 @@ use crate::task::TaskIndex;
 /// # Examples
 ///
 /// ```rust
+/// use spider_core::types::id::Id;
+///
 /// #[derive(Debug, PartialEq, Eq)]
 /// enum SomeTypeIdMarker {}
 /// type SomeTypeId = Id<SomeTypeIdMarker>;
 /// ```
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
-pub struct Id<TypeMarker: Debug + PartialEq + Eq>(Uuid, PhantomData<TypeMarker>);
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Id<TypeMarker: Debug + PartialEq + Eq> {
+    raw: u64,
+    _marker: PhantomData<TypeMarker>,
+}
 
 impl<TypeMarker: Debug + PartialEq + Eq> Default for Id<TypeMarker> {
     fn default() -> Self {
-        Self::new()
+        Self::from(0)
     }
 }
 
 impl<TypeMarker: Debug + PartialEq + Eq> Id<TypeMarker> {
+    /// Creates a random ID for tests.
+    ///
+    /// Production IDs should be assigned by persistent storage instead.
     #[must_use]
-    pub fn new() -> Self {
-        Self(Uuid::new_v4(), PhantomData)
-    }
-
-    #[must_use]
-    pub const fn from(uid: Uuid) -> Self {
-        Self(uid, PhantomData)
+    pub fn random() -> Self {
+        Self::from(rand::random())
     }
 
     #[must_use]
-    pub const fn as_uuid_ref(&self) -> &Uuid {
-        &self.0
+    pub const fn from(id: u64) -> Self {
+        Self {
+            raw: id,
+            _marker: PhantomData,
+        }
     }
 
     #[must_use]
-    pub const fn as_bytes(&self) -> &UuidBytes {
-        self.0.as_bytes()
+    pub const fn get(&self) -> u64 {
+        self.raw
     }
 }
 
-impl<TypeMarker, Db> sqlx::Type<Db> for Id<TypeMarker>
+impl<TypeMarker: Debug + PartialEq + Eq, Db: Database> sqlx::Type<Db> for Id<TypeMarker>
 where
-    TypeMarker: Debug + PartialEq + Eq,
-    Db: Database,
-    Uuid: sqlx::Type<Db>,
+    u64: sqlx::Type<Db>,
 {
     fn type_info() -> <Db as Database>::TypeInfo {
-        <Uuid as sqlx::Type<Db>>::type_info()
+        <u64 as sqlx::Type<Db>>::type_info()
     }
 
     fn compatible(ty: &<Db as Database>::TypeInfo) -> bool {
-        <Uuid as sqlx::Type<Db>>::compatible(ty)
+        <u64 as sqlx::Type<Db>>::compatible(ty)
     }
 }
 
-impl<'encode, TypeMarker, Db> sqlx::Encode<'encode, Db> for Id<TypeMarker>
+impl<'encode, TypeMarker: Debug + PartialEq + Eq, Db: Database> sqlx::Encode<'encode, Db>
+    for Id<TypeMarker>
 where
-    TypeMarker: Debug + PartialEq + Eq,
-    Db: Database,
-    Uuid: sqlx::Encode<'encode, Db>,
+    u64: sqlx::Encode<'encode, Db>,
 {
     fn encode_by_ref(
         &self,
         buf: &mut <Db as Database>::ArgumentBuffer<'encode>,
     ) -> Result<IsNull, sqlx::error::BoxDynError> {
-        self.0.encode_by_ref(buf)
+        self.get().encode_by_ref(buf)
     }
 }
 
-impl<'decode, TypeMarker, Db> sqlx::Decode<'decode, Db> for Id<TypeMarker>
+impl<'decode, TypeMarker: Debug + PartialEq + Eq, Db: Database> sqlx::Decode<'decode, Db>
+    for Id<TypeMarker>
 where
-    TypeMarker: Debug + PartialEq + Eq,
-    Db: Database,
-    Uuid: sqlx::Decode<'decode, Db>,
+    u64: sqlx::Decode<'decode, Db>,
 {
     fn decode(
         value: <Db as Database>::ValueRef<'decode>,
     ) -> Result<Self, sqlx::error::BoxDynError> {
-        Uuid::decode(value).map(|uuid| Self(uuid, PhantomData))
+        u64::decode(value).map(|id| Self::from(id))
+    }
+}
+
+impl<TypeMarker: Debug + PartialEq + Eq> Display for Id<TypeMarker> {
+    fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        Display::fmt(&self.get(), formatter)
+    }
+}
+
+impl<TypeMarker: Debug + PartialEq + Eq> Serialize for Id<TypeMarker> {
+    fn serialize<SerializerImpl: Serializer>(
+        &self,
+        serializer: SerializerImpl,
+    ) -> Result<SerializerImpl::Ok, SerializerImpl::Error> {
+        self.get().serialize(serializer)
     }
 }
 
-pub type UuidBytes = uuid::Bytes;
+impl<'deserializer_lifetime, TypeMarker: Debug + PartialEq + Eq> Deserialize<'deserializer_lifetime>
+    for Id<TypeMarker>
+{
+    fn deserialize<DeserializerImpl: Deserializer<'deserializer_lifetime>>(
+        deserializer: DeserializerImpl,
+    ) -> Result<Self, DeserializerImpl::Error> {
+        u64::deserialize(deserializer).map(Self::from)
+    }
+}
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum ResourceGroupIdMarker {}
@@ -180,3 +207,32 @@ where
 }
 
 pub type SignedJobId = SignedId<JobIdMarker>;
+
+#[cfg(test)]
+mod tests {
+    use super::{JobId, ResourceGroupId};
+
+    #[test]
+    fn id_serializes_as_u64() {
+        let job_id = JobId::from(42);
+        let serialized =
+            serde_json::to_string(&job_id).expect("job id serialization should succeed");
+
+        assert_eq!(serialized, "42");
+    }
+
+    #[test]
+    fn distinct_id_markers_can_share_numeric_values() {
+        let job_id = JobId::from(7);
+        let resource_group_id = ResourceGroupId::from(7);
+
+        assert_eq!(job_id.get(), resource_group_id.get());
+    }
+
+    #[test]
+    fn default_id_is_zero() {
+        let job_id = JobId::default();
+
+        assert_eq!(job_id.get(), 0);
+    }
+}
diff --git a/components/spider-execution-manager/src/liveness.rs b/components/spider-execution-manager/src/liveness.rs
index 411931cd..7662ae6a 100644
--- a/components/spider-execution-manager/src/liveness.rs
+++ b/components/spider-execution-manager/src/liveness.rs
@@ -290,7 +290,7 @@ mod tests {
         cancellation_token: CancellationToken,
     ) -> (LivenessHandle, JoinHandle<()>) {
         spawn(
-            ExecutionManagerId::new(),
+            ExecutionManagerId::random(),
             client,
             tracker,
             cancellation_token,
diff --git a/components/spider-execution-manager/src/process_pool.rs b/components/spider-execution-manager/src/process_pool.rs
index fab51d53..f3703153 100644
--- a/components/spider-execution-manager/src/process_pool.rs
+++ b/components/spider-execution-manager/src/process_pool.rs
@@ -217,10 +217,10 @@ impl ProcessPool {
     fn spawn_executor(&self) -> Result<ExecutorHandle, InternalError> {
         let executor_id = self.next_executor_id.fetch_add(1, Ordering::Relaxed);
         std::fs::create_dir_all(&self.config.log_dir)?;
-        let log_path = self.config.log_dir.join(format!(
-            "{}-{executor_id}.log",
-            self.config.em_id.as_uuid_ref()
-        ));
+        let log_path = self
+            .config
+            .log_dir
+            .join(format!("{}-{executor_id}.log", self.config.em_id));
         let log_file = File::options().create(true).append(true).open(&log_path)?;
 
         let mut command = Command::new(&self.config.executor_binary_path);
diff --git a/components/spider-storage/Cargo.toml b/components/spider-storage/Cargo.toml
index f0a39b72..d57b856f 100644
--- a/components/spider-storage/Cargo.toml
+++ b/components/spider-storage/Cargo.toml
@@ -29,7 +29,6 @@ tokio = {
   version = "1.50.0",
   features = ["macros", "rt-multi-thread", "sync", "time"]
 }
-uuid = { version = "1.19.0", features = ["serde"] }
 
 [dev-dependencies]
 anyhow = "1.0.98"
@@ -38,4 +37,3 @@ serial_test = { version = "3.2.0", features = ["file_locks"] }
 tabled = "0.20.0"
 tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread", "sync"] }
 tokio-util = { version = "0.7", features = ["rt"] }
-uuid = { version = "1.19.0", features = ["v4"] }
diff --git a/components/spider-storage/src/db/mariadb.rs b/components/spider-storage/src/db/mariadb.rs
index faeda2a6..6bd7017c 100644
--- a/components/spider-storage/src/db/mariadb.rs
+++ b/components/spider-storage/src/db/mariadb.rs
@@ -102,7 +102,7 @@ impl ExternalJobOrchestration for MariaDbStorageConnector {
     ) -> Result<JobId, DbError> {
         const INSERT_QUERY: &str = formatcp!(
             "INSERT INTO `{table}` (`resource_group_id`, `serialized_task_graph`, \
-             `serialized_job_inputs`) VALUES (?, ?, ?) RETURNING CAST(`id` AS BINARY(16)) AS `id`;",
+             `serialized_job_inputs`) VALUES (?, ?, ?) RETURNING `id`;",
             table = JOBS_TABLE_NAME,
         );
 
@@ -170,8 +170,7 @@ impl ExternalJobOrchestration for MariaDbStorageConnector {
 
         let outputs_bytes = serialized_outputs.ok_or_else(|| {
             DbError::CorruptedDbState(format!(
-                "job `{}` succeeded but has no serialized outputs",
-                job_id.as_uuid_ref()
+                "job `{job_id}` succeeded but has no serialized outputs"
             ))
         })?;
         let outputs: Vec<TaskOutput> =
@@ -201,10 +200,7 @@ impl ExternalJobOrchestration for MariaDbStorageConnector {
         }
 
         let message = error_message.ok_or_else(|| {
-            DbError::CorruptedDbState(format!(
-                "job `{}` failed but has no error message",
-                job_id.as_uuid_ref()
-            ))
+            DbError::CorruptedDbState(format!("job `{job_id}` failed but has no error message"))
         })?;
         Ok(message)
     }
@@ -344,7 +340,7 @@ impl InternalJobOrchestration for MariaDbStorageConnector {
         const DELETE_BATCH_SIZE: usize = 1000;
 
         const SELECT_QUERY: &str = formatcp!(
-            "SELECT CAST(`id` AS BINARY(16)) FROM `{table}` WHERE `state` IN \
+            "SELECT `id` FROM `{table}` WHERE `state` IN \
              ('{succeeded_state}','{failed_state}','{cancelled_state}') AND `ended_at` < NOW() - \
              INTERVAL ? SECOND LIMIT {DELETE_BATCH_SIZE} FOR UPDATE;",
             table = JOBS_TABLE_NAME,
@@ -394,8 +390,7 @@ impl ResourceGroupManagement for MariaDbStorageConnector {
         password: Vec<u8>,
     ) -> Result<ResourceGroupId, DbError> {
         const QUERY: &str = formatcp!(
-            "INSERT INTO `{table}` (`external_id`, `password`) VALUES (?, ?) RETURNING CAST(`id` \
-             AS BINARY(16)) AS `id`;",
+            "INSERT INTO `{table}` (`external_id`, `password`) VALUES (?, ?) RETURNING `id`;",
             table = RESOURCE_GROUPS_TABLE_NAME,
         );
 
@@ -462,7 +457,7 @@ impl ExecutionManagerLivenessManagement for MariaDbStorageConnector {
         ip_address: IpAddr,
     ) -> Result<ExecutionManagerId, DbError> {
         const INSERT_QUERY: &str = formatcp!(
-            "INSERT INTO `{table}` (`ip_address`) VALUES (?) RETURNING CAST(`id` AS BINARY(16));",
+            "INSERT INTO `{table}` (`ip_address`) VALUES (?) RETURNING `id`;",
             table = EXECUTION_MANAGERS_TABLE_NAME,
         );
 
@@ -539,8 +534,8 @@ impl ExecutionManagerLivenessManagement for MariaDbStorageConnector {
         const UPDATE_BATCH_SIZE: usize = 1000;
 
         const SELECT_QUERY: &str = formatcp!(
-            "SELECT CAST(`id` AS BINARY(16)) FROM `{table}` WHERE `state` = '{alive_state}' AND \
-             `last_heartbeat_at` < CURRENT_TIMESTAMP - INTERVAL ? SECOND FOR UPDATE;",
+            "SELECT `id` FROM `{table}` WHERE `state` = '{alive_state}' AND `last_heartbeat_at` < \
+             CURRENT_TIMESTAMP - INTERVAL ? SECOND FOR UPDATE;",
             table = EXECUTION_MANAGERS_TABLE_NAME,
             alive_state = ExecutionManagerState::Alive.as_str(),
         );
@@ -601,7 +596,7 @@ const fn resource_groups_creation_query() -> &'static str {
     formatcp!(
         r"
 CREATE TABLE IF NOT EXISTS `{RESOURCE_GROUPS_TABLE_NAME}` (
-  `id` UUID NOT NULL DEFAULT UUID_v7(),
+  `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
   `external_id` VARCHAR(256) NOT NULL,
   `password` VARBINARY(2048) NOT NULL,
   PRIMARY KEY (`id`),
@@ -615,8 +610,8 @@ const fn jobs_creation_query() -> &'static str {
     formatcp!(
         r"
 CREATE TABLE IF NOT EXISTS `{JOBS_TABLE_NAME}` (
-  `id` UUID NOT NULL DEFAULT UUID_v7(),
-  `resource_group_id` UUID NOT NULL,
+  `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+  `resource_group_id` BIGINT UNSIGNED NOT NULL,
   `state` {state_enum} NOT NULL DEFAULT {default_state},
   `serialized_task_graph` LONGTEXT NOT NULL,
   `serialized_job_inputs` LONGBLOB NOT NULL,
@@ -642,7 +637,7 @@ const fn execution_managers_creation_query() -> &'static str {
     formatcp!(
         r"
 CREATE TABLE IF NOT EXISTS `{EXECUTION_MANAGERS_TABLE_NAME}` (
-  `id` UUID NOT NULL DEFAULT UUID_v7(),
+  `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
   `ip_address` VARCHAR(45) NOT NULL,
   `state` {state_enum} NOT NULL DEFAULT {default_state},
   `last_heartbeat_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
diff --git a/components/spider-storage/src/state/job_cache.rs b/components/spider-storage/src/state/job_cache.rs
index 6ad3c7ce..5ae5e40a 100644
--- a/components/spider-storage/src/state/job_cache.rs
+++ b/components/spider-storage/src/state/job_cache.rs
@@ -282,7 +282,7 @@ mod tests {
                 .expect("job submission should be valid");
         SharedJobControlBlock::create(
             job_id,
-            spider_core::types::id::ResourceGroupId::new(),
+            spider_core::types::id::ResourceGroupId::random(),
             job_submission,
             MockReadyQueueSender,
             MockDbConnector,
@@ -296,7 +296,7 @@ mod tests {
     async fn job_cache_insert_and_get() -> anyhow::Result<()> {
         let cache: JobCache<MockReadyQueueSender, MockDbConnector, MockTaskInstancePoolConnector> =
             JobCache::new();
-        let job_id = JobId::new();
+        let job_id = JobId::random();
 
         let jcb = create_test_jcb(job_id).await;
         cache.insert(jcb)?;
@@ -310,7 +310,7 @@ mod tests {
     async fn job_cache_remove_returns_inserted_jcb() -> anyhow::Result<()> {
         let cache: JobCache<MockReadyQueueSender, MockDbConnector, MockTaskInstancePoolConnector> =
             JobCache::new();
-        let job_id = JobId::new();
+        let job_id = JobId::random();
 
         let jcb = create_test_jcb(job_id).await;
         cache.insert(jcb)?;
@@ -327,7 +327,7 @@ mod tests {
     async fn job_cache_get_returns_none_for_nonexistent_job() -> anyhow::Result<()> {
         let cache: JobCache<MockReadyQueueSender, MockDbConnector, MockTaskInstancePoolConnector> =
             JobCache::new();
-        let job_id = JobId::new();
+        let job_id = JobId::random();
 
         let result = cache.get(job_id);
         assert!(
@@ -341,7 +341,7 @@ mod tests {
     async fn job_cache_insert_duplicate_returns_error() -> anyhow::Result<()> {
         let cache: JobCache<MockReadyQueueSender, MockDbConnector, MockTaskInstancePoolConnector> =
             JobCache::new();
-        let job_id = JobId::new();
+        let job_id = JobId::random();
 
         let jcb1 = create_test_jcb(job_id).await;
         cache.insert(jcb1)?;
@@ -372,7 +372,7 @@ mod tests {
         for i in 0..num_tasks {
             let cache = Arc::clone(&cache);
             tracker.spawn(async move {
-                let job_id = JobId::new();
+                let job_id = JobId::random();
                 let jcb = create_test_jcb(job_id).await;
                 cache
                     .insert(jcb)
@@ -456,13 +456,13 @@ mod tests {
             })
             .expect("task insertion should succeed");
 
-        let job_id = JobId::new();
+        let job_id = JobId::random();
         let job_submission =
             ValidatedJobSubmission::create(submitted, vec![TaskInput::ValuePayload(vec![0u8; 4])])
                 .expect("job submission should be valid");
         let jcb = SharedJobControlBlock::create(
             job_id,
-            spider_core::types::id::ResourceGroupId::new(),
+            spider_core::types::id::ResourceGroupId::random(),
             job_submission,
             sender,
             MockDbConnector,
diff --git a/components/spider-storage/src/task_instance_pool.rs b/components/spider-storage/src/task_instance_pool.rs
index bba0cf77..930271a2 100644
--- a/components/spider-storage/src/task_instance_pool.rs
+++ b/components/spider-storage/src/task_instance_pool.rs
@@ -683,8 +683,8 @@ mod tests {
     ) -> TaskInstanceMetadata {
         const SOFT_TIMEOUT_MS: Duration = Duration::from_millis(100);
         TaskInstanceMetadata {
-            resource_group_id: ResourceGroupId::new(),
-            job_id: JobId::new(),
+            resource_group_id: ResourceGroupId::random(),
+            job_id: JobId::random(),
             task_id,
             task_instance_id,
             execution_manager_id,
@@ -767,7 +767,7 @@ mod tests {
         let metadata = make_task_instance_metadata(
             TaskId::Index(0),
             task_instance_id,
-            ExecutionManagerId::new(),
+            ExecutionManagerId::random(),
             SystemTime::now(),
         );
         let job_id = metadata.job_id;
@@ -797,7 +797,7 @@ mod tests {
             Duration::from_mins(1),
             DEFAULT_CHANNEL_SIZE,
         );
-        let execution_manager_id = ExecutionManagerId::new();
+        let execution_manager_id = ExecutionManagerId::random();
 
         let tcb1 = build_single_task_tcb().await;
         let metadata1 = make_task_instance_metadata(
@@ -840,7 +840,7 @@ mod tests {
             liveness_store,
             Duration::from_mins(1),
         );
-        let em_id = ExecutionManagerId::new();
+        let em_id = ExecutionManagerId::random();
 
         // Create a few tasks and terminate them immediately.
         for i in 0..NUM_TASKS {
@@ -884,7 +884,7 @@ mod tests {
             liveness_store,
             Duration::from_mins(1),
         );
-        let em_id = ExecutionManagerId::new();
+        let em_id = ExecutionManagerId::random();
         let gc_starting_time = SystemTime::now();
         // soft_timeout_ddl = registered_at + 100ms
         // deadline = now - 900ms
@@ -942,7 +942,7 @@ mod tests {
             liveness_store.clone(),
             Duration::from_mins(1),
         );
-        let em_id = ExecutionManagerId::new();
+        let em_id = ExecutionManagerId::random();
         let now = SystemTime::now();
 
         let mut expected_messages: Vec<ReadyMessage> = Vec::new();
@@ -1000,7 +1000,7 @@ mod tests {
             liveness_store.clone(),
             Duration::from_mins(1),
         );
-        let em_id = ExecutionManagerId::new();
+        let em_id = ExecutionManagerId::random();
         let now = SystemTime::now();
 
         for i in 0..NUM_TASKS {
@@ -1058,8 +1058,8 @@ mod tests {
             liveness_store.clone(),
             Duration::from_mins(1),
         );
-        let alive_em = ExecutionManagerId::new();
-        let dead_em = ExecutionManagerId::new();
+        let alive_em = ExecutionManagerId::random();
+        let dead_em = ExecutionManagerId::random();
         let now = SystemTime::now();
         // soft timeout deadline = now - 900ms
         let elapsed_registration = now - Duration::from_secs(1);
diff --git a/components/spider-storage/tests/mariadb_infra.rs b/components/spider-storage/tests/mariadb_infra.rs
index ef26198c..0772ec04 100644
--- a/components/spider-storage/tests/mariadb_infra.rs
+++ b/components/spider-storage/tests/mariadb_infra.rs
@@ -47,7 +47,7 @@ pub async fn create_mariadb_connector() -> MariaDbStorageConnector {
 ///
 /// Panics if the resource group creation fails.
 pub async fn create_test_resource_group(storage: &MariaDbStorageConnector) -> ResourceGroupId {
-    let external_id = uuid::Uuid::new_v4().to_string();
+    let external_id = format!("test-resource-group-{}", rand::random::<u64>());
     storage
         .add(external_id, b"test-password".to_vec())
         .await
diff --git a/components/spider-storage/tests/mariadb_test.rs b/components/spider-storage/tests/mariadb_test.rs
index 3b90ab07..88343c82 100644
--- a/components/spider-storage/tests/mariadb_test.rs
+++ b/components/spider-storage/tests/mariadb_test.rs
@@ -80,7 +80,7 @@ async fn test_register_job() {
 #[ignore = "requires MariaDB"]
 async fn test_register_job_invalid_resource_group() {
     let storage = create_mariadb_connector().await;
-    let fake_rg_id = ResourceGroupId::new();
+    let fake_rg_id = ResourceGroupId::random();
     let (graph, inputs) = single_task_graph();
     let job_submission =
         ValidatedJobSubmission::create(graph, inputs).expect("job submission should be valid");
@@ -555,7 +555,7 @@ async fn test_delete_expired_terminated_jobs() {
 #[ignore = "requires MariaDB"]
 async fn test_add_duplicate_resource_group() {
     let storage = create_mariadb_connector().await;
-    let external_id = uuid::Uuid::new_v4().to_string();
+    let external_id = format!("test-resource-group-{}", rand::random::<u64>());
 
     storage
         .add(external_id.clone(), b"password".to_vec())
@@ -576,7 +576,7 @@ async fn test_verify_correct_password() {
 
     let rg_id = storage
         .add(
-            uuid::Uuid::new_v4().to_string(),
+            format!("test-resource-group-{}", rand::random::<u64>()),
             b"correct-password".to_vec(),
         )
         .await
@@ -595,7 +595,7 @@ async fn test_verify_wrong_password() {
 
     let rg_id = storage
         .add(
-            uuid::Uuid::new_v4().to_string(),
+            format!("test-resource-group-{}", rand::random::<u64>()),
             b"correct-password".to_vec(),
         )
         .await
@@ -612,7 +612,7 @@ async fn test_verify_wrong_password() {
 #[ignore = "requires MariaDB"]
 async fn test_verify_nonexistent_resource_group() {
     let storage = create_mariadb_connector().await;
-    let fake_rg_id = ResourceGroupId::new();
+    let fake_rg_id = ResourceGroupId::random();
 
     let result = storage.verify(fake_rg_id, b"password").await;
     assert!(
@@ -625,7 +625,7 @@ async fn test_verify_nonexistent_resource_group() {
 #[ignore = "requires MariaDB"]
 async fn test_start_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result = storage.start(fake_job_id).await;
     assert!(
@@ -638,7 +638,7 @@ async fn test_start_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_set_state_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result =
         InternalJobOrchestration::set_state(&storage, fake_job_id, JobState::Running).await;
@@ -652,7 +652,7 @@ async fn test_set_state_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_get_state_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result = storage.get_state(fake_job_id).await;
     assert!(
@@ -665,7 +665,7 @@ async fn test_get_state_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_get_outputs_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result = storage.get_outputs(fake_job_id).await;
     assert!(
@@ -678,7 +678,7 @@ async fn test_get_outputs_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_get_error_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result = storage.get_error(fake_job_id).await;
     assert!(
@@ -691,7 +691,7 @@ async fn test_get_error_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_commit_outputs_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result =
         InternalJobOrchestration::commit_outputs(&storage, fake_job_id, vec![vec![]], false).await;
@@ -705,7 +705,7 @@ async fn test_commit_outputs_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_cancel_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result = InternalJobOrchestration::cancel(&storage, fake_job_id, false).await;
     assert!(
@@ -718,7 +718,7 @@ async fn test_cancel_job_not_found() {
 #[ignore = "requires MariaDB"]
 async fn test_fail_job_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_job_id = JobId::new();
+    let fake_job_id = JobId::random();
 
     let result = InternalJobOrchestration::fail(&storage, fake_job_id, "error".to_string()).await;
     assert!(
@@ -822,7 +822,7 @@ async fn test_update_execution_manager_heartbeat() {
 #[ignore = "requires MariaDB"]
 async fn test_update_execution_manager_heartbeat_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_em_id = ExecutionManagerId::new();
+    let fake_em_id = ExecutionManagerId::random();
 
     let result = storage.update_execution_manager_heartbeat(fake_em_id).await;
     assert!(
@@ -873,7 +873,7 @@ async fn test_is_execution_manager_alive_em_alive() {
 #[ignore = "requires MariaDB"]
 async fn test_is_execution_manager_alive_em_not_found() {
     let storage = create_mariadb_connector().await;
-    let fake_em_id = ExecutionManagerId::new();
+    let fake_em_id = ExecutionManagerId::random();
 
     let result = storage.is_execution_manager_alive(fake_em_id).await;
     assert!(
diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs
index 046a35eb..a089d66f 100644
--- a/components/spider-storage/tests/scheduling_infra.rs
+++ b/components/spider-storage/tests/scheduling_infra.rs
@@ -359,7 +359,7 @@ pub async fn run_workload<DbConnectorType: InternalJobOrchestration + 'static>(
     let ctx = EmContext {
         receiver: ready_receiver,
         jcb: jcb.clone(),
-        execution_manager_id: ExecutionManagerId::new(),
+        execution_manager_id: ExecutionManagerId::random(),
         terminal_state_sender: terminal_state_sender.clone(),
         done_receiver: done_receiver.clone(),
         seen_tasks: Arc::new(DashMap::new()),
@@ -374,7 +374,7 @@ pub async fn run_workload<DbConnectorType: InternalJobOrchestration + 'static>(
     let mut join_set = tokio::task::JoinSet::new();
     for _ in 0..NUM_EXECUTION_MANAGERS {
         let mut em_ctx = ctx.clone();
-        em_ctx.execution_manager_id = ExecutionManagerId::new();
+        em_ctx.execution_manager_id = ExecutionManagerId::random();
         join_set.spawn(async move { run_execution_manager(em_ctx).await });
     }
 
diff --git a/components/spider-tdl/src/task.rs b/components/spider-tdl/src/task.rs
index d4015e0c..7968f4b0 100644
--- a/components/spider-tdl/src/task.rs
+++ b/components/spider-tdl/src/task.rs
@@ -253,10 +253,10 @@ mod tests {
 
     fn make_encoded_ctx() -> Vec<u8> {
         let ctx = TaskContext {
-            job_id: JobId::new(),
+            job_id: JobId::random(),
             task_id: TaskId::Index(0),
             task_instance_id: 1,
-            resource_group_id: ResourceGroupId::new(),
+            resource_group_id: ResourceGroupId::random(),
         };
         rmp_serde::to_vec(&ctx).expect("failed to serialize `TaskContext`")
     }
diff --git a/components/spider-tdl/src/task_context.rs b/components/spider-tdl/src/task_context.rs
index d412bdb4..d79dea6d 100644
--- a/components/spider-tdl/src/task_context.rs
+++ b/components/spider-tdl/src/task_context.rs
@@ -30,10 +30,10 @@ mod tests {
     #[test]
     fn round_trip_msgpack() -> anyhow::Result<()> {
         let ctx = TaskContext {
-            job_id: JobId::new(),
+            job_id: JobId::random(),
             task_id: TaskId::Index(0),
             task_instance_id: 13,
-            resource_group_id: ResourceGroupId::new(),
+            resource_group_id: ResourceGroupId::random(),
         };
         let encoded = rmp_serde::to_vec(&ctx)?;
         let decoded: TaskContext = rmp_serde::from_slice(&encoded)?;
diff --git a/components/spider-tdl/tests/test_task_macro.rs b/components/spider-tdl/tests/test_task_macro.rs
index 9a891f19..59eb05ea 100644
--- a/components/spider-tdl/tests/test_task_macro.rs
+++ b/components/spider-tdl/tests/test_task_macro.rs
@@ -80,10 +80,10 @@ fn translate(_ctx: TaskContext, p: Point, dx: int32, dy: int32) -> Result<(Point
 /// A mocked encoded task context for testing.
 fn make_encoded_ctx() -> Vec<u8> {
     let ctx = TaskContext {
-        job_id: JobId::new(),
+        job_id: JobId::random(),
         task_id: TaskId::Index(0),
         task_instance_id: 1,
-        resource_group_id: ResourceGroupId::new(),
+        resource_group_id: ResourceGroupId::random(),
     };
     rmp_serde::to_vec(&ctx).expect("failed to serialize `TaskContext`")
 }
@@ -302,10 +302,10 @@ fn direct_execute_call_round_trips() -> anyhow::Result<()> {
     const EXPECTED_SUM: int32 = OPERAND_A + OPERAND_B;
 
     let ctx = TaskContext {
-        job_id: JobId::new(),
+        job_id: JobId::random(),
         task_id: TaskId::Index(0),
         task_instance_id: 1,
-        resource_group_id: ResourceGroupId::new(),
+        resource_group_id: ResourceGroupId::random(),
     };
 
     let mut inputs = TaskInputsSerializer::new();
diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs
index e646352f..367c0c3b 100644
--- a/tests/huntsman/task-executor/tests/test_process_pool.rs
+++ b/tests/huntsman/task-executor/tests/test_process_pool.rs
@@ -58,8 +58,8 @@ const SLOW_FIB_INDEX: u64 = 45;
 ///
 /// Panics if [`ProcessPool::new`] fails — i.e., the task-executor binary cannot be spawned.
 fn build_pool() -> ProcessPool {
-    let em_id = ExecutionManagerId::new();
-    let log_dir = std::env::temp_dir().join(format!("spider-em-pool-test-{}", em_id.as_uuid_ref()));
+    let em_id = ExecutionManagerId::random();
+    let log_dir = std::env::temp_dir().join(format!("spider-em-pool-test-{em_id}"));
     let config = ProcessPoolConfig {
         em_id,
         executor_binary_path: task_executor_bin(),
@@ -77,9 +77,9 @@ fn build_pool() -> ProcessPool {
 /// supplies `hard_timeout` directly to [`ProcessPool::execute`]), and the supplied `inputs`.
 fn make_request(task_func: &str, inputs: Vec<TaskInput>) -> ExecuteRequest {
     ExecuteRequest {
-        job_id: JobId::new(),
+        job_id: JobId::random(),
         task_id: TaskId::Index(0),
-        resource_group_id: ResourceGroupId::new(),
+        resource_group_id: ResourceGroupId::random(),
         ctx: ExecutionContext {
             task_instance_id: 1,
             tdl_context: TdlContext {
diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs
index 0e2bc7d5..09c90020 100644
--- a/tests/huntsman/tdl-integration/tests/complex.rs
+++ b/tests/huntsman/tdl-integration/tests/complex.rs
@@ -32,10 +32,10 @@ fn lib_path() -> std::path::PathBuf {
 /// An encoded task context for testing.
 fn encode_ctx() -> Vec<u8> {
     let ctx = TaskContext {
-        job_id: JobId::new(),
+        job_id: JobId::random(),
         task_id: TaskId::Index(0),
         task_instance_id: 1,
-        resource_group_id: ResourceGroupId::new(),
+        resource_group_id: ResourceGroupId::random(),
     };
     rmp_serde::to_vec(&ctx).expect("failed to serialize `TaskContext`")
 }
diff --git a/tests/huntsman/test-utils/src/executor.rs b/tests/huntsman/test-utils/src/executor.rs
index 24f8db5f..37133bea 100644
--- a/tests/huntsman/test-utils/src/executor.rs
+++ b/tests/huntsman/test-utils/src/executor.rs
@@ -191,10 +191,10 @@ pub fn tdl_package_dir() -> PathBuf {
 #[must_use]
 pub fn build_ctx() -> Vec<u8> {
     let ctx = TaskContext {
-        job_id: JobId::new(),
+        job_id: JobId::random(),
         task_id: TaskId::Index(0),
         task_instance_id: 1,
-        resource_group_id: ResourceGroupId::new(),
+        resource_group_id: ResourceGroupId::random(),
     };
     rmp_serde::to_vec(&ctx).expect("serialize TaskContext")
 }
diff --git a/tests/huntsman/test-utils/src/mock.rs b/tests/huntsman/test-utils/src/mock.rs
index 19122cbe..e9115759 100644
--- a/tests/huntsman/test-utils/src/mock.rs
+++ b/tests/huntsman/test-utils/src/mock.rs
@@ -54,7 +54,7 @@ impl MockLiveness {
     pub fn with_initial_session(initial_session: SessionId) -> Self {
         Self {
             inner: Arc::new(LivenessInner {
-                em_id: ExecutionManagerId::new(),
+                em_id: ExecutionManagerId::random(),
                 initial_session: AtomicU64::new(initial_session),
                 register_response: Mutex::new(None),
                 heartbeat_responses: Mutex::new(VecDeque::new()),

From e030af8387b74aa028acdf4f9f12dfa923736cb2 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Sun, 7 Jun 2026 15:07:09 -0400
Subject: [PATCH 07/14] feat(spider-execution-manager): Add the runtime that
 drives the main task-dispatch loop. (#329)

---
 Cargo.lock                                    |  11 +
 Cargo.toml                                    |   1 +
 .../src/client/scheduler.rs                   |   3 +-
 .../spider-execution-manager/src/lib.rs       |   1 +
 .../spider-execution-manager/src/runtime.rs   | 493 +++++++++++++++
 tests/huntsman/em-runtime/Cargo.toml          |  25 +
 tests/huntsman/em-runtime/src/lib.rs          |   4 +
 .../huntsman/em-runtime/tests/test_runtime.rs | 591 ++++++++++++++++++
 tests/huntsman/test-utils/src/mock.rs         | 280 ++++++++-
 9 files changed, 1407 insertions(+), 2 deletions(-)
 create mode 100644 components/spider-execution-manager/src/runtime.rs
 create mode 100644 tests/huntsman/em-runtime/Cargo.toml
 create mode 100644 tests/huntsman/em-runtime/src/lib.rs
 create mode 100644 tests/huntsman/em-runtime/tests/test_runtime.rs

diff --git a/Cargo.lock b/Cargo.lock
index d6ac6cd1..a6653baa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -289,6 +289,17 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "em-runtime-tests"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "spider-core",
+ "spider-execution-manager",
+ "test-utils",
+ "tokio",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.2"
diff --git a/Cargo.toml b/Cargo.toml
index 5eb18596..08d6f85b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,6 +10,7 @@ members = [
   "components/spider-tdl-derive",
   "examples/huntsman/complex/tasks",
   "examples/huntsman/complex/types",
+  "tests/huntsman/em-runtime",
   "tests/huntsman/integration-test-tasks",
   "tests/huntsman/task-executor",
   "tests/huntsman/tdl-integration",
diff --git a/components/spider-execution-manager/src/client/scheduler.rs b/components/spider-execution-manager/src/client/scheduler.rs
index cf13687a..c25312b6 100644
--- a/components/spider-execution-manager/src/client/scheduler.rs
+++ b/components/spider-execution-manager/src/client/scheduler.rs
@@ -3,7 +3,7 @@
 //! The execution manager acquires tasks from the scheduler through [`SchedulerClient`].
 
 use async_trait::async_trait;
-use spider_core::types::id::{ExecutionManagerId, JobId, SessionId, TaskId};
+use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId, TaskId};
 
 /// A task assignment handed to the execution manager by the scheduler.
 ///
@@ -14,6 +14,7 @@ use spider_core::types::id::{ExecutionManagerId, JobId, SessionId, TaskId};
 pub struct SchedulerResponse {
     pub job_id: JobId,
     pub task_id: TaskId,
+    pub resource_group_id: ResourceGroupId,
     pub session_id: SessionId,
 }
 
diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs
index 259fc8a9..20fffe17 100644
--- a/components/spider-execution-manager/src/lib.rs
+++ b/components/spider-execution-manager/src/lib.rs
@@ -4,3 +4,4 @@
 pub mod client;
 pub mod liveness;
 pub mod process_pool;
+pub mod runtime;
diff --git a/components/spider-execution-manager/src/runtime.rs b/components/spider-execution-manager/src/runtime.rs
new file mode 100644
index 00000000..71dd95ca
--- /dev/null
+++ b/components/spider-execution-manager/src/runtime.rs
@@ -0,0 +1,493 @@
+//! Runtime — the execution manager's main loop.
+
+use std::{net::IpAddr, path::PathBuf, sync::Arc, time::Duration};
+
+use spider_core::{
+    session::SessionTracker,
+    types::{
+        id::{ExecutionManagerId, JobId, SessionId, TaskId},
+        io::ExecutionContext,
+    },
+};
+use tokio::task::JoinHandle;
+use tokio_util::sync::{CancellationToken, DropGuard};
+
+use crate::{
+    client::{
+        LivenessClient,
+        LivenessResponseError,
+        SchedulerClient,
+        SchedulerResponse,
+        StorageClient,
+        StorageResponseError,
+    },
+    liveness::{self, LivenessHandle},
+    process_pool::{self, ExecuteRequest, Outcome, ProcessPool, ProcessPoolConfig},
+};
+
+/// Static configuration for a [`Runtime`]. Supplied once at bootstrap and never mutated.
+#[derive(Debug, Clone)]
+pub struct RuntimeConfig {
+    /// IP address advertised to storage at registration.
+    pub em_ip: IpAddr,
+
+    /// Interval between liveness heartbeats. Handed verbatim to the liveness actor.
+    pub heartbeat_interval: Duration,
+
+    /// Absolute path to the `spider-task-executor` binary the process pool spawns.
+    pub executor_binary_path: PathBuf,
+
+    /// Directory of TDL packages exposed to executors via `SPIDER_TDL_PACKAGE_DIR`.
+    pub package_dir: PathBuf,
+
+    /// Directory the process pool writes per-executor stderr logs into.
+    pub log_dir: PathBuf,
+}
+
+/// Errors returned by [`Runtime`] during bootstrap or the main loop.
+#[derive(Debug, thiserror::Error)]
+pub enum RuntimeError {
+    /// Boot-time registration with storage failed.
+    #[error("failed to register with storage: {0}")]
+    Registration(#[from] LivenessResponseError),
+
+    /// The initial process pool could not be created.
+    #[error("failed to create the process pool: {0}")]
+    ProcessPool(#[from] process_pool::InternalError),
+
+    /// Storage rejected a request as malformed. Indicates a contract bug in the runtime, not a
+    /// transient condition, so the runtime treats it as fatal.
+    #[error("storage rejected request as invalid: {0}")]
+    StorageInvalidInput(String),
+}
+
+/// The execution manager runtime: the main loop plus all the state it owns.
+///
+/// # Type Parameters
+///
+/// * `SchedulerClientType` - Concrete [`SchedulerClient`] the main loop pulls task assignments
+///   from.
+/// * `StorageClientType` - Concrete [`StorageClient`] used to register task instances and report
+///   their outcome.
+pub struct Runtime<
+    SchedulerClientType: SchedulerClient + Clone,
+    StorageClientType: StorageClient + Clone + 'static,
+> {
+    em_id: ExecutionManagerId,
+    scheduler_client: SchedulerClientType,
+    storage_client: StorageClientType,
+    process_pool: ProcessPool,
+    session_tracker: SessionTracker,
+    liveness_handle: LivenessHandle,
+    liveness_join: JoinHandle<()>,
+    cancellation_token: CancellationToken,
+    _cancel_guard: DropGuard,
+}
+
+impl<
+    SchedulerClientType: SchedulerClient + Clone,
+    StorageClientType: StorageClient + Clone + 'static,
+> Runtime<SchedulerClientType, StorageClientType>
+{
+    /// Factory function.
+    ///
+    /// Registers the execution manager with storage, seeds the [`SessionTracker`] with the session
+    /// ID returned by registration, spawns the initial executor [`ProcessPool`] and the liveness
+    /// actor, then assembles a ready-to-run runtime. The liveness actor sends the first heartbeat
+    /// by the time this returns.
+    ///
+    /// # Type Parameters
+    ///
+    /// * `LivenessClientType` - Concrete [`LivenessClient`] used to register at boot and, through
+    ///   the spawned liveness actor, heartbeat thereafter.
+    ///
+    /// # Returns
+    ///
+    /// A tuple on success, containing:
+    ///
+    /// * The created [`Runtime`] instance, ready to run.
+    /// * The [`CancellationToken`] that the caller can use to request shutdown.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * Forwards [`LivenessClient::register`]'s return values on failure.
+    /// * Forwards [`ProcessPool::new`]'s return values on failure.
+    pub async fn create<LivenessClientType: LivenessClient + 'static>(
+        scheduler_client: SchedulerClientType,
+        storage_client: StorageClientType,
+        liveness_client: Arc<LivenessClientType>,
+        config: RuntimeConfig,
+    ) -> Result<(Self, CancellationToken), RuntimeError> {
+        let registration = liveness_client.register(config.em_ip).await?;
+        let em_id = registration.em_id;
+        let session_tracker = SessionTracker::new(registration.session_id);
+        tracing::info!(
+            em_id = ? em_id,
+            session_id = registration.session_id,
+            "Execution manager registered with storage."
+        );
+
+        let process_pool = ProcessPool::new(ProcessPoolConfig {
+            em_id,
+            executor_binary_path: config.executor_binary_path,
+            package_dir: config.package_dir,
+            log_dir: config.log_dir,
+        })?;
+
+        let cancellation_token = CancellationToken::new();
+        let (liveness_handle, liveness_join) = liveness::spawn(
+            em_id,
+            liveness_client,
+            session_tracker.clone(),
+            cancellation_token.clone(),
+            config.heartbeat_interval,
+        );
+
+        let cancel_guard = cancellation_token.clone().drop_guard();
+        let runtime = Self {
+            em_id,
+            scheduler_client,
+            storage_client,
+            process_pool,
+            session_tracker,
+            liveness_handle,
+            liveness_join,
+            cancellation_token: cancellation_token.clone(),
+            _cancel_guard: cancel_guard,
+        };
+        Ok((runtime, cancellation_token))
+    }
+
+    /// Runs the main loop until the runtime is cancelled, then tears it down.
+    ///
+    /// # Returns
+    ///
+    /// `Ok(())` after a clean shutdown triggered by cancellation.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * Forwards [`Self::main_loop`]'s return values on failure.
+    pub async fn run(self) -> Result<(), RuntimeError> {
+        tracing::info!(em_id = ? self.em_id, "Runtime main loop starting.");
+        let result = self.main_loop().await;
+        tracing::info!(em_id = ? self.em_id, "Runtime main loop exited. Shutting down.");
+        self.cancellation_token.cancel();
+        if let Err(err) = self.liveness_join.await {
+            tracing::warn!(err = ? err, "Liveness actor task did not exit cleanly.");
+        }
+        result
+    }
+
+    /// Iterates the main loop. Each iteration pulls a task assignment from the scheduler and runs
+    /// it through the local pipeline. Returns when the runtime is cancelled or a fatal error
+    /// occurs.
+    ///
+    /// # Returns
+    ///
+    /// `Ok(())` when the loop exits cleanly because the runtime was cancelled.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * Forwards [`Self::register_task_instance`]'s return values on failure.
+    /// * Forwards [`ProcessPool::execute`]'s return values on failure.
+    async fn main_loop(&self) -> Result<(), RuntimeError> {
+        loop {
+            let assignment = tokio::select! {
+                biased;
+                () = self.cancellation_token.cancelled() => return Ok(()),
+                result = self.scheduler_client.next_task(self.em_id) => {
+                    match result {
+                        Ok(assignment) => assignment,
+                        Err(e) => {
+                            tracing::warn!(err = ? e, "Scheduler returned an error. Retrying.");
+                            continue;
+                        }
+                    }
+                }
+            };
+
+            tracing::info!(
+                bundle_session = assignment.session_id,
+                job_id = ? assignment.job_id,
+                task_id = ? assignment.task_id,
+                "Received a new task assignment from the scheduler."
+            );
+
+            let current_session = self.session_tracker.current();
+            if assignment.session_id < current_session {
+                tracing::warn!(
+                    bundle_session = assignment.session_id,
+                    current_session,
+                    job_id = ? assignment.job_id,
+                    task_id = ? assignment.task_id,
+                    "Dropping stale task assignment from the scheduler."
+                );
+                continue;
+            }
+            if assignment.session_id > current_session {
+                tracing::info!(
+                    new_session = assignment.session_id,
+                    "Observed a newer session via the scheduler. Refreshing liveness."
+                );
+                self.liveness_handle.refresh().await;
+            }
+
+            let Some(execution_context) = self.register_task_instance(assignment).await? else {
+                continue;
+            };
+
+            let hard_timeout =
+                Duration::from_millis(execution_context.timeout_policy.hard_timeout_ms);
+            let request = ExecuteRequest {
+                job_id: assignment.job_id,
+                task_id: assignment.task_id,
+                resource_group_id: assignment.resource_group_id,
+                ctx: execution_context,
+            };
+            let outcome = self
+                .process_pool
+                .execute(request, hard_timeout)
+                .await
+                .inspect_err(|err| {
+                    tracing::error!(
+                        err = ? err,
+                        job_id = ? assignment.job_id,
+                        task_id = ? assignment.task_id,
+                        "Process pool failed to dispatch task. Bailing out."
+                    );
+                })?;
+
+            let current_session = self.session_tracker.current();
+            if assignment.session_id < current_session {
+                tracing::warn!(
+                    bundle_session = assignment.session_id,
+                    current_session,
+                    job_id = ? assignment.job_id,
+                    task_id = ? assignment.task_id,
+                    "Dropping stale task assignment's outcome."
+                );
+                continue;
+            }
+
+            // Fire-and-forget the outcome report so the main loop can dispatch the next task
+            // without waiting on storage. Errors are logged inside `report_outcome`.
+            tokio::spawn(report_outcome(
+                self.storage_client.clone(),
+                ReportTarget {
+                    em: self.em_id,
+                    job: assignment.job_id,
+                    task: assignment.task_id,
+                    session: assignment.session_id,
+                },
+                outcome,
+            ));
+        }
+    }
+
+    /// Registers a task instance with storage.
+    ///
+    /// Races the storage call against [`Self::cancellation_token`]: when it fires, the method
+    /// returns `Ok(None)` and the next [`Self::main_loop`] iteration observes the token via its
+    /// top-level [`tokio::select!`] and exits.
+    ///
+    /// # Returns
+    ///
+    /// * `Ok(Some(execution_context))` if storage accepted the registration.
+    /// * `Ok(None)` if the assignment should be skipped (stale session, transport failure, any
+    ///   other recoverable storage error, or cancellation mid-call).
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`RuntimeError::StorageInvalidInput`] if storage rejects the request as malformed, which
+    ///   the runtime treats as fatal.
+    async fn register_task_instance(
+        &self,
+        assignment: SchedulerResponse,
+    ) -> Result<Option<ExecutionContext>, RuntimeError> {
+        let register_result = tokio::select! {
+            biased;
+            () = self.cancellation_token.cancelled() => return Ok(None),
+            result = self.storage_client.register_task_instance(
+                assignment.job_id,
+                assignment.task_id,
+                self.em_id,
+                assignment.session_id,
+            ) => result,
+        };
+
+        match register_result {
+            Ok(execution_context) => Ok(Some(execution_context)),
+            Err(StorageResponseError::StaleSession { storage_session }) => {
+                tracing::warn!(
+                    bundle_session = assignment.session_id,
+                    storage_session = storage_session,
+                    job_id = ? assignment.job_id,
+                    task_id = ? assignment.task_id,
+                    "Storage rejected task registration as stale. Dropping the assignment."
+                );
+                self.liveness_handle.refresh().await;
+                Ok(None)
+            }
+            Err(StorageResponseError::InvalidInput(err)) => {
+                tracing::error!(
+                    err = % err,
+                    job_id = ? assignment.job_id,
+                    task_id = ? assignment.task_id,
+                    "Storage rejected task registration as malformed. Bailing out."
+                );
+                Err(RuntimeError::StorageInvalidInput(err))
+            }
+            Err(err) => {
+                tracing::warn!(
+                    err = ? err,
+                    job_id = ? assignment.job_id,
+                    task_id = ? assignment.task_id,
+                    "Storage rejected task registration. Dropping the assignment."
+                );
+                Ok(None)
+            }
+        }
+    }
+}
+
+/// Identifies a single task-instance attempt that an outcome report belongs to.
+#[derive(Debug, Clone, Copy)]
+struct ReportTarget {
+    em: ExecutionManagerId,
+    job: JobId,
+    task: TaskId,
+    session: SessionId,
+}
+
+/// A task outcome prepared for transmission to storage. Splits the storage API's two reporting
+/// endpoints (success / failure) and carries their payloads.
+enum Report {
+    Success(Option<Vec<u8>>),
+    Failure(String),
+}
+
+impl Report {
+    /// # Returns
+    ///
+    /// The constructed report from the task executor's outcome.
+    fn from_outcome(outcome: Outcome, target: ReportTarget) -> Self {
+        match outcome {
+            Outcome::Success {
+                outputs,
+                elapsed_us,
+            } => {
+                tracing::info!(
+                    job_id = ? target.job,
+                    task_id = ? target.task,
+                    elapsed_us,
+                    "Task completed successfully."
+                );
+                Self::Success(Some(outputs))
+            }
+            Outcome::InTaskFailure { error, elapsed_us } => {
+                tracing::info!(
+                    job_id = ? target.job,
+                    task_id = ? target.task,
+                    elapsed_us,
+                    "Task reported an in-task failure."
+                );
+                Self::Failure(format!(
+                    "in-task failure: {}",
+                    String::from_utf8_lossy(&error)
+                ))
+            }
+            Outcome::Timeout { hard_timeout } => {
+                tracing::warn!(
+                    job_id = ? target.job,
+                    task_id = ? target.task,
+                    hard_timeout_ms = ?hard_timeout.as_millis(),
+                    "Task hit the hard timeout."
+                );
+                Self::Failure(format!(
+                    "hard timeout ({} ms) exceeded",
+                    hard_timeout.as_millis()
+                ))
+            }
+            Outcome::ExecutorCrash { exit_status } => {
+                tracing::warn!(
+                    job_id = ? target.job,
+                    task_id = ? target.task,
+                    exit_status = ?exit_status,
+                    "Task executor crashed."
+                );
+                Self::Failure(format!("executor crashed (exit_status = {exit_status:?})"))
+            }
+        }
+    }
+
+    /// Consumes `self` and sends it to storage via the matching reporting endpoint.
+    ///
+    /// # Type Parameters
+    ///
+    /// * `StorageClientType` - Concrete [`StorageClient`] the report is sent through.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * Forwards [`StorageClient::report_task_success`]'s return values on failure.
+    /// * Forwards [`StorageClient::report_task_failure`]'s return values on failure.
+    async fn send<StorageClientType: StorageClient>(
+        self,
+        storage_client: &StorageClientType,
+        target: ReportTarget,
+    ) -> Result<(), StorageResponseError> {
+        let ReportTarget {
+            em,
+            job,
+            task,
+            session,
+        } = target;
+        match self {
+            Self::Success(outputs) => {
+                storage_client
+                    .report_task_success(job, task, em, session, outputs)
+                    .await
+            }
+            Self::Failure(message) => {
+                storage_client
+                    .report_task_failure(job, task, em, session, message)
+                    .await
+            }
+        }
+    }
+}
+
+/// Reports a single task outcome to storage. Designed to run as a detached background task spawned
+/// by [`Runtime::main_loop`] so reporting overlaps with the next round of task dispatching; errors
+/// are logged rather than propagated.
+///
+/// # Type Parameters
+///
+/// * `StorageClientType` - Concrete [`StorageClient`] the report is sent through.
+async fn report_outcome<StorageClientType: StorageClient + 'static>(
+    storage_client: StorageClientType,
+    target: ReportTarget,
+    outcome: Outcome,
+) {
+    let report = Report::from_outcome(outcome, target);
+    let _ = report
+        .send(&storage_client, target)
+        .await
+        .inspect_err(|err| {
+            tracing::error!(
+                err = ? err,
+                job_id = ? target.job,
+                task_id = ? target.task,
+                "Failed to report task outcome to storage. Dropping the report."
+            );
+        });
+}
diff --git a/tests/huntsman/em-runtime/Cargo.toml b/tests/huntsman/em-runtime/Cargo.toml
new file mode 100644
index 00000000..55489dc3
--- /dev/null
+++ b/tests/huntsman/em-runtime/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "em-runtime-tests"
+version = "0.1.0"
+edition = "2024"
+publish = false
+
+[lib]
+name = "em_runtime_tests"
+path = "src/lib.rs"
+
+[[test]]
+name = "runtime"
+path = "tests/test_runtime.rs"
+
+[dev-dependencies]
+anyhow = "1.0.98"
+spider-core = { path = "../../../components/spider-core" }
+spider-execution-manager = {
+  path = "../../../components/spider-execution-manager"
+}
+test-utils = { path = "../test-utils" }
+tokio = {
+  version = "1.50.0",
+  features = ["macros", "rt", "rt-multi-thread", "time"]
+}
diff --git a/tests/huntsman/em-runtime/src/lib.rs b/tests/huntsman/em-runtime/src/lib.rs
new file mode 100644
index 00000000..f0a43541
--- /dev/null
+++ b/tests/huntsman/em-runtime/src/lib.rs
@@ -0,0 +1,4 @@
+//! Workspace member that hosts cross-crate integration tests for the execution manager runtime.
+//!
+//! Tests live under `tests/`; the shared mocks and helpers live in the `test-utils` crate. The
+//! library itself is intentionally empty.
diff --git a/tests/huntsman/em-runtime/tests/test_runtime.rs b/tests/huntsman/em-runtime/tests/test_runtime.rs
new file mode 100644
index 00000000..89e9229f
--- /dev/null
+++ b/tests/huntsman/em-runtime/tests/test_runtime.rs
@@ -0,0 +1,591 @@
+//! Integration tests for [`spider_execution_manager::runtime::Runtime`].
+//!
+//! Each test wires up the runtime with the in-process mocks from `em_runtime_tests` plus a real
+//! `spider-task-executor` binary spawned by the runtime's owned process pool. The binary path and
+//! the TDL package staging directory are read from the same env vars the rest of the huntsman
+//! integration suite uses (`SPIDER_TASK_EXECUTOR_BIN`, `SPIDER_TDL_PACKAGE_DIR`).
+//!
+//! All tests are `#[ignore]` so the workspace's plain `cargo test` doesn't run them.
+
+use std::{path::PathBuf, sync::Arc, time::Duration};
+
+use anyhow::Context;
+use spider_core::{
+    task::{TdlContext, TimeoutPolicy},
+    types::{
+        id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId, TaskId},
+        io::{ExecutionContext, TaskInput},
+    },
+};
+use spider_execution_manager::{
+    client::{SchedulerError, SchedulerResponse, StorageResponseError},
+    runtime::{Runtime, RuntimeConfig, RuntimeError},
+};
+use test_utils::{
+    MockLiveness,
+    MockScheduler,
+    MockStorage,
+    PACKAGE_NAME,
+    decode_single_output,
+    single_input,
+    task_executor_bin,
+    tdl_package_dir,
+};
+
+const HEARTBEAT_INTERVAL: Duration = Duration::from_millis(100);
+const SLOW_HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
+const BOUNDED_WAIT: Duration = Duration::from_secs(2);
+const TIGHT_WAIT: Duration = Duration::from_millis(500);
+
+/// Builds a [`SchedulerResponse`] tagged with `session_id` and fresh ids for the rest.
+///
+/// # Returns
+///
+/// A scheduler assignment carrying freshly generated `job_id`, `task_id`, and `resource_group_id`
+/// alongside the requested `session_id`.
+fn assignment_with_session(session_id: u64) -> SchedulerResponse {
+    SchedulerResponse {
+        job_id: JobId::random(),
+        task_id: TaskId::Index(0),
+        resource_group_id: ResourceGroupId::random(),
+        session_id,
+    }
+}
+
+/// Builds an [`ExecutionContext`] pointing at `task_func` in the integration package with the
+/// given inputs. Uses a generous hard timeout so well-behaved tasks always finish before the
+/// process pool kills them.
+///
+/// # Returns
+///
+/// A populated [`ExecutionContext`] suitable for handing to the runtime via
+/// [`MockStorage::push_register_response`].
+fn execution_context(task_func: &str, inputs: Vec<TaskInput>) -> ExecutionContext {
+    ExecutionContext {
+        task_instance_id: 1,
+        tdl_context: TdlContext {
+            package: PACKAGE_NAME.to_owned(),
+            task_func: task_func.to_owned(),
+        },
+        timeout_policy: TimeoutPolicy {
+            soft_timeout_ms: 1_000,
+            hard_timeout_ms: 5_000,
+        },
+        inputs,
+    }
+}
+
+/// Polls `predicate` every 5 ms until it returns `true` or `timeout` elapses.
+///
+/// # Returns
+///
+/// Whether `predicate` returned `true` before the deadline.
+async fn wait_until(predicate: impl Fn() -> bool, timeout: Duration) -> bool {
+    let deadline = tokio::time::Instant::now() + timeout;
+    while !predicate() {
+        if tokio::time::Instant::now() >= deadline {
+            return false;
+        }
+        tokio::time::sleep(Duration::from_millis(5)).await;
+    }
+    true
+}
+
+/// Builds a fresh [`RuntimeConfig`] pointing at the real executor binary, with a unique per-test
+/// log directory and the requested `heartbeat_interval`.
+///
+/// # Returns
+///
+/// A [`RuntimeConfig`] ready to hand to [`Runtime::create`].
+///
+/// # Panics
+///
+/// Panics if the hard-coded loopback ip fails to parse — never in practice.
+fn runtime_config(heartbeat_interval: Duration) -> RuntimeConfig {
+    let unique = ExecutionManagerId::random();
+    let log_dir = std::env::temp_dir().join(format!("spider-em-runtime-test-{unique}"));
+    RuntimeConfig {
+        em_ip: "127.0.0.1".parse().expect("parse loopback"),
+        heartbeat_interval,
+        executor_binary_path: task_executor_bin(),
+        package_dir: tdl_package_dir(),
+        log_dir,
+    }
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn create_registers_and_starts_heartbeats() -> anyhow::Result<()> {
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::new();
+
+    let (_runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness.clone()),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    assert_eq!(liveness.register_calls().len(), 1);
+    assert!(
+        liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await,
+        "liveness actor should send at least one heartbeat after create returns; observed {} so \
+         far",
+        liveness.heartbeat_count()
+    );
+
+    token.cancel();
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn create_propagates_pool_init_error() {
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::new();
+    let bad_config = RuntimeConfig {
+        executor_binary_path: PathBuf::from("/nonexistent/spider-task-executor"),
+        ..runtime_config(HEARTBEAT_INTERVAL)
+    };
+
+    let result = Runtime::create(scheduler, storage, Arc::new(liveness), bad_config).await;
+    match result {
+        Err(RuntimeError::ProcessPool(_)) => {}
+        Err(other) => panic!("expected ProcessPool error, got {other:?}"),
+        Ok(_) => panic!("expected ProcessPool error, got Ok"),
+    }
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn external_cancellation_returns_ok() -> anyhow::Result<()> {
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::new();
+
+    let (runtime, token) = Runtime::create(
+        scheduler,
+        storage,
+        Arc::new(liveness.clone()),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    let join = tokio::spawn(runtime.run());
+    // Let at least one heartbeat happen so we know the loop is alive before cancelling.
+    assert!(liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await);
+
+    token.cancel();
+    let result = tokio::time::timeout(BOUNDED_WAIT, join)
+        .await
+        .context("run did not return within bounded time")?
+        .context("run task panicked")?;
+    assert!(matches!(result, Ok(())), "expected Ok(()), got {result:?}");
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn scheduler_error_is_retried() -> anyhow::Result<()> {
+    const SESSION_ID: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(SESSION_ID);
+
+    // The first poll errors; the loop should log it and poll again rather than bail. The second
+    // poll returns a real assignment, which we drop on the storage side to keep the test focused.
+    scheduler.push(Err(SchedulerError::Transport("boom".to_owned())));
+    scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+    storage.push_register_response(Err(StorageResponseError::Server("test drop".to_owned())));
+
+    let (runtime, token) = Runtime::create(
+        scheduler,
+        storage.clone(),
+        Arc::new(liveness),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    let join = tokio::spawn(runtime.run());
+
+    // Reaching register proves the loop retried past the scheduler error onto the next poll.
+    assert!(
+        wait_until(|| !storage.register_calls().is_empty(), BOUNDED_WAIT).await,
+        "expected the loop to retry past the scheduler error and register the next assignment"
+    );
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn stale_bundle_is_dropped_without_register() -> anyhow::Result<()> {
+    const CURRENT_SESSION: SessionId = 10;
+    const STALE_SESSION: SessionId = 5;
+    const { assert!(CURRENT_SESSION > STALE_SESSION) };
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(CURRENT_SESSION);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    scheduler.push(Ok(assignment_with_session(STALE_SESSION)));
+    let join = tokio::spawn(runtime.run());
+
+    assert!(
+        wait_until(|| scheduler.call_count() >= 2, BOUNDED_WAIT).await,
+        "expected scheduler to be polled again after dropping stale bundle; call_count = {}",
+        scheduler.call_count()
+    );
+    assert!(
+        storage.register_calls().is_empty(),
+        "storage should not be touched for a stale bundle"
+    );
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn newer_bundle_triggers_liveness_refresh() -> anyhow::Result<()> {
+    const CURRENT_SESSION: SessionId = 5;
+    const LATEST_SESSION: SessionId = 10;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(CURRENT_SESSION);
+
+    // Slow interval so we can be sure the second observed heartbeat is the refresh-induced one
+    // (the periodic tick is 5 s away).
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness.clone()),
+        runtime_config(SLOW_HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    // Wait for the periodic-interval's leading tick to settle so the count is a clean baseline.
+    assert!(liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await);
+    let baseline = liveness.heartbeat_count();
+
+    // The newer-session bundle: the runtime should call `LivenessHandle::refresh` before
+    // registering. Drop the bundle on the storage side to keep the test focused on the refresh.
+    scheduler.push(Ok(assignment_with_session(LATEST_SESSION)));
+    storage.push_register_response(Err(StorageResponseError::Server("test drop".to_owned())));
+    let join = tokio::spawn(runtime.run());
+
+    assert!(
+        liveness.wait_for_heartbeats(baseline + 1, TIGHT_WAIT).await,
+        "expected an extra heartbeat (refresh) within {TIGHT_WAIT:?}; heartbeats = {}",
+        liveness.heartbeat_count()
+    );
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn equal_session_passes_through_to_register() -> anyhow::Result<()> {
+    const SESSION_ID: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(SESSION_ID);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    // Bundle session matches the tracker exactly — runtime should skip triage and call register.
+    // Drop on the storage side so we don't need a real execution.
+    scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+    storage.push_register_response(Err(StorageResponseError::Server("test drop".to_owned())));
+    let join = tokio::spawn(runtime.run());
+
+    assert!(
+        wait_until(|| !storage.register_calls().is_empty(), BOUNDED_WAIT).await,
+        "expected register_task_instance to be called with the bundle's session id"
+    );
+    let calls = storage.register_calls();
+    assert_eq!(calls.len(), 1);
+    assert_eq!(calls[0].session_id, SESSION_ID);
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn stale_session_drops_assignment_and_refreshes() -> anyhow::Result<()> {
+    const CURRENT_SESSION: SessionId = 10;
+    const STALE_SESSION: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(STALE_SESSION);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness.clone()),
+        runtime_config(SLOW_HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    assert!(liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await);
+    let baseline = liveness.heartbeat_count();
+
+    scheduler.push(Ok(assignment_with_session(STALE_SESSION)));
+    storage.push_register_response(Err(StorageResponseError::StaleSession {
+        storage_session: CURRENT_SESSION,
+    }));
+    let join = tokio::spawn(runtime.run());
+
+    // Stale-session response triggers liveness refresh and drops the assignment.
+    assert!(
+        liveness.wait_for_heartbeats(baseline + 1, TIGHT_WAIT).await,
+        "expected refresh-induced heartbeat after StaleSession; heartbeats = {}",
+        liveness.heartbeat_count()
+    );
+    assert!(
+        wait_until(|| scheduler.call_count() >= 2, BOUNDED_WAIT).await,
+        "expected scheduler to be polled again after stale assignment was dropped"
+    );
+    assert_eq!(storage.register_calls().len(), 1);
+    assert!(storage.success_reports().is_empty());
+    assert!(storage.failure_reports().is_empty());
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn recoverable_storage_errors_drop_assignment() -> anyhow::Result<()> {
+    const SESSION_ID: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(SESSION_ID);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    // Three bundles, three recoverable register failures. Each one should cause the loop to drop
+    // the assignment and poll the scheduler again.
+    let recoverable_errors = [
+        StorageResponseError::Transport("net blip".to_owned()),
+        StorageResponseError::CacheStale("stale cache".to_owned()),
+        StorageResponseError::Server("server boom".to_owned()),
+    ];
+    for err in recoverable_errors {
+        scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+        storage.push_register_response(Err(err));
+    }
+    let join = tokio::spawn(runtime.run());
+
+    // After all three are drained, the next scheduler call blocks because the queue is empty.
+    assert!(
+        wait_until(|| scheduler.call_count() >= 4, BOUNDED_WAIT).await,
+        "expected 3 drops + 1 idle poll; call_count = {}",
+        scheduler.call_count()
+    );
+    assert_eq!(storage.register_calls().len(), 3);
+    assert!(storage.success_reports().is_empty());
+    assert!(storage.failure_reports().is_empty());
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn success_outcome_reports_outputs() -> anyhow::Result<()> {
+    const SESSION_ID: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(SESSION_ID);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness.clone()),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+    let em_id = liveness.em_id();
+
+    let assignment = assignment_with_session(SESSION_ID);
+    scheduler.push(Ok(assignment));
+    storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64))));
+    let join = tokio::spawn(runtime.run());
+
+    assert!(storage.wait_for_any_report(BOUNDED_WAIT).await);
+    let reports = storage.success_reports();
+    assert_eq!(reports.len(), 1);
+    let report = &reports[0];
+    assert_eq!(report.job_id, assignment.job_id);
+    assert_eq!(report.task_id, assignment.task_id);
+    assert_eq!(report.em_id, em_id);
+    assert_eq!(report.session_id, SESSION_ID);
+    let outputs = report
+        .serialized_outputs
+        .as_ref()
+        .context("success report should carry outputs")?;
+    assert_eq!(decode_single_output::<u64>(outputs), 55);
+    assert!(storage.failure_reports().is_empty());
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn non_success_outcome_keeps_loop_serving() -> anyhow::Result<()> {
+    const SESSION_ID: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(SESSION_ID);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    // First bundle: always_fail. Second bundle: fibonacci. If the loop bails after a failure
+    // outcome, the second bundle never reaches register / report.
+    scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+    storage.push_register_response(Ok(execution_context("always_fail", vec![])));
+    scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+    storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64))));
+    let join = tokio::spawn(runtime.run());
+
+    assert!(
+        wait_until(
+            || !storage.failure_reports().is_empty() && !storage.success_reports().is_empty(),
+            BOUNDED_WAIT,
+        )
+        .await,
+        "expected one failure (always_fail) and one success (fibonacci) report; got success={} \
+         failure={}",
+        storage.success_reports().len(),
+        storage.failure_reports().len()
+    );
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn storage_report_error_does_not_kill_runtime() -> anyhow::Result<()> {
+    const SESSION_ID: SessionId = 5;
+
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::with_initial_session(SESSION_ID);
+
+    let (runtime, token) = Runtime::create(
+        scheduler.clone(),
+        storage.clone(),
+        Arc::new(liveness),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    // The first success report fails, the second succeeds. The runtime should keep serving
+    // assignments either way.
+    storage.push_success_response(Err(StorageResponseError::Server("report boom".to_owned())));
+    scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+    storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64))));
+    scheduler.push(Ok(assignment_with_session(SESSION_ID)));
+    storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64))));
+    let join = tokio::spawn(runtime.run());
+
+    assert!(
+        wait_until(|| storage.success_reports().len() >= 2, BOUNDED_WAIT).await,
+        "expected two success reports; got {}",
+        storage.success_reports().len()
+    );
+
+    token.cancel();
+    join.await??;
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"]
+async fn drop_guard_cancels_token_when_run_future_dropped() -> anyhow::Result<()> {
+    let scheduler = MockScheduler::new();
+    let storage = MockStorage::new();
+    let liveness = MockLiveness::new();
+
+    let (runtime, _token) = Runtime::create(
+        scheduler,
+        storage,
+        Arc::new(liveness.clone()),
+        runtime_config(HEARTBEAT_INTERVAL),
+    )
+    .await?;
+
+    // Make sure the actor is actively ticking before we drop the runtime.
+    assert!(liveness.wait_for_heartbeats(2, BOUNDED_WAIT).await);
+
+    // Dropping the `runtime.run()` future inside a short timeout drops the Runtime itself, which
+    // fires the `DropGuard` and cancels the token the liveness actor watches.
+    let timeout_result = tokio::time::timeout(Duration::from_millis(150), runtime.run()).await;
+    assert!(
+        timeout_result.is_err(),
+        "run unexpectedly returned within the timeout window: {timeout_result:?}"
+    );
+
+    // Give the actor a moment to observe cancellation and drain any in-flight heartbeat call.
+    tokio::time::sleep(2 * HEARTBEAT_INTERVAL).await;
+    let snapshot = liveness.heartbeat_count();
+
+    // Five heartbeat intervals must elapse without the counter advancing.
+    tokio::time::sleep(5 * HEARTBEAT_INTERVAL).await;
+    let current = liveness.heartbeat_count();
+    assert_eq!(
+        current, snapshot,
+        "liveness actor kept heartbeating after Runtime drop; was {snapshot}, now {current}"
+    );
+    Ok(())
+}
diff --git a/tests/huntsman/test-utils/src/mock.rs b/tests/huntsman/test-utils/src/mock.rs
index e9115759..36db4137 100644
--- a/tests/huntsman/test-utils/src/mock.rs
+++ b/tests/huntsman/test-utils/src/mock.rs
@@ -18,14 +18,274 @@ use std::{
 };
 
 use async_trait::async_trait;
-use spider_core::types::id::{ExecutionManagerId, SessionId};
+use spider_core::types::{
+    id::{ExecutionManagerId, JobId, SessionId, TaskId},
+    io::ExecutionContext,
+};
 use spider_execution_manager::client::{
     LivenessClient,
     LivenessResponseError,
     RegistrationResponse,
+    SchedulerClient,
+    SchedulerError,
+    SchedulerResponse,
+    StorageClient,
+    StorageResponseError,
 };
 use tokio::sync::Notify;
 
+/// Mock [`SchedulerClient`].
+#[derive(Clone)]
+pub struct MockScheduler {
+    inner: Arc<SchedulerInner>,
+}
+
+impl MockScheduler {
+    /// Factory function.
+    ///
+    /// # Returns
+    ///
+    /// A fresh scheduler mock with an empty response queue. `next_task` blocks until the test
+    /// pushes a response.
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            inner: Arc::new(SchedulerInner {
+                responses: Mutex::new(VecDeque::new()),
+                notify: Notify::new(),
+                call_count: AtomicU64::new(0),
+            }),
+        }
+    }
+
+    /// Queues `response` for the next pending or future [`SchedulerClient::next_task`] call.
+    pub fn push(&self, response: Result<SchedulerResponse, SchedulerError>) {
+        lock(&self.inner.responses).push_back(response);
+        self.inner.notify.notify_waiters();
+    }
+
+    /// # Returns
+    ///
+    /// The number of `next_task` calls the scheduler has served (including ones that are still
+    /// blocked waiting on the response queue).
+    #[must_use]
+    pub fn call_count(&self) -> u64 {
+        self.inner.call_count.load(Ordering::Relaxed)
+    }
+}
+
+impl Default for MockScheduler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl SchedulerClient for MockScheduler {
+    async fn next_task(
+        &self,
+        _em_id: ExecutionManagerId,
+    ) -> Result<SchedulerResponse, SchedulerError> {
+        self.inner.call_count.fetch_add(1, Ordering::Relaxed);
+        loop {
+            let notified = self.inner.notify.notified();
+            let popped = lock(&self.inner.responses).pop_front();
+            if let Some(response) = popped {
+                return response;
+            }
+            notified.await;
+        }
+    }
+}
+
+/// Captured arguments of one `register_task_instance` call.
+#[derive(Debug, Clone)]
+pub struct RegisterCall {
+    pub job_id: JobId,
+    pub task_id: TaskId,
+    pub em_id: ExecutionManagerId,
+    pub session_id: SessionId,
+}
+
+/// Captured arguments of one `report_task_success` call.
+#[derive(Debug, Clone)]
+pub struct SuccessReport {
+    pub job_id: JobId,
+    pub task_id: TaskId,
+    pub em_id: ExecutionManagerId,
+    pub session_id: SessionId,
+    pub serialized_outputs: Option<Vec<u8>>,
+}
+
+/// Captured arguments of one `report_task_failure` call.
+#[derive(Debug, Clone)]
+pub struct FailureReport {
+    pub job_id: JobId,
+    pub task_id: TaskId,
+    pub em_id: ExecutionManagerId,
+    pub session_id: SessionId,
+    pub error_message: String,
+}
+
+/// Mock [`StorageClient`].
+#[derive(Clone)]
+pub struct MockStorage {
+    inner: Arc<StorageInner>,
+}
+
+impl MockStorage {
+    /// Factory function.
+    ///
+    /// # Returns
+    ///
+    /// A storage mock with no programmed responses. Tests must push register responses before
+    /// they fire; success / failure reports default to `Ok(())`.
+    #[must_use]
+    pub fn new() -> Self {
+        Self {
+            inner: Arc::new(StorageInner {
+                register_responses: Mutex::new(VecDeque::new()),
+                success_responses: Mutex::new(VecDeque::new()),
+                failure_responses: Mutex::new(VecDeque::new()),
+                register_calls: Mutex::new(Vec::new()),
+                success_reports: Mutex::new(Vec::new()),
+                failure_reports: Mutex::new(Vec::new()),
+                notify: Notify::new(),
+            }),
+        }
+    }
+
+    /// Queues `response` for the next `register_task_instance` call.
+    pub fn push_register_response(&self, response: Result<ExecutionContext, StorageResponseError>) {
+        lock(&self.inner.register_responses).push_back(response);
+    }
+
+    /// Queues `response` for the next `report_task_success` call.
+    pub fn push_success_response(&self, response: Result<(), StorageResponseError>) {
+        lock(&self.inner.success_responses).push_back(response);
+    }
+
+    /// Queues `response` for the next `report_task_failure` call.
+    pub fn push_failure_response(&self, response: Result<(), StorageResponseError>) {
+        lock(&self.inner.failure_responses).push_back(response);
+    }
+
+    /// # Returns
+    ///
+    /// A snapshot of every `register_task_instance` call recorded so far.
+    #[must_use]
+    pub fn register_calls(&self) -> Vec<RegisterCall> {
+        lock(&self.inner.register_calls).clone()
+    }
+
+    /// # Returns
+    ///
+    /// A snapshot of every `report_task_success` call recorded so far.
+    #[must_use]
+    pub fn success_reports(&self) -> Vec<SuccessReport> {
+        lock(&self.inner.success_reports).clone()
+    }
+
+    /// # Returns
+    ///
+    /// A snapshot of every `report_task_failure` call recorded so far.
+    #[must_use]
+    pub fn failure_reports(&self) -> Vec<FailureReport> {
+        lock(&self.inner.failure_reports).clone()
+    }
+
+    /// Waits for at least one `report_*` call to be recorded, with a bounded total wait time.
+    ///
+    /// # Returns
+    ///
+    /// Whether a report was observed before `timeout` elapsed.
+    pub async fn wait_for_any_report(&self, timeout: Duration) -> bool {
+        let deadline = tokio::time::Instant::now() + timeout;
+        loop {
+            if !self.success_reports().is_empty() || !self.failure_reports().is_empty() {
+                return true;
+            }
+            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+            if remaining.is_zero() {
+                return false;
+            }
+            let notified = self.inner.notify.notified();
+            tokio::select! {
+                () = notified => {}
+                () = tokio::time::sleep(remaining.min(POLL_INTERVAL)) => {}
+            }
+        }
+    }
+}
+
+impl Default for MockStorage {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl StorageClient for MockStorage {
+    async fn register_task_instance(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+    ) -> Result<ExecutionContext, StorageResponseError> {
+        lock(&self.inner.register_calls).push(RegisterCall {
+            job_id,
+            task_id,
+            em_id,
+            session_id,
+        });
+        let response = lock(&self.inner.register_responses).pop_front();
+        response.expect("mock storage exhausted register responses")
+    }
+
+    async fn report_task_success(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+        serialized_outputs: Option<Vec<u8>>,
+    ) -> Result<(), StorageResponseError> {
+        lock(&self.inner.success_reports).push(SuccessReport {
+            job_id,
+            task_id,
+            em_id,
+            session_id,
+            serialized_outputs,
+        });
+        self.inner.notify.notify_waiters();
+        lock(&self.inner.success_responses)
+            .pop_front()
+            .unwrap_or(Ok(()))
+    }
+
+    async fn report_task_failure(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+        error_message: String,
+    ) -> Result<(), StorageResponseError> {
+        lock(&self.inner.failure_reports).push(FailureReport {
+            job_id,
+            task_id,
+            em_id,
+            session_id,
+            error_message,
+        });
+        self.inner.notify.notify_waiters();
+        lock(&self.inner.failure_responses)
+            .pop_front()
+            .unwrap_or(Ok(()))
+    }
+}
+
 /// Mock [`LivenessClient`].
 #[derive(Clone)]
 pub struct MockLiveness {
@@ -168,6 +428,24 @@ impl LivenessClient for MockLiveness {
 /// Default polling interval for `wait_until_*` helpers. Short enough to keep tests snappy.
 const POLL_INTERVAL: Duration = Duration::from_millis(5);
 
+/// Shared state behind [`MockScheduler`].
+struct SchedulerInner {
+    responses: Mutex<VecDeque<Result<SchedulerResponse, SchedulerError>>>,
+    notify: Notify,
+    call_count: AtomicU64,
+}
+
+/// Shared state behind [`MockStorage`].
+struct StorageInner {
+    register_responses: Mutex<VecDeque<Result<ExecutionContext, StorageResponseError>>>,
+    success_responses: Mutex<VecDeque<Result<(), StorageResponseError>>>,
+    failure_responses: Mutex<VecDeque<Result<(), StorageResponseError>>>,
+    register_calls: Mutex<Vec<RegisterCall>>,
+    success_reports: Mutex<Vec<SuccessReport>>,
+    failure_reports: Mutex<Vec<FailureReport>>,
+    notify: Notify,
+}
+
 /// Shared state behind [`MockLiveness`].
 struct LivenessInner {
     em_id: ExecutionManagerId,

From 0683275d84d7c2e318feb3788f0223792ae90b8d Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Sun, 7 Jun 2026 15:58:48 -0400
Subject: [PATCH 08/14] build: Split the Ubuntu dev-dependency install script
 into common, huntsman, and wolf variants. (#336)

Co-authored-by: sitaowang1998 <sitaowang1998@outlook.com>
---
 .devcontainer/Dockerfile                      |  3 +-
 .github/workflows/code-linting-checks.yaml    |  8 ++---
 .github/workflows/tests.yaml                  |  4 +--
 .../lib_install/ubuntu/install-dev-common.sh  | 29 +++++++++++++++++++
 .../ubuntu/install-dev-huntsman.sh            | 26 +++++++++++++++++
 .../install-dev-wolf.sh}                      | 22 ++++++--------
 6 files changed, 72 insertions(+), 20 deletions(-)
 create mode 100755 tools/scripts/lib_install/ubuntu/install-dev-common.sh
 create mode 100755 tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
 rename tools/scripts/lib_install/{linux/install-dev.sh => ubuntu/install-dev-wolf.sh} (70%)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index a47c1a5e..74610fb6 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -5,7 +5,8 @@ WORKDIR /root
 RUN mkdir -p ./tools/scripts/lib_install
 COPY ./tools/scripts/lib_install ./tools/scripts/lib_install
 
-RUN ./tools/scripts/lib_install/linux/install-dev.sh
+RUN ./tools/scripts/lib_install/ubuntu/install-dev-wolf.sh
+RUN ./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
 
 # NOTE:
 # `task` doesn't have an apt/yum package so we use its install script.
diff --git a/.github/workflows/code-linting-checks.yaml b/.github/workflows/code-linting-checks.yaml
index 2890ea61..fa37293f 100644
--- a/.github/workflows/code-linting-checks.yaml
+++ b/.github/workflows/code-linting-checks.yaml
@@ -47,7 +47,7 @@ jobs:
       - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv"
 
       - name: "Install dev dependencies"
-        run: "./tools/scripts/lib_install/linux/install-dev.sh"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-common.sh"
 
       - run: "task lint:toml-check"
 
@@ -75,7 +75,7 @@ jobs:
       - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv"
 
       - name: "Install dev dependencies"
-        run: "./tools/scripts/lib_install/linux/install-dev.sh"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-wolf.sh"
 
       - uses: "./tools/yscope-dev-utils/exports/github/actions/print-tool-versions"
 
@@ -151,7 +151,7 @@ jobs:
       - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv"
 
       - name: "Install dev dependencies"
-        run: "./tools/scripts/lib_install/linux/install-dev.sh"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-common.sh"
 
       - run: "task lint:py-check"
 
@@ -174,7 +174,7 @@ jobs:
       - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv"
 
       - name: "Install dev dependencies"
-        run: "./tools/scripts/lib_install/linux/install-dev.sh"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh"
 
       - run: "task lint:check-rust"
 
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index e8ee9c47..46870c49 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -59,7 +59,7 @@ jobs:
       - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv"
 
       - name: "Install dev dependencies"
-        run: "./tools/scripts/lib_install/linux/install-dev.sh"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-wolf.sh"
 
       - uses: "./tools/yscope-dev-utils/exports/github/actions/print-tool-versions"
 
@@ -97,7 +97,7 @@ jobs:
       - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv"
 
       - name: "Install dev dependencies"
-        run: "./tools/scripts/lib_install/linux/install-dev.sh"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh"
 
       - uses: "./tools/yscope-dev-utils/exports/github/actions/print-tool-versions"
 
diff --git a/tools/scripts/lib_install/ubuntu/install-dev-common.sh b/tools/scripts/lib_install/ubuntu/install-dev-common.sh
new file mode 100755
index 00000000..4a24c8e4
--- /dev/null
+++ b/tools/scripts/lib_install/ubuntu/install-dev-common.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Installs the dev dependencies shared by all versions of Spider.
+
+# Exit on any error
+set -e
+
+# Error on undefined variable
+set -u
+
+echo "Checking for elevated privileges..."
+privileged_command_prefix=""
+if [ ${EUID:-$(id -u)} -ne 0 ] ; then
+  sudo echo "Script can elevate privileges."
+  privileged_command_prefix="${privileged_command_prefix} sudo"
+fi
+
+${privileged_command_prefix} apt-get update
+DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \
+apt-get install --no-install-recommends -y \
+    ca-certificates \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-venv
+
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
diff --git a/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
new file mode 100755
index 00000000..73b64eab
--- /dev/null
+++ b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Installs the dev dependencies for Spider Huntsman.
+
+# Exit on any error
+set -e
+
+# Error on undefined variable
+set -u
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+"$script_dir"/install-dev-common.sh
+
+echo "Checking for elevated privileges..."
+privileged_command_prefix=""
+if [ ${EUID:-$(id -u)} -ne 0 ] ; then
+  sudo echo "Script can elevate privileges."
+  privileged_command_prefix="${privileged_command_prefix} sudo"
+fi
+
+# `gcc` and `libc6-dev` are required by `rustc`, which invokes the system C compiler driver to
+# link binaries against libc.
+DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \
+apt-get install --no-install-recommends -y \
+    gcc \
+    libc6-dev
diff --git a/tools/scripts/lib_install/linux/install-dev.sh b/tools/scripts/lib_install/ubuntu/install-dev-wolf.sh
similarity index 70%
rename from tools/scripts/lib_install/linux/install-dev.sh
rename to tools/scripts/lib_install/ubuntu/install-dev-wolf.sh
index c1f1b4f7..2c816b0a 100755
--- a/tools/scripts/lib_install/linux/install-dev.sh
+++ b/tools/scripts/lib_install/ubuntu/install-dev-wolf.sh
@@ -1,25 +1,28 @@
 #!/usr/bin/env bash
 
+# Installs the dev dependencies for Spider Wolf.
+
 # Exit on any error
 set -e
 
 # Error on undefined variable
 set -u
 
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+"$script_dir"/install-dev-common.sh
+
 echo "Checking for elevated privileges..."
 privileged_command_prefix=""
 if [ ${EUID:-$(id -u)} -ne 0 ] ; then
   sudo echo "Script can elevate privileges."
   privileged_command_prefix="${privileged_command_prefix} sudo"
 fi
-${privileged_command_prefix} apt-get update
-DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} apt-get install --no-install-recommends -y \
-    ca-certificates \
+
+DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \
+apt-get install --no-install-recommends -y \
     checkinstall \
-    curl \
     g++ \
     gcc \
-    git \
     jq \
     libcurl4 \
     libcurl4-openssl-dev \
@@ -27,16 +30,9 @@ DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} apt-get install --no
     libssl-dev \
     make \
     openjdk-11-jdk \
-    pkg-config \
-    python3 \
-    python3-pip \
-    python3-venv
+    pkg-config
 
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 lib_install_scripts_dir="$script_dir/.."
 ${privileged_command_prefix} "$lib_install_scripts_dir"/install-cmake.sh 3.23.5
 # TODO https://github.com/y-scope/spider/issues/86
 "$lib_install_scripts_dir"/check-cmake-version.sh
-
-# Install uv
-curl -LsSf https://astral.sh/uv/install.sh | sh

From e39e319b91e7f6623af9022ba12446d8dba637f7 Mon Sep 17 00:00:00 2001
From: sitaowang1998 <sitaowang1998@outlook.com>
Date: Sun, 7 Jun 2026 16:37:34 -0400
Subject: [PATCH 09/14] feat(huntsman): Add protobuf scaffolding and gRPC
 StorageClient for task instance management: (#333)

* Add `spider-proto` component for protobuf source files.
* Add `spider-proto-rust` component for the generated Rust code.
* Add tasks to build the generated Rust code.
* Add a GitHub workflow to verify the generated Rust code is up-to-date.
* Implement a gRPC-backed `StorageClient` for the execution manager.
Co-authored-by: LinZhihao-723 <pleiades3190@gmail.com>
---
 .../proto-generated-code-checks.yaml          |  38 +
 Cargo.lock                                    | 889 ++++++++++++++++--
 Cargo.toml                                    |  11 +
 .../spider-execution-manager/Cargo.toml       |   2 +
 .../spider-execution-manager/src/client.rs    |   2 +
 .../src/client/grpc/mod.rs                    |   5 +
 .../src/client/grpc/storage.rs                | 225 +++++
 components/spider-proto-rust/Cargo.toml       |  16 +
 components/spider-proto-rust/build.rs         |  81 ++
 .../src/generated/storage.rs                  | 632 +++++++++++++
 components/spider-proto-rust/src/id.rs        |  44 +
 components/spider-proto-rust/src/lib.rs       |   8 +
 components/spider-proto/storage/storage.proto |  71 ++
 taskfiles/build.yaml                          |  20 +
 taskfiles/lint.yaml                           |   2 +-
 taskfiles/test.yaml                           |   9 +-
 .../ubuntu/install-dev-huntsman.sh            |   3 +-
 17 files changed, 1978 insertions(+), 80 deletions(-)
 create mode 100644 .github/workflows/proto-generated-code-checks.yaml
 create mode 100644 components/spider-execution-manager/src/client/grpc/mod.rs
 create mode 100644 components/spider-execution-manager/src/client/grpc/storage.rs
 create mode 100644 components/spider-proto-rust/Cargo.toml
 create mode 100644 components/spider-proto-rust/build.rs
 create mode 100644 components/spider-proto-rust/src/generated/storage.rs
 create mode 100644 components/spider-proto-rust/src/id.rs
 create mode 100644 components/spider-proto-rust/src/lib.rs
 create mode 100644 components/spider-proto/storage/storage.proto

diff --git a/.github/workflows/proto-generated-code-checks.yaml b/.github/workflows/proto-generated-code-checks.yaml
new file mode 100644
index 00000000..823639fa
--- /dev/null
+++ b/.github/workflows/proto-generated-code-checks.yaml
@@ -0,0 +1,38 @@
+name: "proto-generated-code-checks"
+
+on:
+  pull_request:
+  push:
+  schedule:
+    # Run daily at 00:15 UTC (the 15 is to avoid periods of high load)
+    - cron: "15 0 * * *"
+  workflow_dispatch:
+
+concurrency:
+  group: "${{github.workflow}}-${{github.ref}}"
+  # Cancel in-progress jobs for efficiency
+  cancel-in-progress: true
+
+jobs:
+  proto-code-committed:
+    name: "proto-code-committed"
+    runs-on: "ubuntu-latest"
+    steps:
+      - uses: "actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd"  # v6.0.2
+        with:
+          submodules: "recursive"
+
+      - uses: "./tools/yscope-dev-utils/exports/github/actions/install-python"
+
+      - uses: "./tools/yscope-dev-utils/exports/github/actions/install-go-task"
+
+      - name: "Install dev dependencies"
+        run: "./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh"
+
+      - name: "`spider-proto-rust` code generation"
+        shell: "bash"
+        run: "task build:spider-proto-rust-codegen"
+
+      - name: "Check if the generated proto code is the latest"
+        shell: "bash"
+        run: "git diff --exit-code components/spider-proto-rust/src/generated"
diff --git a/Cargo.lock b/Cargo.lock
index a6653baa..0f3763a2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -35,6 +35,28 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "async-stream"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "async-trait"
 version = "0.1.89"
@@ -55,11 +77,64 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
 [[package]]
 name = "autocfg"
-version = "1.5.0"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
+
+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper",
+ "tower 0.5.3",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper",
+ "tower-layer",
+ "tower-service",
+]
 
 [[package]]
 name = "base64"
@@ -84,9 +159,9 @@ dependencies = [
 
 [[package]]
 name = "bitflags"
-version = "2.11.1"
+version = "2.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8"
 dependencies = [
  "serde_core",
 ]
@@ -217,9 +292,9 @@ dependencies = [
 
 [[package]]
 name = "dashmap"
-version = "6.1.0"
+version = "6.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c"
 dependencies = [
  "cfg-if",
  "crossbeam-utils",
@@ -265,9 +340,9 @@ dependencies = [
 
 [[package]]
 name = "displaydoc"
-version = "0.2.5"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -282,9 +357,9 @@ checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
 
 [[package]]
 name = "either"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
 dependencies = [
  "serde",
 ]
@@ -348,6 +423,18 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "fastrand"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
+
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
 [[package]]
 name = "flume"
 version = "0.11.1"
@@ -502,10 +589,48 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "libc",
- "r-efi",
+ "r-efi 5.3.0",
+ "wasip2",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
  "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "h2"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap 2.14.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@@ -525,9 +650,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.17.0"
+version = "0.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
+checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
 
 [[package]]
 name = "hashlink"
@@ -577,6 +702,51 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "http"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
 [[package]]
 name = "huntsman-complex"
 version = "0.1.0"
@@ -594,6 +764,61 @@ dependencies = [
  "spider-tdl",
 ]
 
+[[package]]
+name = "hyper"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
+dependencies = [
+ "hyper",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "libc",
+ "pin-project-lite",
+ "socket2 0.6.4",
+ "tokio",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "icu_collections"
 version = "2.2.0"
@@ -676,6 +901,12 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
 [[package]]
 name = "idna"
 version = "1.1.0"
@@ -697,6 +928,16 @@ dependencies = [
  "icu_properties",
 ]
 
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
 [[package]]
 name = "indexmap"
 version = "2.14.0"
@@ -704,7 +945,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
 dependencies = [
  "equivalent",
- "hashbrown 0.17.0",
+ "hashbrown 0.17.1",
+ "serde",
+ "serde_core",
 ]
 
 [[package]]
@@ -715,6 +958,15 @@ dependencies = [
  "spider-tdl",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.18"
@@ -745,6 +997,12 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
 [[package]]
 name = "libc"
 version = "0.2.186"
@@ -769,14 +1027,14 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
 [[package]]
 name = "libredox"
-version = "0.1.16"
+version = "0.1.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
+checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3"
 dependencies = [
  "bitflags",
  "libc",
  "plain",
- "redox_syscall 0.7.5",
+ "redox_syscall 0.8.1",
 ]
 
 [[package]]
@@ -789,6 +1047,12 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
 [[package]]
 name = "litemap"
 version = "0.8.2"
@@ -806,9 +1070,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.29"
+version = "0.4.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a"
 
 [[package]]
 name = "matchers"
@@ -819,6 +1083,12 @@ dependencies = [
  "regex-automata",
 ]
 
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
 [[package]]
 name = "md-5"
 version = "0.10.6"
@@ -831,21 +1101,33 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.8.0"
+version = "2.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
 [[package]]
 name = "mio"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
+checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda"
 dependencies = [
  "libc",
  "wasi",
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "multimap"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
+
 [[package]]
 name = "non-empty-string"
 version = "0.2.6"
@@ -963,6 +1245,36 @@ version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
 
+[[package]]
+name = "petgraph"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.14.0",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.17"
@@ -1020,6 +1332,16 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "proc-macro-error-attr2"
 version = "2.0.0"
@@ -1051,6 +1373,58 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "prost"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
+dependencies = [
+ "heck",
+ "itertools",
+ "log",
+ "multimap",
+ "once_cell",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn 2.0.117",
+ "tempfile",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
+dependencies = [
+ "anyhow",
+ "itertools",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
+dependencies = [
+ "prost",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.45"
@@ -1066,6 +1440,12 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
 [[package]]
 name = "rand"
 version = "0.8.6"
@@ -1136,13 +1516,25 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.7.5"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4666a1a60d8412eab19d94f6d13dcc9cea0a5ef4fdf6a5db306537413c661b1b"
+checksum = "5b44b894f2a6e36457d665d1e08c3866add6ed5e70050c1b4ba8a8ddedb02ce7"
 dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
 [[package]]
 name = "regex-automata"
 version = "0.4.14"
@@ -1200,31 +1592,35 @@ dependencies = [
 ]
 
 [[package]]
-name = "ryu"
-version = "1.0.23"
+name = "rustix"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
 
 [[package]]
-name = "scc"
-version = "2.4.0"
+name = "rustversion"
+version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc"
-dependencies = [
- "sdd",
-]
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 
 [[package]]
-name = "scopeguard"
-version = "1.2.0"
+name = "ryu"
+version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
 
 [[package]]
-name = "sdd"
-version = "3.0.10"
+name = "scopeguard"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "secrecy"
@@ -1274,9 +1670,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.149"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
 dependencies = [
  "itoa",
  "memchr",
@@ -1299,9 +1695,9 @@ dependencies = [
 
 [[package]]
 name = "serial_test"
-version = "3.4.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f"
+checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d"
 dependencies = [
  "fslock",
  "futures-executor",
@@ -1309,15 +1705,14 @@ dependencies = [
  "log",
  "once_cell",
  "parking_lot",
- "scc",
  "serial_test_derive",
 ]
 
 [[package]]
 name = "serial_test_derive"
-version = "3.4.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9"
+checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1392,9 +1787,19 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.6.3"
+version = "0.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
+checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "socket2"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51"
 dependencies = [
  "libc",
  "windows-sys 0.61.2",
@@ -1438,14 +1843,26 @@ dependencies = [
  "futures-util",
  "rmp-serde",
  "spider-core",
+ "spider-proto-rust",
  "spider-task-executor",
  "spider-tdl",
  "thiserror",
  "tokio",
  "tokio-util",
+ "tonic",
  "tracing",
 ]
 
+[[package]]
+name = "spider-proto-rust"
+version = "0.1.0"
+dependencies = [
+ "prost",
+ "spider-core",
+ "tonic",
+ "tonic-build",
+]
+
 [[package]]
 name = "spider-storage"
 version = "0.1.0"
@@ -1562,7 +1979,7 @@ dependencies = [
  "futures-util",
  "hashbrown 0.15.5",
  "hashlink",
- "indexmap",
+ "indexmap 2.14.0",
  "log",
  "memchr",
  "once_cell",
@@ -1785,6 +2202,12 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+
 [[package]]
 name = "synstructure"
 version = "0.13.2"
@@ -1847,6 +2270,19 @@ dependencies = [
  "spider-tdl",
 ]
 
+[[package]]
+name = "tempfile"
+version = "3.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
+dependencies = [
+ "fastrand",
+ "getrandom 0.4.2",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "test-utils"
 version = "0.1.0"
@@ -1930,16 +2366,16 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.52.2"
+version = "1.52.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "110a78583f19d5cdb2c5ccf321d1290344e71313c6c37d43520d386027d18386"
+checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
 dependencies = [
  "bytes",
  "libc",
  "mio",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2",
+ "socket2 0.6.4",
  "tokio-macros",
  "windows-sys 0.61.2",
 ]
@@ -1980,6 +2416,96 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tonic"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-timeout",
+ "hyper-util",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "socket2 0.5.10",
+ "tokio",
+ "tokio-stream",
+ "tower 0.4.13",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "prost-types",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 1.9.3",
+ "pin-project",
+ "pin-project-lite",
+ "rand 0.8.6",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
 [[package]]
 name = "tracing"
 version = "0.1.44"
@@ -2041,11 +2567,17 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
 [[package]]
 name = "typenum"
-version = "1.20.0"
+version = "1.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
+checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20"
 
 [[package]]
 name = "unicode-bidi"
@@ -2122,6 +2654,15 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
 [[package]]
 name = "wasi"
 version = "0.11.1+wasi-snapshot-preview1"
@@ -2134,7 +2675,16 @@ version = "1.0.3+wasi-0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
 dependencies = [
- "wit-bindgen",
+ "wit-bindgen 0.57.1",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen 0.51.0",
 ]
 
 [[package]]
@@ -2143,6 +2693,40 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
 
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.14.0",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap 2.14.0",
+ "semver",
+]
+
 [[package]]
 name = "whoami"
 version = "1.6.1"
@@ -2187,7 +2771,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -2205,13 +2798,29 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
 ]
 
 [[package]]
@@ -2220,48 +2829,184 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
 [[package]]
 name = "wit-bindgen"
 version = "0.57.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
 
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap 2.14.0",
+ "prettyplease",
+ "syn 2.0.117",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap 2.14.0",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.14.0",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
 [[package]]
 name = "writeable"
 version = "0.6.3"
@@ -2270,9 +3015,9 @@ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
 
 [[package]]
 name = "yoke"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5"
 dependencies = [
  "stable_deref_trait",
  "yoke-derive",
@@ -2293,18 +3038,18 @@ dependencies = [
 
 [[package]]
 name = "zerocopy"
-version = "0.8.48"
+version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.48"
+version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2313,9 +3058,9 @@ dependencies = [
 
 [[package]]
 name = "zerofrom"
-version = "0.1.7"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df"
+checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272"
 dependencies = [
  "zerofrom-derive",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 08d6f85b..86489204 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
   "components/spider-core",
   "components/spider-derive",
   "components/spider-execution-manager",
+  "components/spider-proto-rust",
   "components/spider-storage",
   "components/spider-task-executor",
   "components/spider-tdl",
@@ -16,3 +17,13 @@ members = [
   "tests/huntsman/tdl-integration",
   "tests/huntsman/test-utils",
 ]
+default-members = [
+  "components/spider-core",
+  "components/spider-derive",
+  "components/spider-execution-manager",
+  "components/spider-proto-rust",
+  "components/spider-storage",
+  "components/spider-task-executor",
+  "components/spider-tdl",
+  "components/spider-tdl-derive",
+]
diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml
index 10f0e3ac..13562728 100644
--- a/components/spider-execution-manager/Cargo.toml
+++ b/components/spider-execution-manager/Cargo.toml
@@ -18,6 +18,7 @@ futures-util = {
 }
 rmp-serde = "1.3.1"
 spider-core = { path = "../spider-core" }
+spider-proto-rust = { path = "../spider-proto-rust" }
 spider-task-executor = { path = "../spider-task-executor" }
 spider-tdl = { path = "../spider-tdl" }
 thiserror = "2.0.18"
@@ -26,4 +27,5 @@ tokio = {
   features = ["io-util", "macros", "process", "rt", "sync", "time"]
 }
 tokio-util = { version = "0.7", features = ["codec", "rt"] }
+tonic = "0.12.3"
 tracing = { version = "0.1.41", default-features = false, features = ["std"] }
diff --git a/components/spider-execution-manager/src/client.rs b/components/spider-execution-manager/src/client.rs
index 4f335f6e..63b132ce 100644
--- a/components/spider-execution-manager/src/client.rs
+++ b/components/spider-execution-manager/src/client.rs
@@ -6,10 +6,12 @@
 //! * [`storage::StorageClient`] — registers task instances and reports their outcome.
 //! * [`liveness::LivenessClient`] — registers the EM at boot and ticks the heartbeat thereafter.
 
+pub mod grpc;
 pub mod liveness;
 pub mod scheduler;
 pub mod storage;
 
+pub use grpc::GrpcStorageClient;
 pub use liveness::{LivenessClient, LivenessResponseError, RegistrationResponse};
 pub use scheduler::{SchedulerClient, SchedulerError, SchedulerResponse};
 pub use storage::{StorageClient, StorageResponseError};
diff --git a/components/spider-execution-manager/src/client/grpc/mod.rs b/components/spider-execution-manager/src/client/grpc/mod.rs
new file mode 100644
index 00000000..9f15ee9a
--- /dev/null
+++ b/components/spider-execution-manager/src/client/grpc/mod.rs
@@ -0,0 +1,5 @@
+//! gRPC-backed implementations of the execution manager's client traits.
+
+pub mod storage;
+
+pub use storage::GrpcStorageClient;
diff --git a/components/spider-execution-manager/src/client/grpc/storage.rs b/components/spider-execution-manager/src/client/grpc/storage.rs
new file mode 100644
index 00000000..c037814a
--- /dev/null
+++ b/components/spider-execution-manager/src/client/grpc/storage.rs
@@ -0,0 +1,225 @@
+//! gRPC-backed [`StorageClient`] implementation.
+//!
+//! Wraps the generated [`TaskInstanceManagementServiceClient`] and adapts its protobuf
+//! request/response types to the transport-agnostic [`StorageClient`] trait.
+
+use async_trait::async_trait;
+use spider_core::types::{
+    id::{ExecutionManagerId, JobId, SessionId, TaskId},
+    io::ExecutionContext,
+};
+use spider_proto_rust::storage::{
+    self,
+    register_task_instance_response,
+    storage_error,
+    storage_operation_response,
+    task_instance_management_service_client::TaskInstanceManagementServiceClient,
+};
+use tonic::transport::{Channel, Endpoint};
+
+use crate::client::storage::{StorageClient, StorageResponseError};
+
+/// gRPC-backed [`StorageClient`] implementation.
+#[derive(Debug, Clone)]
+pub struct GrpcStorageClient {
+    client: TaskInstanceManagementServiceClient<Channel>,
+}
+
+impl GrpcStorageClient {
+    /// Connects to the storage gRPC endpoint.
+    ///
+    /// # Returns
+    ///
+    /// A new [`GrpcStorageClient`] connected to `endpoint` on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`StorageResponseError::Transport`] if tonic cannot create or connect to the endpoint.
+    pub async fn connect(endpoint: Endpoint) -> Result<Self, StorageResponseError> {
+        TaskInstanceManagementServiceClient::connect(endpoint)
+            .await
+            .map(|inner| Self { client: inner })
+            .map_err(to_transport_error)
+    }
+}
+
+#[async_trait]
+impl StorageClient for GrpcStorageClient {
+    async fn register_task_instance(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+    ) -> Result<ExecutionContext, StorageResponseError> {
+        let request = storage::RegisterTaskInstanceRequest {
+            job_id: job_id.get(),
+            task_id: Some(storage::TaskId::from(task_id)),
+            execution_manager_id: em_id.get(),
+            session_id,
+        };
+        let response = self
+            .client
+            .clone()
+            .register_task_instance(request)
+            .await
+            .map_err(to_transport_error)?
+            .into_inner();
+
+        match response.result {
+            Some(register_task_instance_response::Result::ExecutionContext(bytes)) => {
+                bincode::deserialize(&bytes).map_err(|error| {
+                    StorageResponseError::Transport(format!(
+                        "failed to decode execution context: {error}"
+                    ))
+                })
+            }
+            Some(register_task_instance_response::Result::Error(error)) => Err(error.into()),
+            None => Err(StorageResponseError::Transport(
+                "register task instance response missing result".to_owned(),
+            )),
+        }
+    }
+
+    async fn report_task_success(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+        serialized_outputs: Option<Vec<u8>>,
+    ) -> Result<(), StorageResponseError> {
+        let request = storage::ReportTaskSuccessRequest {
+            job_id: job_id.get(),
+            task_id: Some(storage::TaskId::from(task_id)),
+            execution_manager_id: em_id.get(),
+            session_id,
+            serialized_outputs: serialized_outputs.unwrap_or_default(),
+        };
+        let response = self
+            .client
+            .clone()
+            .report_task_success(request)
+            .await
+            .map_err(to_transport_error)?
+            .into_inner();
+
+        storage_operation_response_to_result(response)
+    }
+
+    async fn report_task_failure(
+        &self,
+        job_id: JobId,
+        task_id: TaskId,
+        em_id: ExecutionManagerId,
+        session_id: SessionId,
+        error_message: String,
+    ) -> Result<(), StorageResponseError> {
+        let request = storage::ReportTaskFailureRequest {
+            job_id: job_id.get(),
+            task_id: Some(storage::TaskId::from(task_id)),
+            execution_manager_id: em_id.get(),
+            session_id,
+            error_message,
+        };
+        let response = self
+            .client
+            .clone()
+            .report_task_failure(request)
+            .await
+            .map_err(to_transport_error)?
+            .into_inner();
+
+        storage_operation_response_to_result(response)
+    }
+}
+
+impl From<storage::StorageError> for StorageResponseError {
+    fn from(error: storage::StorageError) -> Self {
+        match storage_error::ErrCode::try_from(error.err_code) {
+            Ok(storage_error::ErrCode::StaleSession) => Self::StaleSession {
+                storage_session: error.storage_session,
+            },
+            Ok(storage_error::ErrCode::CacheStale) => Self::CacheStale(error.message),
+            Ok(storage_error::ErrCode::Transport) => Self::Transport(error.message),
+            Ok(storage_error::ErrCode::Server | storage_error::ErrCode::Unspecified) => {
+                Self::Server(error.message)
+            }
+            Ok(storage_error::ErrCode::InvalidInput) => Self::InvalidInput(error.message),
+            Err(error) => Self::Transport(format!("unknown storage error kind: {error}")),
+        }
+    }
+}
+
+/// # Returns
+///
+/// [`storage::StorageOperationResponse`] converted into [`Result<(), StorageResponseError>`].
+fn storage_operation_response_to_result(
+    response: storage::StorageOperationResponse,
+) -> Result<(), StorageResponseError> {
+    match response.result {
+        Some(storage_operation_response::Result::Ok(_)) => Ok(()),
+        Some(storage_operation_response::Result::Error(error)) => Err(error.into()),
+        None => Err(StorageResponseError::Transport(
+            "storage operation response missing `result` message".to_owned(),
+        )),
+    }
+}
+
+/// Converts a displayable transport-layer error into [`StorageResponseError::Transport`].
+///
+/// # Returns
+///
+/// A [`StorageResponseError::Transport`] containing `error`'s display string.
+fn to_transport_error(error: impl std::fmt::Display) -> StorageResponseError {
+    StorageResponseError::Transport(error.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn storage_error_maps_stale_session() {
+        let error = storage::StorageError {
+            err_code: storage_error::ErrCode::StaleSession.into(),
+            message: "stale".to_owned(),
+            storage_session: 7,
+        };
+
+        match StorageResponseError::from(error) {
+            StorageResponseError::StaleSession { storage_session } => {
+                assert_eq!(7, storage_session);
+            }
+            error => panic!("unexpected storage response error: {error:?}"),
+        }
+    }
+
+    #[test]
+    fn storage_error_maps_unknown_kind_to_transport_error() {
+        let error = storage::StorageError {
+            err_code: 99,
+            message: "unknown".to_owned(),
+            storage_session: 0,
+        };
+
+        match StorageResponseError::from(error) {
+            StorageResponseError::Transport(message) => {
+                assert!(message.contains("unknown storage error kind"));
+            }
+            error => panic!("unexpected storage response error: {error:?}"),
+        }
+    }
+
+    #[test]
+    fn missing_storage_operation_result_is_transport_error() {
+        match storage_operation_response_to_result(storage::StorageOperationResponse {
+            result: None,
+        }) {
+            Err(StorageResponseError::Transport(_)) => {}
+            result => panic!("unexpected storage operation result: {result:?}"),
+        }
+    }
+}
diff --git a/components/spider-proto-rust/Cargo.toml b/components/spider-proto-rust/Cargo.toml
new file mode 100644
index 00000000..6a5e53db
--- /dev/null
+++ b/components/spider-proto-rust/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "spider-proto-rust"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+name = "spider_proto_rust"
+path = "src/lib.rs"
+
+[dependencies]
+prost = "0.13.5"
+spider-core = { path = "../spider-core" }
+tonic = "0.12.3"
+
+[build-dependencies]
+tonic-build = "0.12.3"
diff --git a/components/spider-proto-rust/build.rs b/components/spider-proto-rust/build.rs
new file mode 100644
index 00000000..e4e8d880
--- /dev/null
+++ b/components/spider-proto-rust/build.rs
@@ -0,0 +1,81 @@
+use std::{env, fs, path::PathBuf};
+
+/// The environment variable that, if set, forces the build script to regenerate the protobuf code.
+const SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE: &str = "SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE";
+
+/// The default destination directory for generated protobuf code, relative to the crate root.
+const SPIDER_PROTO_RUST_GENERATED_DIR: &str = "src/generated";
+
+/// The root of the protobuf source files.
+const SPIDER_PROTO_ROOT: &str = "spider-proto";
+
+/// The protobuf source files to compile, relative to [`SPIDER_PROTO_ROOT`].
+const SPIDER_PROTO_SOURCE_FILES: &[&str] = &["storage/storage.proto"];
+
+fn main() {
+    // Rerun the build script whenever the generation gate is toggled or changes value.
+    println!("cargo:rerun-if-env-changed={SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE}");
+
+    let crate_root = PathBuf::from(
+        env::var_os("CARGO_MANIFEST_DIR").expect("`CARGO_MANIFEST_DIR` env var not set"),
+    );
+    let components_root = crate_root
+        .parent()
+        .expect("`CARGO_MANIFEST_DIR` is not a directory");
+
+    let spider_proto_root = components_root.join(SPIDER_PROTO_ROOT);
+    let spider_proto_sources = SPIDER_PROTO_SOURCE_FILES
+        .iter()
+        .map(|relative_path| {
+            let abs_path = spider_proto_root.join(relative_path);
+            println!("cargo:rerun-if-changed={}", abs_path.display());
+            abs_path
+        })
+        .collect::<Vec<_>>();
+
+    if env::var_os(SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE).is_none() {
+        // The committed generated code is used as-is.
+        return;
+    }
+
+    let generate_from_source =
+        env::var_os(SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE).is_some_and(|val| {
+            const ON: &str = "ON";
+            const OFF: &str = "OFF";
+            match val.to_str() {
+                Some(ON) => true,
+                Some(OFF) => false,
+                _ => panic!(
+                    "invalid value for {SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE}: expected '{ON}' \
+                     or '{OFF}'"
+                ),
+            }
+        });
+
+    if !generate_from_source {
+        // The committed generated code is used as-is.
+        return;
+    }
+
+    let out_dir = crate_root.join(SPIDER_PROTO_RUST_GENERATED_DIR);
+    if out_dir.exists() {
+        fs::remove_dir_all(&out_dir).expect("failed to remove existing generated code");
+    }
+    fs::create_dir_all(&out_dir).expect("failed to create output dir for generated code");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(true)
+        .out_dir(&out_dir)
+        .compile_protos(
+            spider_proto_sources.as_ref(),
+            &[spider_proto_root.as_path()],
+        )
+        .inspect_err(|e| eprintln!("Failed to compile `spider-proto`: {e:?}"))
+        .expect("proto compilation failed");
+
+    // NOTE: The generated outputs are deliberately NOT tracked with `cargo:rerun-if-changed`. Cargo
+    // compares the tracked paths' mtimes against the build script's recorded output file, whose
+    // mtime is not guaranteed to postdate files written by this script in the same run, so tracking
+    // our own outputs would make every subsequent build appear dirty.
+}
diff --git a/components/spider-proto-rust/src/generated/storage.rs b/components/spider-proto-rust/src/generated/storage.rs
new file mode 100644
index 00000000..4a4cd353
--- /dev/null
+++ b/components/spider-proto-rust/src/generated/storage.rs
@@ -0,0 +1,632 @@
+// This file is @generated by prost-build.
+#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+pub struct RegisterTaskInstanceRequest {
+    #[prost(uint64, tag = "1")]
+    pub job_id: u64,
+    #[prost(message, optional, tag = "2")]
+    pub task_id: ::core::option::Option<TaskId>,
+    #[prost(uint64, tag = "3")]
+    pub execution_manager_id: u64,
+    #[prost(uint64, tag = "4")]
+    pub session_id: u64,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RegisterTaskInstanceResponse {
+    #[prost(oneof = "register_task_instance_response::Result", tags = "1, 2")]
+    pub result: ::core::option::Option<register_task_instance_response::Result>,
+}
+/// Nested message and enum types in `RegisterTaskInstanceResponse`.
+pub mod register_task_instance_response {
+    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    pub enum Result {
+        #[prost(bytes, tag = "1")]
+        ExecutionContext(::prost::alloc::vec::Vec<u8>),
+        #[prost(message, tag = "2")]
+        Error(super::StorageError),
+    }
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ReportTaskSuccessRequest {
+    #[prost(uint64, tag = "1")]
+    pub job_id: u64,
+    #[prost(message, optional, tag = "2")]
+    pub task_id: ::core::option::Option<TaskId>,
+    #[prost(uint64, tag = "3")]
+    pub execution_manager_id: u64,
+    #[prost(uint64, tag = "4")]
+    pub session_id: u64,
+    #[prost(bytes = "vec", tag = "5")]
+    pub serialized_outputs: ::prost::alloc::vec::Vec<u8>,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ReportTaskFailureRequest {
+    #[prost(uint64, tag = "1")]
+    pub job_id: u64,
+    #[prost(message, optional, tag = "2")]
+    pub task_id: ::core::option::Option<TaskId>,
+    #[prost(uint64, tag = "3")]
+    pub execution_manager_id: u64,
+    #[prost(uint64, tag = "4")]
+    pub session_id: u64,
+    #[prost(string, tag = "5")]
+    pub error_message: ::prost::alloc::string::String,
+}
+#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+pub struct TaskId {
+    #[prost(oneof = "task_id::Kind", tags = "1, 2, 3")]
+    pub kind: ::core::option::Option<task_id::Kind>,
+}
+/// Nested message and enum types in `TaskId`.
+pub mod task_id {
+    #[derive(Clone, Copy, PartialEq, ::prost::Oneof)]
+    pub enum Kind {
+        #[prost(uint64, tag = "1")]
+        Index(u64),
+        #[prost(message, tag = "2")]
+        Commit(super::Void),
+        #[prost(message, tag = "3")]
+        Cleanup(super::Void),
+    }
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct StorageOperationResponse {
+    #[prost(oneof = "storage_operation_response::Result", tags = "1, 2")]
+    pub result: ::core::option::Option<storage_operation_response::Result>,
+}
+/// Nested message and enum types in `StorageOperationResponse`.
+pub mod storage_operation_response {
+    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    pub enum Result {
+        #[prost(message, tag = "1")]
+        Ok(super::Void),
+        #[prost(message, tag = "2")]
+        Error(super::StorageError),
+    }
+}
+#[derive(Clone, Copy, PartialEq, ::prost::Message)]
+pub struct Void {}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct StorageError {
+    #[prost(enumeration = "storage_error::ErrCode", tag = "1")]
+    pub err_code: i32,
+    #[prost(string, tag = "2")]
+    pub message: ::prost::alloc::string::String,
+    #[prost(uint64, tag = "3")]
+    pub storage_session: u64,
+}
+/// Nested message and enum types in `StorageError`.
+pub mod storage_error {
+    #[derive(
+        Clone,
+        Copy,
+        Debug,
+        PartialEq,
+        Eq,
+        Hash,
+        PartialOrd,
+        Ord,
+        ::prost::Enumeration
+    )]
+    #[repr(i32)]
+    pub enum ErrCode {
+        Unspecified = 0,
+        StaleSession = 1,
+        CacheStale = 2,
+        Transport = 3,
+        Server = 4,
+        InvalidInput = 5,
+    }
+    impl ErrCode {
+        /// String value of the enum field names used in the ProtoBuf definition.
+        ///
+        /// The values are not transformed in any way and thus are considered stable
+        /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+        pub fn as_str_name(&self) -> &'static str {
+            match self {
+                Self::Unspecified => "ERR_CODE_UNSPECIFIED",
+                Self::StaleSession => "STALE_SESSION",
+                Self::CacheStale => "CACHE_STALE",
+                Self::Transport => "TRANSPORT",
+                Self::Server => "SERVER",
+                Self::InvalidInput => "INVALID_INPUT",
+            }
+        }
+        /// Creates an enum from field names used in the ProtoBuf definition.
+        pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+            match value {
+                "ERR_CODE_UNSPECIFIED" => Some(Self::Unspecified),
+                "STALE_SESSION" => Some(Self::StaleSession),
+                "CACHE_STALE" => Some(Self::CacheStale),
+                "TRANSPORT" => Some(Self::Transport),
+                "SERVER" => Some(Self::Server),
+                "INVALID_INPUT" => Some(Self::InvalidInput),
+                _ => None,
+            }
+        }
+    }
+}
+/// Generated client implementations.
+pub mod task_instance_management_service_client {
+    #![allow(
+        unused_variables,
+        dead_code,
+        missing_docs,
+        clippy::wildcard_imports,
+        clippy::let_unit_value,
+    )]
+    use tonic::codegen::*;
+    use tonic::codegen::http::Uri;
+    #[derive(Debug, Clone)]
+    pub struct TaskInstanceManagementServiceClient<T> {
+        inner: tonic::client::Grpc<T>,
+    }
+    impl TaskInstanceManagementServiceClient<tonic::transport::Channel> {
+        /// Attempt to create a new client by connecting to a given endpoint.
+        pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
+        where
+            D: TryInto<tonic::transport::Endpoint>,
+            D::Error: Into<StdError>,
+        {
+            let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
+            Ok(Self::new(conn))
+        }
+    }
+    impl<T> TaskInstanceManagementServiceClient<T>
+    where
+        T: tonic::client::GrpcService<tonic::body::BoxBody>,
+        T::Error: Into<StdError>,
+        T::ResponseBody: Body<Data = Bytes> + std::marker::Send + 'static,
+        <T::ResponseBody as Body>::Error: Into<StdError> + std::marker::Send,
+    {
+        pub fn new(inner: T) -> Self {
+            let inner = tonic::client::Grpc::new(inner);
+            Self { inner }
+        }
+        pub fn with_origin(inner: T, origin: Uri) -> Self {
+            let inner = tonic::client::Grpc::with_origin(inner, origin);
+            Self { inner }
+        }
+        pub fn with_interceptor<F>(
+            inner: T,
+            interceptor: F,
+        ) -> TaskInstanceManagementServiceClient<InterceptedService<T, F>>
+        where
+            F: tonic::service::Interceptor,
+            T::ResponseBody: Default,
+            T: tonic::codegen::Service<
+                http::Request<tonic::body::BoxBody>,
+                Response = http::Response<
+                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
+                >,
+            >,
+            <T as tonic::codegen::Service<
+                http::Request<tonic::body::BoxBody>,
+            >>::Error: Into<StdError> + std::marker::Send + std::marker::Sync,
+        {
+            TaskInstanceManagementServiceClient::new(
+                InterceptedService::new(inner, interceptor),
+            )
+        }
+        /// Compress requests with the given encoding.
+        ///
+        /// This requires the server to support it otherwise it might respond with an
+        /// error.
+        #[must_use]
+        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
+            self.inner = self.inner.send_compressed(encoding);
+            self
+        }
+        /// Enable decompressing responses.
+        #[must_use]
+        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
+            self.inner = self.inner.accept_compressed(encoding);
+            self
+        }
+        /// Limits the maximum size of a decoded message.
+        ///
+        /// Default: `4MB`
+        #[must_use]
+        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
+            self.inner = self.inner.max_decoding_message_size(limit);
+            self
+        }
+        /// Limits the maximum size of an encoded message.
+        ///
+        /// Default: `usize::MAX`
+        #[must_use]
+        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
+            self.inner = self.inner.max_encoding_message_size(limit);
+            self
+        }
+        pub async fn register_task_instance(
+            &mut self,
+            request: impl tonic::IntoRequest<super::RegisterTaskInstanceRequest>,
+        ) -> std::result::Result<
+            tonic::Response<super::RegisterTaskInstanceResponse>,
+            tonic::Status,
+        > {
+            self.inner
+                .ready()
+                .await
+                .map_err(|e| {
+                    tonic::Status::unknown(
+                        format!("Service was not ready: {}", e.into()),
+                    )
+                })?;
+            let codec = tonic::codec::ProstCodec::default();
+            let path = http::uri::PathAndQuery::from_static(
+                "/storage.TaskInstanceManagementService/RegisterTaskInstance",
+            );
+            let mut req = request.into_request();
+            req.extensions_mut()
+                .insert(
+                    GrpcMethod::new(
+                        "storage.TaskInstanceManagementService",
+                        "RegisterTaskInstance",
+                    ),
+                );
+            self.inner.unary(req, path, codec).await
+        }
+        pub async fn report_task_success(
+            &mut self,
+            request: impl tonic::IntoRequest<super::ReportTaskSuccessRequest>,
+        ) -> std::result::Result<
+            tonic::Response<super::StorageOperationResponse>,
+            tonic::Status,
+        > {
+            self.inner
+                .ready()
+                .await
+                .map_err(|e| {
+                    tonic::Status::unknown(
+                        format!("Service was not ready: {}", e.into()),
+                    )
+                })?;
+            let codec = tonic::codec::ProstCodec::default();
+            let path = http::uri::PathAndQuery::from_static(
+                "/storage.TaskInstanceManagementService/ReportTaskSuccess",
+            );
+            let mut req = request.into_request();
+            req.extensions_mut()
+                .insert(
+                    GrpcMethod::new(
+                        "storage.TaskInstanceManagementService",
+                        "ReportTaskSuccess",
+                    ),
+                );
+            self.inner.unary(req, path, codec).await
+        }
+        pub async fn report_task_failure(
+            &mut self,
+            request: impl tonic::IntoRequest<super::ReportTaskFailureRequest>,
+        ) -> std::result::Result<
+            tonic::Response<super::StorageOperationResponse>,
+            tonic::Status,
+        > {
+            self.inner
+                .ready()
+                .await
+                .map_err(|e| {
+                    tonic::Status::unknown(
+                        format!("Service was not ready: {}", e.into()),
+                    )
+                })?;
+            let codec = tonic::codec::ProstCodec::default();
+            let path = http::uri::PathAndQuery::from_static(
+                "/storage.TaskInstanceManagementService/ReportTaskFailure",
+            );
+            let mut req = request.into_request();
+            req.extensions_mut()
+                .insert(
+                    GrpcMethod::new(
+                        "storage.TaskInstanceManagementService",
+                        "ReportTaskFailure",
+                    ),
+                );
+            self.inner.unary(req, path, codec).await
+        }
+    }
+}
+/// Generated server implementations.
+pub mod task_instance_management_service_server {
+    #![allow(
+        unused_variables,
+        dead_code,
+        missing_docs,
+        clippy::wildcard_imports,
+        clippy::let_unit_value,
+    )]
+    use tonic::codegen::*;
+    /// Generated trait containing gRPC methods that should be implemented for use with TaskInstanceManagementServiceServer.
+    #[async_trait]
+    pub trait TaskInstanceManagementService: std::marker::Send + std::marker::Sync + 'static {
+        async fn register_task_instance(
+            &self,
+            request: tonic::Request<super::RegisterTaskInstanceRequest>,
+        ) -> std::result::Result<
+            tonic::Response<super::RegisterTaskInstanceResponse>,
+            tonic::Status,
+        >;
+        async fn report_task_success(
+            &self,
+            request: tonic::Request<super::ReportTaskSuccessRequest>,
+        ) -> std::result::Result<
+            tonic::Response<super::StorageOperationResponse>,
+            tonic::Status,
+        >;
+        async fn report_task_failure(
+            &self,
+            request: tonic::Request<super::ReportTaskFailureRequest>,
+        ) -> std::result::Result<
+            tonic::Response<super::StorageOperationResponse>,
+            tonic::Status,
+        >;
+    }
+    #[derive(Debug)]
+    pub struct TaskInstanceManagementServiceServer<T> {
+        inner: Arc<T>,
+        accept_compression_encodings: EnabledCompressionEncodings,
+        send_compression_encodings: EnabledCompressionEncodings,
+        max_decoding_message_size: Option<usize>,
+        max_encoding_message_size: Option<usize>,
+    }
+    impl<T> TaskInstanceManagementServiceServer<T> {
+        pub fn new(inner: T) -> Self {
+            Self::from_arc(Arc::new(inner))
+        }
+        pub fn from_arc(inner: Arc<T>) -> Self {
+            Self {
+                inner,
+                accept_compression_encodings: Default::default(),
+                send_compression_encodings: Default::default(),
+                max_decoding_message_size: None,
+                max_encoding_message_size: None,
+            }
+        }
+        pub fn with_interceptor<F>(
+            inner: T,
+            interceptor: F,
+        ) -> InterceptedService<Self, F>
+        where
+            F: tonic::service::Interceptor,
+        {
+            InterceptedService::new(Self::new(inner), interceptor)
+        }
+        /// Enable decompressing requests with the given encoding.
+        #[must_use]
+        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
+            self.accept_compression_encodings.enable(encoding);
+            self
+        }
+        /// Compress responses with the given encoding, if the client supports it.
+        #[must_use]
+        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
+            self.send_compression_encodings.enable(encoding);
+            self
+        }
+        /// Limits the maximum size of a decoded message.
+        ///
+        /// Default: `4MB`
+        #[must_use]
+        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
+            self.max_decoding_message_size = Some(limit);
+            self
+        }
+        /// Limits the maximum size of an encoded message.
+        ///
+        /// Default: `usize::MAX`
+        #[must_use]
+        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
+            self.max_encoding_message_size = Some(limit);
+            self
+        }
+    }
+    impl<T, B> tonic::codegen::Service<http::Request<B>>
+    for TaskInstanceManagementServiceServer<T>
+    where
+        T: TaskInstanceManagementService,
+        B: Body + std::marker::Send + 'static,
+        B::Error: Into<StdError> + std::marker::Send + 'static,
+    {
+        type Response = http::Response<tonic::body::BoxBody>;
+        type Error = std::convert::Infallible;
+        type Future = BoxFuture<Self::Response, Self::Error>;
+        fn poll_ready(
+            &mut self,
+            _cx: &mut Context<'_>,
+        ) -> Poll<std::result::Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+        fn call(&mut self, req: http::Request<B>) -> Self::Future {
+            match req.uri().path() {
+                "/storage.TaskInstanceManagementService/RegisterTaskInstance" => {
+                    #[allow(non_camel_case_types)]
+                    struct RegisterTaskInstanceSvc<T: TaskInstanceManagementService>(
+                        pub Arc<T>,
+                    );
+                    impl<
+                        T: TaskInstanceManagementService,
+                    > tonic::server::UnaryService<super::RegisterTaskInstanceRequest>
+                    for RegisterTaskInstanceSvc<T> {
+                        type Response = super::RegisterTaskInstanceResponse;
+                        type Future = BoxFuture<
+                            tonic::Response<Self::Response>,
+                            tonic::Status,
+                        >;
+                        fn call(
+                            &mut self,
+                            request: tonic::Request<super::RegisterTaskInstanceRequest>,
+                        ) -> Self::Future {
+                            let inner = Arc::clone(&self.0);
+                            let fut = async move {
+                                <T as TaskInstanceManagementService>::register_task_instance(
+                                        &inner,
+                                        request,
+                                    )
+                                    .await
+                            };
+                            Box::pin(fut)
+                        }
+                    }
+                    let accept_compression_encodings = self.accept_compression_encodings;
+                    let send_compression_encodings = self.send_compression_encodings;
+                    let max_decoding_message_size = self.max_decoding_message_size;
+                    let max_encoding_message_size = self.max_encoding_message_size;
+                    let inner = self.inner.clone();
+                    let fut = async move {
+                        let method = RegisterTaskInstanceSvc(inner);
+                        let codec = tonic::codec::ProstCodec::default();
+                        let mut grpc = tonic::server::Grpc::new(codec)
+                            .apply_compression_config(
+                                accept_compression_encodings,
+                                send_compression_encodings,
+                            )
+                            .apply_max_message_size_config(
+                                max_decoding_message_size,
+                                max_encoding_message_size,
+                            );
+                        let res = grpc.unary(method, req).await;
+                        Ok(res)
+                    };
+                    Box::pin(fut)
+                }
+                "/storage.TaskInstanceManagementService/ReportTaskSuccess" => {
+                    #[allow(non_camel_case_types)]
+                    struct ReportTaskSuccessSvc<T: TaskInstanceManagementService>(
+                        pub Arc<T>,
+                    );
+                    impl<
+                        T: TaskInstanceManagementService,
+                    > tonic::server::UnaryService<super::ReportTaskSuccessRequest>
+                    for ReportTaskSuccessSvc<T> {
+                        type Response = super::StorageOperationResponse;
+                        type Future = BoxFuture<
+                            tonic::Response<Self::Response>,
+                            tonic::Status,
+                        >;
+                        fn call(
+                            &mut self,
+                            request: tonic::Request<super::ReportTaskSuccessRequest>,
+                        ) -> Self::Future {
+                            let inner = Arc::clone(&self.0);
+                            let fut = async move {
+                                <T as TaskInstanceManagementService>::report_task_success(
+                                        &inner,
+                                        request,
+                                    )
+                                    .await
+                            };
+                            Box::pin(fut)
+                        }
+                    }
+                    let accept_compression_encodings = self.accept_compression_encodings;
+                    let send_compression_encodings = self.send_compression_encodings;
+                    let max_decoding_message_size = self.max_decoding_message_size;
+                    let max_encoding_message_size = self.max_encoding_message_size;
+                    let inner = self.inner.clone();
+                    let fut = async move {
+                        let method = ReportTaskSuccessSvc(inner);
+                        let codec = tonic::codec::ProstCodec::default();
+                        let mut grpc = tonic::server::Grpc::new(codec)
+                            .apply_compression_config(
+                                accept_compression_encodings,
+                                send_compression_encodings,
+                            )
+                            .apply_max_message_size_config(
+                                max_decoding_message_size,
+                                max_encoding_message_size,
+                            );
+                        let res = grpc.unary(method, req).await;
+                        Ok(res)
+                    };
+                    Box::pin(fut)
+                }
+                "/storage.TaskInstanceManagementService/ReportTaskFailure" => {
+                    #[allow(non_camel_case_types)]
+                    struct ReportTaskFailureSvc<T: TaskInstanceManagementService>(
+                        pub Arc<T>,
+                    );
+                    impl<
+                        T: TaskInstanceManagementService,
+                    > tonic::server::UnaryService<super::ReportTaskFailureRequest>
+                    for ReportTaskFailureSvc<T> {
+                        type Response = super::StorageOperationResponse;
+                        type Future = BoxFuture<
+                            tonic::Response<Self::Response>,
+                            tonic::Status,
+                        >;
+                        fn call(
+                            &mut self,
+                            request: tonic::Request<super::ReportTaskFailureRequest>,
+                        ) -> Self::Future {
+                            let inner = Arc::clone(&self.0);
+                            let fut = async move {
+                                <T as TaskInstanceManagementService>::report_task_failure(
+                                        &inner,
+                                        request,
+                                    )
+                                    .await
+                            };
+                            Box::pin(fut)
+                        }
+                    }
+                    let accept_compression_encodings = self.accept_compression_encodings;
+                    let send_compression_encodings = self.send_compression_encodings;
+                    let max_decoding_message_size = self.max_decoding_message_size;
+                    let max_encoding_message_size = self.max_encoding_message_size;
+                    let inner = self.inner.clone();
+                    let fut = async move {
+                        let method = ReportTaskFailureSvc(inner);
+                        let codec = tonic::codec::ProstCodec::default();
+                        let mut grpc = tonic::server::Grpc::new(codec)
+                            .apply_compression_config(
+                                accept_compression_encodings,
+                                send_compression_encodings,
+                            )
+                            .apply_max_message_size_config(
+                                max_decoding_message_size,
+                                max_encoding_message_size,
+                            );
+                        let res = grpc.unary(method, req).await;
+                        Ok(res)
+                    };
+                    Box::pin(fut)
+                }
+                _ => {
+                    Box::pin(async move {
+                        let mut response = http::Response::new(empty_body());
+                        let headers = response.headers_mut();
+                        headers
+                            .insert(
+                                tonic::Status::GRPC_STATUS,
+                                (tonic::Code::Unimplemented as i32).into(),
+                            );
+                        headers
+                            .insert(
+                                http::header::CONTENT_TYPE,
+                                tonic::metadata::GRPC_CONTENT_TYPE,
+                            );
+                        Ok(response)
+                    })
+                }
+            }
+        }
+    }
+    impl<T> Clone for TaskInstanceManagementServiceServer<T> {
+        fn clone(&self) -> Self {
+            let inner = self.inner.clone();
+            Self {
+                inner,
+                accept_compression_encodings: self.accept_compression_encodings,
+                send_compression_encodings: self.send_compression_encodings,
+                max_decoding_message_size: self.max_decoding_message_size,
+                max_encoding_message_size: self.max_encoding_message_size,
+            }
+        }
+    }
+    /// Generated gRPC service name
+    pub const SERVICE_NAME: &str = "storage.TaskInstanceManagementService";
+    impl<T> tonic::server::NamedService for TaskInstanceManagementServiceServer<T> {
+        const NAME: &'static str = SERVICE_NAME;
+    }
+}
diff --git a/components/spider-proto-rust/src/id.rs b/components/spider-proto-rust/src/id.rs
new file mode 100644
index 00000000..ef21bcd8
--- /dev/null
+++ b/components/spider-proto-rust/src/id.rs
@@ -0,0 +1,44 @@
+//! Helpers for converting Spider IDs to protobuf fields.
+
+use spider_core::types::id::TaskId;
+
+use crate::storage::{self, task_id};
+
+impl From<TaskId> for storage::TaskId {
+    fn from(task_id: TaskId) -> Self {
+        let kind = match task_id {
+            TaskId::Index(task_index) => task_id::Kind::Index(
+                u64::try_from(task_index).expect("task index does not fit in u64"),
+            ),
+            TaskId::Commit => task_id::Kind::Commit(storage::Void {}),
+            TaskId::Cleanup => task_id::Kind::Cleanup(storage::Void {}),
+        };
+        Self { kind: Some(kind) }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn task_id_to_protocol_u64_converts_index_task() {
+        let task_id = storage::TaskId::from(TaskId::Index(7));
+
+        assert!(matches!(task_id.kind, Some(task_id::Kind::Index(7))));
+    }
+
+    #[test]
+    fn task_id_to_protocol_converts_commit_task() {
+        let task_id = storage::TaskId::from(TaskId::Commit);
+
+        assert!(matches!(task_id.kind, Some(task_id::Kind::Commit(_))));
+    }
+
+    #[test]
+    fn task_id_to_protocol_converts_cleanup_task() {
+        let task_id = storage::TaskId::from(TaskId::Cleanup);
+
+        assert!(matches!(task_id.kind, Some(task_id::Kind::Cleanup(_))));
+    }
+}
diff --git a/components/spider-proto-rust/src/lib.rs b/components/spider-proto-rust/src/lib.rs
new file mode 100644
index 00000000..d78e8f0d
--- /dev/null
+++ b/components/spider-proto-rust/src/lib.rs
@@ -0,0 +1,8 @@
+//! Rust gRPC protocol definitions generated from Spider protobuf files.
+
+pub mod id;
+
+#[allow(clippy::all, clippy::nursery, clippy::pedantic)]
+pub mod storage {
+    include!("generated/storage.rs");
+}
diff --git a/components/spider-proto/storage/storage.proto b/components/spider-proto/storage/storage.proto
new file mode 100644
index 00000000..3d6f5483
--- /dev/null
+++ b/components/spider-proto/storage/storage.proto
@@ -0,0 +1,71 @@
+syntax = "proto3";
+
+package storage;
+
+service TaskInstanceManagementService {
+  rpc RegisterTaskInstance(RegisterTaskInstanceRequest) returns (RegisterTaskInstanceResponse);
+  rpc ReportTaskSuccess(ReportTaskSuccessRequest) returns (StorageOperationResponse);
+  rpc ReportTaskFailure(ReportTaskFailureRequest) returns (StorageOperationResponse);
+}
+
+message RegisterTaskInstanceRequest {
+  uint64 job_id = 1;
+  TaskId task_id = 2;
+  uint64 execution_manager_id = 3;
+  uint64 session_id = 4;
+}
+
+message RegisterTaskInstanceResponse {
+  oneof result {
+    bytes execution_context = 1;
+    StorageError error = 2;
+  }
+}
+
+message ReportTaskSuccessRequest {
+  uint64 job_id = 1;
+  TaskId task_id = 2;
+  uint64 execution_manager_id = 3;
+  uint64 session_id = 4;
+  bytes serialized_outputs = 5;
+}
+
+message ReportTaskFailureRequest {
+  uint64 job_id = 1;
+  TaskId task_id = 2;
+  uint64 execution_manager_id = 3;
+  uint64 session_id = 4;
+  string error_message = 5;
+}
+
+message TaskId {
+  oneof kind {
+    uint64 index = 1;
+    Void commit = 2;
+    Void cleanup = 3;
+  }
+}
+
+message StorageOperationResponse {
+  oneof result {
+    Void ok = 1;
+    StorageError error = 2;
+  }
+}
+
+message Void {}
+
+message StorageError {
+  enum ErrCode {
+    ERR_CODE_UNSPECIFIED = 0;
+    STALE_SESSION = 1;
+    CACHE_STALE = 2;
+    TRANSPORT = 3;
+    SERVER = 4;
+    INVALID_INPUT = 5;
+  }
+
+  ErrCode err_code = 1;
+  string message = 2;
+  uint64 storage_session = 3;
+}
diff --git a/taskfiles/build.yaml b/taskfiles/build.yaml
index cfdf3c48..cf605838 100644
--- a/taskfiles/build.yaml
+++ b/taskfiles/build.yaml
@@ -1,5 +1,8 @@
 version: "3"
 
+includes:
+  toolchains: "toolchains.yaml"
+
 tasks:
   cpp-target:
     internal: true
@@ -24,6 +27,23 @@ tasks:
     cmds:
       - "uv build --directory {{.G_SRC_PYTHON_DIR}} -o {{.G_BUILD_PYTHON_DIR}}"
 
+  spider-proto-rust-codegen:
+    env:
+      SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE: "ON"
+    dir: "{{.ROOT_DIR}}"
+    deps: ["toolchains:rust"]
+    cmd: |-
+      . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}"
+      cargo build --release --package spider-proto-rust
+
+  rust:
+    dir: "{{.ROOT_DIR}}"
+    deps:
+      - "toolchains:rust"
+    cmd: |-
+      . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}"
+      cargo build --release --all-features
+
   tdl-generate-parsers:
     vars:
       CHECKSUM_FILE: "{{.G_BUILD_DIR}}/{{.TASK}}.md5"
diff --git a/taskfiles/lint.yaml b/taskfiles/lint.yaml
index 99f27cea..19547345 100644
--- a/taskfiles/lint.yaml
+++ b/taskfiles/lint.yaml
@@ -314,4 +314,4 @@ tasks:
     deps: ["toolchains:rust"]
     cmd: |-
       . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}"
-      cargo +nightly clippy --all-targets --all-features {{.CARGO_CLIPPY_FLAGS}}
+      cargo +nightly clippy --workspace --all-targets --all-features {{.CARGO_CLIPPY_FLAGS}}
diff --git a/taskfiles/test.yaml b/taskfiles/test.yaml
index 7d79bfdb..a7b19749 100644
--- a/taskfiles/test.yaml
+++ b/taskfiles/test.yaml
@@ -33,6 +33,8 @@ tasks:
           STORAGE_TASK: "spider-py-unit-tests-executor"
 
   rust-unit-tests:
+    deps:
+      - ":build:rust"
     cmds:
       - task: "huntsman-mariadb-storage-task-executor"
         vars:
@@ -234,12 +236,7 @@ tasks:
       - defer: "rm -rf ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}"
       - |-
         . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}"
-        # `--bin` is a workspace-wide target filter; combining it with cdylib packages in the
-        # same `cargo build` would silently exclude the `.so` artifacts. Use one invocation per
-        # artifact to keep the target selection unambiguous.
-        cargo build --release --package huntsman-complex
-        cargo build --release --package integration-test-tasks
-        cargo build --release --package spider-task-executor --bin spider-task-executor
+        cargo build --release --workspace --all-features
         mkdir -p "{{.G_TDL_PACKAGES_DIR}}/complex" \
                  "{{.G_TDL_PACKAGES_DIR}}/integration_test_tasks"
         cp "{{.G_RUST_RELEASE_DIR}}/libhuntsman_complex.so" \
diff --git a/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
index 73b64eab..6143e75e 100755
--- a/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
+++ b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh
@@ -23,4 +23,5 @@ fi
 DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \
 apt-get install --no-install-recommends -y \
     gcc \
-    libc6-dev
+    libc6-dev \
+    protobuf-compiler

From 80152948c142ec2ab3f5912dd0eb1d896cbff1e8 Mon Sep 17 00:00:00 2001
From: Sitao Wang <sitaowang1998@outlook.com>
Date: Mon, 8 Jun 2026 10:54:14 -0400
Subject: [PATCH 10/14] Add get recoverable jobs in db

---
 components/spider-storage/src/db.rs          |  1 +
 components/spider-storage/src/db/error.rs    |  9 +++
 components/spider-storage/src/db/mariadb.rs  | 61 +++++++++++++++++++-
 components/spider-storage/src/db/protocol.rs | 33 +++++++++++
 4 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/components/spider-storage/src/db.rs b/components/spider-storage/src/db.rs
index c5152e2f..8ea9a5f0 100644
--- a/components/spider-storage/src/db.rs
+++ b/components/spider-storage/src/db.rs
@@ -9,6 +9,7 @@ pub use protocol::{
     ExecutionManagerLivenessManagement,
     ExternalJobOrchestration,
     InternalJobOrchestration,
+    RecoverableJob,
     ResourceGroupManagement,
     SessionManagement,
 };
diff --git a/components/spider-storage/src/db/error.rs b/components/spider-storage/src/db/error.rs
index 62b6434b..3bce5386 100644
--- a/components/spider-storage/src/db/error.rs
+++ b/components/spider-storage/src/db/error.rs
@@ -40,6 +40,9 @@ pub enum DbError {
     #[error("Task graph serialization failure: {0}")]
     TaskGraphSerializationFailure(#[source] Box<dyn std::error::Error + Send + Sync>),
 
+    #[error("Task graph deserialization failure: {0}")]
+    TaskGraphDeserializationFailure(#[source] Box<dyn std::error::Error + Send + Sync>),
+
     #[error("Value serialization failure: {0}")]
     ValueSerializationFailure(#[source] Box<dyn std::error::Error + Send + Sync>),
 
@@ -57,6 +60,12 @@ impl DbError {
         Self::TaskGraphSerializationFailure(Box::new(e))
     }
 
+    pub fn task_graph_de<DeserializationError: std::error::Error + Send + Sync + 'static>(
+        e: DeserializationError,
+    ) -> Self {
+        Self::TaskGraphDeserializationFailure(Box::new(e))
+    }
+
     pub fn value_ser<SerializationError: serde::ser::Error + Send + Sync + 'static>(
         e: SerializationError,
     ) -> Self {
diff --git a/components/spider-storage/src/db/mariadb.rs b/components/spider-storage/src/db/mariadb.rs
index 6bd7017c..4cb11320 100644
--- a/components/spider-storage/src/db/mariadb.rs
+++ b/components/spider-storage/src/db/mariadb.rs
@@ -5,9 +5,10 @@ use const_format::formatcp;
 use secrecy::ExposeSecret;
 use spider_core::{
     job::JobState,
+    task::TaskGraph,
     types::{
         id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId},
-        io::TaskOutput,
+        io::{TaskInput, TaskOutput},
     },
 };
 use spider_derive::MySqlEnum;
@@ -22,6 +23,7 @@ use crate::{
         ExecutionManagerLivenessManagement,
         ExternalJobOrchestration,
         InternalJobOrchestration,
+        RecoverableJob,
         ResourceGroupManagement,
         SessionManagement,
         error::ExpectedStates,
@@ -380,6 +382,63 @@ impl InternalJobOrchestration for MariaDbStorageConnector {
         tx.commit().await?;
         Ok(deleted_job_ids)
     }
+
+    async fn get_recoverable_jobs(&self) -> Result<Vec<RecoverableJob>, DbError> {
+        const SELECT_QUERY: &str = formatcp!(
+            "SELECT `id`, `resource_group_id`, `state`, `serialized_task_graph`, \
+             `serialized_job_inputs`, `serialized_job_outputs` FROM `{table}` WHERE `state` IN \
+             ('{running_state}','{commit_ready_state}','{cleanup_ready_state}');",
+            table = JOBS_TABLE_NAME,
+            running_state = JobState::Running.as_str(),
+            commit_ready_state = JobState::CommitReady.as_str(),
+            cleanup_ready_state = JobState::CleanupReady.as_str(),
+        );
+
+        let rows = sqlx::query_as::<
+            _,
+            (
+                JobId,
+                ResourceGroupId,
+                JobState,
+                String,
+                Vec<u8>,
+                Option<Vec<u8>>,
+            ),
+        >(SELECT_QUERY)
+        .fetch_all(&self.pool)
+        .await?;
+
+        rows.into_iter()
+            .map(
+                |(
+                    id,
+                    resource_group_id,
+                    state,
+                    serialized_task_graph,
+                    serialized_job_inputs,
+                    serialized_job_outputs,
+                )| {
+                    let task_graph = TaskGraph::from_json(&serialized_task_graph)
+                        .map_err(DbError::task_graph_de)?;
+                    let job_inputs: Vec<TaskInput> =
+                        rmp_serde::from_slice(&serialized_job_inputs).map_err(DbError::value_de)?;
+                    let job_submission = ValidatedJobSubmission::create(task_graph, job_inputs)
+                        .map_err(|e| DbError::CorruptedDbState(e.to_string()))?;
+                    let job_outputs = serialized_job_outputs
+                        .map(|outputs| rmp_serde::from_slice(&outputs).map_err(DbError::value_de))
+                        .transpose()?;
+
+                    Ok(RecoverableJob {
+                        id,
+                        resource_group_id,
+                        state,
+                        job_submission,
+                        job_outputs,
+                    })
+                },
+            )
+            .collect()
+    }
 }
 
 #[async_trait]
diff --git a/components/spider-storage/src/db/protocol.rs b/components/spider-storage/src/db/protocol.rs
index 0b9e297f..2f9be3fd 100644
--- a/components/spider-storage/src/db/protocol.rs
+++ b/components/spider-storage/src/db/protocol.rs
@@ -11,6 +11,23 @@ use spider_core::{
 
 use crate::{cache::job_submission::ValidatedJobSubmission, db::error::DbError};
 
+/// A job persisted in the database that should be rebuilt in the storage cache on startup.
+///
+/// Only jobs that have already started execution are recoverable. [`JobState::Ready`] jobs remain
+/// database-only until a client starts them.
+pub struct RecoverableJob {
+    /// The persisted job ID.
+    pub id: JobId,
+    /// The owning resource group.
+    pub resource_group_id: ResourceGroupId,
+    /// The source-of-truth database state.
+    pub state: JobState,
+    /// The original job submission.
+    pub job_submission: ValidatedJobSubmission,
+    /// The committed job outputs, if the job has reached the commit phase.
+    pub job_outputs: Option<Vec<TaskOutput>>,
+}
+
 /// The database storage interface. A database storage must implement the following traits:
 ///
 /// * [`ExternalJobOrchestration`]
@@ -244,6 +261,22 @@ pub trait InternalJobOrchestration: Clone + Send + Sync {
         &self,
         expire_after_sec: u64,
     ) -> Result<Vec<JobId>, DbError>;
+
+    /// Gets all jobs that should be recovered into the cache.
+    ///
+    /// # Returns
+    ///
+    /// All persisted jobs in [`JobState::Running`], [`JobState::CommitReady`], or
+    /// [`JobState::CleanupReady`] on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`DbError::TaskGraphDeserializationFailure`] if a persisted task graph is invalid.
+    /// * [`DbError::ValueDeserializationFailure`] if persisted inputs or outputs are invalid.
+    /// * Forwards [`sqlx::error::Error`] on DB operation failure.
+    async fn get_recoverable_jobs(&self) -> Result<Vec<RecoverableJob>, DbError>;
 }
 
 /// Defines the storage interface for resource group management in the database.

From a667a7984f24f28c2f4d6392a3e2e54ab6c942b6 Mon Sep 17 00:00:00 2001
From: Sitao Wang <sitaowang1998@outlook.com>
Date: Mon, 8 Jun 2026 12:20:27 -0400
Subject: [PATCH 11/14] Add runtime recovery

---
 components/spider-storage/src/cache/job.rs    | 95 +++++++++++++++++++
 components/spider-storage/src/cache/sync.rs   |  7 ++
 components/spider-storage/src/cache/task.rs   | 24 +++++
 .../spider-storage/src/state/runtime.rs       | 73 +++++++++++++-
 .../spider-storage/src/state/service.rs       |  8 ++
 .../spider-storage/src/state/test_utils.rs    |  5 +
 6 files changed, 208 insertions(+), 4 deletions(-)

diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs
index 21997b06..146b015f 100644
--- a/components/spider-storage/src/cache/job.rs
+++ b/components/spider-storage/src/cache/job.rs
@@ -47,6 +47,20 @@ pub struct SharedJobControlBlock<
         Arc<JobControlBlock<ReadyQueueSenderType, DbConnectorType, TaskInstancePoolConnectorType>>,
 }
 
+/// Persistent job state used to recover a job control block.
+pub struct JobRecoveryContext {
+    /// The persisted job ID.
+    pub id: JobId,
+    /// The owning resource group.
+    pub owner_id: ResourceGroupId,
+    /// The source-of-truth database state.
+    pub state: JobState,
+    /// The original job submission.
+    pub job_submission: ValidatedJobSubmission,
+    /// The committed job outputs, if the job has reached the commit phase.
+    pub job_outputs: Option<Vec<TaskOutput>>,
+}
+
 impl<
     ReadyQueueSenderType: ReadyQueueSender,
     DbConnectorType: InternalJobOrchestration,
@@ -93,6 +107,87 @@ impl<
         })
     }
 
+    /// Recovers a job control block from persistent database state.
+    ///
+    /// This constructor does not mutate the database. It rebuilds enough cache state to resume
+    /// scheduling:
+    ///
+    /// * [`JobState::Running`] jobs enqueue their initially-ready regular tasks.
+    /// * [`JobState::CommitReady`] jobs enqueue the commit task.
+    /// * [`JobState::CleanupReady`] jobs enqueue the cleanup task.
+    ///
+    /// # Returns
+    ///
+    /// The recovered [`SharedJobControlBlock`] on success.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`InternalError::UnexpectedJobState`] if `state` is not recoverable.
+    /// * Forwards [`TaskGraph::create`]'s return values on failure.
+    /// * Forwards [`TaskGraph::restore_outputs`]'s return values on failure.
+    /// * Forwards [`SharedJobControlBlock::resend_ready_tasks`]'s return values on failure.
+    pub async fn recover(
+        recovery_context: JobRecoveryContext,
+        ready_queue_sender: ReadyQueueSenderType,
+        db_connector: DbConnectorType,
+        task_instance_pool_connector: TaskInstancePoolConnectorType,
+    ) -> Result<Self, CacheError> {
+        let JobRecoveryContext {
+            id,
+            owner_id,
+            state,
+            job_submission,
+            job_outputs,
+        } = recovery_context;
+        if !matches!(
+            state,
+            JobState::Running | JobState::CommitReady | JobState::CleanupReady
+        ) {
+            return Err(UnexpectedJobState {
+                current: state,
+                expected: JobState::Running,
+            }
+            .into());
+        }
+
+        let num_tasks = job_submission.task_graph().get_num_tasks();
+        let mut task_graph = TaskGraph::create(job_submission).await?;
+        if let Some(outputs) = job_outputs {
+            task_graph.restore_outputs(outputs).await?;
+        }
+        let num_incomplete_tasks = if matches!(state, JobState::CommitReady) {
+            0
+        } else {
+            num_tasks
+        };
+
+        if matches!(state, JobState::CleanupReady) {
+            task_graph.cancel_non_terminal().await;
+        }
+
+        let job_execution_state = JobExecutionState {
+            state,
+            task_graph,
+            num_incomplete_tasks: AtomicUsize::new(num_incomplete_tasks),
+            ready_queue_sender,
+            db_connector,
+            task_instance_pool_connector,
+        };
+        let recovered = Self {
+            inner: Arc::new(JobControlBlock {
+                id,
+                owner_id,
+                job_execution_state: JobExecutionStateHandle {
+                    inner: tokio::sync::RwLock::new(job_execution_state),
+                },
+            }),
+        };
+        recovered.resend_ready_tasks().await?;
+        Ok(recovered)
+    }
+
     /// Returns the job ID.
     #[must_use]
     pub fn id(&self) -> JobId {
diff --git a/components/spider-storage/src/cache/sync.rs b/components/spider-storage/src/cache/sync.rs
index 0fc03448..4d1847a9 100644
--- a/components/spider-storage/src/cache/sync.rs
+++ b/components/spider-storage/src/cache/sync.rs
@@ -17,6 +17,13 @@ impl<Type: Send + Sync> Reader<Type> {
         Self { inner }
     }
 
+    /// # Returns
+    ///
+    /// A writer for the same shared data.
+    pub(crate) fn writer(&self) -> Writer<Type> {
+        Writer::new(self.inner.clone())
+    }
+
     /// # Returns
     ///
     /// A guard that allows read access to the shared data. The guard will be released when it goes
diff --git a/components/spider-storage/src/cache/task.rs b/components/spider-storage/src/cache/task.rs
index 5ce7ff30..75e1bba3 100644
--- a/components/spider-storage/src/cache/task.rs
+++ b/components/spider-storage/src/cache/task.rs
@@ -172,6 +172,30 @@ impl TaskGraph {
         &self.outputs
     }
 
+    /// Restores graph outputs from persisted job outputs.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    ///
+    /// * [`InternalError::TaskOutputsLengthMismatch`] if the number of persisted outputs does not
+    ///   match the number of graph outputs.
+    pub async fn restore_outputs(
+        &self,
+        persisted_outputs: Vec<TaskOutput>,
+    ) -> Result<(), InternalError> {
+        if persisted_outputs.len() != self.outputs.len() {
+            return Err(InternalError::TaskOutputsLengthMismatch(
+                self.outputs.len(),
+                persisted_outputs.len(),
+            ));
+        }
+        for (output_reader, output) in self.outputs.iter().zip(persisted_outputs) {
+            *output_reader.writer().write().await = Some(output);
+        }
+        Ok(())
+    }
+
     #[must_use]
     pub const fn has_commit_task(&self) -> bool {
         self.commit_task.is_some()
diff --git a/components/spider-storage/src/state/runtime.rs b/components/spider-storage/src/state/runtime.rs
index 5bda0d7a..5d4e77a5 100644
--- a/components/spider-storage/src/state/runtime.rs
+++ b/components/spider-storage/src/state/runtime.rs
@@ -4,9 +4,12 @@ use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 
 use crate::{
-    cache::error::{CacheError, InternalError},
+    cache::{
+        error::{CacheError, InternalError},
+        job::{JobRecoveryContext, SharedJobControlBlock},
+    },
     config::DatabaseConfig,
-    db::{DbStorage, MariaDbStorageConnector, SessionManagement},
+    db::{DbStorage, MariaDbStorageConnector, RecoverableJob, SessionManagement},
     ready_queue::{ReadyQueueConfig, ReadyQueueSender, ReadyQueueSenderHandle, create_ready_queue},
     state::{JobCache, ServiceState, StorageServerError},
     task_instance_pool::{
@@ -121,11 +124,16 @@ pub async fn create_runtime(
     )
     .map_err(CacheError::from)?;
 
-    // TODO: Recover jobs from the database.
+    let job_cache = recover_job_cache(
+        &db,
+        ready_queue_sender.clone(),
+        task_instance_pool_connector.clone(),
+    )
+    .await?;
     let service_state = ServiceState::new(
         db,
         session_id,
-        JobCache::new(),
+        job_cache,
         ready_queue_sender,
         ready_queue_receiver,
         task_instance_pool_connector,
@@ -144,6 +152,63 @@ pub async fn create_runtime(
 
 const STOP_BACKGROUND_TASKS_TIMEOUT_SEC: u64 = 30;
 
+/// Recovers jobs from persistent storage into the cache.
+///
+/// # Returns
+///
+/// A [`JobCache`] containing all recoverable jobs on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`DbStorage::get_recoverable_jobs`]'s return values on failure.
+/// * Forwards [`SharedJobControlBlock::recover`]'s return values on failure.
+/// * Forwards [`JobCache::insert`]'s return values on failure.
+async fn recover_job_cache<
+    ReadyQueueSenderType: ReadyQueueSender,
+    DbConnectorType: DbStorage,
+    TaskInstancePoolConnectorType: TaskInstancePoolConnector,
+>(
+    db: &DbConnectorType,
+    ready_queue_sender: ReadyQueueSenderType,
+    task_instance_pool_connector: TaskInstancePoolConnectorType,
+) -> Result<
+    JobCache<ReadyQueueSenderType, DbConnectorType, TaskInstancePoolConnectorType>,
+    StorageServerError,
+> {
+    let job_cache = JobCache::new();
+    for recoverable_job in db.get_recoverable_jobs().await? {
+        let RecoverableJob {
+            id,
+            resource_group_id,
+            state,
+            job_submission,
+            job_outputs,
+        } = recoverable_job;
+        let jcb = SharedJobControlBlock::recover(
+            JobRecoveryContext {
+                id,
+                owner_id: resource_group_id,
+                state,
+                job_submission,
+                job_outputs,
+            },
+            ready_queue_sender.clone(),
+            db.clone(),
+            task_instance_pool_connector.clone(),
+        )
+        .await?;
+        job_cache.insert(jcb).await?;
+        tracing::info!(
+            job_id = ? id,
+            job_state = ? state,
+            "Job recovered into cache.",
+        );
+    }
+    Ok(job_cache)
+}
+
 #[cfg(test)]
 mod tests {
     use std::time::Duration;
diff --git a/components/spider-storage/src/state/service.rs b/components/spider-storage/src/state/service.rs
index ac257e77..a198fb3c 100644
--- a/components/spider-storage/src/state/service.rs
+++ b/components/spider-storage/src/state/service.rs
@@ -83,6 +83,14 @@ impl<
         }
     }
 
+    /// # Returns
+    ///
+    /// The storage session ID owned by this service state.
+    #[must_use]
+    pub fn session_id(&self) -> SessionId {
+        self.inner.session_id
+    }
+
     /// Registers a job in the database and inserts its control block into the cache.
     ///
     /// # Returns
diff --git a/components/spider-storage/src/state/test_utils.rs b/components/spider-storage/src/state/test_utils.rs
index a2536d6c..52dcf383 100644
--- a/components/spider-storage/src/state/test_utils.rs
+++ b/components/spider-storage/src/state/test_utils.rs
@@ -27,6 +27,7 @@ use crate::{
         ExecutionManagerLivenessManagement,
         ExternalJobOrchestration,
         InternalJobOrchestration,
+        RecoverableJob,
         ResourceGroupManagement,
         SessionManagement,
     },
@@ -166,6 +167,10 @@ impl InternalJobOrchestration for MockDbConnector {
     ) -> Result<Vec<JobId>, DbError> {
         Ok(Vec::new())
     }
+
+    async fn get_recoverable_jobs(&self) -> Result<Vec<RecoverableJob>, DbError> {
+        Ok(Vec::new())
+    }
 }
 
 #[async_trait::async_trait]

From da87450c42e31b66b089716b7b47ee4517899fee Mon Sep 17 00:00:00 2001
From: Sitao Wang <sitaowang1998@outlook.com>
Date: Mon, 8 Jun 2026 13:01:37 -0400
Subject: [PATCH 12/14] Add unit tests

---
 .../spider-storage/tests/mariadb_infra.rs     |  24 +-
 .../spider-storage/tests/mariadb_test.rs      |   4 +-
 .../spider-storage/tests/recovery_test.rs     | 481 ++++++++++++++++++
 .../spider-storage/tests/scheduling_infra.rs  |  12 +-
 .../tests/test_spider_storage.rs              |   1 +
 5 files changed, 514 insertions(+), 8 deletions(-)
 create mode 100644 components/spider-storage/tests/recovery_test.rs

diff --git a/components/spider-storage/tests/mariadb_infra.rs b/components/spider-storage/tests/mariadb_infra.rs
index 0772ec04..299ec1fb 100644
--- a/components/spider-storage/tests/mariadb_infra.rs
+++ b/components/spider-storage/tests/mariadb_infra.rs
@@ -16,6 +16,23 @@ use spider_storage::{
 /// Panics if any required environment variable (`MARIADB_PORT`, `MARIADB_DATABASE`,
 /// `MARIADB_USERNAME`, `MARIADB_PASSWORD`) is missing or if the connection fails.
 pub async fn create_mariadb_connector() -> MariaDbStorageConnector {
+    MariaDbStorageConnector::connect(&create_mariadb_config())
+        .await
+        .expect("connect failed")
+}
+
+/// Creates a [`DatabaseConfig`] from environment variables.
+///
+/// # Returns
+///
+/// A [`DatabaseConfig`] configured from environment variables.
+///
+/// # Panics
+///
+/// Panics if any required environment variable (`MARIADB_PORT`, `MARIADB_DATABASE`,
+/// `MARIADB_USERNAME`, `MARIADB_PASSWORD`) is missing or if `MARIADB_PORT` is invalid.
+#[must_use]
+pub fn create_mariadb_config() -> DatabaseConfig {
     let port: u16 = std::env::var("MARIADB_PORT")
         .expect("MARIADB_PORT")
         .parse()
@@ -24,17 +41,14 @@ pub async fn create_mariadb_connector() -> MariaDbStorageConnector {
     let username = std::env::var("MARIADB_USERNAME").expect("MARIADB_USERNAME");
     let password = std::env::var("MARIADB_PASSWORD").expect("MARIADB_PASSWORD");
 
-    let config = DatabaseConfig {
+    DatabaseConfig {
         host: "localhost".to_string(),
         port,
         name: database,
         username,
         password: SecretString::from(password),
         max_connections: 5,
-    };
-    MariaDbStorageConnector::connect(&config)
-        .await
-        .expect("connect failed")
+    }
 }
 
 /// Registers a new resource group with a random external ID and a fixed test password.
diff --git a/components/spider-storage/tests/mariadb_test.rs b/components/spider-storage/tests/mariadb_test.rs
index 88343c82..f58a020f 100644
--- a/components/spider-storage/tests/mariadb_test.rs
+++ b/components/spider-storage/tests/mariadb_test.rs
@@ -269,7 +269,7 @@ async fn test_get_error_wrong_state() {
 async fn test_cancel_job_with_cleanup_transitions_to_cleanup_ready() {
     let storage = create_mariadb_connector().await;
     let rg_id = create_test_resource_group(&storage).await;
-    let (graph, inputs) = single_task_graph();
+    let (graph, inputs) = build_flat_task_graph(1, TEST_INPUT_PAYLOAD_SIZE, false, true);
     let job_submission =
         ValidatedJobSubmission::create(graph, inputs).expect("job submission should be valid");
 
@@ -403,7 +403,7 @@ async fn test_commit_outputs_without_commit_task() {
 async fn test_commit_outputs_with_commit_task() {
     let storage = create_mariadb_connector().await;
     let rg_id = create_test_resource_group(&storage).await;
-    let (graph, inputs) = single_task_graph();
+    let (graph, inputs) = build_flat_task_graph(1, TEST_INPUT_PAYLOAD_SIZE, true, false);
     let job_submission =
         ValidatedJobSubmission::create(graph, inputs).expect("job submission should be valid");
 
diff --git a/components/spider-storage/tests/recovery_test.rs b/components/spider-storage/tests/recovery_test.rs
new file mode 100644
index 00000000..84860644
--- /dev/null
+++ b/components/spider-storage/tests/recovery_test.rs
@@ -0,0 +1,481 @@
+use std::{net::IpAddr, time::Duration};
+
+use spider_core::{
+    job::JobState,
+    task::TaskIndex,
+    types::{
+        id::{JobId, TaskInstanceId},
+        io::TaskInput,
+    },
+};
+use spider_storage::{
+    db::ExternalJobOrchestration,
+    ready_queue::{ReadyQueueConfig, ReadyQueueEntry},
+    state::{Runtime, ServiceState, StorageServerError, create_runtime},
+    task_instance_pool::TaskInstancePoolConfig,
+};
+use spider_tdl::wire::{TaskInputsSerializer, TaskOutputsSerializer};
+
+use crate::{
+    mariadb_infra::{create_mariadb_config, create_mariadb_connector},
+    task_graph_builder::build_flat_task_graph,
+};
+
+#[tokio::test]
+async fn restarted_storage_cache_does_not_recover_ready_job() -> anyhow::Result<()> {
+    let db_config = create_mariadb_config();
+    let (runtime, _) = create_runtime(
+        &db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let service = runtime.get_service_state();
+    let job_id = create_registered_job(&service, false, false).await?;
+    assert_eq!(service.get_job_state(job_id).await?, JobState::Ready);
+    runtime.stop().await?;
+
+    let (recovered_runtime, _) = create_runtime(
+        &db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let recovered_service = recovered_runtime.get_service_state();
+    let start_result = recovered_service.start_job(job_id).await;
+    assert!(
+        matches!(start_result, Err(StorageServerError::JobNotFound(id)) if id == job_id),
+        "ready job should not be recovered into cache"
+    );
+    assert_eq!(
+        recovered_service.get_job_state(job_id).await?,
+        JobState::Ready
+    );
+    recovered_runtime.stop().await?;
+    Ok(())
+}
+
+#[tokio::test]
+async fn restarted_storage_cache_recovers_running_job_from_start() -> anyhow::Result<()> {
+    let db_config = create_mariadb_config();
+    let (job_id, recovered_service, recovered_runtime) =
+        restart_after_starting_job(&db_config, false, false).await?;
+
+    let ready_entries = recovered_service
+        .poll_ready_tasks(32, Duration::from_secs(1))
+        .await?;
+    let ready_entry = find_entry_for_job(ready_entries, job_id);
+
+    let task_instance_id =
+        run_recovered_regular_task(&recovered_service, job_id, ready_entry.task_kind).await?;
+    let state = recovered_service
+        .succeed_task_instance(
+            recovered_service.session_id(),
+            job_id,
+            task_instance_id,
+            ready_entry.task_kind,
+            serialized_single_output()?,
+        )
+        .await?;
+    assert_eq!(state, JobState::Succeeded);
+
+    assert_eq!(
+        create_mariadb_connector().await.get_state(job_id).await?,
+        JobState::Succeeded
+    );
+    recovered_runtime.stop().await?;
+    Ok(())
+}
+
+#[tokio::test]
+async fn restarted_storage_cache_recovers_commit_ready_job() -> anyhow::Result<()> {
+    let db_config = create_mariadb_config();
+    let (job_id, recovered_service, recovered_runtime) =
+        restart_after_commit_ready(&db_config).await?;
+
+    let ready_entries = recovered_service
+        .poll_commit_ready_tasks(32, Duration::from_secs(1))
+        .await?;
+    let _ready_entry = find_entry_for_job(ready_entries, job_id);
+
+    let execution_manager_id = recovered_service
+        .register_execution_manager(IpAddr::from([127, 0, 0, 1]))
+        .await?;
+    let execution_context = recovered_service
+        .create_task_instance(
+            recovered_service.session_id(),
+            job_id,
+            spider_core::types::id::TaskId::Commit,
+            execution_manager_id,
+        )
+        .await?;
+    let state = recovered_service
+        .succeed_commit_task_instance(
+            recovered_service.session_id(),
+            job_id,
+            execution_context.task_instance_id,
+        )
+        .await?;
+    assert_eq!(state, JobState::Succeeded);
+    let expected_outputs = TaskOutputsSerializer::deserialize(&serialized_single_output()?)?;
+    assert_eq!(
+        recovered_service.get_job_outputs(job_id).await?,
+        expected_outputs
+    );
+
+    assert_eq!(
+        create_mariadb_connector().await.get_state(job_id).await?,
+        JobState::Succeeded
+    );
+    recovered_runtime.stop().await?;
+    Ok(())
+}
+
+#[tokio::test]
+async fn restarted_storage_cache_recovers_cleanup_ready_job() -> anyhow::Result<()> {
+    let db_config = create_mariadb_config();
+    let (job_id, recovered_service, recovered_runtime) =
+        restart_after_cleanup_ready(&db_config).await?;
+
+    let ready_entries = recovered_service
+        .poll_cleanup_ready_tasks(32, Duration::from_secs(1))
+        .await?;
+    let _ready_entry = find_entry_for_job(ready_entries, job_id);
+
+    let execution_manager_id = recovered_service
+        .register_execution_manager(IpAddr::from([127, 0, 0, 1]))
+        .await?;
+    let execution_context = recovered_service
+        .create_task_instance(
+            recovered_service.session_id(),
+            job_id,
+            spider_core::types::id::TaskId::Cleanup,
+            execution_manager_id,
+        )
+        .await?;
+    let state = recovered_service
+        .succeed_cleanup_task_instance(
+            recovered_service.session_id(),
+            job_id,
+            execution_context.task_instance_id,
+        )
+        .await?;
+    assert_eq!(state, JobState::Cancelled);
+
+    assert_eq!(
+        create_mariadb_connector().await.get_state(job_id).await?,
+        JobState::Cancelled
+    );
+    recovered_runtime.stop().await?;
+    Ok(())
+}
+
+/// Starts a job, stops the runtime, and creates a replacement runtime over the same database.
+///
+/// # Returns
+///
+/// The job ID, recovered service state, and recovered runtime on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`create_runtime`]'s return values on failure.
+/// * Forwards [`create_and_start_job`]'s return values on failure.
+/// * Forwards [`Runtime::stop`]'s return values on failure.
+async fn restart_after_starting_job(
+    db_config: &spider_storage::DatabaseConfig,
+    with_commit: bool,
+    with_cleanup: bool,
+) -> anyhow::Result<(
+    JobId,
+    ServiceState<
+        spider_storage::ready_queue::ReadyQueueSenderHandle,
+        spider_storage::db::MariaDbStorageConnector,
+        spider_storage::task_instance_pool::TaskInstancePoolHandle,
+    >,
+    Runtime<
+        spider_storage::ready_queue::ReadyQueueSenderHandle,
+        spider_storage::db::MariaDbStorageConnector,
+        spider_storage::task_instance_pool::TaskInstancePoolHandle,
+    >,
+)> {
+    let (runtime, _) = create_runtime(
+        db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let service = runtime.get_service_state();
+    let job_id = create_and_start_job(&service, with_commit, with_cleanup).await?;
+    runtime.stop().await?;
+
+    let (recovered_runtime, _) = create_runtime(
+        db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let recovered_service = recovered_runtime.get_service_state();
+    Ok((job_id, recovered_service, recovered_runtime))
+}
+
+/// Drives a job to [`JobState::CommitReady`], stops the runtime, and creates a replacement runtime.
+///
+/// # Returns
+///
+/// The job ID, recovered service state, and recovered runtime on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`restart_after_starting_job`]'s return values on failure.
+/// * Forwards [`ServiceState::poll_ready_tasks`]'s return values on failure.
+/// * Forwards [`run_recovered_regular_task`]'s return values on failure.
+/// * Forwards [`serialized_single_output`]'s return values on failure.
+/// * Forwards [`ServiceState::succeed_task_instance`]'s return values on failure.
+/// * Forwards [`Runtime::stop`]'s return values on failure.
+/// * Forwards [`create_runtime`]'s return values on failure.
+async fn restart_after_commit_ready(
+    db_config: &spider_storage::DatabaseConfig,
+) -> anyhow::Result<(
+    JobId,
+    ServiceState<
+        spider_storage::ready_queue::ReadyQueueSenderHandle,
+        spider_storage::db::MariaDbStorageConnector,
+        spider_storage::task_instance_pool::TaskInstancePoolHandle,
+    >,
+    Runtime<
+        spider_storage::ready_queue::ReadyQueueSenderHandle,
+        spider_storage::db::MariaDbStorageConnector,
+        spider_storage::task_instance_pool::TaskInstancePoolHandle,
+    >,
+)> {
+    let (job_id, service, runtime) = restart_after_starting_job(db_config, true, false).await?;
+    let ready_entries = service.poll_ready_tasks(32, Duration::from_secs(1)).await?;
+    let ready_entry = find_entry_for_job(ready_entries, job_id);
+    let task_instance_id =
+        run_recovered_regular_task(&service, job_id, ready_entry.task_kind).await?;
+    let state = service
+        .succeed_task_instance(
+            service.session_id(),
+            job_id,
+            task_instance_id,
+            0,
+            serialized_single_output()?,
+        )
+        .await?;
+    assert_eq!(state, JobState::CommitReady);
+    runtime.stop().await?;
+
+    let (recovered_runtime, _) = create_runtime(
+        db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let recovered_service = recovered_runtime.get_service_state();
+    Ok((job_id, recovered_service, recovered_runtime))
+}
+
+/// Drives a job to [`JobState::CleanupReady`], stops the runtime, and creates a replacement
+/// runtime.
+///
+/// # Returns
+///
+/// The job ID, recovered service state, and recovered runtime on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`create_runtime`]'s return values on failure.
+/// * Forwards [`create_and_start_job`]'s return values on failure.
+/// * Forwards [`ServiceState::cancel_job`]'s return values on failure.
+/// * Forwards [`Runtime::stop`]'s return values on failure.
+async fn restart_after_cleanup_ready(
+    db_config: &spider_storage::DatabaseConfig,
+) -> anyhow::Result<(
+    JobId,
+    ServiceState<
+        spider_storage::ready_queue::ReadyQueueSenderHandle,
+        spider_storage::db::MariaDbStorageConnector,
+        spider_storage::task_instance_pool::TaskInstancePoolHandle,
+    >,
+    Runtime<
+        spider_storage::ready_queue::ReadyQueueSenderHandle,
+        spider_storage::db::MariaDbStorageConnector,
+        spider_storage::task_instance_pool::TaskInstancePoolHandle,
+    >,
+)> {
+    let (runtime, _) = create_runtime(
+        db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let service = runtime.get_service_state();
+    let job_id = create_and_start_job(&service, false, true).await?;
+    let state = service.cancel_job(job_id).await?;
+    assert_eq!(state, JobState::CleanupReady);
+    runtime.stop().await?;
+
+    let (recovered_runtime, _) = create_runtime(
+        db_config,
+        &ReadyQueueConfig::default(),
+        &TaskInstancePoolConfig::default(),
+    )
+    .await?;
+    let recovered_service = recovered_runtime.get_service_state();
+    Ok((job_id, recovered_service, recovered_runtime))
+}
+
+/// Registers and starts a flat recovery-test job.
+///
+/// # Returns
+///
+/// The registered job ID on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`create_registered_job`]'s return values on failure.
+/// * Forwards [`ServiceState::start_job`]'s return values on failure.
+async fn create_and_start_job<
+    ReadyQueueSenderType: spider_storage::ready_queue::ReadyQueueSender,
+    DbConnectorType: spider_storage::db::DbStorage,
+    TaskInstancePoolConnectorType: spider_storage::task_instance_pool::TaskInstancePoolConnector,
+>(
+    service: &ServiceState<ReadyQueueSenderType, DbConnectorType, TaskInstancePoolConnectorType>,
+    with_commit: bool,
+    with_cleanup: bool,
+) -> anyhow::Result<JobId> {
+    let job_id = create_registered_job(service, with_commit, with_cleanup).await?;
+    service.start_job(job_id).await?;
+    Ok(job_id)
+}
+
+/// Registers a flat recovery-test job without starting it.
+///
+/// # Returns
+///
+/// The registered job ID on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`ServiceState::add_resource_group`]'s return values on failure.
+/// * Forwards [`spider_core::task::TaskGraph::to_json`]'s return values on failure.
+/// * Forwards [`serialize_inputs`]'s return values on failure.
+/// * Forwards [`ServiceState::register_job`]'s return values on failure.
+async fn create_registered_job<
+    ReadyQueueSenderType: spider_storage::ready_queue::ReadyQueueSender,
+    DbConnectorType: spider_storage::db::DbStorage,
+    TaskInstancePoolConnectorType: spider_storage::task_instance_pool::TaskInstancePoolConnector,
+>(
+    service: &ServiceState<ReadyQueueSenderType, DbConnectorType, TaskInstancePoolConnectorType>,
+    with_commit: bool,
+    with_cleanup: bool,
+) -> anyhow::Result<JobId> {
+    let rg_id = service
+        .add_resource_group(
+            format!("recovery-test-{}", rand::random::<u64>()),
+            b"test-password".to_vec(),
+        )
+        .await?;
+    let (task_graph, inputs) = build_flat_task_graph(1, 4, with_commit, with_cleanup);
+    Ok(service
+        .register_job(rg_id, task_graph.to_json()?, serialize_inputs(inputs)?)
+        .await?)
+}
+
+/// Registers an execution manager and creates an instance for a recovered regular task.
+///
+/// # Returns
+///
+/// The created task instance ID on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`ServiceState::register_execution_manager`]'s return values on failure.
+/// * Forwards [`ServiceState::create_task_instance`]'s return values on failure.
+async fn run_recovered_regular_task<
+    ReadyQueueSenderType: spider_storage::ready_queue::ReadyQueueSender,
+    DbConnectorType: spider_storage::db::DbStorage,
+    TaskInstancePoolConnectorType: spider_storage::task_instance_pool::TaskInstancePoolConnector,
+>(
+    service: &ServiceState<ReadyQueueSenderType, DbConnectorType, TaskInstancePoolConnectorType>,
+    job_id: JobId,
+    task_index: TaskIndex,
+) -> anyhow::Result<TaskInstanceId> {
+    let execution_manager_id = service
+        .register_execution_manager(IpAddr::from([127, 0, 0, 1]))
+        .await?;
+    let execution_context = service
+        .create_task_instance(
+            service.session_id(),
+            job_id,
+            spider_core::types::id::TaskId::Index(task_index),
+            execution_manager_id,
+        )
+        .await?;
+    Ok(execution_context.task_instance_id)
+}
+
+/// Finds the ready-queue entry for a job.
+///
+/// # Returns
+///
+/// The matching ready-queue entry.
+///
+/// # Panics
+///
+/// Panics if no matching entry exists.
+fn find_entry_for_job<TaskKind>(
+    entries: Vec<ReadyQueueEntry<TaskKind>>,
+    job_id: JobId,
+) -> ReadyQueueEntry<TaskKind> {
+    entries
+        .into_iter()
+        .find(|entry| entry.job_id == job_id)
+        .expect("recovered job should be enqueued")
+}
+
+/// Serializes task inputs into the storage service wire format.
+///
+/// # Returns
+///
+/// The serialized task inputs on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`TaskInputsSerializer::append`]'s return values on failure.
+fn serialize_inputs(inputs: Vec<TaskInput>) -> anyhow::Result<Vec<u8>> {
+    let mut serializer = TaskInputsSerializer::new();
+    for input in inputs {
+        serializer.append(input)?;
+    }
+    Ok(serializer.release())
+}
+
+/// Serializes the single output payload used by recovery tests.
+///
+/// # Returns
+///
+/// The serialized task output on success.
+///
+/// # Errors
+///
+/// Returns an error if:
+///
+/// * Forwards [`TaskOutputsSerializer::from_tuple`]'s return values on failure.
+fn serialized_single_output() -> anyhow::Result<Vec<u8>> {
+    Ok(TaskOutputsSerializer::from_tuple(&(vec![1u8; 4],))?)
+}
diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs
index a089d66f..d4fa4878 100644
--- a/components/spider-storage/tests/scheduling_infra.rs
+++ b/components/spider-storage/tests/scheduling_infra.rs
@@ -98,7 +98,13 @@ use spider_storage::{
         job_submission::ValidatedJobSubmission,
         task::{SharedTaskControlBlock, SharedTerminationTaskControlBlock},
     },
-    db::{DbError, ExternalJobOrchestration, InternalJobOrchestration, MariaDbStorageConnector},
+    db::{
+        DbError,
+        ExternalJobOrchestration,
+        InternalJobOrchestration,
+        MariaDbStorageConnector,
+        RecoverableJob,
+    },
     ready_queue::ReadyQueueSender,
     task_instance_pool::{TaskInstanceMetadata, TaskInstancePoolConnector},
 };
@@ -176,6 +182,10 @@ impl InternalJobOrchestration for NoopDbConnector {
     ) -> Result<Vec<JobId>, DbError> {
         Ok(Vec::new())
     }
+
+    async fn get_recoverable_jobs(&self) -> Result<Vec<RecoverableJob>, DbError> {
+        Ok(Vec::new())
+    }
 }
 
 /// The result of running a workload to completion.
diff --git a/components/spider-storage/tests/test_spider_storage.rs b/components/spider-storage/tests/test_spider_storage.rs
index 78520dd4..6e69cc13 100644
--- a/components/spider-storage/tests/test_spider_storage.rs
+++ b/components/spider-storage/tests/test_spider_storage.rs
@@ -4,3 +4,4 @@ mod task_graph_builder;
 
 mod jcb_test;
 mod mariadb_test;
+mod recovery_test;

From e3db65f0a39bf7f27bcd7e379747f4ebab612089 Mon Sep 17 00:00:00 2001
From: Sitao Wang <sitaowang1998@outlook.com>
Date: Mon, 8 Jun 2026 15:55:29 -0400
Subject: [PATCH 13/14] Address comment

---
 components/spider-storage/src/cache/job.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs
index 146b015f..15c8ed5a 100644
--- a/components/spider-storage/src/cache/job.rs
+++ b/components/spider-storage/src/cache/job.rs
@@ -125,6 +125,7 @@ impl<
     /// Returns an error if:
     ///
     /// * [`InternalError::UnexpectedJobState`] if `state` is not recoverable.
+    /// * [`InternalError::TaskGraphCorrupted`] if a commit-ready job has no persisted outputs.
     /// * Forwards [`TaskGraph::create`]'s return values on failure.
     /// * Forwards [`TaskGraph::restore_outputs`]'s return values on failure.
     /// * Forwards [`SharedJobControlBlock::resend_ready_tasks`]'s return values on failure.
@@ -154,6 +155,12 @@ impl<
 
         let num_tasks = job_submission.task_graph().get_num_tasks();
         let mut task_graph = TaskGraph::create(job_submission).await?;
+        if matches!(state, JobState::CommitReady) && job_outputs.is_none() {
+            return Err(InternalError::TaskGraphCorrupted(
+                "commit-ready job has no persisted outputs".to_owned(),
+            )
+            .into());
+        }
         if let Some(outputs) = job_outputs {
             task_graph.restore_outputs(outputs).await?;
         }

From 022223f8a930d02b46695e41aef484d881f74726 Mon Sep 17 00:00:00 2001
From: Sitao Wang <sitaowang1998@outlook.com>
Date: Tue, 9 Jun 2026 16:07:20 -0400
Subject: [PATCH 14/14] Use RecoverableJob

---
 components/spider-storage/src/cache/job.rs    | 26 +++++--------------
 .../spider-storage/src/state/runtime.rs       | 21 ++++-----------
 2 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs
index 15c8ed5a..a6e3bf98 100644
--- a/components/spider-storage/src/cache/job.rs
+++ b/components/spider-storage/src/cache/job.rs
@@ -22,7 +22,7 @@ use crate::{
         job_submission::ValidatedJobSubmission,
         task::TaskGraph,
     },
-    db::InternalJobOrchestration,
+    db::{InternalJobOrchestration, RecoverableJob},
     ready_queue::ReadyQueueSender,
     task_instance_pool::{TaskInstanceMetadata, TaskInstancePoolConnector},
 };
@@ -47,20 +47,6 @@ pub struct SharedJobControlBlock<
         Arc<JobControlBlock<ReadyQueueSenderType, DbConnectorType, TaskInstancePoolConnectorType>>,
 }
 
-/// Persistent job state used to recover a job control block.
-pub struct JobRecoveryContext {
-    /// The persisted job ID.
-    pub id: JobId,
-    /// The owning resource group.
-    pub owner_id: ResourceGroupId,
-    /// The source-of-truth database state.
-    pub state: JobState,
-    /// The original job submission.
-    pub job_submission: ValidatedJobSubmission,
-    /// The committed job outputs, if the job has reached the commit phase.
-    pub job_outputs: Option<Vec<TaskOutput>>,
-}
-
 impl<
     ReadyQueueSenderType: ReadyQueueSender,
     DbConnectorType: InternalJobOrchestration,
@@ -130,18 +116,18 @@ impl<
     /// * Forwards [`TaskGraph::restore_outputs`]'s return values on failure.
     /// * Forwards [`SharedJobControlBlock::resend_ready_tasks`]'s return values on failure.
     pub async fn recover(
-        recovery_context: JobRecoveryContext,
+        recoverable_job: RecoverableJob,
         ready_queue_sender: ReadyQueueSenderType,
         db_connector: DbConnectorType,
         task_instance_pool_connector: TaskInstancePoolConnectorType,
     ) -> Result<Self, CacheError> {
-        let JobRecoveryContext {
+        let RecoverableJob {
             id,
-            owner_id,
+            resource_group_id,
             state,
             job_submission,
             job_outputs,
-        } = recovery_context;
+        } = recoverable_job;
         if !matches!(
             state,
             JobState::Running | JobState::CommitReady | JobState::CleanupReady
@@ -185,7 +171,7 @@ impl<
         let recovered = Self {
             inner: Arc::new(JobControlBlock {
                 id,
-                owner_id,
+                owner_id: resource_group_id,
                 job_execution_state: JobExecutionStateHandle {
                     inner: tokio::sync::RwLock::new(job_execution_state),
                 },
diff --git a/components/spider-storage/src/state/runtime.rs b/components/spider-storage/src/state/runtime.rs
index 5d4e77a5..9350bce1 100644
--- a/components/spider-storage/src/state/runtime.rs
+++ b/components/spider-storage/src/state/runtime.rs
@@ -6,10 +6,10 @@ use tokio_util::sync::CancellationToken;
 use crate::{
     cache::{
         error::{CacheError, InternalError},
-        job::{JobRecoveryContext, SharedJobControlBlock},
+        job::SharedJobControlBlock,
     },
     config::DatabaseConfig,
-    db::{DbStorage, MariaDbStorageConnector, RecoverableJob, SessionManagement},
+    db::{DbStorage, MariaDbStorageConnector, SessionManagement},
     ready_queue::{ReadyQueueConfig, ReadyQueueSender, ReadyQueueSenderHandle, create_ready_queue},
     state::{JobCache, ServiceState, StorageServerError},
     task_instance_pool::{
@@ -179,21 +179,10 @@ async fn recover_job_cache<
 > {
     let job_cache = JobCache::new();
     for recoverable_job in db.get_recoverable_jobs().await? {
-        let RecoverableJob {
-            id,
-            resource_group_id,
-            state,
-            job_submission,
-            job_outputs,
-        } = recoverable_job;
+        let id = recoverable_job.id;
+        let state = recoverable_job.state;
         let jcb = SharedJobControlBlock::recover(
-            JobRecoveryContext {
-                id,
-                owner_id: resource_group_id,
-                state,
-                job_submission,
-                job_outputs,
-            },
+            recoverable_job,
             ready_queue_sender.clone(),
             db.clone(),
             task_instance_pool_connector.clone(),