From 6e812ccf70d59ad79205cbba886e1fbc8fcf1871 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Sun, 24 May 2026 18:46:19 -0400 Subject: [PATCH 01/14] feat(spider-task-executor): Add executor binary with bincode wire protocol and integration tests. (#325) --- Cargo.lock | 153 ++++++++++ Cargo.toml | 2 + components/spider-task-executor/Cargo.toml | 31 +- .../src/bin/spider_task_executor.rs | 146 ++++++++++ components/spider-task-executor/src/error.rs | 26 +- components/spider-task-executor/src/lib.rs | 1 + .../spider-task-executor/src/manager.rs | 7 +- .../spider-task-executor/src/protocol.rs | 49 ++++ taskfiles/test.yaml | 25 +- .../integration-test-tasks/Cargo.toml | 16 + .../integration-test-tasks/src/lib.rs | 75 +++++ tests/huntsman/task-executor/Cargo.toml | 42 +++ tests/huntsman/task-executor/src/lib.rs | 275 ++++++++++++++++++ .../tests/overhead_instrument.rs | 228 +++++++++++++++ .../task-executor/tests/test_executor.rs | 90 ++++++ .../huntsman/tdl-integration/tests/complex.rs | 4 +- 16 files changed, 1155 insertions(+), 15 deletions(-) create mode 100644 components/spider-task-executor/src/bin/spider_task_executor.rs create mode 100644 components/spider-task-executor/src/protocol.rs create mode 100644 tests/huntsman/integration-test-tasks/Cargo.toml create mode 100644 tests/huntsman/integration-test-tasks/src/lib.rs create mode 100644 tests/huntsman/task-executor/Cargo.toml create mode 100644 tests/huntsman/task-executor/src/lib.rs create mode 100644 tests/huntsman/task-executor/tests/overhead_instrument.rs create mode 100644 tests/huntsman/task-executor/tests/test_executor.rs diff --git a/Cargo.lock b/Cargo.lock index 516efac4..1c7f0093 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -64,6 +73,15 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "2.11.1" @@ -283,6 +301,16 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "etcetera" version = "0.8.0" @@ -695,6 +723,14 @@ dependencies = [ "serde_core", ] +[[package]] +name = "integration-test-tasks" +version = "0.1.0" +dependencies = [ + "serde", + "spider-tdl", +] + [[package]] name = "itoa" version = "1.0.18" @@ -808,6 +844,15 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "md-5" version = "0.10.6" @@ -1148,6 +1193,23 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + [[package]] name = "rmp" version = "0.8.15" @@ -1340,6 +1402,25 @@ dependencies = [ "digest", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + [[package]] name = "signature" version = "2.2.0" @@ -1431,10 +1512,19 @@ name = "spider-task-executor" version = "0.1.0" dependencies = [ "anyhow", + "bincode", + "bytes", + "futures-util", "libloading", "rmp-serde", + "serde", + "spider-core", "spider-tdl", "thiserror", + "tokio", + "tokio-util", + "tracing", + "tracing-subscriber", ] [[package]] @@ -1771,6 +1861,24 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "task-executor-tests" +version = "0.1.0" +dependencies = [ + "bincode", + "bytes", + "futures-util", + "integration-test-tasks", + "rmp-serde", + "serde", + "spider-core", + "spider-task-executor", + "spider-tdl", + "tabled", + "tokio", + "tokio-util", +] + [[package]] name = "tdl-integration" version = "0.1.0" @@ -1812,6 +1920,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + [[package]] name = "tinystr" version = "0.8.3" @@ -1847,6 +1964,7 @@ dependencies = [ "libc", "mio", "pin-project-lite", + "signal-hook-registry", "socket2", "tokio-macros", "windows-sys 0.61.2", @@ -1918,6 +2036,35 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "thread_local", + "tracing", + "tracing-core", + "tracing-serde", ] [[package]] @@ -1995,6 +2142,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "vcpkg" version = "0.2.15" diff --git a/Cargo.toml b/Cargo.toml index 30796143..67362f87 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,5 +9,7 @@ members = [ "components/spider-tdl-derive", "examples/huntsman/complex/tasks", "examples/huntsman/complex/types", + "tests/huntsman/integration-test-tasks", + "tests/huntsman/task-executor", "tests/huntsman/tdl-integration", ] diff --git a/components/spider-task-executor/Cargo.toml b/components/spider-task-executor/Cargo.toml index c51c09b2..789308ca 100644 --- a/components/spider-task-executor/Cargo.toml +++ b/components/spider-task-executor/Cargo.toml @@ -7,11 +7,36 @@ edition = "2024" name = "spider_task_executor" path = "src/lib.rs" +[[bin]] +name = "spider-task-executor" +path = "src/bin/spider_task_executor.rs" + [dependencies] +anyhow = "1.0.98" +bincode = "1.3.3" +bytes = "1.10" +futures-util = { version = "0.3.31", default-features = false, features = [ + "sink", + "std" +] } libloading = "0.8.5" rmp-serde = "1.3.1" +serde = { version = "1.0.228", features = ["derive"] } +spider-core = { path = "../spider-core" } spider-tdl = { path = "../spider-tdl" } thiserror = "2.0.18" - -[dev-dependencies] -anyhow = "1.0.98" +tokio = { version = "1.50.0", features = [ + "io-std", + "io-util", + "macros", + "rt", + "sync", + "time" +] } +tokio-util = { version = "0.7", features = ["codec"] } +tracing = { version = "0.1.41", default-features = false, features = ["std"] } +tracing-subscriber = { version = "0.3.19", default-features = false, features = [ + "env-filter", + "fmt", + "json" +] } diff --git a/components/spider-task-executor/src/bin/spider_task_executor.rs b/components/spider-task-executor/src/bin/spider_task_executor.rs new file mode 100644 index 00000000..5be95bf0 --- /dev/null +++ b/components/spider-task-executor/src/bin/spider_task_executor.rs @@ -0,0 +1,146 @@ +//! Spider task-executor binary. +//! +//! Reads bincode-framed [`Request`](spider_task_executor::protocol::Request)s from `stdin`, +//! dispatches them through a [`TdlPackageManager`], and writes +//! [`Response`](spider_task_executor::protocol::Response)s to `stdout`. The execution manager +//! spawns this process per slot and supervises it. +//! +//! Package resolution: each `Execute` request names a TDL package; the executor looks for +//! `${SPIDER_TDL_PACKAGE_DIR}/${package}/${package}.so` and caches the loaded library by name. +//! +//! Execution model: requests are processed strictly sequentially on a single-threaded tokio +//! runtime. Tokio is used only to match the async I/O surface on the execution manager side; +//! the executor itself has no concurrency requirements, and exactly one task runs for the +//! lifetime of the process. + +use std::{ + path::{Path, PathBuf}, + time::Instant, +}; + +use anyhow::{Result, anyhow}; +use bytes::Bytes; +use futures_util::{SinkExt, StreamExt}; +use spider_task_executor::{ + ExecutorError, + TdlPackageManager, + protocol::{ExecutorOutcome, Request, Response}, +}; +use tokio::io::{stdin, stdout}; +use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec}; + +/// Env var that points to the directory where compiled TDL packages live. +const SPIDER_TDL_PACKAGE_DIR: &str = "SPIDER_TDL_PACKAGE_DIR"; + +/// Initializes tracing logging. +fn init_tracing() { + // Send tracing output to stderr so it doesn't pollute the framed-stdout protocol channel. + tracing_subscriber::fmt() + .event_format( + tracing_subscriber::fmt::format() + .with_level(true) + .with_target(false) + .with_file(true) + .with_line_number(true) + .json(), + ) + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .with_ansi(false) + .with_writer(std::io::stderr) + .init(); +} + +/// Runs a task from the given TDL context and inputs. +/// +/// # Returns +/// +/// Forwards [`spider_task_executor::TdlPackage::execute_task`]'s return values on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`TdlPackageManager::load`]'s return values on failure. +/// * Forwards [`spider_task_executor::TdlPackage::execute_task`]'s return values on failure. +fn run_task( + manager: &mut TdlPackageManager, + pkg_dir: &Path, + package: &str, + task_func: &str, + raw_ctx: &[u8], + raw_inputs: &[u8], +) -> Result, ExecutorError> { + let pkg = if let Some(pkg) = manager.get(package) { + pkg + } else { + let path = pkg_dir.join(package).join(format!("lib{package}.so")); + manager.load(&path)? + }; + pkg.execute_task(task_func, raw_ctx, raw_inputs) +} + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<()> { + init_tracing(); + + let pkg_dir: PathBuf = std::env::var_os(SPIDER_TDL_PACKAGE_DIR) + .map(PathBuf::from) + .ok_or_else(|| anyhow!("{SPIDER_TDL_PACKAGE_DIR} env var not set"))?; + + let mut requests = FramedRead::new(stdin(), LengthDelimitedCodec::new()); + let mut responses = FramedWrite::new(stdout(), LengthDelimitedCodec::new()); + + let mut manager = TdlPackageManager::new(); + + tracing::info!("Executor starts."); + + while let Some(frame) = requests.next().await { + let frame = frame + .inspect_err(|e| tracing::error!(err = ? e, "Failed to receive request frame."))?; + let req: Request = bincode::deserialize(&frame) + .inspect_err(|e| tracing::error!(err = ? e, "Failed to deserialize request."))?; + match req { + Request::Execute { + tdl_context, + raw_ctx, + raw_inputs, + } => { + let started = Instant::now(); + let outcome = match run_task( + &mut manager, + &pkg_dir, + &tdl_context.package, + &tdl_context.task_func, + &raw_ctx, + &raw_inputs, + ) { + Ok(outputs) => ExecutorOutcome::Success { outputs }, + Err(e) => ExecutorOutcome::Failure { + error: rmp_serde::to_vec(&e).inspect_err( + |e| tracing::error!(err = ? e, "Failed to serialize execution result."), + )?, + }, + }; + let elapsed_us = u64::try_from(started.elapsed().as_micros()).unwrap_or(u64::MAX); + + let resp = Response::Result { + outcome, + elapsed_us, + }; + let bytes = bincode::serialize(&resp) + .inspect_err(|e| tracing::error!(err = ? e, "Failed to serialize response."))?; + responses + .send(Bytes::from(bytes)) + .await + .inspect_err(|e| tracing::error!(err = ? e, "Failed to send response."))?; + } + Request::Shutdown => { + tracing::info!("Received shutdown request."); + break; + } + } + } + + tracing::info!("Executor exits."); + Ok(()) +} diff --git a/components/spider-task-executor/src/error.rs b/components/spider-task-executor/src/error.rs index da582342..c8da04ef 100644 --- a/components/spider-task-executor/src/error.rs +++ b/components/spider-task-executor/src/error.rs @@ -6,11 +6,11 @@ use spider_tdl::{TdlError, Version}; /// /// [`TdlError`] (failure inside a user task) is wrapped via [`Self::TaskError`] so callers can /// distinguish executor-internal failures from in-task failures. -#[derive(Debug, thiserror::Error)] +#[derive(Debug, thiserror::Error, serde::Serialize, serde::Deserialize)] pub enum ExecutorError { /// `dlopen` failed or a required FFI symbol was missing. #[error("failed to load TDL package library: {0}")] - InvalidLibrary(#[from] libloading::Error), + InvalidLibrary(String), /// The package's declared `spider-tdl` ABI version is not compatible with the executor's. #[error( @@ -33,7 +33,7 @@ pub enum ExecutorError { /// The byte buffer contains invalid UTF-8 patterns. #[error("invalid UTF-8: {0}")] - InvalidUtf8(#[from] std::str::Utf8Error), + InvalidUtf8(String), /// A user task returned a [`TdlError`] across the FFI boundary. #[error("task execution failed: {0}")] @@ -42,7 +42,25 @@ pub enum ExecutorError { /// The msgpack-encoded error payload returned by a failing task could not be decoded back into /// a [`TdlError`]. #[error("failed to deserialize error payload: {0}")] - ErrorPayloadDeserializationFailure(#[from] rmp_serde::decode::Error), + ErrorPayloadDeserializationFailure(String), +} + +impl From for ExecutorError { + fn from(value: libloading::Error) -> Self { + Self::InvalidLibrary(value.to_string()) + } +} + +impl From for ExecutorError { + fn from(value: std::str::Utf8Error) -> Self { + Self::InvalidUtf8(value.to_string()) + } +} + +impl From for ExecutorError { + fn from(value: rmp_serde::decode::Error) -> Self { + Self::ErrorPayloadDeserializationFailure(value.to_string()) + } } impl ExecutorError { diff --git a/components/spider-task-executor/src/lib.rs b/components/spider-task-executor/src/lib.rs index b5b05076..3afb0484 100644 --- a/components/spider-task-executor/src/lib.rs +++ b/components/spider-task-executor/src/lib.rs @@ -2,6 +2,7 @@ pub mod error; pub mod manager; +pub mod protocol; pub use error::ExecutorError; pub use manager::{TdlPackage, TdlPackageManager}; diff --git a/components/spider-task-executor/src/manager.rs b/components/spider-task-executor/src/manager.rs index 49fca52b..61060055 100644 --- a/components/spider-task-executor/src/manager.rs +++ b/components/spider-task-executor/src/manager.rs @@ -21,6 +21,7 @@ use crate::error::ExecutorError; /// avoid repeating the FFI round trip on every call. The execute fn pointer is also resolved once /// at load time and cached so each [`Self::execute_task`] call doesn't require `dlsym` per /// dispatch. +#[derive(Debug)] pub struct TdlPackage { /// The name of the package. name: String, @@ -190,7 +191,7 @@ impl TdlPackageManager { /// /// # Returns /// - /// The newly loaded package's name on success. + /// The newly loaded package on success. /// /// # Errors /// @@ -199,14 +200,14 @@ impl TdlPackageManager { /// * [`ExecutorError::DuplicatePackage`] if a package with the same name is already loaded. The /// freshly loaded library will be dropped (unloaded). /// * Forwards [`TdlPackage::load`]'s return values on failure. - pub fn load(&mut self, path: &Path) -> Result { + pub fn load(&mut self, path: &Path) -> Result<&TdlPackage, ExecutorError> { let package = TdlPackage::load(path)?; if self.packages.contains_key(package.name()) { return Err(ExecutorError::DuplicatePackage(package.name().to_owned())); } let name_key = package.name().to_owned(); let inserted = self.packages.entry(name_key).or_insert(package); - Ok(inserted.name().to_owned()) + Ok(inserted) } /// # Returns diff --git a/components/spider-task-executor/src/protocol.rs b/components/spider-task-executor/src/protocol.rs new file mode 100644 index 00000000..935d60d7 --- /dev/null +++ b/components/spider-task-executor/src/protocol.rs @@ -0,0 +1,49 @@ +//! Wire protocol between the execution manager and a `spider-task-executor` subprocess. +//! +//! The parent encodes each [`Request`] with `bincode` and writes it as one length-delimited frame +//! over the executor's `stdin`; the executor reads frames, dispatches to the TDL package manager, +//! and writes one [`Response`] frame back over `stdout`. +//! +//! `stderr` is **not** carried over this protocol. The executor writes diagnostics to its own +//! stderr; how those bytes are disposed of (inherited, piped, redirected to a log file) is a choice +//! made by whoever spawned the process. + +use serde::{Deserialize, Serialize}; +use spider_core::task::TdlContext; + +/// Request from the parent process (execution manager) to the executor. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Request { + Execute { + /// TDL information for identifying which task to execute. + tdl_context: TdlContext, + + /// Serialized task context. + raw_ctx: Vec, + + /// Serialized task inputs. + raw_inputs: Vec, + }, + + Shutdown, +} + +/// Reply from the executor to the parent process. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Response { + Result { + outcome: ExecutorOutcome, + /// Wall-clock duration of the FFI call, measured by the executor. + elapsed_us: u64, + }, +} + +/// Outcome of a task execution. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ExecutorOutcome { + /// Task outputs serialized in wire-format. + Success { outputs: Vec }, + + /// [`crate::ExecutorError`] serialized in msgpack. + Failure { error: Vec }, +} diff --git a/taskfiles/test.yaml b/taskfiles/test.yaml index 83807015..7d79bfdb 100644 --- a/taskfiles/test.yaml +++ b/taskfiles/test.yaml @@ -209,12 +209,20 @@ tasks: # @param {string} SPIDER_STORAGE_URL An URL pointing to the MariaDB instance. spider-huntsman-unit-tests-executor: internal: true + vars: + # TDL packages are staged under `${G_TDL_PACKAGES_DIR}//lib.so` + # so that the `spider-task-executor` binary can resolve them via the on-disk layout it + # documents. + G_TDL_PACKAGES_DIR: "{{.G_BUILD_DIR}}/tdl_packages" + G_RUST_RELEASE_DIR: "{{.G_RUST_BUILD_DIR}}/release" env: MARIADB_PORT: "{{.MARIADB_PORT}}" MARIADB_DATABASE: "{{.MARIADB_DATABASE}}" MARIADB_USERNAME: "{{.MARIADB_USERNAME}}" MARIADB_PASSWORD: "{{.MARIADB_PASSWORD}}" - SPIDER_TDL_PACKAGE_COMPLEX: "{{.G_RUST_BUILD_DIR}}/release/libhuntsman_complex.so" + SPIDER_TDL_PACKAGE_COMPLEX: "{{.G_TDL_PACKAGES_DIR}}/complex/libcomplex.so" + SPIDER_TDL_PACKAGE_DIR: "{{.G_TDL_PACKAGES_DIR}}" + SPIDER_TASK_EXECUTOR_BIN: "{{.G_RUST_RELEASE_DIR}}/spider-task-executor" SPIDER_TEST_INSTRUMENT_OUTPUT_DIR: sh: "echo {{.G_BUILD_DIR}}/spider-instrument-$(uuidgen)" requires: @@ -222,11 +230,22 @@ tasks: dir: "{{.ROOT_DIR}}" deps: ["toolchains:rust"] cmds: - - "mkdir ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}" + - "mkdir -p ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}" - defer: "rm -rf ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}" - |- . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}" - cargo build --package huntsman-complex --release + # `--bin` is a workspace-wide target filter; combining it with cdylib packages in the + # same `cargo build` would silently exclude the `.so` artifacts. Use one invocation per + # artifact to keep the target selection unambiguous. + cargo build --release --package huntsman-complex + cargo build --release --package integration-test-tasks + cargo build --release --package spider-task-executor --bin spider-task-executor + mkdir -p "{{.G_TDL_PACKAGES_DIR}}/complex" \ + "{{.G_TDL_PACKAGES_DIR}}/integration_test_tasks" + cp "{{.G_RUST_RELEASE_DIR}}/libhuntsman_complex.so" \ + "{{.G_TDL_PACKAGES_DIR}}/complex/libcomplex.so" + cp "{{.G_RUST_RELEASE_DIR}}/libintegration_test_tasks.so" \ + "{{.G_TDL_PACKAGES_DIR}}/integration_test_tasks/libintegration_test_tasks.so" cargo nextest run --all --all-features --run-ignored all --release - |- for f in ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}/*; do diff --git a/tests/huntsman/integration-test-tasks/Cargo.toml b/tests/huntsman/integration-test-tasks/Cargo.toml new file mode 100644 index 00000000..0c77122e --- /dev/null +++ b/tests/huntsman/integration-test-tasks/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "integration-test-tasks" +version = "0.1.0" +edition = "2024" +publish = false + +[lib] +# `cdylib` is what the task-executor dlopens; `rlib` lets other Rust crates (the integration +# tests) `use` constants like `INSTRUMENT_SLEEP_US`. +crate-type = ["cdylib", "rlib"] +name = "integration_test_tasks" +path = "src/lib.rs" + +[dependencies] +serde = { version = "1.0.228", features = ["derive"] } +spider-tdl = { path = "../../../components/spider-tdl", features = ["derive"] } diff --git a/tests/huntsman/integration-test-tasks/src/lib.rs b/tests/huntsman/integration-test-tasks/src/lib.rs new file mode 100644 index 00000000..1f6bc731 --- /dev/null +++ b/tests/huntsman/integration-test-tasks/src/lib.rs @@ -0,0 +1,75 @@ +//! Test TDL package used by the `task-executor` integration tests. +//! +//! Exposes four tasks that exercise distinct executor code paths: +//! +//! * [`task_decl::fibonacci`] — basic compute + correctness. +//! * [`task_decl::always_fail`] — in-task error reporting. +//! * [`task_decl::always_panic`] — process-level crash handling. +//! * [`task_decl::sleep_and_echo`] — fixed-cost task: sleeps for a known [`INSTRUMENT_SLEEP_US`] +//! duration then echoes its `Vec` payload back. Used by the overhead bench so the +//! non-sleep portion of the executor's reported FFI time isolates the in-executor input/output +//! serde cost, while the parent-side delta isolates IPC framing cost. + +/// The constant sleep duration used by [`task_decl::sleep_and_echo`]. +/// +/// Exposed at crate scope so the overhead bench (linked dynamically, so it can't read the value +/// through the cdylib) can reference the same number to keep them in sync if changed. +pub const INSTRUMENT_SLEEP_US: u64 = 50; + +mod task_decl { + use std::{thread::sleep, time::Duration}; + + use spider_tdl::{TaskContext, TdlError, task}; + + use crate::INSTRUMENT_SLEEP_US; + + /// Computes the `index`-th Fibonacci number with a deliberately naive recursive + /// implementation so the call has measurable CPU cost for the overhead benchmark. + #[task(name = "fibonacci")] + pub fn fibonacci(_ctx: TaskContext, index: u64) -> Result { + Ok(fib(index)) + } + + fn fib(index: u64) -> u64 { + if index < 2 { + index + } else { + fib(index - 1) + fib(index - 2) + } + } + + /// Always returns a [`TdlError::ExecutionError`]. + #[task(name = "always_fail")] + pub fn always_fail(_ctx: TaskContext) -> Result { + Err(TdlError::ExecutionError( + "always_fail: intentional failure".to_owned(), + )) + } + + /// Always panics. The panic crosses the `extern "C"` FFI boundary, which aborts the executor + /// process — the test asserts the parent observes that crash. + #[task(name = "always_panic")] + pub fn always_panic(_ctx: TaskContext) -> Result { + panic!("always_panic: intentional panic") + } + + /// Sleeps for a fixed [`INSTRUMENT_SLEEP_US`] microseconds, then echoes the input back. + /// + /// The fixed-cost body lets the overhead bench subtract the known sleep from the executor's + /// reported FFI duration, isolating the in-executor input/output serde overhead. + #[task(name = "sleep_and_echo")] + pub fn sleep_and_echo(_ctx: TaskContext, items: Vec) -> Result, TdlError> { + sleep(Duration::from_micros(INSTRUMENT_SLEEP_US)); + Ok(items) + } +} + +spider_tdl::register_tdl_package! { + package_name: "integration_test_tasks", + tasks: [ + task_decl::fibonacci, + task_decl::always_fail, + task_decl::always_panic, + task_decl::sleep_and_echo, + ], +} diff --git a/tests/huntsman/task-executor/Cargo.toml b/tests/huntsman/task-executor/Cargo.toml new file mode 100644 index 00000000..0d237bef --- /dev/null +++ b/tests/huntsman/task-executor/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "task-executor-tests" +version = "0.1.0" +edition = "2024" +publish = false + +[lib] +name = "task_executor_tests" +path = "src/lib.rs" + +[[test]] +name = "executor" +path = "tests/test_executor.rs" + +[[test]] +name = "overhead_instrument" +path = "tests/overhead_instrument.rs" + +[dependencies] +bincode = "1.3.3" +bytes = "1.10" +futures-util = { version = "0.3.31", default-features = false, features = [ + "sink", + "std" +] } +rmp-serde = "1.3.1" +serde = { version = "1.0.228", features = ["derive"] } +spider-core = { path = "../../../components/spider-core" } +spider-task-executor = { path = "../../../components/spider-task-executor" } +spider-tdl = { path = "../../../components/spider-tdl" } +tokio = { version = "1.50.0", features = [ + "io-util", + "macros", + "process", + "rt", + "time" +] } +tokio-util = { version = "0.7", features = ["codec"] } + +[dev-dependencies] +integration-test-tasks = { path = "../integration-test-tasks" } +tabled = "0.20.0" diff --git a/tests/huntsman/task-executor/src/lib.rs b/tests/huntsman/task-executor/src/lib.rs new file mode 100644 index 00000000..c42a20f4 --- /dev/null +++ b/tests/huntsman/task-executor/src/lib.rs @@ -0,0 +1,275 @@ +//! Test harness shared by the `task-executor-tests` integration tests. +//! +//! Spawns the `spider-task-executor` binary as a child process, frames bincode requests on its +//! stdin and reads bincode responses from its stdout — the exact wire protocol of +//! [`spider_task_executor::protocol`]. +//! +//! Every fallible operation in this harness panics with `.expect(...)` on failure; the tests are +//! infrastructure, not production code, and the panic message + backtrace is more useful at the +//! failure site than threading an error type through every helper. +//! +//! Environment: +//! +//! * `SPIDER_TASK_EXECUTOR_BIN` — absolute path to the executor binary. +//! * `SPIDER_TDL_PACKAGE_DIR` — directory the binary searches for TDL packages; gets forwarded to +//! the child verbatim. + +use std::{path::PathBuf, process::Stdio}; + +use bytes::Bytes; +use futures_util::{SinkExt, StreamExt}; +use spider_core::{ + task::TdlContext, + types::{ + id::{JobId, ResourceGroupId, TaskId}, + io::TaskInput, + }, +}; +use spider_task_executor::protocol::{Request, Response}; +use spider_tdl::{ + TaskContext, + wire::{TaskInputsSerializer, TaskOutputsSerializer}, +}; +use tokio::process::{Child, ChildStdin, ChildStdout, Command}; +use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec}; + +/// The TDL package name registered by `integration-test-tasks`. +pub const PACKAGE_NAME: &str = "integration_test_tasks"; + +/// One running executor subprocess plus framed handles to its stdin / stdout. +/// +/// The subprocess will be killed when the handle is dropped. +pub struct ExecutorHandle { + child: Child, + requests: FramedWrite, + responses: FramedRead, +} + +impl ExecutorHandle { + /// Spawns the executor binary with `SPIDER_TDL_PACKAGE_DIR` set; the child inherits the + /// parent's stderr so panic / abort messages surface in the test log. + /// + /// # Returns + /// + /// A handle owning the running subprocess and framed I/O. + /// + /// # Panics + /// + /// Panics if the binary cannot be spawned or its stdio handles cannot be claimed. + #[must_use] + pub fn spawn() -> Self { + let mut child = Command::new(task_executor_bin()) + .env("SPIDER_TDL_PACKAGE_DIR", tdl_package_dir()) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .kill_on_drop(true) + .spawn() + .expect("spawn executor binary"); + let stdin = child.stdin.take().expect("stdin must be piped"); + let stdout = child.stdout.take().expect("stdout must be piped"); + Self { + child, + requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()), + responses: FramedRead::new(stdout, LengthDelimitedCodec::new()), + } + } + + /// Bincode-serializes `req` and writes one length-delimited frame to the executor's stdin. + /// + /// # Panics + /// + /// Panics if encoding fails or the stdin pipe cannot be written. + pub async fn send(&mut self, req: &Request) { + let bytes = bincode::serialize(req).expect("bincode encode Request"); + self.requests + .send(Bytes::from(bytes)) + .await + .expect("write request frame"); + } + + /// Reads exactly one length-delimited frame from the executor's stdout and bincode-decodes it. + /// + /// # Returns + /// + /// The next [`Response`] from the executor. + /// + /// # Panics + /// + /// Panics if stdout closes before a frame arrives, the frame I/O fails, or decoding fails. + pub async fn recv(&mut self) -> Response { + let frame = self + .responses + .next() + .await + .expect("executor closed stdout before reply") + .expect("read response frame"); + bincode::deserialize(&frame).expect("bincode decode Response") + } + + /// Reads at most one length-delimited frame, tolerating a clean EOF (which crash-path tests + /// rely on to detect that the executor died). + /// + /// # Returns + /// + /// `Some(response)` if a frame was received, `None` if stdout closed cleanly first. + /// + /// # Panics + /// + /// Panics if the frame I/O fails for a reason other than EOF or if decoding fails. + pub async fn try_recv(&mut self) -> Option { + let frame = self.responses.next().await?; + let bytes = frame.expect("read response frame"); + Some(bincode::deserialize(&bytes).expect("bincode decode Response")) + } + + /// Sends [`Request::Shutdown`], closes stdin, and waits for the child to exit cleanly. + /// + /// # Panics + /// + /// Panics if waiting on the child fails or the child exits non-zero. + pub async fn shutdown_clean(mut self) { + self.send(&Request::Shutdown).await; + // Close the stdin pipe so the child sees EOF after `Shutdown` is drained. + drop(self.requests); + let status = self.child.wait().await.expect("wait for executor"); + assert!(status.success(), "executor exited with status {status:?}"); + } + + /// Closes stdin and waits for the child to exit. Used by crash-path tests that don't expect + /// a clean shutdown. + /// + /// # Returns + /// + /// The child's [`ExitStatus`](std::process::ExitStatus). + /// + /// # Panics + /// + /// Panics if waiting on the child fails. + pub async fn wait_for_exit(mut self) -> std::process::ExitStatus { + drop(self.requests); + self.child.wait().await.expect("wait for executor") + } +} + +/// # Returns +/// +/// The absolute path of the `spider-task-executor` binary, read from `SPIDER_TASK_EXECUTOR_BIN`. +/// +/// # Panics +/// +/// Panics if `SPIDER_TASK_EXECUTOR_BIN` is unset. +#[must_use] +pub fn task_executor_bin() -> PathBuf { + std::env::var_os("SPIDER_TASK_EXECUTOR_BIN") + .map(PathBuf::from) + .expect("SPIDER_TASK_EXECUTOR_BIN env var not set") +} + +/// # Returns +/// +/// The TDL package staging directory, read from `SPIDER_TDL_PACKAGE_DIR`. Forwarded verbatim +/// into the executor child's environment so it resolves +/// `${SPIDER_TDL_PACKAGE_DIR}//lib.so`. +/// +/// # Panics +/// +/// Panics if `SPIDER_TDL_PACKAGE_DIR` is unset. +#[must_use] +pub fn tdl_package_dir() -> PathBuf { + std::env::var_os("SPIDER_TDL_PACKAGE_DIR") + .map(PathBuf::from) + .expect("SPIDER_TDL_PACKAGE_DIR env var not set") +} + +/// # Returns +/// +/// A placeholder msgpack-encoded [`TaskContext`] suitable for a one-shot test invocation. The id +/// fields are fresh per call but the executor doesn't inspect them. +/// +/// # Panics +/// +/// Panics if msgpack encoding fails (the test ids serialize trivially). +#[must_use] +pub fn build_ctx() -> Vec { + let ctx = TaskContext { + job_id: JobId::new(), + task_id: TaskId::new(), + task_instance_id: 1, + resource_group_id: ResourceGroupId::new(), + }; + rmp_serde::to_vec(&ctx).expect("serialize TaskContext") +} + +/// # Type Parameters +/// +/// * `T` - The Serde-serializable value type passed as the task's single input. +/// +/// # Returns +/// +/// A wire-format buffer carrying one [`TaskInput::ValuePayload`] holding the msgpack-encoded +/// `value` — i.e. the same shape the parent ships for a single-argument task. +/// +/// # Panics +/// +/// Panics if msgpack encoding or wire-format append fails. +#[must_use] +pub fn encode_single_input(value: &T) -> Vec { + let mut inputs = TaskInputsSerializer::new(); + inputs + .append(TaskInput::ValuePayload( + rmp_serde::to_vec(value).expect("msgpack encode input"), + )) + .expect("append wire-format input"); + inputs.release() +} + +/// # Returns +/// +/// A wire-format buffer carrying zero inputs — for nullary tasks like `always_fail` and +/// `always_panic`. +#[must_use] +pub fn encode_no_inputs() -> Vec { + TaskInputsSerializer::new().release() +} + +/// # Type Parameters +/// +/// * `T` - The Serde-deserializable type the output payload should decode into. +/// +/// # Returns +/// +/// The single msgpack-encoded value carried in `output_bytes`, deserialized as `T`. +/// +/// # Panics +/// +/// Panics if the outputs buffer doesn't contain exactly one value, or if the msgpack decode +/// fails. +#[must_use] +pub fn decode_single_output(output_bytes: &[u8]) -> T { + let outputs = + TaskOutputsSerializer::deserialize(output_bytes).expect("decode wire-format outputs"); + assert_eq!( + outputs.len(), + 1, + "expected exactly one output payload, got {}", + outputs.len(), + ); + rmp_serde::from_slice(&outputs[0]).expect("msgpack decode output") +} + +/// # Returns +/// +/// A [`Request::Execute`] targeting `task_func` in the integration package, with a fresh test +/// `TaskContext` and the caller-supplied wire-format `raw_inputs`. +#[must_use] +pub fn execute_request(task_func: &str, raw_inputs: Vec) -> Request { + Request::Execute { + tdl_context: TdlContext { + package: PACKAGE_NAME.to_owned(), + task_func: task_func.to_owned(), + }, + raw_ctx: build_ctx(), + raw_inputs, + } +} diff --git a/tests/huntsman/task-executor/tests/overhead_instrument.rs b/tests/huntsman/task-executor/tests/overhead_instrument.rs new file mode 100644 index 00000000..fc4e146e --- /dev/null +++ b/tests/huntsman/task-executor/tests/overhead_instrument.rs @@ -0,0 +1,228 @@ +//! Measures the round-trip overhead of one task execution through the `spider-task-executor` +//! binary. +//! +//! Drives the `sleep_and_echo` task — which sleeps for a known constant +//! [`INSTRUMENT_SLEEP_US`](integration_test_tasks::INSTRUMENT_SLEEP_US) and then echoes its +//! `Vec` payload — against a *long-lived* executor subprocess (the FFI library is +//! cached after the first call, so subsequent dispatches measure steady-state overhead, not +//! one-time dlopen cost). With the work portion held constant we can split the cost into: +//! +//! * `e2e`: parent's wall-clock around `send(Execute)` → `recv(Response::Result)`. +//! * `executor`: the in-executor FFI duration, taken straight from +//! [`Response::Result::elapsed_us`]. This is `INSTRUMENT_SLEEP_US` + the executor's in-FFI +//! input/output serde. +//! * `executor_internal`: `executor - INSTRUMENT_SLEEP_US`. Approximates the in-executor +//! input/output serde cost alone. +//! * `ipc_overhead`: `e2e - executor`. The parent-side framing + bincode + pipe traversal. +//! +//! Aggregates (avg, p50, p95, p99) for each metric land in a markdown table at +//! `${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}/task_executor_overhead.md`. + +use std::{ + fs::File, + io::Write, + path::PathBuf, + time::{Duration, Instant}, +}; + +use integration_test_tasks::INSTRUMENT_SLEEP_US; +use spider_task_executor::protocol::{ExecutorOutcome, Response}; +use tabled::{Table, Tabled}; +use task_executor_tests::{ + ExecutorHandle, + decode_single_output, + encode_single_input, + execute_request, +}; + +const PAYLOAD_LEN: usize = 100; +const ITERATIONS: usize = 10; +const OUTPUT_FILE: &str = "task_executor_overhead.md"; +const INSTRUMENT_OUTPUT_DIR_ENV: &str = "SPIDER_TEST_INSTRUMENT_OUTPUT_DIR"; + +/// One row in the markdown table: a metric and its aggregate latency statistics. +#[derive(Tabled)] +struct LatencyRow { + #[tabled(rename = "Metric")] + metric: &'static str, + #[tabled(rename = "Count")] + count: usize, + #[tabled(rename = "Avg (µs)")] + avg_us: String, + #[tabled(rename = "P50 (µs)")] + p50_us: String, + #[tabled(rename = "P95 (µs)")] + p95_us: String, + #[tabled(rename = "P99 (µs)")] + p99_us: String, +} + +impl LatencyRow { + /// Sorts `samples` in place and computes `count`, `avg`, `p50`, `p95`, `p99` in microseconds. + /// + /// # Returns + /// + /// A populated [`LatencyRow`], or a row with `"N/A"` aggregates when `samples` is empty. + fn from_samples(metric: &'static str, samples: &mut [Duration]) -> Self { + if samples.is_empty() { + return Self { + metric, + count: 0, + avg_us: "N/A".to_owned(), + p50_us: "N/A".to_owned(), + p95_us: "N/A".to_owned(), + p99_us: "N/A".to_owned(), + }; + } + samples.sort(); + let count = samples.len(); + let sum: Duration = samples.iter().sum(); + #[allow(clippy::cast_precision_loss)] + let avg = sum.as_secs_f64() * 1_000_000.0 / count as f64; + let last = count - 1; + let p50 = samples[(count / 2).min(last)].as_secs_f64() * 1_000_000.0; + let p95 = samples[(count * 95 / 100).min(last)].as_secs_f64() * 1_000_000.0; + let p99 = samples[(count * 99 / 100).min(last)].as_secs_f64() * 1_000_000.0; + Self { + metric, + count, + avg_us: format!("{avg:.2}"), + p50_us: format!("{p50:.2}"), + p95_us: format!("{p95:.2}"), + p99_us: format!("{p99:.2}"), + } + } +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib, `spider-task-executor` binary, and \ + SPIDER_TEST_INSTRUMENT_OUTPUT_DIR"] +async fn instrument_overhead() { + let output_dir = std::env::var_os(INSTRUMENT_OUTPUT_DIR_ENV).map_or_else( + || panic!("{INSTRUMENT_OUTPUT_DIR_ENV} env var not set"), + PathBuf::from, + ); + + let mut handle = ExecutorHandle::spawn(); + + let payload = path_like_payload(PAYLOAD_LEN); + let raw_inputs = encode_single_input(&payload); + let sleep_floor = Duration::from_micros(INSTRUMENT_SLEEP_US); + + // Warm-up: first call dlopens the package. Assert correctness; discard timing. + handle + .send(&execute_request("sleep_and_echo", raw_inputs.clone())) + .await; + expect_echo(&handle.recv().await, &payload); + + let mut e2e_samples = Vec::with_capacity(ITERATIONS); + let mut executor_samples = Vec::with_capacity(ITERATIONS); + let mut executor_internal_samples = Vec::with_capacity(ITERATIONS); + let mut ipc_overhead_samples = Vec::with_capacity(ITERATIONS); + + for _ in 0..ITERATIONS { + let started = Instant::now(); + handle + .send(&execute_request("sleep_and_echo", raw_inputs.clone())) + .await; + let response = handle.recv().await; + let e2e = started.elapsed(); + + let Response::Result { + outcome, + elapsed_us, + } = response; + let ExecutorOutcome::Success { outputs } = outcome else { + panic!("sleep_and_echo task unexpectedly failed in overhead loop"); + }; + let got: Vec = decode_single_output(&outputs); + assert_eq!(got, payload); + + let executor = Duration::from_micros(elapsed_us); + // Defensive: a coarse system clock could in principle report e2e < executor, or executor < + // sleep_floor (the sleep can return slightly early on some platforms). Treat both as zero + // overhead and keep the sample for visibility. + let executor_internal = executor.checked_sub(sleep_floor).unwrap_or(Duration::ZERO); + let ipc_overhead = e2e.checked_sub(executor).unwrap_or(Duration::ZERO); + + e2e_samples.push(e2e); + executor_samples.push(executor); + executor_internal_samples.push(executor_internal); + ipc_overhead_samples.push(ipc_overhead); + } + + handle.shutdown_clean().await; + + let rows = vec![ + LatencyRow::from_samples("E2E (parent)", &mut e2e_samples.clone()), + LatencyRow::from_samples("Executor FFI", &mut executor_samples.clone()), + LatencyRow::from_samples( + "Executor internal (FFI - sleep)", + &mut executor_internal_samples.clone(), + ), + LatencyRow::from_samples( + "IPC overhead (E2E - FFI)", + &mut ipc_overhead_samples.clone(), + ), + ]; + let table = Table::new(rows).to_string(); + + let preamble = format!( + "# Task-executor overhead\n\nInputs: `sleep_and_echo` task with {PAYLOAD_LEN} path-like \ + strings echoed after a {INSTRUMENT_SLEEP_US}µs sleep, {ITERATIONS} samples (excluding \ + warm-up).\n\n* `Executor internal` ≈ in-executor input/output serde cost.\n* `IPC \ + overhead` ≈ parent-side framing + bincode + pipe traversal.\n\n" + ); + + let path = output_dir.join(OUTPUT_FILE); + let mut file = + File::create(&path).unwrap_or_else(|err| panic!("create {} failed: {err}", path.display())); + file.write_all(preamble.as_bytes()).expect("write preamble"); + file.write_all(table.as_bytes()).expect("write table"); + file.write_all(b"\n").expect("write trailing newline"); +} + +/// Builds `len` deterministic path-like strings. Mixing prefixes and suffixes keeps the payload +/// representative of a realistic input without depending on `rand`. +/// +/// # Returns +/// +/// A `Vec` of length `len`. +fn path_like_payload(len: usize) -> Vec { + const PREFIXES: &[&str] = &[ + "/var/log", + "/usr/local/bin", + "/etc/spider", + "/home/user/projects", + "/opt/data/cache", + ]; + const SUFFIXES: &[&str] = &["log", "txt", "bin", "json", "tmp"]; + (0..len) + .map(|idx| { + let prefix = PREFIXES[idx % PREFIXES.len()]; + let suffix = SUFFIXES[(idx / PREFIXES.len()) % SUFFIXES.len()]; + format!("{prefix}/file_{:04}_{idx:05}.{suffix}", (idx * 31) % 10_000) + }) + .collect() +} + +/// Asserts that `response` is a `Success` whose decoded payload equals `expected`. +/// +/// # Panics +/// +/// Panics if the response is a `Failure` (the decoded +/// [`ExecutorError`](spider_task_executor::ExecutorError) is included in the panic message), or if +/// the decoded payload doesn't match `expected`. +fn expect_echo(response: &Response, expected: &[String]) { + let Response::Result { outcome, .. } = response; + let outputs = match outcome { + ExecutorOutcome::Success { outputs } => outputs, + ExecutorOutcome::Failure { error } => { + let err: spider_task_executor::ExecutorError = + rmp_serde::from_slice(error).expect("decode ExecutorError payload"); + panic!("sleep_and_echo warm-up returned Failure: {err:?}"); + } + }; + let got: Vec = decode_single_output(outputs); + assert_eq!(got, expected, "warm-up output mismatch"); +} diff --git a/tests/huntsman/task-executor/tests/test_executor.rs b/tests/huntsman/task-executor/tests/test_executor.rs new file mode 100644 index 00000000..e2eb8ec4 --- /dev/null +++ b/tests/huntsman/task-executor/tests/test_executor.rs @@ -0,0 +1,90 @@ +//! End-to-end correctness tests against the `spider-task-executor` binary. +//! +//! Each test spawns a fresh executor subprocess via [`ExecutorHandle::spawn`], exchanges one framed +//! bincode request/response over the binary's stdin/stdout, and asserts on the result. + +use spider_task_executor::{ + ExecutorError, + protocol::{ExecutorOutcome, Response}, +}; +use spider_tdl::TdlError; +use task_executor_tests::{ + ExecutorHandle, + decode_single_output, + encode_no_inputs, + encode_single_input, + execute_request, +}; + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn fibonacci_returns_correct_value() { + let mut handle = ExecutorHandle::spawn(); + let input: u64 = 10; + handle + .send(&execute_request("fibonacci", encode_single_input(&input))) + .await; + let Response::Result { outcome, .. } = handle.recv().await; + match outcome { + ExecutorOutcome::Success { outputs } => { + let got: u64 = decode_single_output(&outputs); + // Fib(10) = 55 + assert_eq!(got, 55); + } + ExecutorOutcome::Failure { error } => { + let err: ExecutorError = + rmp_serde::from_slice(&error).expect("decode ExecutorError payload"); + panic!("expected Success for fibonacci(10), got Failure: {err:?}"); + } + } + handle.shutdown_clean().await; +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn always_fail_reports_task_error() { + let mut handle = ExecutorHandle::spawn(); + handle + .send(&execute_request("always_fail", encode_no_inputs())) + .await; + let Response::Result { outcome, .. } = handle.recv().await; + match outcome { + ExecutorOutcome::Success { outputs } => { + panic!("expected Failure, got Success with {} bytes", outputs.len()); + } + ExecutorOutcome::Failure { error } => { + let err: ExecutorError = + rmp_serde::from_slice(&error).expect("decode ExecutorError payload"); + let ExecutorError::TaskError(TdlError::ExecutionError(message)) = &err else { + panic!("expected TaskError(ExecutionError), got {err:?}"); + }; + assert!( + message.contains("always_fail"), + "unexpected error message: {message}", + ); + } + } + handle.shutdown_clean().await; +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn always_panic_crashes_the_process() { + let mut handle = ExecutorHandle::spawn(); + handle + .send(&execute_request("always_panic", encode_no_inputs())) + .await; + + // A panic across the `extern "C"` boundary aborts the executor process. The parent must + // observe stdout EOF (no further frames) and a non-zero exit status. + let frame = handle.try_recv().await; + assert!( + frame.is_none(), + "expected stdout EOF after panic, got a response frame: {frame:?}", + ); + let status = handle.wait_for_exit().await; + assert!( + !status.success(), + "expected non-zero exit after panic, got {status:?}", + ); +} diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs index 007cb557..513e7d75 100644 --- a/tests/huntsman/tdl-integration/tests/complex.rs +++ b/tests/huntsman/tdl-integration/tests/complex.rs @@ -88,8 +88,8 @@ fn decode_complex_vec(output_bytes: &[u8]) -> anyhow::Result { fn load_and_query_name() -> anyhow::Result<()> { let path = lib_path(); let mut manager = TdlPackageManager::new(); - let name = manager.load(&path)?; - assert_eq!(name, PACKAGE_NAME); + let pkg = manager.load(&path)?; + assert_eq!(pkg.name(), PACKAGE_NAME); let pkg = manager .get(PACKAGE_NAME) .expect("just-loaded package should be retrievable"); From 86c7bea4c1542745d8e1b36699ba6e674f8bf6d9 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Tue, 26 May 2026 16:07:06 -0400 Subject: [PATCH 02/14] feat(spider-execution-manager): Add single-process supervisor pool for the task executor. (#326) --- Cargo.lock | 18 + Cargo.toml | 1 + .../spider-execution-manager/Cargo.toml | 28 ++ .../spider-execution-manager/src/lib.rs | 4 + .../src/process_pool.rs | 385 ++++++++++++++++++ components/spider-storage/Cargo.toml | 10 +- components/spider-task-executor/Cargo.toml | 31 +- examples/huntsman/complex/tasks/Cargo.toml | 5 +- tests/huntsman/task-executor/Cargo.toml | 27 +- .../task-executor/tests/test_process_pool.rs | 208 ++++++++++ 10 files changed, 682 insertions(+), 35 deletions(-) create mode 100644 components/spider-execution-manager/Cargo.toml create mode 100644 components/spider-execution-manager/src/lib.rs create mode 100644 components/spider-execution-manager/src/process_pool.rs create mode 100644 tests/huntsman/task-executor/tests/test_process_pool.rs diff --git a/Cargo.lock b/Cargo.lock index 1c7f0093..2888d5e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1482,6 +1482,23 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "spider-execution-manager" +version = "0.1.0" +dependencies = [ + "bincode", + "bytes", + "futures-util", + "rmp-serde", + "spider-core", + "spider-task-executor", + "spider-tdl", + "thiserror", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "spider-storage" version = "0.1.0" @@ -1872,6 +1889,7 @@ dependencies = [ "rmp-serde", "serde", "spider-core", + "spider-execution-manager", "spider-task-executor", "spider-tdl", "tabled", diff --git a/Cargo.toml b/Cargo.toml index 67362f87..ea9992cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ resolver = "3" members = [ "components/spider-core", "components/spider-derive", + "components/spider-execution-manager", "components/spider-storage", "components/spider-task-executor", "components/spider-tdl", diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml new file mode 100644 index 00000000..ed8e74db --- /dev/null +++ b/components/spider-execution-manager/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "spider-execution-manager" +version = "0.1.0" +edition = "2024" + +[lib] +name = "spider_execution_manager" +path = "src/lib.rs" + +[dependencies] +bincode = "1.3.3" +bytes = "1.10" +futures-util = { + version = "0.3.31", + default-features = false, + features = ["sink", "std"] +} +rmp-serde = "1.3.1" +spider-core = { path = "../spider-core" } +spider-task-executor = { path = "../spider-task-executor" } +spider-tdl = { path = "../spider-tdl" } +thiserror = "2.0.18" +tokio = { + version = "1.50.0", + features = ["io-util", "macros", "process", "rt", "sync", "time"] +} +tokio-util = { version = "0.7", features = ["codec"] } +tracing = { version = "0.1.41", default-features = false, features = ["std"] } diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs new file mode 100644 index 00000000..2d7171a9 --- /dev/null +++ b/components/spider-execution-manager/src/lib.rs @@ -0,0 +1,4 @@ +//! Execution manager — the per-node service that drives Spider task execution against a +//! `spider-task-executor` subprocess. + +pub mod process_pool; diff --git a/components/spider-execution-manager/src/process_pool.rs b/components/spider-execution-manager/src/process_pool.rs new file mode 100644 index 00000000..fab51d53 --- /dev/null +++ b/components/spider-execution-manager/src/process_pool.rs @@ -0,0 +1,385 @@ +//! Process supervisor for `spider-task-executor` subprocesses. + +use std::{ + fs::File, + path::PathBuf, + process::Stdio, + sync::atomic::{AtomicU64, Ordering}, + time::Duration, +}; + +use bytes::Bytes; +use futures_util::{SinkExt, StreamExt}; +use spider_core::types::{ + id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId}, + io::ExecutionContext, +}; +use spider_task_executor::protocol::{ExecutorOutcome, Request, Response}; +use spider_tdl::{ + TaskContext, + wire::{TaskInputsSerializer, WireError}, +}; +use tokio::{ + process::{Child, ChildStdin, ChildStdout, Command}, + sync::Mutex, +}; +use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec}; + +/// Pool configuration. Supplied once at construction time and never mutated. +#[derive(Debug, Clone)] +pub struct ProcessPoolConfig { + /// Identity of the owning execution manager. + pub em_id: ExecutionManagerId, + + /// Absolute path to the `spider-task-executor` binary the pool will spawn. + pub executor_binary_path: PathBuf, + + /// Directory exposed to the child via `SPIDER_TDL_PACKAGE_DIR`. The executor resolves + /// `${dir}//lib.so` for each package it dispatches. + pub package_dir: PathBuf, + + /// Directory the pool writes per-executor stderr log files into. Each spawn opens + /// `/-.log` in create-or-append mode and routes the child's + /// stderr there. + /// + /// Per-spawn filenames mean each respawn naturally rotates onto a fresh file; a long-lived + /// healthy executor accumulates into one file. + pub log_dir: PathBuf, +} + +/// Request to execute a task inside the spawned task executor. +#[derive(Debug)] +pub struct ExecuteRequest { + pub job_id: JobId, + pub task_id: TaskId, + pub resource_group_id: ResourceGroupId, + pub ctx: ExecutionContext, +} + +/// Outcome of a single [`ProcessPool::execute`] call. +#[derive(Debug)] +pub enum Outcome { + /// Task ran to completion. `outputs` is the wire-format + /// [`spider_tdl::wire::TaskOutputsSerializer`] buffer ready to forward to storage as + /// `serialized_outputs`. `elapsed_us` is the in-FFI duration measured by the executor. + Success { outputs: Vec, elapsed_us: u64 }, + + /// Task ran to completion but returned an error. `error` is the msgpack-encoded + /// [`spider_task_executor::ExecutorError`]. + InTaskFailure { error: Vec, elapsed_us: u64 }, + + /// `hard_timeout` elapsed before the executor replied. The pool has `SIGKILL`-ed the process. + Timeout { hard_timeout: Duration }, + + /// The executor process exited (or closed stdout) before replying. + ExecutorCrash { exit_status: Option }, +} + +/// Internal failure of the pool itself, distinct from a task-execution [`Outcome`]. These indicate +/// the pool can't serve the current request (and possibly any future request). +/// +/// This error may indicate a non-recoverable failure. The upper-level caller may need to close the +/// entire process pool and restart the execution manager service from the ground. +#[derive(Debug, thiserror::Error)] +pub enum InternalError { + /// The pool was entered with no running executor. + #[error("task executor process is not running")] + NotRunning, + + /// Failed to spawn the executor (any I/O step during spawn — `create_dir_all`, log-file open, + /// [`Command::spawn`], or claiming the piped stdio handles). + #[error("failed to create an executor process: {0}")] + ExecutorCreationFailure(#[from] std::io::Error), + + /// Failed to msgpack-encode the [`TaskContext`] when building the executor request. + #[error("failed to encode task context: {0}")] + EncodeTaskContext(#[from] rmp_serde::encode::Error), + + /// Failed to wire-format-encode the task inputs when building the executor request. + #[error("failed to encode task inputs: {0}")] + EncodeTaskInputs(#[from] WireError), +} + +/// The process pool of pre-forked task executor subprocesses ready for task execution. +pub struct ProcessPool { + config: ProcessPoolConfig, + next_executor_id: AtomicU64, + /// Lock-serializes concurrent [`Self::execute`] callers. The single executor means each caller + /// takes the lock for the whole call, so the mutex is the entire concurrency gate. + handle: Mutex>, +} + +impl ProcessPool { + /// Factory function. + /// + /// Spawns the initial executor process and returns a ready-to-use pool. + /// + /// # Returns + /// + /// A pool whose handle already holds a freshly spawned executor on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::spawn_executor`]'s return values on failure. + pub fn new(config: ProcessPoolConfig) -> Result { + let mut this = Self { + config, + handle: Mutex::new(None), + next_executor_id: AtomicU64::new(0), + }; + let handle = this.spawn_executor().inspect_err(|err| { + tracing::error!(err = ? err, "Failed to spawn executor process on construction."); + })?; + *this.handle.get_mut() = Some(handle); + Ok(this) + } + + /// Runs one task on the pooled executor. + /// + /// Locks the handle so concurrent callers queue. Once inside, the request is bincode-framed + /// onto the child's stdin and the parent races a deadline against the response frame. On + /// timeout or crash the process is killed and respawned before the call returns; subsequent + /// calls see a fresh executor. + /// + /// # Returns + /// + /// Exactly one [`Outcome`] variant describing the dispatch result on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`InternalError::NotRunning`] if the pool's handle was empty at entry — meaning a prior + /// respawn failed and the pool is unrecoverable. The pool should be discarded. + /// * Forwards [`build_request`]'s return values on failure. + /// * Forwards [`Self::spawn_executor`]'s return values on failure. + pub async fn execute( + &self, + request: ExecuteRequest, + hard_timeout: Duration, + ) -> Result { + let mut handle_guard = self.handle.lock().await; + let handle = handle_guard.as_mut().ok_or(InternalError::NotRunning)?; + tracing::info!( + job_id = ? request.job_id, + task_id = ? request.task_id, + task_instance_id = ? request.ctx.task_instance_id, + executor_id = handle.executor_id, + "Task executor acquired for execution." + ); + let frame_request = build_request(request)?; + let outcome = handle.run(frame_request, hard_timeout).await; + + if matches!( + outcome, + Outcome::Timeout { .. } | Outcome::ExecutorCrash { .. } + ) { + // Dropping the handle will automatically kill the child process. + drop(handle_guard.take()); + let new_handle = self.spawn_executor().inspect_err(|err| { + tracing::error!( + err = ? err, + "Failed to respawn the executor process after a crash or timeout." + ); + })?; + tracing::info!( + executor_id = new_handle.executor_id, + "Executor respawned successfully." + ); + *handle_guard = Some(new_handle); + } + + drop(handle_guard); + Ok(outcome) + } + + /// Spawns the executor binary, allocates the next monotonic executor-id, opens the per-executor + /// log file, and wraps the child's stdin/stdout in length-delimited codec frames. + /// + /// The child's stderr is redirected to `/-.log` in + /// create-or-append mode. + /// + /// # Returns + /// + /// A fully wired [`ExecutorHandle`] on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`InternalError::ExecutorCreationFailure`] if the piped stdin or stdout handles cannot be + /// claimed after spawn. + /// * Forwards [`std::fs::create_dir_all`]'s return values on failure. + /// * Forwards [`std::fs::OpenOptions::open`]'s return values on failure. + /// * Forwards [`Command::spawn`]'s return values on failure. + fn spawn_executor(&self) -> Result { + let executor_id = self.next_executor_id.fetch_add(1, Ordering::Relaxed); + std::fs::create_dir_all(&self.config.log_dir)?; + let log_path = self.config.log_dir.join(format!( + "{}-{executor_id}.log", + self.config.em_id.as_uuid_ref() + )); + let log_file = File::options().create(true).append(true).open(&log_path)?; + + let mut command = Command::new(&self.config.executor_binary_path); + command + .env("SPIDER_TDL_PACKAGE_DIR", &self.config.package_dir) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::from(log_file)) + .kill_on_drop(true); + let mut child = command.spawn()?; + let stdin = child + .stdin + .take() + .ok_or_else(|| std::io::Error::other("executor stdin not piped"))?; + let stdout = child + .stdout + .take() + .ok_or_else(|| std::io::Error::other("executor stdout not piped"))?; + tracing::info!(executor_id, "Executor spawned."); + Ok(ExecutorHandle { + executor_id, + child, + requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()), + responses: FramedRead::new(stdout, LengthDelimitedCodec::new()), + }) + } +} + +/// One running executor subprocess plus framed handles to its stdin / stdout. +struct ExecutorHandle { + executor_id: u64, + child: Child, + requests: FramedWrite, + responses: FramedRead, +} + +impl ExecutorHandle { + /// Sends `request` and awaits exactly one reply, racing it against `hard_timeout` and against + /// stdout EOF (process death). + /// + /// # Returns + /// + /// Exactly one [`Outcome`] variant: + /// + /// * [`Outcome::Success`] or [`Outcome::InTaskFailure`] from a well-formed reply. + /// * [`Outcome::Timeout`] if `hard_timeout` fires. + /// * [`Outcome::ExecutorCrash`] on any write/read/decode failure (which all imply the child is + /// no longer usable). + /// + /// # Panics + /// + /// Panics if [`bincode::serialize`] fails to encode `request` — the protocol types are + /// `derive(Serialize)` and serialize trivially, so an encoding failure indicates programmer + /// error rather than a runtime condition. + async fn run(&mut self, request: Request, hard_timeout: Duration) -> Outcome { + let bytes = bincode::serialize(&request).expect("bincode encode Request"); + if let Err(err) = self.requests.send(Bytes::from(bytes)).await { + tracing::warn!( + executor_id = self.executor_id, + err = ? err, + "Failed to send request to executor." + ); + return Outcome::ExecutorCrash { + exit_status: self.poll_exit_code(), + }; + } + + tokio::select! { + biased; + frame = self.responses.next() => match frame { + Some(Ok(bytes)) => match bincode::deserialize::(&bytes) { + Ok(Response::Result { outcome, elapsed_us }) => match outcome { + ExecutorOutcome::Success { outputs } => { + Outcome::Success { outputs, elapsed_us } + } + ExecutorOutcome::Failure { error } => { + Outcome::InTaskFailure { error, elapsed_us } + } + }, + Err(err) => { + tracing::error!( + executor_id = self.executor_id, + err = ? err, + "Failed to decode executor's response. Considered as crashed." + ); + Outcome::ExecutorCrash { exit_status: self.poll_exit_code() } + } + }, + Some(Err(err)) => { + tracing::error!( + executor_id = self.executor_id, + err = ? err, + "Failed to receive executor's response." + ); + Outcome::ExecutorCrash { exit_status: self.poll_exit_code() } + } + None => Outcome::ExecutorCrash { exit_status: self.poll_exit_code() }, + }, + () = tokio::time::sleep(hard_timeout) => { + tracing::warn!(executor_id = self.executor_id, "Executor time out triggered."); + Outcome::Timeout { hard_timeout } + } + } + } + + /// Non-blocking peek at the child's exit status. + /// + /// # Returns + /// + /// `Some(code)` if the child has already exited with a code; `None` if it is still running, was + /// terminated by a signal, or `try_wait` itself errored. + fn poll_exit_code(&mut self) -> Option { + self.child + .try_wait() + .ok() + .flatten() + .and_then(|status| status.code()) + } +} + +/// Builds the wire [`Request::Execute`] from caller inputs. +/// +/// # Returns +/// +/// A populated [`Request::Execute`] with `raw_ctx` set to the msgpack-encoded [`TaskContext`] and +/// `raw_inputs` set to the wire-format [`TaskInputsSerializer`] buffer on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`rmp_serde::to_vec`]'s return values on failure. +/// * Forwards [`TaskInputsSerializer::append`]'s return values on failure. +fn build_request(request: ExecuteRequest) -> Result { + let ExecuteRequest { + job_id, + task_id, + resource_group_id, + ctx, + } = request; + let ExecutionContext { + task_instance_id, + tdl_context, + timeout_policy: _, + inputs, + } = ctx; + let raw_ctx = rmp_serde::to_vec(&TaskContext { + job_id, + task_id, + task_instance_id, + resource_group_id, + })?; + let mut inputs_ser = TaskInputsSerializer::new(); + for input in inputs { + inputs_ser.append(input)?; + } + Ok(Request::Execute { + tdl_context, + raw_ctx, + raw_inputs: inputs_ser.release(), + }) +} diff --git a/components/spider-storage/Cargo.toml b/components/spider-storage/Cargo.toml index 2a661e89..f0a39b72 100644 --- a/components/spider-storage/Cargo.toml +++ b/components/spider-storage/Cargo.toml @@ -25,12 +25,10 @@ spider-derive = { path = "../spider-derive" } sqlx = { version = "0.8.6", features = ["mysql", "runtime-tokio"] } subtle = "2.6.1" thiserror = "2.0.18" -tokio = { version = "1.50.0", features = [ - "macros", - "rt-multi-thread", - "sync", - "time" -] } +tokio = { + version = "1.50.0", + features = ["macros", "rt-multi-thread", "sync", "time"] +} uuid = { version = "1.19.0", features = ["serde"] } [dev-dependencies] diff --git a/components/spider-task-executor/Cargo.toml b/components/spider-task-executor/Cargo.toml index 789308ca..450d2567 100644 --- a/components/spider-task-executor/Cargo.toml +++ b/components/spider-task-executor/Cargo.toml @@ -15,28 +15,25 @@ path = "src/bin/spider_task_executor.rs" anyhow = "1.0.98" bincode = "1.3.3" bytes = "1.10" -futures-util = { version = "0.3.31", default-features = false, features = [ - "sink", - "std" -] } +futures-util = { + version = "0.3.31", + default-features = false, + features = ["sink", "std"] +} libloading = "0.8.5" rmp-serde = "1.3.1" serde = { version = "1.0.228", features = ["derive"] } spider-core = { path = "../spider-core" } spider-tdl = { path = "../spider-tdl" } thiserror = "2.0.18" -tokio = { version = "1.50.0", features = [ - "io-std", - "io-util", - "macros", - "rt", - "sync", - "time" -] } +tokio = { + version = "1.50.0", + features = ["io-std", "io-util", "macros", "rt", "sync", "time"] +} tokio-util = { version = "0.7", features = ["codec"] } tracing = { version = "0.1.41", default-features = false, features = ["std"] } -tracing-subscriber = { version = "0.3.19", default-features = false, features = [ - "env-filter", - "fmt", - "json" -] } +tracing-subscriber = { + version = "0.3.19", + default-features = false, + features = ["env-filter", "fmt", "json"] +} diff --git a/examples/huntsman/complex/tasks/Cargo.toml b/examples/huntsman/complex/tasks/Cargo.toml index 71a5dfbc..df76713a 100644 --- a/examples/huntsman/complex/tasks/Cargo.toml +++ b/examples/huntsman/complex/tasks/Cargo.toml @@ -12,4 +12,7 @@ path = "src/lib.rs" [dependencies] huntsman-complex-types = { path = "../types" } serde = { version = "1.0.228", features = ["derive"] } -spider-tdl = { path = "../../../../components/spider-tdl", features = ["derive"] } +spider-tdl = { + path = "../../../../components/spider-tdl", + features = ["derive"] +} diff --git a/tests/huntsman/task-executor/Cargo.toml b/tests/huntsman/task-executor/Cargo.toml index 0d237bef..ca86c0ad 100644 --- a/tests/huntsman/task-executor/Cargo.toml +++ b/tests/huntsman/task-executor/Cargo.toml @@ -16,27 +16,32 @@ path = "tests/test_executor.rs" name = "overhead_instrument" path = "tests/overhead_instrument.rs" +[[test]] +name = "process_pool" +path = "tests/test_process_pool.rs" + [dependencies] bincode = "1.3.3" bytes = "1.10" -futures-util = { version = "0.3.31", default-features = false, features = [ - "sink", - "std" -] } +futures-util = { + version = "0.3.31", + default-features = false, + features = ["sink", "std"] +} rmp-serde = "1.3.1" serde = { version = "1.0.228", features = ["derive"] } spider-core = { path = "../../../components/spider-core" } spider-task-executor = { path = "../../../components/spider-task-executor" } spider-tdl = { path = "../../../components/spider-tdl" } -tokio = { version = "1.50.0", features = [ - "io-util", - "macros", - "process", - "rt", - "time" -] } +tokio = { + version = "1.50.0", + features = ["io-util", "macros", "process", "rt", "time"] +} tokio-util = { version = "0.7", features = ["codec"] } [dev-dependencies] integration-test-tasks = { path = "../integration-test-tasks" } +spider-execution-manager = { + path = "../../../components/spider-execution-manager" +} tabled = "0.20.0" diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs new file mode 100644 index 00000000..7bc5d332 --- /dev/null +++ b/tests/huntsman/task-executor/tests/test_process_pool.rs @@ -0,0 +1,208 @@ +//! End-to-end tests of [`spider_execution_manager::process_pool::ProcessPool`] against the real +//! task-executor binary. +//! +//! Mirrors `tests/executor.rs` but exercises the pool's `execute` API rather than the raw +//! [`task_executor_tests::ExecutorHandle`]. Adds coverage for the two paths that respawn the +//! executor: +//! +//! * Hard timeout — a long-running task is force-killed when the parent's timer fires. +//! * Crash — a panicking task aborts the executor process. +//! +//! Each of those paths is followed by a second `execute` that asserts the pool transparently +//! respawned the child and is ready to serve again. + +use std::time::Duration; + +use spider_core::{ + task::{TdlContext, TimeoutPolicy}, + types::{ + id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId}, + io::{ExecutionContext, TaskInput}, + }, +}; +use spider_execution_manager::process_pool::{ + ExecuteRequest, + Outcome, + ProcessPool, + ProcessPoolConfig, +}; +use spider_task_executor::ExecutorError; +use spider_tdl::TdlError; +use task_executor_tests::{PACKAGE_NAME, decode_single_output, task_executor_bin, tdl_package_dir}; + +/// Generous timeout for tasks expected to finish quickly. +const NORMAL_TIMEOUT: Duration = Duration::from_secs(5); + +/// Hard timeout chosen to fire well before [`SLOW_FIB_INDEX`] can complete even on a fast host. +/// Tokio's sleep granularity is comfortably below this value. +const SHORT_TIMEOUT: Duration = Duration::from_millis(200); + +/// Fibonacci index whose naive-recursive execution takes well over [`SHORT_TIMEOUT`] on any +/// realistic host (`fib(45)` ~= 1.1×10^9 recursive calls — about a second in release mode). +const SLOW_FIB_INDEX: u64 = 45; + +/// Builds a fresh [`ProcessPool`] wired to the test-harness env (executor binary + package dir) +/// with a unique temp log directory. +/// +/// # Returns +/// +/// A ready-to-use pool whose handle already holds a spawned executor. +/// +/// # Panics +/// +/// Panics if [`ProcessPool::new`] fails — i.e., the task-executor binary cannot be spawned. +fn build_pool() -> ProcessPool { + let em_id = ExecutionManagerId::new(); + let log_dir = std::env::temp_dir().join(format!("spider-em-pool-test-{}", em_id.as_uuid_ref())); + let config = ProcessPoolConfig { + em_id, + executor_binary_path: task_executor_bin(), + package_dir: tdl_package_dir(), + log_dir, + }; + ProcessPool::new(config).expect("construct pool") +} + +/// Builds an [`ExecuteRequest`] targeting `task_func` in the integration package. +/// +/// # Returns +/// +/// A request with fresh IDs, a placeholder [`TimeoutPolicy`] (which the pool ignores — the caller +/// supplies `hard_timeout` directly to [`ProcessPool::execute`]), and the supplied `inputs`. +fn make_request(task_func: &str, inputs: Vec) -> ExecuteRequest { + ExecuteRequest { + job_id: JobId::new(), + task_id: TaskId::new(), + resource_group_id: ResourceGroupId::new(), + ctx: ExecutionContext { + task_instance_id: 1, + tdl_context: TdlContext { + package: PACKAGE_NAME.to_owned(), + task_func: task_func.to_owned(), + }, + timeout_policy: TimeoutPolicy { + soft_timeout_ms: 100, + hard_timeout_ms: 1000, + }, + inputs, + }, + } +} + +/// Wraps `value` into a single-payload input list. +/// +/// # Type Parameters +/// +/// * `T` - The Serde-serializable value type carried as the task's single input. +/// +/// # Returns +/// +/// A `Vec` of length 1 carrying the msgpack-encoded `value`. +/// +/// # Panics +/// +/// Panics if msgpack encoding fails. +fn single_input(value: &T) -> Vec { + vec![TaskInput::ValuePayload( + rmp_serde::to_vec(value).expect("msgpack encode input"), + )] +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn fibonacci_succeeds() { + let pool = build_pool(); + let outcome = pool + .execute( + make_request("fibonacci", single_input(&10_u64)), + NORMAL_TIMEOUT, + ) + .await + .expect("execute"); + let Outcome::Success { outputs, .. } = outcome else { + panic!("expected Success, got {outcome:?}"); + }; + assert_eq!(decode_single_output::(&outputs), 55); +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn always_fail_reports_task_error() { + let pool = build_pool(); + let outcome = pool + .execute(make_request("always_fail", vec![]), NORMAL_TIMEOUT) + .await + .expect("execute"); + let Outcome::InTaskFailure { error, .. } = outcome else { + panic!("expected InTaskFailure, got {outcome:?}"); + }; + let err: ExecutorError = rmp_serde::from_slice(&error).expect("decode ExecutorError"); + let ExecutorError::TaskError(TdlError::ExecutionError(message)) = err else { + panic!("expected TaskError(ExecutionError), got {err:?}"); + }; + assert!( + message.contains("always_fail"), + "unexpected message: {message}" + ); +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn always_panic_returns_crash_then_respawns() { + let pool = build_pool(); + + let outcome = pool + .execute(make_request("always_panic", vec![]), NORMAL_TIMEOUT) + .await + .expect("execute (crash)"); + assert!( + matches!(outcome, Outcome::ExecutorCrash { .. }), + "expected ExecutorCrash, got {outcome:?}", + ); + + // The pool must have respawned the executor before returning. A follow-up call must succeed + // against the fresh process. + let outcome = pool + .execute( + make_request("fibonacci", single_input(&7_u64)), + NORMAL_TIMEOUT, + ) + .await + .expect("execute (after respawn)"); + let Outcome::Success { outputs, .. } = outcome else { + panic!("expected Success after respawn, got {outcome:?}"); + }; + assert_eq!(decode_single_output::(&outputs), 13); +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn hard_timeout_kills_then_respawns() { + let pool = build_pool(); + + let outcome = pool + .execute( + make_request("fibonacci", single_input(&SLOW_FIB_INDEX)), + SHORT_TIMEOUT, + ) + .await + .expect("execute (timeout)"); + let Outcome::Timeout { hard_timeout } = outcome else { + panic!("expected Timeout, got {outcome:?}"); + }; + assert_eq!(hard_timeout, SHORT_TIMEOUT); + + // The pool must have respawned the executor before returning. A follow-up call must succeed + // against the fresh process. + let outcome = pool + .execute( + make_request("fibonacci", single_input(&7_u64)), + NORMAL_TIMEOUT, + ) + .await + .expect("execute (after respawn)"); + let Outcome::Success { outputs, .. } = outcome else { + panic!("expected Success after respawn, got {outcome:?}"); + }; + assert_eq!(decode_single_output::(&outputs), 13); +} From 27091f06f5384816b0be1a0c1e419399f05785a2 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Thu, 28 May 2026 22:43:54 -0400 Subject: [PATCH 03/14] feat(spider-execution-manager): Add scheduler, storage, and liveness client traits. (#327) --- Cargo.lock | 1 + .../spider-execution-manager/Cargo.toml | 1 + .../spider-execution-manager/src/client.rs | 15 ++ .../src/client/liveness.rs | 79 +++++++++++ .../src/client/scheduler.rs | 59 ++++++++ .../src/client/storage.rs | 134 ++++++++++++++++++ .../spider-execution-manager/src/lib.rs | 1 + 7 files changed, 290 insertions(+) create mode 100644 components/spider-execution-manager/src/client.rs create mode 100644 components/spider-execution-manager/src/client/liveness.rs create mode 100644 components/spider-execution-manager/src/client/scheduler.rs create mode 100644 components/spider-execution-manager/src/client/storage.rs diff --git a/Cargo.lock b/Cargo.lock index 2888d5e8..e862f5dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1486,6 +1486,7 @@ dependencies = [ name = "spider-execution-manager" version = "0.1.0" dependencies = [ + "async-trait", "bincode", "bytes", "futures-util", diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml index ed8e74db..6b687212 100644 --- a/components/spider-execution-manager/Cargo.toml +++ b/components/spider-execution-manager/Cargo.toml @@ -8,6 +8,7 @@ name = "spider_execution_manager" path = "src/lib.rs" [dependencies] +async-trait = "0.1.89" bincode = "1.3.3" bytes = "1.10" futures-util = { diff --git a/components/spider-execution-manager/src/client.rs b/components/spider-execution-manager/src/client.rs new file mode 100644 index 00000000..4f335f6e --- /dev/null +++ b/components/spider-execution-manager/src/client.rs @@ -0,0 +1,15 @@ +//! Network client traits used by the execution manager. +//! +//! Three traits cover the EM's outbound traffic: +//! +//! * [`scheduler::SchedulerClient`] — pulls task assignments from the scheduler. +//! * [`storage::StorageClient`] — registers task instances and reports their outcome. +//! * [`liveness::LivenessClient`] — registers the EM at boot and ticks the heartbeat thereafter. + +pub mod liveness; +pub mod scheduler; +pub mod storage; + +pub use liveness::{LivenessClient, LivenessResponseError, RegistrationResponse}; +pub use scheduler::{SchedulerClient, SchedulerError, SchedulerResponse}; +pub use storage::{StorageClient, StorageResponseError}; diff --git a/components/spider-execution-manager/src/client/liveness.rs b/components/spider-execution-manager/src/client/liveness.rs new file mode 100644 index 00000000..3261c9d8 --- /dev/null +++ b/components/spider-execution-manager/src/client/liveness.rs @@ -0,0 +1,79 @@ +//! Liveness client trait. +//! +//! The execution manager registers itself with storage at boot, then sends a periodic heartbeat. +//! Each heartbeat both keeps the EM marked alive and returns storage's current session id. + +use std::net::IpAddr; + +use async_trait::async_trait; +use spider_core::types::id::{ExecutionManagerId, SessionId}; + +/// The execution manager's identity and the storage session at registration time. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RegistrationResponse { + pub em_id: ExecutionManagerId, + pub session_id: SessionId, +} + +/// Errors returned by [`LivenessClient`] operations. +#[derive(Debug, thiserror::Error)] +pub enum LivenessResponseError { + /// Storage has reaped this execution manager. + #[error("execution manager already marked dead")] + MarkedDead, + + /// Connection lost, request timeout, or wire-format serialization failure. Callers may back off + /// and retry. + #[error("transport error: {0}")] + Transport(String), + + /// The execution manager id was rejected by storage (e.g. unknown id). + #[error("execution manager id rejected: {0}")] + IllegalId(String), +} + +/// Client interface to the storage server's execution-manager liveness endpoint. +#[async_trait] +pub trait LivenessClient: Send + Sync { + /// Registers the execution manager with storage and obtains its id. + /// + /// Called once at boot. + /// + /// # Parameters + /// + /// * `ip` - The advertised IP address of the execution manager process. + /// + /// # Returns + /// + /// The freshly assigned execution manager id and the current storage session id on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`LivenessResponseError::Transport`] if the connection was lost or timed out. + async fn register(&self, ip: IpAddr) -> Result; + + /// Sends one heartbeat for `em_id` and returns the storage's current session id. + /// + /// # Parameters + /// + /// * `em_id` - The execution manager id being heartbeated. + /// + /// # Returns + /// + /// The storage's current session id on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`LivenessResponseError::MarkedDead`] if storage has already reaped this execution + /// manager. + /// * [`LivenessResponseError::Transport`] if the connection was lost or timed out. + /// * [`LivenessResponseError::IllegalId`] if storage rejected the id. + async fn heartbeat( + &self, + em_id: ExecutionManagerId, + ) -> Result; +} diff --git a/components/spider-execution-manager/src/client/scheduler.rs b/components/spider-execution-manager/src/client/scheduler.rs new file mode 100644 index 00000000..cf13687a --- /dev/null +++ b/components/spider-execution-manager/src/client/scheduler.rs @@ -0,0 +1,59 @@ +//! Scheduler client trait. +//! +//! The execution manager acquires tasks from the scheduler through [`SchedulerClient`]. + +use async_trait::async_trait; +use spider_core::types::id::{ExecutionManagerId, JobId, SessionId, TaskId}; + +/// A task assignment handed to the execution manager by the scheduler. +/// +/// `session_id` is the scheduler's view of storage's session at the moment the assignment was +/// produced. The execution manager pins this exact value on every subsequent storage call for the +/// attempt. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SchedulerResponse { + pub job_id: JobId, + pub task_id: TaskId, + pub session_id: SessionId, +} + +/// Errors returned by [`SchedulerClient::next_task`]. +#[derive(Debug, thiserror::Error)] +pub enum SchedulerError { + /// Connection to the scheduler was lost or the request timed out. Callers may back off and + /// retry. + #[error("transport error: {0}")] + Transport(String), + + /// The scheduler returned a malformed reply. + #[error("protocol error: {0}")] + Protocol(String), +} + +/// Client interface to the scheduler service. +#[async_trait] +pub trait SchedulerClient: Send + Sync { + /// Blocks until a task is assigned to this execution manager. + /// + /// Implementations may long-poll the scheduler; callers should treat this call as a + /// cancellation point. + /// + /// # Parameters + /// + /// * `em_id` - The identity of the calling execution manager. + /// + /// # Returns + /// + /// A [`SchedulerResponse`] describing the assigned task on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::Transport`] if the connection was lost or the request timed out. + /// * [`SchedulerError::Protocol`] if the scheduler returned a malformed reply. + async fn next_task( + &self, + em_id: ExecutionManagerId, + ) -> Result; +} diff --git a/components/spider-execution-manager/src/client/storage.rs b/components/spider-execution-manager/src/client/storage.rs new file mode 100644 index 00000000..89732c8c --- /dev/null +++ b/components/spider-execution-manager/src/client/storage.rs @@ -0,0 +1,134 @@ +//! Storage client trait. +//! +//! The execution manager interacts with the storage server through this trait to register a task +//! instance, fetch its [`ExecutionContext`], and report success or failure. + +use async_trait::async_trait; +use spider_core::types::{ + id::{ExecutionManagerId, JobId, SessionId, TaskId}, + io::ExecutionContext, +}; + +/// Errors returned by [`StorageClient`] operations. +/// +/// The variants intentionally mirror the storage server's externally visible failure modes (see +/// `spider_storage::state::error::StorageServerError`) plus a transport bucket for connection / +/// serialization failures. +#[derive(Debug, thiserror::Error)] +pub enum StorageResponseError { + /// The `session_id` carried with the request does not match storage's current session. + #[error("stale session (storage now at {storage_session})")] + StaleSession { storage_session: SessionId }, + + /// Storage's job cache rejected the operation as stale (e.g. the task or its job has already + /// terminated). + #[error("cache stale: {0}")] + CacheStale(String), + + /// Connection lost, request timeout, or wire-format serialization failure. Callers may back off + /// and retry. + #[error("transport error: {0}")] + Transport(String), + + /// The storage server returned an otherwise-uncategorized error. + #[error("storage server: {0}")] + Server(String), + + /// The input to the operation is invalid. + #[error("invalid input: {0}")] + InvalidInput(String), +} + +/// Client interface to the storage server. +#[async_trait] +pub trait StorageClient: Send + Sync { + /// Registers a task instance and fetches its execution context. + /// + /// # Parameters + /// + /// * `job_id` - The owning job. + /// * `task_id` - The task being instantiated. + /// * `em_id` - The identity of the calling execution manager. + /// * `session_id` - The session id captured from the scheduler assignment, pinned for the + /// lifetime of the attempt. + /// + /// # Returns + /// + /// The [`ExecutionContext`] for the task instance on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageResponseError::StaleSession`] if `session_id` no longer matches storage's current + /// session. + /// * [`StorageResponseError::CacheStale`] if storage's job cache rejected the registration. + /// * [`StorageResponseError::Transport`] if the connection was lost or timed out. + /// * [`StorageResponseError::Server`] if storage returned an otherwise-uncategorized error. + async fn register_task_instance( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + ) -> Result; + + /// Reports successful execution of a task instance. + /// + /// # Parameters + /// + /// * `job_id` - The owning job. + /// * `task_id` - The task that ran. + /// * `em_id` - The identity of the calling execution manager. + /// * `session_id` - The session id captured from the scheduler assignment. + /// * `serialized_outputs` - The wire-format encoded task outputs buffer, forwarded verbatim to + /// storage. For commit tasks and cleanup tasks, this must be `None`. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageResponseError::StaleSession`] if `session_id` no longer matches storage's current + /// session. + /// * [`StorageResponseError::CacheStale`] if storage's job cache rejected the report. + /// * [`StorageResponseError::Transport`] if the connection was lost or timed out. + /// * [`StorageResponseError::Server`] if storage returned an otherwise-uncategorized error. + /// * [`StorageResponseError::InvalidInput`] if `serialized_outputs` is `Some` for a commit or + /// cleanup task. + async fn report_task_success( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + serialized_outputs: Option>, + ) -> Result<(), StorageResponseError>; + + /// Reports failed execution of a task instance. + /// + /// # Parameters + /// + /// * `job_id` - The owning job. + /// * `task_id` - The task that ran. + /// * `em_id` - The identity of the calling execution manager. + /// * `session_id` - The session id captured from the scheduler assignment. + /// * `error_message` - The formatted error message. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageResponseError::StaleSession`] if `session_id` no longer matches storage's current + /// session. + /// * [`StorageResponseError::CacheStale`] if storage's job cache rejected the report. + /// * [`StorageResponseError::Transport`] if the connection was lost or timed out. + /// * [`StorageResponseError::Server`] if storage returned an otherwise-uncategorized error. + async fn report_task_failure( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + error_message: String, + ) -> Result<(), StorageResponseError>; +} diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs index 2d7171a9..84a2b6b2 100644 --- a/components/spider-execution-manager/src/lib.rs +++ b/components/spider-execution-manager/src/lib.rs @@ -1,4 +1,5 @@ //! Execution manager — the per-node service that drives Spider task execution against a //! `spider-task-executor` subprocess. +pub mod client; pub mod process_pool; From 85a5130cb77af8e2611bb7536e8b511174bb63a5 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Mon, 1 Jun 2026 22:10:03 -0400 Subject: [PATCH 04/14] feat(spider-execution-manager): Add liveness actor with session ID tracker; Refactor integration tests to extract common helpers into `test-utils`. (#328) --- Cargo.lock | 26 +- Cargo.toml | 1 + components/spider-core/Cargo.toml | 4 + components/spider-core/src/lib.rs | 1 + components/spider-core/src/session.rs | 107 +++++ .../spider-execution-manager/Cargo.toml | 2 +- .../spider-execution-manager/src/lib.rs | 1 + .../spider-execution-manager/src/liveness.rs | 398 ++++++++++++++++++ tests/huntsman/task-executor/Cargo.toml | 26 +- tests/huntsman/task-executor/src/lib.rs | 278 +----------- .../tests/overhead_instrument.rs | 7 +- .../task-executor/tests/test_executor.rs | 2 +- .../task-executor/tests/test_process_pool.rs | 27 +- tests/huntsman/test-utils/Cargo.toml | 32 ++ tests/huntsman/test-utils/src/executor.rs | 297 +++++++++++++ tests/huntsman/test-utils/src/lib.rs | 16 + tests/huntsman/test-utils/src/mock.rs | 195 +++++++++ 17 files changed, 1095 insertions(+), 325 deletions(-) create mode 100644 components/spider-core/src/session.rs create mode 100644 components/spider-execution-manager/src/liveness.rs create mode 100644 tests/huntsman/test-utils/Cargo.toml create mode 100644 tests/huntsman/test-utils/src/executor.rs create mode 100644 tests/huntsman/test-utils/src/lib.rs create mode 100644 tests/huntsman/test-utils/src/mock.rs diff --git a/Cargo.lock b/Cargo.lock index e862f5dc..bb6a1c31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1469,6 +1469,8 @@ dependencies = [ "sqlx", "strum", "thiserror", + "tokio", + "tokio-util", "uuid", ] @@ -1883,19 +1885,15 @@ dependencies = [ name = "task-executor-tests" version = "0.1.0" dependencies = [ - "bincode", - "bytes", - "futures-util", "integration-test-tasks", "rmp-serde", - "serde", "spider-core", "spider-execution-manager", "spider-task-executor", "spider-tdl", "tabled", + "test-utils", "tokio", - "tokio-util", ] [[package]] @@ -1910,6 +1908,24 @@ dependencies = [ "spider-tdl", ] +[[package]] +name = "test-utils" +version = "0.1.0" +dependencies = [ + "async-trait", + "bincode", + "bytes", + "futures-util", + "rmp-serde", + "serde", + "spider-core", + "spider-execution-manager", + "spider-task-executor", + "spider-tdl", + "tokio", + "tokio-util", +] + [[package]] name = "testing_table" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index ea9992cf..5eb18596 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,4 +13,5 @@ members = [ "tests/huntsman/integration-test-tasks", "tests/huntsman/task-executor", "tests/huntsman/tdl-integration", + "tests/huntsman/test-utils", ] diff --git a/components/spider-core/Cargo.toml b/components/spider-core/Cargo.toml index c75bc4e3..87531aaa 100644 --- a/components/spider-core/Cargo.toml +++ b/components/spider-core/Cargo.toml @@ -18,3 +18,7 @@ sqlx = { version = "0.8.6", features = ["mysql", "uuid"] } strum = { version = "0.28.0", features = ["derive"] } thiserror = "2.0.18" uuid = { version = "1.19.0", features = ["serde", "v4"] } + +[dev-dependencies] +tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread"] } +tokio-util = { version = "0.7", features = ["rt"] } diff --git a/components/spider-core/src/lib.rs b/components/spider-core/src/lib.rs index 66ed84f0..7e546853 100644 --- a/components/spider-core/src/lib.rs +++ b/components/spider-core/src/lib.rs @@ -1,3 +1,4 @@ pub mod job; +pub mod session; pub mod task; pub mod types; diff --git a/components/spider-core/src/session.rs b/components/spider-core/src/session.rs new file mode 100644 index 00000000..a428e001 --- /dev/null +++ b/components/spider-core/src/session.rs @@ -0,0 +1,107 @@ +//! Monotonically increasing session tracker shared across services. +//! +//! Wraps an [`AtomicU64`] in [`Arc`] so multiple tasks (and multiple consumers such as the +//! execution manager and the scheduler) can observe and advance a shared view of storage's current +//! session id. + +use std::sync::{ + Arc, + atomic::{AtomicU64, Ordering}, +}; + +use crate::types::id::SessionId; + +/// Monotonically increasing counter holding a service's view of the current storage session id. +/// +/// Cloneable; clones share the same underlying counter so writers in different tasks stay coherent. +#[derive(Clone, Debug, Default)] +pub struct SessionTracker { + inner: Arc, +} + +impl SessionTracker { + /// Builds a tracker pre-loaded with `initial`. + /// + /// # Returns + /// + /// A newly created [`SessionTracker`] on success. + #[must_use] + pub fn new(initial: SessionId) -> Self { + Self { + inner: Arc::new(AtomicU64::new(initial)), + } + } + + /// # Returns + /// + /// The currently stored session id. + #[must_use] + pub fn current(&self) -> SessionId { + self.inner.load(Ordering::Acquire) + } + + /// Attempts to advance the stored session id to `new_sid`. + /// + /// CAS-loop: if the stored value is already `>= new_sid`, the call no-ops. Otherwise the + /// stored value is bumped to `new_sid`. Coherent under concurrent writers. + /// + /// # Returns + /// + /// Whether `new_sid` strictly advanced the stored value. + #[must_use] + pub fn try_advance(&self, new_sid: SessionId) -> bool { + let mut cur = self.inner.load(Ordering::Acquire); + loop { + if new_sid <= cur { + return false; + } + match self.inner.compare_exchange_weak( + cur, + new_sid, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => return true, + Err(actual) => cur = actual, + } + } + } +} + +#[cfg(test)] +mod tests { + use tokio_util::task::TaskTracker; + + use super::SessionTracker; + + #[test] + fn try_advance_forward() { + let tracker = SessionTracker::new(1); + assert!(tracker.try_advance(5)); + assert_eq!(tracker.current(), 5); + } + + #[test] + fn try_advance_stale_or_equal() { + let tracker = SessionTracker::new(10); + assert!(!tracker.try_advance(10)); + assert!(!tracker.try_advance(7)); + assert_eq!(tracker.current(), 10); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn concurrent_advance_converges_to_max() { + const MAX_TARGET: u64 = 1_000; + let tracker = SessionTracker::new(0); + let task_tracker = TaskTracker::new(); + for i in 1..=MAX_TARGET { + let t = tracker.clone(); + task_tracker.spawn(async move { + let _ = t.try_advance(i); + }); + } + task_tracker.close(); + task_tracker.wait().await; + assert_eq!(tracker.current(), MAX_TARGET); + } +} diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml index 6b687212..10f0e3ac 100644 --- a/components/spider-execution-manager/Cargo.toml +++ b/components/spider-execution-manager/Cargo.toml @@ -25,5 +25,5 @@ tokio = { version = "1.50.0", features = ["io-util", "macros", "process", "rt", "sync", "time"] } -tokio-util = { version = "0.7", features = ["codec"] } +tokio-util = { version = "0.7", features = ["codec", "rt"] } tracing = { version = "0.1.41", default-features = false, features = ["std"] } diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs index 84a2b6b2..259fc8a9 100644 --- a/components/spider-execution-manager/src/lib.rs +++ b/components/spider-execution-manager/src/lib.rs @@ -2,4 +2,5 @@ //! `spider-task-executor` subprocess. pub mod client; +pub mod liveness; pub mod process_pool; diff --git a/components/spider-execution-manager/src/liveness.rs b/components/spider-execution-manager/src/liveness.rs new file mode 100644 index 00000000..411931cd --- /dev/null +++ b/components/spider-execution-manager/src/liveness.rs @@ -0,0 +1,398 @@ +//! Liveness actor — owns the periodic heartbeat to storage and the runtime's view of the current +//! storage session id. +//! +//! The actor runs as a dedicated tokio task driven by [`tokio::select!`] over three sources: +//! +//! 1. A [`tokio::time::interval`] driving periodic heartbeat ticks. +//! 2. An [`mpsc`] command channel from the rest of the runtime. +//! 3. A [`CancellationToken`] that the runtime flips on shutdown. + +use std::{sync::Arc, time::Duration}; + +use spider_core::{session::SessionTracker, types::id::ExecutionManagerId}; +use tokio::{ + sync::mpsc, + task::JoinHandle, + time::{Interval, MissedTickBehavior}, +}; +use tokio_util::sync::CancellationToken; + +use crate::client::{LivenessClient, LivenessResponseError}; + +/// Commands the runtime sends to the actor. +#[derive(Debug)] +pub enum LivenessCommand { + /// Asks the actor to send an immediate heartbeat to storage instead of waiting for the next + /// interval tick. + /// + /// Sent by the main loop when it suspects its session view is stale (e.g. after storage replies + /// with a stale-session error). Storage's heartbeat response is the authoritative source of + /// truth for the current session id, so the actor always re-checks rather than blindly trusting + /// the caller's observation. + Refresh, +} + +/// Cloneable handle for sending commands into the running actor. +#[derive(Clone)] +pub struct LivenessHandle { + cmd_sender: mpsc::Sender, +} + +impl LivenessHandle { + /// Asks the actor to send an immediate heartbeat to storage in a fire-and-forget manner. + pub async fn refresh(&self) { + let _ = self.cmd_sender.send(LivenessCommand::Refresh).await; + } +} + +/// Spawns the liveness actor on the current tokio runtime. +/// +/// The first heartbeat fires immediately when the spawned task is polled for the first time; from +/// there it ticks every `heartbeat_interval`. Missed ticks are skipped rather than burst-replayed. +/// +/// # Returns +/// +/// A pair containing: +/// +/// * A handle for sending commands to the actor. +/// * The spawned task's [`JoinHandle`]. +pub fn spawn( + em_id: ExecutionManagerId, + client: Arc, + session_tracker: SessionTracker, + cancellation_token: CancellationToken, + heartbeat_interval: Duration, +) -> (LivenessHandle, JoinHandle<()>) { + let (tx, rx) = mpsc::channel(COMMAND_CHANNEL_CAP); + let mut interval = tokio::time::interval(heartbeat_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let actor = LivenessActor { + em_id, + client, + session_tracker, + cmd_receiver: rx, + cancellation_token, + interval, + }; + let join = tokio::spawn(actor.run()); + (LivenessHandle { cmd_sender: tx }, join) +} + +/// Capacity of the command channel between the runtime and the actor. +const COMMAND_CHANNEL_CAP: usize = 16; + +/// The actor's owned state. Lives entirely inside the spawned task. +struct LivenessActor { + em_id: ExecutionManagerId, + client: Arc, + session_tracker: SessionTracker, + cmd_receiver: mpsc::Receiver, + cancellation_token: CancellationToken, + interval: Interval, +} + +impl LivenessActor { + /// Drives the actor until cancellation or the command channel closes. + async fn run(mut self) { + loop { + tokio::select! { + () = self.cancellation_token.cancelled() => { + tracing::info!("Cancellation token received. Liveness actor shutting down."); + break; + }, + cmd = self.cmd_receiver.recv() => if let Some(cmd) = cmd { + self.on_command(&cmd).await; + } else { + tracing::info!("Command channel closed. Liveness actor shutting down."); + break; + }, + _ = self.interval.tick() => self.send_heartbeat().await, + } + } + } + + /// Handles one command popped from the channel. + async fn on_command(&mut self, cmd: &LivenessCommand) { + match cmd { + LivenessCommand::Refresh => { + self.send_heartbeat().await; + } + } + } + + /// Sends one heartbeat to storage, processes the response, and resets the interval so the next + /// scheduled tick fires one period from now. + /// + /// Resetting the interval rate-limits refresh-triggered heartbeats: an off-schedule call + /// (driven by [`LivenessCommand::Refresh`]) postpones the next scheduled tick, so the actor + /// never sends two heartbeats closer together than `heartbeat_interval`. + async fn send_heartbeat(&mut self) { + match self.client.heartbeat(self.em_id).await { + Ok(session_id) => { + let previous = self.session_tracker.current(); + if previous != session_id { + if self.session_tracker.try_advance(session_id) { + tracing::info!( + from = previous, + to = session_id, + "Session advanced by heartbeat." + ); + } else { + tracing::error!( + from = previous, + to = session_id, + "Session update rejected. This is unexpected since there should be no \ + concurrent session updates in the current implementation. Cancelling \ + the runtime." + ); + self.cancellation_token.cancel(); + } + } + } + Err(LivenessResponseError::MarkedDead) => { + tracing::error!( + "Liveness reports execution manager marked dead. Cancelling the runtime." + ); + self.cancellation_token.cancel(); + } + Err(LivenessResponseError::IllegalId(msg)) => { + tracing::error!( + err = %msg, + "Liveness rejected the execution manager ID. Cancelling the runtime." + ); + self.cancellation_token.cancel(); + } + Err(LivenessResponseError::Transport(msg)) => { + tracing::warn!(err = %msg, "Heartbeat transport error; retrying next tick."); + } + } + self.interval.reset(); + } +} + +#[cfg(test)] +mod tests { + use std::{ + collections::VecDeque, + net::IpAddr, + sync::{Arc, Mutex}, + time::Duration, + }; + + use async_trait::async_trait; + use spider_core::{ + session::SessionTracker, + types::id::{ExecutionManagerId, SessionId}, + }; + use tokio::{sync::Notify, task::JoinHandle}; + use tokio_util::sync::CancellationToken; + + use super::{LivenessHandle, spawn}; + use crate::client::{LivenessClient, LivenessResponseError, RegistrationResponse}; + + struct MockState { + responses: VecDeque>, + call_count: u64, + } + + /// Mock [`LivenessClient`] that returns scripted heartbeat responses and notifies the test + /// once per call. + struct MockLivenessClient { + state: Mutex, + notify: Notify, + } + + impl MockLivenessClient { + /// Builds an empty mock. Tests prime the response queue via [`Self::push_response`] before + /// spawning the actor. + /// + /// # Returns + /// + /// A newly created [`MockLivenessClient`] with an empty response queue. + fn new() -> Self { + Self { + state: Mutex::new(MockState { + responses: VecDeque::new(), + call_count: 0, + }), + notify: Notify::new(), + } + } + + /// Pushes one scripted heartbeat response onto the queue. + /// + /// Responses are returned in FIFO order, one per [`LivenessClient::heartbeat`] call. If the + /// queue is exhausted, the mock returns a synthetic [`LivenessResponseError::Transport`] so + /// a misconfigured test fails loudly rather than hanging. + fn push_response(&self, response: Result) { + self.state + .lock() + .expect("mock state lock poisoned") + .responses + .push_back(response); + } + + /// # Returns + /// + /// The total number of [`LivenessClient::heartbeat`] invocations observed so far. + fn call_count(&self) -> u64 { + self.state + .lock() + .expect("mock state lock poisoned") + .call_count + } + + /// Awaits the next [`LivenessClient::heartbeat`] invocation. + /// + /// Backed by a [`Notify`] permit, so an invocation that fires before this future is polled + /// can be still observed. + async fn wait_for_call(&self) { + self.notify.notified().await; + } + } + + #[async_trait] + impl LivenessClient for MockLivenessClient { + async fn register( + &self, + _ip: IpAddr, + ) -> Result { + unimplemented!("`LivenessClient::register` is not exercised by actor tests") + } + + async fn heartbeat( + &self, + _em_id: ExecutionManagerId, + ) -> Result { + let response = { + let mut state = self.state.lock().expect("mock state lock poisoned"); + state.call_count += 1; + state.responses.pop_front().unwrap_or_else(|| { + Err(LivenessResponseError::Transport( + "MockLivenessClient: response queue exhausted".to_owned(), + )) + }) + }; + self.notify.notify_one(); + response + } + } + + /// Spawns the actor with a long heartbeat interval so only the initial tick and explicit + /// `Refresh`-driven heartbeats fire during the test. + /// + /// # Returns + /// + /// Forwards [`spawn`]'s return values. + fn spawn_actor( + client: Arc, + tracker: SessionTracker, + cancellation_token: CancellationToken, + ) -> (LivenessHandle, JoinHandle<()>) { + spawn( + ExecutionManagerId::new(), + client, + tracker, + cancellation_token, + Duration::from_mins(1), + ) + } + + /// Joins the actor with a short upper bound so a stuck task surfaces as a test failure + /// instead of an infinite hang. + async fn join_actor(join: JoinHandle<()>) { + tokio::time::timeout(Duration::from_secs(1), join) + .await + .expect("actor did not exit within 1s") + .expect("actor task panicked"); + } + + #[tokio::test] + async fn heartbeat_advances_tracker_on_success() { + let client = Arc::new(MockLivenessClient::new()); + client.push_response(Ok(7)); + let tracker = SessionTracker::new(5); + let cancellation_token = CancellationToken::new(); + + let (_handle, join) = spawn_actor( + Arc::clone(&client), + tracker.clone(), + cancellation_token.clone(), + ); + + client.wait_for_call().await; + assert_eq!(tracker.current(), 7); + assert!(!cancellation_token.is_cancelled()); + + cancellation_token.cancel(); + join_actor(join).await; + } + + #[tokio::test] + async fn marked_dead_cancels_runtime() { + let client = Arc::new(MockLivenessClient::new()); + client.push_response(Err(LivenessResponseError::MarkedDead)); + let cancellation_token = CancellationToken::new(); + + let (_handle, join) = spawn_actor( + Arc::clone(&client), + SessionTracker::new(0), + cancellation_token.clone(), + ); + + tokio::time::timeout(Duration::from_secs(1), cancellation_token.cancelled()) + .await + .expect("token was not cancelled within 1s"); + join_actor(join).await; + } + + #[tokio::test] + async fn transport_error_does_not_cancel_runtime() { + let client = Arc::new(MockLivenessClient::new()); + client.push_response(Err(LivenessResponseError::Transport( + "simulated".to_owned(), + ))); + let tracker = SessionTracker::new(5); + let cancellation_token = CancellationToken::new(); + + let (_handle, join) = spawn_actor( + Arc::clone(&client), + tracker.clone(), + cancellation_token.clone(), + ); + + client.wait_for_call().await; + assert!(!cancellation_token.is_cancelled()); + assert_eq!(tracker.current(), 5); + + cancellation_token.cancel(); + join_actor(join).await; + } + + #[tokio::test] + async fn refresh_triggers_immediate_heartbeat() { + let client = Arc::new(MockLivenessClient::new()); + client.push_response(Ok(5)); + client.push_response(Ok(7)); + let tracker = SessionTracker::new(0); + let cancellation_token = CancellationToken::new(); + + let (handle, join) = spawn_actor( + Arc::clone(&client), + tracker.clone(), + cancellation_token.clone(), + ); + + client.wait_for_call().await; + assert_eq!(tracker.current(), 5); + assert_eq!(client.call_count(), 1); + + handle.refresh().await; + client.wait_for_call().await; + assert_eq!(tracker.current(), 7); + assert_eq!(client.call_count(), 2); + + cancellation_token.cancel(); + join_actor(join).await; + } +} diff --git a/tests/huntsman/task-executor/Cargo.toml b/tests/huntsman/task-executor/Cargo.toml index ca86c0ad..94909303 100644 --- a/tests/huntsman/task-executor/Cargo.toml +++ b/tests/huntsman/task-executor/Cargo.toml @@ -20,28 +20,18 @@ path = "tests/overhead_instrument.rs" name = "process_pool" path = "tests/test_process_pool.rs" -[dependencies] -bincode = "1.3.3" -bytes = "1.10" -futures-util = { - version = "0.3.31", - default-features = false, - features = ["sink", "std"] -} +[dev-dependencies] +integration-test-tasks = { path = "../integration-test-tasks" } rmp-serde = "1.3.1" -serde = { version = "1.0.228", features = ["derive"] } spider-core = { path = "../../../components/spider-core" } +spider-execution-manager = { + path = "../../../components/spider-execution-manager" +} spider-task-executor = { path = "../../../components/spider-task-executor" } spider-tdl = { path = "../../../components/spider-tdl" } +tabled = "0.20.0" +test-utils = { path = "../test-utils" } tokio = { version = "1.50.0", - features = ["io-util", "macros", "process", "rt", "time"] + features = ["macros", "rt", "rt-multi-thread", "time"] } -tokio-util = { version = "0.7", features = ["codec"] } - -[dev-dependencies] -integration-test-tasks = { path = "../integration-test-tasks" } -spider-execution-manager = { - path = "../../../components/spider-execution-manager" -} -tabled = "0.20.0" diff --git a/tests/huntsman/task-executor/src/lib.rs b/tests/huntsman/task-executor/src/lib.rs index c42a20f4..c69ee050 100644 --- a/tests/huntsman/task-executor/src/lib.rs +++ b/tests/huntsman/task-executor/src/lib.rs @@ -1,275 +1,5 @@ -//! Test harness shared by the `task-executor-tests` integration tests. +//! Workspace member that hosts cross-crate integration tests for the `spider-task-executor` +//! binary and the execution manager's process pool. //! -//! Spawns the `spider-task-executor` binary as a child process, frames bincode requests on its -//! stdin and reads bincode responses from its stdout — the exact wire protocol of -//! [`spider_task_executor::protocol`]. -//! -//! Every fallible operation in this harness panics with `.expect(...)` on failure; the tests are -//! infrastructure, not production code, and the panic message + backtrace is more useful at the -//! failure site than threading an error type through every helper. -//! -//! Environment: -//! -//! * `SPIDER_TASK_EXECUTOR_BIN` — absolute path to the executor binary. -//! * `SPIDER_TDL_PACKAGE_DIR` — directory the binary searches for TDL packages; gets forwarded to -//! the child verbatim. - -use std::{path::PathBuf, process::Stdio}; - -use bytes::Bytes; -use futures_util::{SinkExt, StreamExt}; -use spider_core::{ - task::TdlContext, - types::{ - id::{JobId, ResourceGroupId, TaskId}, - io::TaskInput, - }, -}; -use spider_task_executor::protocol::{Request, Response}; -use spider_tdl::{ - TaskContext, - wire::{TaskInputsSerializer, TaskOutputsSerializer}, -}; -use tokio::process::{Child, ChildStdin, ChildStdout, Command}; -use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec}; - -/// The TDL package name registered by `integration-test-tasks`. -pub const PACKAGE_NAME: &str = "integration_test_tasks"; - -/// One running executor subprocess plus framed handles to its stdin / stdout. -/// -/// The subprocess will be killed when the handle is dropped. -pub struct ExecutorHandle { - child: Child, - requests: FramedWrite, - responses: FramedRead, -} - -impl ExecutorHandle { - /// Spawns the executor binary with `SPIDER_TDL_PACKAGE_DIR` set; the child inherits the - /// parent's stderr so panic / abort messages surface in the test log. - /// - /// # Returns - /// - /// A handle owning the running subprocess and framed I/O. - /// - /// # Panics - /// - /// Panics if the binary cannot be spawned or its stdio handles cannot be claimed. - #[must_use] - pub fn spawn() -> Self { - let mut child = Command::new(task_executor_bin()) - .env("SPIDER_TDL_PACKAGE_DIR", tdl_package_dir()) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::inherit()) - .kill_on_drop(true) - .spawn() - .expect("spawn executor binary"); - let stdin = child.stdin.take().expect("stdin must be piped"); - let stdout = child.stdout.take().expect("stdout must be piped"); - Self { - child, - requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()), - responses: FramedRead::new(stdout, LengthDelimitedCodec::new()), - } - } - - /// Bincode-serializes `req` and writes one length-delimited frame to the executor's stdin. - /// - /// # Panics - /// - /// Panics if encoding fails or the stdin pipe cannot be written. - pub async fn send(&mut self, req: &Request) { - let bytes = bincode::serialize(req).expect("bincode encode Request"); - self.requests - .send(Bytes::from(bytes)) - .await - .expect("write request frame"); - } - - /// Reads exactly one length-delimited frame from the executor's stdout and bincode-decodes it. - /// - /// # Returns - /// - /// The next [`Response`] from the executor. - /// - /// # Panics - /// - /// Panics if stdout closes before a frame arrives, the frame I/O fails, or decoding fails. - pub async fn recv(&mut self) -> Response { - let frame = self - .responses - .next() - .await - .expect("executor closed stdout before reply") - .expect("read response frame"); - bincode::deserialize(&frame).expect("bincode decode Response") - } - - /// Reads at most one length-delimited frame, tolerating a clean EOF (which crash-path tests - /// rely on to detect that the executor died). - /// - /// # Returns - /// - /// `Some(response)` if a frame was received, `None` if stdout closed cleanly first. - /// - /// # Panics - /// - /// Panics if the frame I/O fails for a reason other than EOF or if decoding fails. - pub async fn try_recv(&mut self) -> Option { - let frame = self.responses.next().await?; - let bytes = frame.expect("read response frame"); - Some(bincode::deserialize(&bytes).expect("bincode decode Response")) - } - - /// Sends [`Request::Shutdown`], closes stdin, and waits for the child to exit cleanly. - /// - /// # Panics - /// - /// Panics if waiting on the child fails or the child exits non-zero. - pub async fn shutdown_clean(mut self) { - self.send(&Request::Shutdown).await; - // Close the stdin pipe so the child sees EOF after `Shutdown` is drained. - drop(self.requests); - let status = self.child.wait().await.expect("wait for executor"); - assert!(status.success(), "executor exited with status {status:?}"); - } - - /// Closes stdin and waits for the child to exit. Used by crash-path tests that don't expect - /// a clean shutdown. - /// - /// # Returns - /// - /// The child's [`ExitStatus`](std::process::ExitStatus). - /// - /// # Panics - /// - /// Panics if waiting on the child fails. - pub async fn wait_for_exit(mut self) -> std::process::ExitStatus { - drop(self.requests); - self.child.wait().await.expect("wait for executor") - } -} - -/// # Returns -/// -/// The absolute path of the `spider-task-executor` binary, read from `SPIDER_TASK_EXECUTOR_BIN`. -/// -/// # Panics -/// -/// Panics if `SPIDER_TASK_EXECUTOR_BIN` is unset. -#[must_use] -pub fn task_executor_bin() -> PathBuf { - std::env::var_os("SPIDER_TASK_EXECUTOR_BIN") - .map(PathBuf::from) - .expect("SPIDER_TASK_EXECUTOR_BIN env var not set") -} - -/// # Returns -/// -/// The TDL package staging directory, read from `SPIDER_TDL_PACKAGE_DIR`. Forwarded verbatim -/// into the executor child's environment so it resolves -/// `${SPIDER_TDL_PACKAGE_DIR}//lib.so`. -/// -/// # Panics -/// -/// Panics if `SPIDER_TDL_PACKAGE_DIR` is unset. -#[must_use] -pub fn tdl_package_dir() -> PathBuf { - std::env::var_os("SPIDER_TDL_PACKAGE_DIR") - .map(PathBuf::from) - .expect("SPIDER_TDL_PACKAGE_DIR env var not set") -} - -/// # Returns -/// -/// A placeholder msgpack-encoded [`TaskContext`] suitable for a one-shot test invocation. The id -/// fields are fresh per call but the executor doesn't inspect them. -/// -/// # Panics -/// -/// Panics if msgpack encoding fails (the test ids serialize trivially). -#[must_use] -pub fn build_ctx() -> Vec { - let ctx = TaskContext { - job_id: JobId::new(), - task_id: TaskId::new(), - task_instance_id: 1, - resource_group_id: ResourceGroupId::new(), - }; - rmp_serde::to_vec(&ctx).expect("serialize TaskContext") -} - -/// # Type Parameters -/// -/// * `T` - The Serde-serializable value type passed as the task's single input. -/// -/// # Returns -/// -/// A wire-format buffer carrying one [`TaskInput::ValuePayload`] holding the msgpack-encoded -/// `value` — i.e. the same shape the parent ships for a single-argument task. -/// -/// # Panics -/// -/// Panics if msgpack encoding or wire-format append fails. -#[must_use] -pub fn encode_single_input(value: &T) -> Vec { - let mut inputs = TaskInputsSerializer::new(); - inputs - .append(TaskInput::ValuePayload( - rmp_serde::to_vec(value).expect("msgpack encode input"), - )) - .expect("append wire-format input"); - inputs.release() -} - -/// # Returns -/// -/// A wire-format buffer carrying zero inputs — for nullary tasks like `always_fail` and -/// `always_panic`. -#[must_use] -pub fn encode_no_inputs() -> Vec { - TaskInputsSerializer::new().release() -} - -/// # Type Parameters -/// -/// * `T` - The Serde-deserializable type the output payload should decode into. -/// -/// # Returns -/// -/// The single msgpack-encoded value carried in `output_bytes`, deserialized as `T`. -/// -/// # Panics -/// -/// Panics if the outputs buffer doesn't contain exactly one value, or if the msgpack decode -/// fails. -#[must_use] -pub fn decode_single_output(output_bytes: &[u8]) -> T { - let outputs = - TaskOutputsSerializer::deserialize(output_bytes).expect("decode wire-format outputs"); - assert_eq!( - outputs.len(), - 1, - "expected exactly one output payload, got {}", - outputs.len(), - ); - rmp_serde::from_slice(&outputs[0]).expect("msgpack decode output") -} - -/// # Returns -/// -/// A [`Request::Execute`] targeting `task_func` in the integration package, with a fresh test -/// `TaskContext` and the caller-supplied wire-format `raw_inputs`. -#[must_use] -pub fn execute_request(task_func: &str, raw_inputs: Vec) -> Request { - Request::Execute { - tdl_context: TdlContext { - package: PACKAGE_NAME.to_owned(), - task_func: task_func.to_owned(), - }, - raw_ctx: build_ctx(), - raw_inputs, - } -} +//! Tests live under `tests/`; the shared harness and helpers live in the `test-utils` crate. The +//! library itself is intentionally empty. diff --git a/tests/huntsman/task-executor/tests/overhead_instrument.rs b/tests/huntsman/task-executor/tests/overhead_instrument.rs index fc4e146e..64bba93e 100644 --- a/tests/huntsman/task-executor/tests/overhead_instrument.rs +++ b/tests/huntsman/task-executor/tests/overhead_instrument.rs @@ -28,12 +28,7 @@ use std::{ use integration_test_tasks::INSTRUMENT_SLEEP_US; use spider_task_executor::protocol::{ExecutorOutcome, Response}; use tabled::{Table, Tabled}; -use task_executor_tests::{ - ExecutorHandle, - decode_single_output, - encode_single_input, - execute_request, -}; +use test_utils::{ExecutorHandle, decode_single_output, encode_single_input, execute_request}; const PAYLOAD_LEN: usize = 100; const ITERATIONS: usize = 10; diff --git a/tests/huntsman/task-executor/tests/test_executor.rs b/tests/huntsman/task-executor/tests/test_executor.rs index e2eb8ec4..cd91c1d6 100644 --- a/tests/huntsman/task-executor/tests/test_executor.rs +++ b/tests/huntsman/task-executor/tests/test_executor.rs @@ -8,7 +8,7 @@ use spider_task_executor::{ protocol::{ExecutorOutcome, Response}, }; use spider_tdl::TdlError; -use task_executor_tests::{ +use test_utils::{ ExecutorHandle, decode_single_output, encode_no_inputs, diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs index 7bc5d332..7983285b 100644 --- a/tests/huntsman/task-executor/tests/test_process_pool.rs +++ b/tests/huntsman/task-executor/tests/test_process_pool.rs @@ -28,7 +28,13 @@ use spider_execution_manager::process_pool::{ }; use spider_task_executor::ExecutorError; use spider_tdl::TdlError; -use task_executor_tests::{PACKAGE_NAME, decode_single_output, task_executor_bin, tdl_package_dir}; +use test_utils::{ + PACKAGE_NAME, + decode_single_output, + single_input, + task_executor_bin, + tdl_package_dir, +}; /// Generous timeout for tasks expected to finish quickly. const NORMAL_TIMEOUT: Duration = Duration::from_secs(5); @@ -89,25 +95,6 @@ fn make_request(task_func: &str, inputs: Vec) -> ExecuteRequest { } } -/// Wraps `value` into a single-payload input list. -/// -/// # Type Parameters -/// -/// * `T` - The Serde-serializable value type carried as the task's single input. -/// -/// # Returns -/// -/// A `Vec` of length 1 carrying the msgpack-encoded `value`. -/// -/// # Panics -/// -/// Panics if msgpack encoding fails. -fn single_input(value: &T) -> Vec { - vec![TaskInput::ValuePayload( - rmp_serde::to_vec(value).expect("msgpack encode input"), - )] -} - #[tokio::test] #[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] async fn fibonacci_succeeds() { diff --git a/tests/huntsman/test-utils/Cargo.toml b/tests/huntsman/test-utils/Cargo.toml new file mode 100644 index 00000000..cd477a15 --- /dev/null +++ b/tests/huntsman/test-utils/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "test-utils" +version = "0.1.0" +edition = "2024" +publish = false + +[lib] +name = "test_utils" +path = "src/lib.rs" + +[dependencies] +async-trait = "0.1.89" +bincode = "1.3.3" +bytes = "1.10" +futures-util = { + version = "0.3.31", + default-features = false, + features = ["sink", "std"] +} +rmp-serde = "1.3.1" +serde = "1.0.228" +spider-core = { path = "../../../components/spider-core" } +spider-execution-manager = { + path = "../../../components/spider-execution-manager" +} +spider-task-executor = { path = "../../../components/spider-task-executor" } +spider-tdl = { path = "../../../components/spider-tdl" } +tokio = { + version = "1.50.0", + features = ["io-util", "macros", "process", "rt", "sync", "time"] +} +tokio-util = { version = "0.7", features = ["codec"] } diff --git a/tests/huntsman/test-utils/src/executor.rs b/tests/huntsman/test-utils/src/executor.rs new file mode 100644 index 00000000..43ae646f --- /dev/null +++ b/tests/huntsman/test-utils/src/executor.rs @@ -0,0 +1,297 @@ +//! Executor subprocess harness plus the TDL wire-payload helpers the integration suites share. +//! +//! [`ExecutorHandle`] spawns the `spider-task-executor` binary as a child process, frames bincode +//! requests on its stdin and reads bincode responses from its stdout — the exact wire protocol of +//! [`spider_task_executor::protocol`]. +//! +//! Every fallible operation in this harness panics with `.expect(...)` on failure; the tests are +//! infrastructure, not production code, and the panic message + backtrace is more useful at the +//! failure site than threading an error type through every helper. +//! +//! Environment: +//! +//! * `SPIDER_TASK_EXECUTOR_BIN` — absolute path to the executor binary. +//! * `SPIDER_TDL_PACKAGE_DIR` — directory the binary searches for TDL packages; gets forwarded to +//! the child verbatim. + +use std::{path::PathBuf, process::Stdio}; + +use bytes::Bytes; +use futures_util::{SinkExt, StreamExt}; +use spider_core::{ + task::TdlContext, + types::{ + id::{JobId, ResourceGroupId, TaskId}, + io::TaskInput, + }, +}; +use spider_task_executor::protocol::{Request, Response}; +use spider_tdl::{ + TaskContext, + wire::{TaskInputsSerializer, TaskOutputsSerializer}, +}; +use tokio::process::{Child, ChildStdin, ChildStdout, Command}; +use tokio_util::codec::{FramedRead, FramedWrite, LengthDelimitedCodec}; + +/// The TDL package name registered by `integration-test-tasks`. +pub const PACKAGE_NAME: &str = "integration_test_tasks"; + +/// One running executor subprocess plus framed handles to its stdin / stdout. +/// +/// The subprocess will be killed when the handle is dropped. +pub struct ExecutorHandle { + child: Child, + requests: FramedWrite, + responses: FramedRead, +} + +impl ExecutorHandle { + /// Spawns the executor binary with `SPIDER_TDL_PACKAGE_DIR` set; the child inherits the + /// parent's stderr so panic / abort messages surface in the test log. + /// + /// # Returns + /// + /// A handle owning the running subprocess and framed I/O. + /// + /// # Panics + /// + /// Panics if the binary cannot be spawned or its stdio handles cannot be claimed. + #[must_use] + pub fn spawn() -> Self { + let mut child = Command::new(task_executor_bin()) + .env("SPIDER_TDL_PACKAGE_DIR", tdl_package_dir()) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .kill_on_drop(true) + .spawn() + .expect("spawn executor binary"); + let stdin = child.stdin.take().expect("stdin must be piped"); + let stdout = child.stdout.take().expect("stdout must be piped"); + Self { + child, + requests: FramedWrite::new(stdin, LengthDelimitedCodec::new()), + responses: FramedRead::new(stdout, LengthDelimitedCodec::new()), + } + } + + /// Bincode-serializes `req` and writes one length-delimited frame to the executor's stdin. + /// + /// # Panics + /// + /// Panics if encoding fails or the stdin pipe cannot be written. + pub async fn send(&mut self, req: &Request) { + let bytes = bincode::serialize(req).expect("bincode encode Request"); + self.requests + .send(Bytes::from(bytes)) + .await + .expect("write request frame"); + } + + /// Reads exactly one length-delimited frame from the executor's stdout and bincode-decodes it. + /// + /// # Returns + /// + /// The next [`Response`] from the executor. + /// + /// # Panics + /// + /// Panics if stdout closes before a frame arrives, the frame I/O fails, or decoding fails. + pub async fn recv(&mut self) -> Response { + let frame = self + .responses + .next() + .await + .expect("executor closed stdout before reply") + .expect("read response frame"); + bincode::deserialize(&frame).expect("bincode decode Response") + } + + /// Reads at most one length-delimited frame, tolerating a clean EOF (which crash-path tests + /// rely on to detect that the executor died). + /// + /// # Returns + /// + /// `Some(response)` if a frame was received, `None` if stdout closed cleanly first. + /// + /// # Panics + /// + /// Panics if the frame I/O fails for a reason other than EOF or if decoding fails. + pub async fn try_recv(&mut self) -> Option { + let frame = self.responses.next().await?; + let bytes = frame.expect("read response frame"); + Some(bincode::deserialize(&bytes).expect("bincode decode Response")) + } + + /// Sends [`Request::Shutdown`], closes stdin, and waits for the child to exit cleanly. + /// + /// # Panics + /// + /// Panics if waiting on the child fails or the child exits non-zero. + pub async fn shutdown_clean(mut self) { + self.send(&Request::Shutdown).await; + // Close the stdin pipe so the child sees EOF after `Shutdown` is drained. + drop(self.requests); + let status = self.child.wait().await.expect("wait for executor"); + assert!(status.success(), "executor exited with status {status:?}"); + } + + /// Closes stdin and waits for the child to exit. Used by crash-path tests that don't expect + /// a clean shutdown. + /// + /// # Returns + /// + /// The child's [`ExitStatus`](std::process::ExitStatus). + /// + /// # Panics + /// + /// Panics if waiting on the child fails. + pub async fn wait_for_exit(mut self) -> std::process::ExitStatus { + drop(self.requests); + self.child.wait().await.expect("wait for executor") + } +} + +/// # Returns +/// +/// The absolute path of the `spider-task-executor` binary, read from `SPIDER_TASK_EXECUTOR_BIN`. +/// +/// # Panics +/// +/// Panics if `SPIDER_TASK_EXECUTOR_BIN` is unset. +#[must_use] +pub fn task_executor_bin() -> PathBuf { + std::env::var_os("SPIDER_TASK_EXECUTOR_BIN") + .map(PathBuf::from) + .expect("SPIDER_TASK_EXECUTOR_BIN env var not set") +} + +/// # Returns +/// +/// The TDL package staging directory, read from `SPIDER_TDL_PACKAGE_DIR`. +/// +/// # Panics +/// +/// Panics if `SPIDER_TDL_PACKAGE_DIR` is unset. +#[must_use] +pub fn tdl_package_dir() -> PathBuf { + std::env::var_os("SPIDER_TDL_PACKAGE_DIR") + .map(PathBuf::from) + .expect("SPIDER_TDL_PACKAGE_DIR env var not set") +} + +/// # Returns +/// +/// A placeholder msgpack-encoded [`TaskContext`] suitable for a one-shot test invocation. The id +/// fields are fresh per call but the executor doesn't inspect them. +/// +/// # Panics +/// +/// Panics if msgpack encoding fails. +#[must_use] +pub fn build_ctx() -> Vec { + let ctx = TaskContext { + job_id: JobId::new(), + task_id: TaskId::new(), + task_instance_id: 1, + resource_group_id: ResourceGroupId::new(), + }; + rmp_serde::to_vec(&ctx).expect("serialize TaskContext") +} + +/// Wraps `value` into a single-payload [`TaskInput`] list — the shape carried in +/// [`spider_core::types::io::ExecutionContext::inputs`] for a single-argument task. +/// +/// # Type Parameters +/// +/// * `ValueType` - The Serde-serializable value type carried as the task's single input. +/// +/// # Returns +/// +/// A [`Vec`] of length 1 holding the msgpack-encoded `value`. +/// +/// # Panics +/// +/// Panics if msgpack encoding fails. +#[must_use] +pub fn single_input(value: &ValueType) -> Vec { + vec![TaskInput::ValuePayload( + rmp_serde::to_vec(value).expect("msgpack encode input"), + )] +} + +/// # Type Parameters +/// +/// * `ValueType` - The Serde-serializable value type passed as the task's single input. +/// +/// # Returns +/// +/// A wire-format buffer carrying one [`TaskInput::ValuePayload`] holding the msgpack-encoded +/// `value` — i.e. the same shape the parent ships for a single-argument task. +/// +/// # Panics +/// +/// Panics if msgpack encoding or wire-format append fails. +#[must_use] +pub fn encode_single_input(value: &ValueType) -> Vec { + let mut inputs = TaskInputsSerializer::new(); + inputs + .append(TaskInput::ValuePayload( + rmp_serde::to_vec(value).expect("msgpack encode input"), + )) + .expect("append wire-format input"); + inputs.release() +} + +/// # Returns +/// +/// A wire-format buffer carrying zero inputs — for nullary tasks like `always_fail` and +/// `always_panic`. +#[must_use] +pub fn encode_no_inputs() -> Vec { + TaskInputsSerializer::new().release() +} + +/// # Type Parameters +/// +/// * `OutputType` - The Serde-deserializable type the output payload should decode into. +/// +/// # Returns +/// +/// The single msgpack-encoded value carried in `output_bytes`, deserialized as `OutputType`. +/// +/// # Panics +/// +/// Panics if: +/// +/// * The output buffer doesn't contain exactly one value. +/// * The msgpack decoding fails. +#[must_use] +pub fn decode_single_output( + output_bytes: &[u8], +) -> OutputType { + let outputs = + TaskOutputsSerializer::deserialize(output_bytes).expect("decode wire-format outputs"); + assert_eq!( + outputs.len(), + 1, + "expected exactly one output payload, got {}", + outputs.len(), + ); + rmp_serde::from_slice(&outputs[0]).expect("msgpack decode output") +} + +/// # Returns +/// +/// A [`Request::Execute`] targeting `task_func` in the integration package. +#[must_use] +pub fn execute_request(task_func: &str, raw_inputs: Vec) -> Request { + Request::Execute { + tdl_context: TdlContext { + package: PACKAGE_NAME.to_owned(), + task_func: task_func.to_owned(), + }, + raw_ctx: build_ctx(), + raw_inputs, + } +} diff --git a/tests/huntsman/test-utils/src/lib.rs b/tests/huntsman/test-utils/src/lib.rs new file mode 100644 index 00000000..825f3628 --- /dev/null +++ b/tests/huntsman/test-utils/src/lib.rs @@ -0,0 +1,16 @@ +//! Shared test utilities for the huntsman integration suites. +//! +//! Two concern areas: +//! +//! * [`executor`] — the `spider-task-executor` subprocess harness ([`ExecutorHandle`]) plus the TDL +//! wire-payload helpers and environment readers the suites share. +//! * [`mock`] — in-process mock implementations of the execution manager's client traits. +//! +//! Both modules' items are re-exported at the crate level, so tests can `use test_utils::*`-style +//! imports without naming the submodule. + +mod executor; +mod mock; + +pub use executor::*; +pub use mock::*; diff --git a/tests/huntsman/test-utils/src/mock.rs b/tests/huntsman/test-utils/src/mock.rs new file mode 100644 index 00000000..19122cbe --- /dev/null +++ b/tests/huntsman/test-utils/src/mock.rs @@ -0,0 +1,195 @@ +//! In-process mock implementations of the execution manager's client traits. +//! +//! Each mock is `Clone` (internally `Arc`-backed) so the test body retains an inspection handle +//! while the runtime owns a clone. Response queues let the test drive deterministic call sequences; +//! inboxes record every call so assertions can be made. + +use std::{ + collections::VecDeque, + net::IpAddr, + sync::{ + Arc, + Mutex, + MutexGuard, + PoisonError, + atomic::{AtomicU64, Ordering}, + }, + time::Duration, +}; + +use async_trait::async_trait; +use spider_core::types::id::{ExecutionManagerId, SessionId}; +use spider_execution_manager::client::{ + LivenessClient, + LivenessResponseError, + RegistrationResponse, +}; +use tokio::sync::Notify; + +/// Mock [`LivenessClient`]. +#[derive(Clone)] +pub struct MockLiveness { + inner: Arc, +} + +impl MockLiveness { + /// Factory function. + /// + /// # Returns + /// + /// A fresh liveness mock with a freshly generated `em_id`, initial session 1, and Ok(1) + /// heartbeats by default. + #[must_use] + pub fn new() -> Self { + Self::with_initial_session(1) + } + + /// Factory function. + /// + /// # Returns + /// + /// A fresh liveness mock with the given initial session id (used both for the registration + /// response and as the default heartbeat reply). + #[must_use] + pub fn with_initial_session(initial_session: SessionId) -> Self { + Self { + inner: Arc::new(LivenessInner { + em_id: ExecutionManagerId::new(), + initial_session: AtomicU64::new(initial_session), + register_response: Mutex::new(None), + heartbeat_responses: Mutex::new(VecDeque::new()), + default_session: AtomicU64::new(initial_session), + register_calls: Mutex::new(Vec::new()), + heartbeat_count: AtomicU64::new(0), + heartbeat_notify: Notify::new(), + }), + } + } + + /// Overrides the registration response. By default `register` returns + /// `Ok(RegistrationResponse { em_id, session_id: initial_session })`. + pub fn set_register_response( + &self, + response: Result, + ) { + *lock(&self.inner.register_response) = Some(response); + } + + /// Updates the fallback session id returned by `heartbeat` when the response queue is empty. + pub fn set_default_heartbeat_session(&self, session: SessionId) { + self.inner.default_session.store(session, Ordering::Relaxed); + } + + /// Queues `response` for the next `heartbeat` call (takes priority over the default session). + pub fn push_heartbeat_response(&self, response: Result) { + lock(&self.inner.heartbeat_responses).push_back(response); + } + + /// # Returns + /// + /// The `em_id` baked into this mock — the same value the runtime sees through + /// [`LivenessClient::register`]. + #[must_use] + pub fn em_id(&self) -> ExecutionManagerId { + self.inner.em_id + } + + /// # Returns + /// + /// The number of `heartbeat` calls observed. + #[must_use] + pub fn heartbeat_count(&self) -> u64 { + self.inner.heartbeat_count.load(Ordering::Relaxed) + } + + /// # Returns + /// + /// The list of IPs passed to `register`. + #[must_use] + pub fn register_calls(&self) -> Vec { + lock(&self.inner.register_calls).clone() + } + + /// Waits until at least `target` heartbeats have been observed, bounded by `timeout`. + /// + /// # Returns + /// + /// `true` if the threshold was reached, `false` if `timeout` elapsed first. + pub async fn wait_for_heartbeats(&self, target: u64, timeout: Duration) -> bool { + let deadline = tokio::time::Instant::now() + timeout; + loop { + if self.heartbeat_count() >= target { + return true; + } + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + return false; + } + let notified = self.inner.heartbeat_notify.notified(); + tokio::select! { + () = notified => {} + () = tokio::time::sleep(remaining.min(POLL_INTERVAL)) => {} + } + } + } +} + +impl Default for MockLiveness { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl LivenessClient for MockLiveness { + async fn register(&self, ip: IpAddr) -> Result { + lock(&self.inner.register_calls).push(ip); + let programmed = lock(&self.inner.register_response).take(); + if let Some(response) = programmed { + return response; + } + Ok(RegistrationResponse { + em_id: self.inner.em_id, + session_id: self.inner.initial_session.load(Ordering::Relaxed), + }) + } + + async fn heartbeat( + &self, + _em_id: ExecutionManagerId, + ) -> Result { + self.inner.heartbeat_count.fetch_add(1, Ordering::Relaxed); + self.inner.heartbeat_notify.notify_waiters(); + let queued = lock(&self.inner.heartbeat_responses).pop_front(); + queued.unwrap_or_else(|| Ok(self.inner.default_session.load(Ordering::Relaxed))) + } +} + +/// Default polling interval for `wait_until_*` helpers. Short enough to keep tests snappy. +const POLL_INTERVAL: Duration = Duration::from_millis(5); + +/// Shared state behind [`MockLiveness`]. +struct LivenessInner { + em_id: ExecutionManagerId, + initial_session: AtomicU64, + register_response: Mutex>>, + heartbeat_responses: Mutex>>, + default_session: AtomicU64, + register_calls: Mutex>, + heartbeat_count: AtomicU64, + heartbeat_notify: Notify, +} + +/// Acquires `mutex`, silently recovering from poisoning so the helpers never panic from a peer +/// test's failure. +/// +/// # Type Parameters +/// +/// * `InnerType` - The type wrapped by `mutex`. +/// +/// # Returns +/// +/// A [`MutexGuard`] over `mutex`'s contents. +fn lock(mutex: &Mutex) -> MutexGuard<'_, InnerType> { + mutex.lock().unwrap_or_else(PoisonError::into_inner) +} From 100129e6c0a3e7f54287f005ee60ec1ee7211950 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:19:20 -0400 Subject: [PATCH 05/14] refactor(huntsman): Unify `TaskId` by replacing `spider-core`'s definition with `spider-storage`'s. (#331) --- components/spider-core/src/types/id.rs | 47 ++++++------------- components/spider-storage/src/cache.rs | 15 ------ components/spider-storage/src/cache/job.rs | 3 +- .../spider-storage/src/task_instance_pool.rs | 3 +- .../spider-storage/tests/scheduling_infra.rs | 3 +- components/spider-tdl/src/task.rs | 2 +- components/spider-tdl/src/task_context.rs | 2 +- .../spider-tdl/tests/test_task_macro.rs | 4 +- .../task-executor/tests/test_process_pool.rs | 2 +- .../huntsman/tdl-integration/tests/complex.rs | 2 +- tests/huntsman/test-utils/src/executor.rs | 2 +- 11 files changed, 24 insertions(+), 61 deletions(-) diff --git a/components/spider-core/src/types/id.rs b/components/spider-core/src/types/id.rs index 21821e7e..4735f798 100644 --- a/components/spider-core/src/types/id.rs +++ b/components/spider-core/src/types/id.rs @@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize}; use sqlx::{Database, encode::IsNull}; use uuid::Uuid; +use crate::task::TaskIndex; + /// A generic identifier type that wraps a UUID and a type marker. /// /// # Type Parameters: @@ -96,9 +98,18 @@ pub type UuidBytes = uuid::Bytes; pub enum ResourceGroupIdMarker {} pub type ResourceGroupId = Id; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum TaskIdMarker {} -pub type TaskId = Id; +/// Identifier of a task inside a job. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum TaskId { + /// The index of the task in the job's task graph. + Index(TaskIndex), + + /// The commit task. + Commit, + + /// The cleanup task. + Cleanup, +} #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum JobIdMarker {} @@ -169,33 +180,3 @@ where } pub type SignedJobId = SignedId; - -pub type SignedTaskId = SignedId; - -#[cfg(test)] -mod tests { - use std::any::TypeId; - - use super::*; - - #[test] - fn test_id_basic() { - let id = TaskId::new(); - let underlying_uuid = id.as_uuid_ref().to_owned(); - assert_eq!(id, TaskId::from(underlying_uuid)); - - assert_ne!(TypeId::of::(), TypeId::of::()); - } - - #[test] - fn task_id_json_roundtrip() { - let id = TaskId::new(); - let deserialized_id: TaskId = serde_json::from_str( - serde_json::to_string(&id) - .expect("JSON serialization failure") - .as_str(), - ) - .expect("JSON deserialization failure"); - assert_eq!(id, deserialized_id); - } -} diff --git a/components/spider-storage/src/cache.rs b/components/spider-storage/src/cache.rs index d520f519..89a5e13d 100644 --- a/components/spider-storage/src/cache.rs +++ b/components/spider-storage/src/cache.rs @@ -1,21 +1,6 @@ -use spider_core::task::TaskIndex; - pub mod error; pub mod io; pub mod job; pub mod job_submission; mod sync; pub mod task; - -/// Identifier of a task inside a job. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum TaskId { - /// The index of the task in the job's task graph. - Index(TaskIndex), - - /// The commit task. - Commit, - - /// The cleanup task. - Cleanup, -} diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs index 5c575e8e..c5a06ccb 100644 --- a/components/spider-storage/src/cache/job.rs +++ b/components/spider-storage/src/cache/job.rs @@ -10,7 +10,7 @@ use spider_core::{ job::JobState, task::{TaskIndex, TaskState}, types::{ - id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId}, + id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId}, io::{ExecutionContext, TaskOutput}, }, }; @@ -18,7 +18,6 @@ use tokio::sync::{RwLockReadGuard, RwLockWriteGuard}; use crate::{ cache::{ - TaskId, error::{CacheError, InternalError, InternalError::UnexpectedJobState, StaleStateError}, job_submission::ValidatedJobSubmission, task::TaskGraph, diff --git a/components/spider-storage/src/task_instance_pool.rs b/components/spider-storage/src/task_instance_pool.rs index ace45ce6..bba0cf77 100644 --- a/components/spider-storage/src/task_instance_pool.rs +++ b/components/spider-storage/src/task_instance_pool.rs @@ -23,12 +23,11 @@ use std::{ }; use async_trait::async_trait; -use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId}; +use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId}; use tokio::sync::mpsc; use crate::{ cache::{ - TaskId, error::InternalError, task::{SharedTaskControlBlock, SharedTerminationTaskControlBlock}, }, diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs index d3e5eb98..046a35eb 100644 --- a/components/spider-storage/tests/scheduling_infra.rs +++ b/components/spider-storage/tests/scheduling_infra.rs @@ -87,13 +87,12 @@ use spider_core::{ job::JobState, task::TaskIndex, types::{ - id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId}, + id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId}, io::{ExecutionContext, TaskOutput}, }, }; use spider_storage::{ cache::{ - TaskId, error::{CacheError, InternalError}, job::SharedJobControlBlock, job_submission::ValidatedJobSubmission, diff --git a/components/spider-tdl/src/task.rs b/components/spider-tdl/src/task.rs index 99ca904d..d4015e0c 100644 --- a/components/spider-tdl/src/task.rs +++ b/components/spider-tdl/src/task.rs @@ -254,7 +254,7 @@ mod tests { fn make_encoded_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; diff --git a/components/spider-tdl/src/task_context.rs b/components/spider-tdl/src/task_context.rs index 60348315..d412bdb4 100644 --- a/components/spider-tdl/src/task_context.rs +++ b/components/spider-tdl/src/task_context.rs @@ -31,7 +31,7 @@ mod tests { fn round_trip_msgpack() -> anyhow::Result<()> { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 13, resource_group_id: ResourceGroupId::new(), }; diff --git a/components/spider-tdl/tests/test_task_macro.rs b/components/spider-tdl/tests/test_task_macro.rs index e2a070fe..9a891f19 100644 --- a/components/spider-tdl/tests/test_task_macro.rs +++ b/components/spider-tdl/tests/test_task_macro.rs @@ -81,7 +81,7 @@ fn translate(_ctx: TaskContext, p: Point, dx: int32, dy: int32) -> Result<(Point fn make_encoded_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; @@ -303,7 +303,7 @@ fn direct_execute_call_round_trips() -> anyhow::Result<()> { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs index 7983285b..e646352f 100644 --- a/tests/huntsman/task-executor/tests/test_process_pool.rs +++ b/tests/huntsman/task-executor/tests/test_process_pool.rs @@ -78,7 +78,7 @@ fn build_pool() -> ProcessPool { fn make_request(task_func: &str, inputs: Vec) -> ExecuteRequest { ExecuteRequest { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), resource_group_id: ResourceGroupId::new(), ctx: ExecutionContext { task_instance_id: 1, diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs index 513e7d75..0e2bc7d5 100644 --- a/tests/huntsman/tdl-integration/tests/complex.rs +++ b/tests/huntsman/tdl-integration/tests/complex.rs @@ -33,7 +33,7 @@ fn lib_path() -> std::path::PathBuf { fn encode_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; diff --git a/tests/huntsman/test-utils/src/executor.rs b/tests/huntsman/test-utils/src/executor.rs index 43ae646f..24f8db5f 100644 --- a/tests/huntsman/test-utils/src/executor.rs +++ b/tests/huntsman/test-utils/src/executor.rs @@ -192,7 +192,7 @@ pub fn tdl_package_dir() -> PathBuf { pub fn build_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; From d95057fc5fc4d2e12d4616d4969df1c5cb697f92 Mon Sep 17 00:00:00 2001 From: sitaowang1998 Date: Sat, 6 Jun 2026 14:50:50 -0400 Subject: [PATCH 06/14] refactor(spider-huntsman): Use auto-incrementing u64 IDs instead of UUIDv7 for database-generated IDs. (#337) Co-authored-by: LinZhihao-723 --- Cargo.lock | 266 +----------------- components/spider-core/Cargo.toml | 4 +- components/spider-core/src/types/id.rs | 126 ++++++--- .../spider-execution-manager/src/liveness.rs | 2 +- .../src/process_pool.rs | 8 +- components/spider-storage/Cargo.toml | 2 - components/spider-storage/src/db/mariadb.rs | 29 +- .../spider-storage/src/state/job_cache.rs | 16 +- .../spider-storage/src/task_instance_pool.rs | 20 +- .../spider-storage/tests/mariadb_infra.rs | 2 +- .../spider-storage/tests/mariadb_test.rs | 30 +- .../spider-storage/tests/scheduling_infra.rs | 4 +- components/spider-tdl/src/task.rs | 4 +- components/spider-tdl/src/task_context.rs | 4 +- .../spider-tdl/tests/test_task_macro.rs | 8 +- .../task-executor/tests/test_process_pool.rs | 8 +- .../huntsman/tdl-integration/tests/complex.rs | 4 +- tests/huntsman/test-utils/src/executor.rs | 4 +- tests/huntsman/test-utils/src/mock.rs | 2 +- 19 files changed, 166 insertions(+), 377 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bb6a1c31..d6ac6cd1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,12 +100,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "bumpalo" -version = "3.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" - [[package]] name = "bytecount" version = "0.6.9" @@ -497,21 +491,8 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi 5.3.0", - "wasip2", -] - -[[package]] -name = "getrandom" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" -dependencies = [ - "cfg-if", - "libc", - "r-efi 6.0.0", + "r-efi", "wasip2", - "wasip3", ] [[package]] @@ -684,12 +665,6 @@ dependencies = [ "zerovec", ] -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "idna" version = "1.1.0" @@ -719,8 +694,6 @@ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", "hashbrown 0.17.0", - "serde", - "serde_core", ] [[package]] @@ -737,18 +710,6 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" -[[package]] -name = "js-sys" -version = "0.3.97" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf" -dependencies = [ - "cfg-if", - "futures-util", - "once_cell", - "wasm-bindgen", -] - [[package]] name = "konst" version = "0.2.20" @@ -773,12 +734,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "libc" version = "0.2.186" @@ -1054,16 +1009,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "prettyplease" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" -dependencies = [ - "proc-macro2", - "syn 2.0.117", -] - [[package]] name = "proc-macro-error-attr2" version = "2.0.0" @@ -1110,12 +1055,6 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" -[[package]] -name = "r-efi" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" - [[package]] name = "rand" version = "0.8.6" @@ -1249,12 +1188,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustversion" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" - [[package]] name = "ryu" version = "1.0.23" @@ -1461,6 +1394,7 @@ name = "spider-core" version = "0.1.0" dependencies = [ "non-empty-string", + "rand 0.9.4", "rmp-serde", "semver", "serde", @@ -1471,7 +1405,6 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "uuid", ] [[package]] @@ -1524,7 +1457,6 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "uuid", ] [[package]] @@ -1633,7 +1565,6 @@ dependencies = [ "tokio-stream", "tracing", "url", - "uuid", ] [[package]] @@ -1713,7 +1644,6 @@ dependencies = [ "stringprep", "thiserror", "tracing", - "uuid", "whoami", ] @@ -1751,7 +1681,6 @@ dependencies = [ "stringprep", "thiserror", "tracing", - "uuid", "whoami", ] @@ -1777,7 +1706,6 @@ dependencies = [ "thiserror", "tracing", "url", - "uuid", ] [[package]] @@ -2165,18 +2093,6 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" -[[package]] -name = "uuid" -version = "1.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" -dependencies = [ - "getrandom 0.4.2", - "js-sys", - "serde_core", - "wasm-bindgen", -] - [[package]] name = "valuable" version = "0.1.1" @@ -2207,16 +2123,7 @@ version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -2225,85 +2132,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" -[[package]] -name = "wasm-bindgen" -version = "0.2.120" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1" -dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.120" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.120" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41" -dependencies = [ - "bumpalo", - "proc-macro2", - "quote", - "syn 2.0.117", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.120" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap", - "wasm-encoder", - "wasmparser", -] - -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap", - "semver", -] - [[package]] name = "whoami" version = "1.6.1" @@ -2417,100 +2245,12 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "writeable" version = "0.6.3" diff --git a/components/spider-core/Cargo.toml b/components/spider-core/Cargo.toml index 87531aaa..7167cfde 100644 --- a/components/spider-core/Cargo.toml +++ b/components/spider-core/Cargo.toml @@ -9,15 +9,15 @@ path = "src/lib.rs" [dependencies] non-empty-string = { version = "0.2.6", features = ["serde"] } +rand = "0.9.1" rmp-serde = "1.3.1" semver = "1.0.27" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" spider-derive = { path = "../spider-derive" } -sqlx = { version = "0.8.6", features = ["mysql", "uuid"] } +sqlx = { version = "0.8.6", features = ["mysql"] } strum = { version = "0.28.0", features = ["derive"] } thiserror = "2.0.18" -uuid = { version = "1.19.0", features = ["serde", "v4"] } [dev-dependencies] tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread"] } diff --git a/components/spider-core/src/types/id.rs b/components/spider-core/src/types/id.rs index 4735f798..e205d27e 100644 --- a/components/spider-core/src/types/id.rs +++ b/components/spider-core/src/types/id.rs @@ -1,12 +1,14 @@ -use std::{fmt::Debug, marker::PhantomData}; +use std::{ + fmt::{Debug, Display}, + marker::PhantomData, +}; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use sqlx::{Database, encode::IsNull}; -use uuid::Uuid; use crate::task::TaskIndex; -/// A generic identifier type that wraps a UUID and a type marker. +/// A generic identifier type that wraps a numeric ID and a type marker. /// /// # Type Parameters: /// @@ -15,84 +17,109 @@ use crate::task::TaskIndex; /// # Examples /// /// ```rust +/// use spider_core::types::id::Id; +/// /// #[derive(Debug, PartialEq, Eq)] /// enum SomeTypeIdMarker {} /// type SomeTypeId = Id; /// ``` -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct Id(Uuid, PhantomData); +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Id { + raw: u64, + _marker: PhantomData, +} impl Default for Id { fn default() -> Self { - Self::new() + Self::from(0) } } impl Id { + /// Creates a random ID for tests. + /// + /// Production IDs should be assigned by persistent storage instead. #[must_use] - pub fn new() -> Self { - Self(Uuid::new_v4(), PhantomData) - } - - #[must_use] - pub const fn from(uid: Uuid) -> Self { - Self(uid, PhantomData) + pub fn random() -> Self { + Self::from(rand::random()) } #[must_use] - pub const fn as_uuid_ref(&self) -> &Uuid { - &self.0 + pub const fn from(id: u64) -> Self { + Self { + raw: id, + _marker: PhantomData, + } } #[must_use] - pub const fn as_bytes(&self) -> &UuidBytes { - self.0.as_bytes() + pub const fn get(&self) -> u64 { + self.raw } } -impl sqlx::Type for Id +impl sqlx::Type for Id where - TypeMarker: Debug + PartialEq + Eq, - Db: Database, - Uuid: sqlx::Type, + u64: sqlx::Type, { fn type_info() -> ::TypeInfo { - >::type_info() + >::type_info() } fn compatible(ty: &::TypeInfo) -> bool { - >::compatible(ty) + >::compatible(ty) } } -impl<'encode, TypeMarker, Db> sqlx::Encode<'encode, Db> for Id +impl<'encode, TypeMarker: Debug + PartialEq + Eq, Db: Database> sqlx::Encode<'encode, Db> + for Id where - TypeMarker: Debug + PartialEq + Eq, - Db: Database, - Uuid: sqlx::Encode<'encode, Db>, + u64: sqlx::Encode<'encode, Db>, { fn encode_by_ref( &self, buf: &mut ::ArgumentBuffer<'encode>, ) -> Result { - self.0.encode_by_ref(buf) + self.get().encode_by_ref(buf) } } -impl<'decode, TypeMarker, Db> sqlx::Decode<'decode, Db> for Id +impl<'decode, TypeMarker: Debug + PartialEq + Eq, Db: Database> sqlx::Decode<'decode, Db> + for Id where - TypeMarker: Debug + PartialEq + Eq, - Db: Database, - Uuid: sqlx::Decode<'decode, Db>, + u64: sqlx::Decode<'decode, Db>, { fn decode( value: ::ValueRef<'decode>, ) -> Result { - Uuid::decode(value).map(|uuid| Self(uuid, PhantomData)) + u64::decode(value).map(|id| Self::from(id)) + } +} + +impl Display for Id { + fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.get(), formatter) + } +} + +impl Serialize for Id { + fn serialize( + &self, + serializer: SerializerImpl, + ) -> Result { + self.get().serialize(serializer) } } -pub type UuidBytes = uuid::Bytes; +impl<'deserializer_lifetime, TypeMarker: Debug + PartialEq + Eq> Deserialize<'deserializer_lifetime> + for Id +{ + fn deserialize>( + deserializer: DeserializerImpl, + ) -> Result { + u64::deserialize(deserializer).map(Self::from) + } +} #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum ResourceGroupIdMarker {} @@ -180,3 +207,32 @@ where } pub type SignedJobId = SignedId; + +#[cfg(test)] +mod tests { + use super::{JobId, ResourceGroupId}; + + #[test] + fn id_serializes_as_u64() { + let job_id = JobId::from(42); + let serialized = + serde_json::to_string(&job_id).expect("job id serialization should succeed"); + + assert_eq!(serialized, "42"); + } + + #[test] + fn distinct_id_markers_can_share_numeric_values() { + let job_id = JobId::from(7); + let resource_group_id = ResourceGroupId::from(7); + + assert_eq!(job_id.get(), resource_group_id.get()); + } + + #[test] + fn default_id_is_zero() { + let job_id = JobId::default(); + + assert_eq!(job_id.get(), 0); + } +} diff --git a/components/spider-execution-manager/src/liveness.rs b/components/spider-execution-manager/src/liveness.rs index 411931cd..7662ae6a 100644 --- a/components/spider-execution-manager/src/liveness.rs +++ b/components/spider-execution-manager/src/liveness.rs @@ -290,7 +290,7 @@ mod tests { cancellation_token: CancellationToken, ) -> (LivenessHandle, JoinHandle<()>) { spawn( - ExecutionManagerId::new(), + ExecutionManagerId::random(), client, tracker, cancellation_token, diff --git a/components/spider-execution-manager/src/process_pool.rs b/components/spider-execution-manager/src/process_pool.rs index fab51d53..f3703153 100644 --- a/components/spider-execution-manager/src/process_pool.rs +++ b/components/spider-execution-manager/src/process_pool.rs @@ -217,10 +217,10 @@ impl ProcessPool { fn spawn_executor(&self) -> Result { let executor_id = self.next_executor_id.fetch_add(1, Ordering::Relaxed); std::fs::create_dir_all(&self.config.log_dir)?; - let log_path = self.config.log_dir.join(format!( - "{}-{executor_id}.log", - self.config.em_id.as_uuid_ref() - )); + let log_path = self + .config + .log_dir + .join(format!("{}-{executor_id}.log", self.config.em_id)); let log_file = File::options().create(true).append(true).open(&log_path)?; let mut command = Command::new(&self.config.executor_binary_path); diff --git a/components/spider-storage/Cargo.toml b/components/spider-storage/Cargo.toml index f0a39b72..d57b856f 100644 --- a/components/spider-storage/Cargo.toml +++ b/components/spider-storage/Cargo.toml @@ -29,7 +29,6 @@ tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread", "sync", "time"] } -uuid = { version = "1.19.0", features = ["serde"] } [dev-dependencies] anyhow = "1.0.98" @@ -38,4 +37,3 @@ serial_test = { version = "3.2.0", features = ["file_locks"] } tabled = "0.20.0" tokio = { version = "1.50.0", features = ["macros", "rt-multi-thread", "sync"] } tokio-util = { version = "0.7", features = ["rt"] } -uuid = { version = "1.19.0", features = ["v4"] } diff --git a/components/spider-storage/src/db/mariadb.rs b/components/spider-storage/src/db/mariadb.rs index faeda2a6..6bd7017c 100644 --- a/components/spider-storage/src/db/mariadb.rs +++ b/components/spider-storage/src/db/mariadb.rs @@ -102,7 +102,7 @@ impl ExternalJobOrchestration for MariaDbStorageConnector { ) -> Result { const INSERT_QUERY: &str = formatcp!( "INSERT INTO `{table}` (`resource_group_id`, `serialized_task_graph`, \ - `serialized_job_inputs`) VALUES (?, ?, ?) RETURNING CAST(`id` AS BINARY(16)) AS `id`;", + `serialized_job_inputs`) VALUES (?, ?, ?) RETURNING `id`;", table = JOBS_TABLE_NAME, ); @@ -170,8 +170,7 @@ impl ExternalJobOrchestration for MariaDbStorageConnector { let outputs_bytes = serialized_outputs.ok_or_else(|| { DbError::CorruptedDbState(format!( - "job `{}` succeeded but has no serialized outputs", - job_id.as_uuid_ref() + "job `{job_id}` succeeded but has no serialized outputs" )) })?; let outputs: Vec = @@ -201,10 +200,7 @@ impl ExternalJobOrchestration for MariaDbStorageConnector { } let message = error_message.ok_or_else(|| { - DbError::CorruptedDbState(format!( - "job `{}` failed but has no error message", - job_id.as_uuid_ref() - )) + DbError::CorruptedDbState(format!("job `{job_id}` failed but has no error message")) })?; Ok(message) } @@ -344,7 +340,7 @@ impl InternalJobOrchestration for MariaDbStorageConnector { const DELETE_BATCH_SIZE: usize = 1000; const SELECT_QUERY: &str = formatcp!( - "SELECT CAST(`id` AS BINARY(16)) FROM `{table}` WHERE `state` IN \ + "SELECT `id` FROM `{table}` WHERE `state` IN \ ('{succeeded_state}','{failed_state}','{cancelled_state}') AND `ended_at` < NOW() - \ INTERVAL ? SECOND LIMIT {DELETE_BATCH_SIZE} FOR UPDATE;", table = JOBS_TABLE_NAME, @@ -394,8 +390,7 @@ impl ResourceGroupManagement for MariaDbStorageConnector { password: Vec, ) -> Result { const QUERY: &str = formatcp!( - "INSERT INTO `{table}` (`external_id`, `password`) VALUES (?, ?) RETURNING CAST(`id` \ - AS BINARY(16)) AS `id`;", + "INSERT INTO `{table}` (`external_id`, `password`) VALUES (?, ?) RETURNING `id`;", table = RESOURCE_GROUPS_TABLE_NAME, ); @@ -462,7 +457,7 @@ impl ExecutionManagerLivenessManagement for MariaDbStorageConnector { ip_address: IpAddr, ) -> Result { const INSERT_QUERY: &str = formatcp!( - "INSERT INTO `{table}` (`ip_address`) VALUES (?) RETURNING CAST(`id` AS BINARY(16));", + "INSERT INTO `{table}` (`ip_address`) VALUES (?) RETURNING `id`;", table = EXECUTION_MANAGERS_TABLE_NAME, ); @@ -539,8 +534,8 @@ impl ExecutionManagerLivenessManagement for MariaDbStorageConnector { const UPDATE_BATCH_SIZE: usize = 1000; const SELECT_QUERY: &str = formatcp!( - "SELECT CAST(`id` AS BINARY(16)) FROM `{table}` WHERE `state` = '{alive_state}' AND \ - `last_heartbeat_at` < CURRENT_TIMESTAMP - INTERVAL ? SECOND FOR UPDATE;", + "SELECT `id` FROM `{table}` WHERE `state` = '{alive_state}' AND `last_heartbeat_at` < \ + CURRENT_TIMESTAMP - INTERVAL ? SECOND FOR UPDATE;", table = EXECUTION_MANAGERS_TABLE_NAME, alive_state = ExecutionManagerState::Alive.as_str(), ); @@ -601,7 +596,7 @@ const fn resource_groups_creation_query() -> &'static str { formatcp!( r" CREATE TABLE IF NOT EXISTS `{RESOURCE_GROUPS_TABLE_NAME}` ( - `id` UUID NOT NULL DEFAULT UUID_v7(), + `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `external_id` VARCHAR(256) NOT NULL, `password` VARBINARY(2048) NOT NULL, PRIMARY KEY (`id`), @@ -615,8 +610,8 @@ const fn jobs_creation_query() -> &'static str { formatcp!( r" CREATE TABLE IF NOT EXISTS `{JOBS_TABLE_NAME}` ( - `id` UUID NOT NULL DEFAULT UUID_v7(), - `resource_group_id` UUID NOT NULL, + `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `resource_group_id` BIGINT UNSIGNED NOT NULL, `state` {state_enum} NOT NULL DEFAULT {default_state}, `serialized_task_graph` LONGTEXT NOT NULL, `serialized_job_inputs` LONGBLOB NOT NULL, @@ -642,7 +637,7 @@ const fn execution_managers_creation_query() -> &'static str { formatcp!( r" CREATE TABLE IF NOT EXISTS `{EXECUTION_MANAGERS_TABLE_NAME}` ( - `id` UUID NOT NULL DEFAULT UUID_v7(), + `id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `ip_address` VARCHAR(45) NOT NULL, `state` {state_enum} NOT NULL DEFAULT {default_state}, `last_heartbeat_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, diff --git a/components/spider-storage/src/state/job_cache.rs b/components/spider-storage/src/state/job_cache.rs index 6ad3c7ce..5ae5e40a 100644 --- a/components/spider-storage/src/state/job_cache.rs +++ b/components/spider-storage/src/state/job_cache.rs @@ -282,7 +282,7 @@ mod tests { .expect("job submission should be valid"); SharedJobControlBlock::create( job_id, - spider_core::types::id::ResourceGroupId::new(), + spider_core::types::id::ResourceGroupId::random(), job_submission, MockReadyQueueSender, MockDbConnector, @@ -296,7 +296,7 @@ mod tests { async fn job_cache_insert_and_get() -> anyhow::Result<()> { let cache: JobCache = JobCache::new(); - let job_id = JobId::new(); + let job_id = JobId::random(); let jcb = create_test_jcb(job_id).await; cache.insert(jcb)?; @@ -310,7 +310,7 @@ mod tests { async fn job_cache_remove_returns_inserted_jcb() -> anyhow::Result<()> { let cache: JobCache = JobCache::new(); - let job_id = JobId::new(); + let job_id = JobId::random(); let jcb = create_test_jcb(job_id).await; cache.insert(jcb)?; @@ -327,7 +327,7 @@ mod tests { async fn job_cache_get_returns_none_for_nonexistent_job() -> anyhow::Result<()> { let cache: JobCache = JobCache::new(); - let job_id = JobId::new(); + let job_id = JobId::random(); let result = cache.get(job_id); assert!( @@ -341,7 +341,7 @@ mod tests { async fn job_cache_insert_duplicate_returns_error() -> anyhow::Result<()> { let cache: JobCache = JobCache::new(); - let job_id = JobId::new(); + let job_id = JobId::random(); let jcb1 = create_test_jcb(job_id).await; cache.insert(jcb1)?; @@ -372,7 +372,7 @@ mod tests { for i in 0..num_tasks { let cache = Arc::clone(&cache); tracker.spawn(async move { - let job_id = JobId::new(); + let job_id = JobId::random(); let jcb = create_test_jcb(job_id).await; cache .insert(jcb) @@ -456,13 +456,13 @@ mod tests { }) .expect("task insertion should succeed"); - let job_id = JobId::new(); + let job_id = JobId::random(); let job_submission = ValidatedJobSubmission::create(submitted, vec![TaskInput::ValuePayload(vec![0u8; 4])]) .expect("job submission should be valid"); let jcb = SharedJobControlBlock::create( job_id, - spider_core::types::id::ResourceGroupId::new(), + spider_core::types::id::ResourceGroupId::random(), job_submission, sender, MockDbConnector, diff --git a/components/spider-storage/src/task_instance_pool.rs b/components/spider-storage/src/task_instance_pool.rs index bba0cf77..930271a2 100644 --- a/components/spider-storage/src/task_instance_pool.rs +++ b/components/spider-storage/src/task_instance_pool.rs @@ -683,8 +683,8 @@ mod tests { ) -> TaskInstanceMetadata { const SOFT_TIMEOUT_MS: Duration = Duration::from_millis(100); TaskInstanceMetadata { - resource_group_id: ResourceGroupId::new(), - job_id: JobId::new(), + resource_group_id: ResourceGroupId::random(), + job_id: JobId::random(), task_id, task_instance_id, execution_manager_id, @@ -767,7 +767,7 @@ mod tests { let metadata = make_task_instance_metadata( TaskId::Index(0), task_instance_id, - ExecutionManagerId::new(), + ExecutionManagerId::random(), SystemTime::now(), ); let job_id = metadata.job_id; @@ -797,7 +797,7 @@ mod tests { Duration::from_mins(1), DEFAULT_CHANNEL_SIZE, ); - let execution_manager_id = ExecutionManagerId::new(); + let execution_manager_id = ExecutionManagerId::random(); let tcb1 = build_single_task_tcb().await; let metadata1 = make_task_instance_metadata( @@ -840,7 +840,7 @@ mod tests { liveness_store, Duration::from_mins(1), ); - let em_id = ExecutionManagerId::new(); + let em_id = ExecutionManagerId::random(); // Create a few tasks and terminate them immediately. for i in 0..NUM_TASKS { @@ -884,7 +884,7 @@ mod tests { liveness_store, Duration::from_mins(1), ); - let em_id = ExecutionManagerId::new(); + let em_id = ExecutionManagerId::random(); let gc_starting_time = SystemTime::now(); // soft_timeout_ddl = registered_at + 100ms // deadline = now - 900ms @@ -942,7 +942,7 @@ mod tests { liveness_store.clone(), Duration::from_mins(1), ); - let em_id = ExecutionManagerId::new(); + let em_id = ExecutionManagerId::random(); let now = SystemTime::now(); let mut expected_messages: Vec = Vec::new(); @@ -1000,7 +1000,7 @@ mod tests { liveness_store.clone(), Duration::from_mins(1), ); - let em_id = ExecutionManagerId::new(); + let em_id = ExecutionManagerId::random(); let now = SystemTime::now(); for i in 0..NUM_TASKS { @@ -1058,8 +1058,8 @@ mod tests { liveness_store.clone(), Duration::from_mins(1), ); - let alive_em = ExecutionManagerId::new(); - let dead_em = ExecutionManagerId::new(); + let alive_em = ExecutionManagerId::random(); + let dead_em = ExecutionManagerId::random(); let now = SystemTime::now(); // soft timeout deadline = now - 900ms let elapsed_registration = now - Duration::from_secs(1); diff --git a/components/spider-storage/tests/mariadb_infra.rs b/components/spider-storage/tests/mariadb_infra.rs index ef26198c..0772ec04 100644 --- a/components/spider-storage/tests/mariadb_infra.rs +++ b/components/spider-storage/tests/mariadb_infra.rs @@ -47,7 +47,7 @@ pub async fn create_mariadb_connector() -> MariaDbStorageConnector { /// /// Panics if the resource group creation fails. pub async fn create_test_resource_group(storage: &MariaDbStorageConnector) -> ResourceGroupId { - let external_id = uuid::Uuid::new_v4().to_string(); + let external_id = format!("test-resource-group-{}", rand::random::()); storage .add(external_id, b"test-password".to_vec()) .await diff --git a/components/spider-storage/tests/mariadb_test.rs b/components/spider-storage/tests/mariadb_test.rs index 3b90ab07..88343c82 100644 --- a/components/spider-storage/tests/mariadb_test.rs +++ b/components/spider-storage/tests/mariadb_test.rs @@ -80,7 +80,7 @@ async fn test_register_job() { #[ignore = "requires MariaDB"] async fn test_register_job_invalid_resource_group() { let storage = create_mariadb_connector().await; - let fake_rg_id = ResourceGroupId::new(); + let fake_rg_id = ResourceGroupId::random(); let (graph, inputs) = single_task_graph(); let job_submission = ValidatedJobSubmission::create(graph, inputs).expect("job submission should be valid"); @@ -555,7 +555,7 @@ async fn test_delete_expired_terminated_jobs() { #[ignore = "requires MariaDB"] async fn test_add_duplicate_resource_group() { let storage = create_mariadb_connector().await; - let external_id = uuid::Uuid::new_v4().to_string(); + let external_id = format!("test-resource-group-{}", rand::random::()); storage .add(external_id.clone(), b"password".to_vec()) @@ -576,7 +576,7 @@ async fn test_verify_correct_password() { let rg_id = storage .add( - uuid::Uuid::new_v4().to_string(), + format!("test-resource-group-{}", rand::random::()), b"correct-password".to_vec(), ) .await @@ -595,7 +595,7 @@ async fn test_verify_wrong_password() { let rg_id = storage .add( - uuid::Uuid::new_v4().to_string(), + format!("test-resource-group-{}", rand::random::()), b"correct-password".to_vec(), ) .await @@ -612,7 +612,7 @@ async fn test_verify_wrong_password() { #[ignore = "requires MariaDB"] async fn test_verify_nonexistent_resource_group() { let storage = create_mariadb_connector().await; - let fake_rg_id = ResourceGroupId::new(); + let fake_rg_id = ResourceGroupId::random(); let result = storage.verify(fake_rg_id, b"password").await; assert!( @@ -625,7 +625,7 @@ async fn test_verify_nonexistent_resource_group() { #[ignore = "requires MariaDB"] async fn test_start_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = storage.start(fake_job_id).await; assert!( @@ -638,7 +638,7 @@ async fn test_start_job_not_found() { #[ignore = "requires MariaDB"] async fn test_set_state_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = InternalJobOrchestration::set_state(&storage, fake_job_id, JobState::Running).await; @@ -652,7 +652,7 @@ async fn test_set_state_job_not_found() { #[ignore = "requires MariaDB"] async fn test_get_state_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = storage.get_state(fake_job_id).await; assert!( @@ -665,7 +665,7 @@ async fn test_get_state_job_not_found() { #[ignore = "requires MariaDB"] async fn test_get_outputs_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = storage.get_outputs(fake_job_id).await; assert!( @@ -678,7 +678,7 @@ async fn test_get_outputs_job_not_found() { #[ignore = "requires MariaDB"] async fn test_get_error_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = storage.get_error(fake_job_id).await; assert!( @@ -691,7 +691,7 @@ async fn test_get_error_job_not_found() { #[ignore = "requires MariaDB"] async fn test_commit_outputs_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = InternalJobOrchestration::commit_outputs(&storage, fake_job_id, vec![vec![]], false).await; @@ -705,7 +705,7 @@ async fn test_commit_outputs_job_not_found() { #[ignore = "requires MariaDB"] async fn test_cancel_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = InternalJobOrchestration::cancel(&storage, fake_job_id, false).await; assert!( @@ -718,7 +718,7 @@ async fn test_cancel_job_not_found() { #[ignore = "requires MariaDB"] async fn test_fail_job_not_found() { let storage = create_mariadb_connector().await; - let fake_job_id = JobId::new(); + let fake_job_id = JobId::random(); let result = InternalJobOrchestration::fail(&storage, fake_job_id, "error".to_string()).await; assert!( @@ -822,7 +822,7 @@ async fn test_update_execution_manager_heartbeat() { #[ignore = "requires MariaDB"] async fn test_update_execution_manager_heartbeat_not_found() { let storage = create_mariadb_connector().await; - let fake_em_id = ExecutionManagerId::new(); + let fake_em_id = ExecutionManagerId::random(); let result = storage.update_execution_manager_heartbeat(fake_em_id).await; assert!( @@ -873,7 +873,7 @@ async fn test_is_execution_manager_alive_em_alive() { #[ignore = "requires MariaDB"] async fn test_is_execution_manager_alive_em_not_found() { let storage = create_mariadb_connector().await; - let fake_em_id = ExecutionManagerId::new(); + let fake_em_id = ExecutionManagerId::random(); let result = storage.is_execution_manager_alive(fake_em_id).await; assert!( diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs index 046a35eb..a089d66f 100644 --- a/components/spider-storage/tests/scheduling_infra.rs +++ b/components/spider-storage/tests/scheduling_infra.rs @@ -359,7 +359,7 @@ pub async fn run_workload( let ctx = EmContext { receiver: ready_receiver, jcb: jcb.clone(), - execution_manager_id: ExecutionManagerId::new(), + execution_manager_id: ExecutionManagerId::random(), terminal_state_sender: terminal_state_sender.clone(), done_receiver: done_receiver.clone(), seen_tasks: Arc::new(DashMap::new()), @@ -374,7 +374,7 @@ pub async fn run_workload( let mut join_set = tokio::task::JoinSet::new(); for _ in 0..NUM_EXECUTION_MANAGERS { let mut em_ctx = ctx.clone(); - em_ctx.execution_manager_id = ExecutionManagerId::new(); + em_ctx.execution_manager_id = ExecutionManagerId::random(); join_set.spawn(async move { run_execution_manager(em_ctx).await }); } diff --git a/components/spider-tdl/src/task.rs b/components/spider-tdl/src/task.rs index d4015e0c..7968f4b0 100644 --- a/components/spider-tdl/src/task.rs +++ b/components/spider-tdl/src/task.rs @@ -253,10 +253,10 @@ mod tests { fn make_encoded_ctx() -> Vec { let ctx = TaskContext { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), task_instance_id: 1, - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), }; rmp_serde::to_vec(&ctx).expect("failed to serialize `TaskContext`") } diff --git a/components/spider-tdl/src/task_context.rs b/components/spider-tdl/src/task_context.rs index d412bdb4..d79dea6d 100644 --- a/components/spider-tdl/src/task_context.rs +++ b/components/spider-tdl/src/task_context.rs @@ -30,10 +30,10 @@ mod tests { #[test] fn round_trip_msgpack() -> anyhow::Result<()> { let ctx = TaskContext { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), task_instance_id: 13, - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), }; let encoded = rmp_serde::to_vec(&ctx)?; let decoded: TaskContext = rmp_serde::from_slice(&encoded)?; diff --git a/components/spider-tdl/tests/test_task_macro.rs b/components/spider-tdl/tests/test_task_macro.rs index 9a891f19..59eb05ea 100644 --- a/components/spider-tdl/tests/test_task_macro.rs +++ b/components/spider-tdl/tests/test_task_macro.rs @@ -80,10 +80,10 @@ fn translate(_ctx: TaskContext, p: Point, dx: int32, dy: int32) -> Result<(Point /// A mocked encoded task context for testing. fn make_encoded_ctx() -> Vec { let ctx = TaskContext { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), task_instance_id: 1, - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), }; rmp_serde::to_vec(&ctx).expect("failed to serialize `TaskContext`") } @@ -302,10 +302,10 @@ fn direct_execute_call_round_trips() -> anyhow::Result<()> { const EXPECTED_SUM: int32 = OPERAND_A + OPERAND_B; let ctx = TaskContext { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), task_instance_id: 1, - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), }; let mut inputs = TaskInputsSerializer::new(); diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs index e646352f..367c0c3b 100644 --- a/tests/huntsman/task-executor/tests/test_process_pool.rs +++ b/tests/huntsman/task-executor/tests/test_process_pool.rs @@ -58,8 +58,8 @@ const SLOW_FIB_INDEX: u64 = 45; /// /// Panics if [`ProcessPool::new`] fails — i.e., the task-executor binary cannot be spawned. fn build_pool() -> ProcessPool { - let em_id = ExecutionManagerId::new(); - let log_dir = std::env::temp_dir().join(format!("spider-em-pool-test-{}", em_id.as_uuid_ref())); + let em_id = ExecutionManagerId::random(); + let log_dir = std::env::temp_dir().join(format!("spider-em-pool-test-{em_id}")); let config = ProcessPoolConfig { em_id, executor_binary_path: task_executor_bin(), @@ -77,9 +77,9 @@ fn build_pool() -> ProcessPool { /// supplies `hard_timeout` directly to [`ProcessPool::execute`]), and the supplied `inputs`. fn make_request(task_func: &str, inputs: Vec) -> ExecuteRequest { ExecuteRequest { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), ctx: ExecutionContext { task_instance_id: 1, tdl_context: TdlContext { diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs index 0e2bc7d5..09c90020 100644 --- a/tests/huntsman/tdl-integration/tests/complex.rs +++ b/tests/huntsman/tdl-integration/tests/complex.rs @@ -32,10 +32,10 @@ fn lib_path() -> std::path::PathBuf { /// An encoded task context for testing. fn encode_ctx() -> Vec { let ctx = TaskContext { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), task_instance_id: 1, - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), }; rmp_serde::to_vec(&ctx).expect("failed to serialize `TaskContext`") } diff --git a/tests/huntsman/test-utils/src/executor.rs b/tests/huntsman/test-utils/src/executor.rs index 24f8db5f..37133bea 100644 --- a/tests/huntsman/test-utils/src/executor.rs +++ b/tests/huntsman/test-utils/src/executor.rs @@ -191,10 +191,10 @@ pub fn tdl_package_dir() -> PathBuf { #[must_use] pub fn build_ctx() -> Vec { let ctx = TaskContext { - job_id: JobId::new(), + job_id: JobId::random(), task_id: TaskId::Index(0), task_instance_id: 1, - resource_group_id: ResourceGroupId::new(), + resource_group_id: ResourceGroupId::random(), }; rmp_serde::to_vec(&ctx).expect("serialize TaskContext") } diff --git a/tests/huntsman/test-utils/src/mock.rs b/tests/huntsman/test-utils/src/mock.rs index 19122cbe..e9115759 100644 --- a/tests/huntsman/test-utils/src/mock.rs +++ b/tests/huntsman/test-utils/src/mock.rs @@ -54,7 +54,7 @@ impl MockLiveness { pub fn with_initial_session(initial_session: SessionId) -> Self { Self { inner: Arc::new(LivenessInner { - em_id: ExecutionManagerId::new(), + em_id: ExecutionManagerId::random(), initial_session: AtomicU64::new(initial_session), register_response: Mutex::new(None), heartbeat_responses: Mutex::new(VecDeque::new()), From e030af8387b74aa028acdf4f9f12dfa923736cb2 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Sun, 7 Jun 2026 15:07:09 -0400 Subject: [PATCH 07/14] feat(spider-execution-manager): Add the runtime that drives the main task-dispatch loop. (#329) --- Cargo.lock | 11 + Cargo.toml | 1 + .../src/client/scheduler.rs | 3 +- .../spider-execution-manager/src/lib.rs | 1 + .../spider-execution-manager/src/runtime.rs | 493 +++++++++++++++ tests/huntsman/em-runtime/Cargo.toml | 25 + tests/huntsman/em-runtime/src/lib.rs | 4 + .../huntsman/em-runtime/tests/test_runtime.rs | 591 ++++++++++++++++++ tests/huntsman/test-utils/src/mock.rs | 280 ++++++++- 9 files changed, 1407 insertions(+), 2 deletions(-) create mode 100644 components/spider-execution-manager/src/runtime.rs create mode 100644 tests/huntsman/em-runtime/Cargo.toml create mode 100644 tests/huntsman/em-runtime/src/lib.rs create mode 100644 tests/huntsman/em-runtime/tests/test_runtime.rs diff --git a/Cargo.lock b/Cargo.lock index d6ac6cd1..a6653baa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -289,6 +289,17 @@ dependencies = [ "serde", ] +[[package]] +name = "em-runtime-tests" +version = "0.1.0" +dependencies = [ + "anyhow", + "spider-core", + "spider-execution-manager", + "test-utils", + "tokio", +] + [[package]] name = "equivalent" version = "1.0.2" diff --git a/Cargo.toml b/Cargo.toml index 5eb18596..08d6f85b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "components/spider-tdl-derive", "examples/huntsman/complex/tasks", "examples/huntsman/complex/types", + "tests/huntsman/em-runtime", "tests/huntsman/integration-test-tasks", "tests/huntsman/task-executor", "tests/huntsman/tdl-integration", diff --git a/components/spider-execution-manager/src/client/scheduler.rs b/components/spider-execution-manager/src/client/scheduler.rs index cf13687a..c25312b6 100644 --- a/components/spider-execution-manager/src/client/scheduler.rs +++ b/components/spider-execution-manager/src/client/scheduler.rs @@ -3,7 +3,7 @@ //! The execution manager acquires tasks from the scheduler through [`SchedulerClient`]. use async_trait::async_trait; -use spider_core::types::id::{ExecutionManagerId, JobId, SessionId, TaskId}; +use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId, TaskId}; /// A task assignment handed to the execution manager by the scheduler. /// @@ -14,6 +14,7 @@ use spider_core::types::id::{ExecutionManagerId, JobId, SessionId, TaskId}; pub struct SchedulerResponse { pub job_id: JobId, pub task_id: TaskId, + pub resource_group_id: ResourceGroupId, pub session_id: SessionId, } diff --git a/components/spider-execution-manager/src/lib.rs b/components/spider-execution-manager/src/lib.rs index 259fc8a9..20fffe17 100644 --- a/components/spider-execution-manager/src/lib.rs +++ b/components/spider-execution-manager/src/lib.rs @@ -4,3 +4,4 @@ pub mod client; pub mod liveness; pub mod process_pool; +pub mod runtime; diff --git a/components/spider-execution-manager/src/runtime.rs b/components/spider-execution-manager/src/runtime.rs new file mode 100644 index 00000000..71dd95ca --- /dev/null +++ b/components/spider-execution-manager/src/runtime.rs @@ -0,0 +1,493 @@ +//! Runtime — the execution manager's main loop. + +use std::{net::IpAddr, path::PathBuf, sync::Arc, time::Duration}; + +use spider_core::{ + session::SessionTracker, + types::{ + id::{ExecutionManagerId, JobId, SessionId, TaskId}, + io::ExecutionContext, + }, +}; +use tokio::task::JoinHandle; +use tokio_util::sync::{CancellationToken, DropGuard}; + +use crate::{ + client::{ + LivenessClient, + LivenessResponseError, + SchedulerClient, + SchedulerResponse, + StorageClient, + StorageResponseError, + }, + liveness::{self, LivenessHandle}, + process_pool::{self, ExecuteRequest, Outcome, ProcessPool, ProcessPoolConfig}, +}; + +/// Static configuration for a [`Runtime`]. Supplied once at bootstrap and never mutated. +#[derive(Debug, Clone)] +pub struct RuntimeConfig { + /// IP address advertised to storage at registration. + pub em_ip: IpAddr, + + /// Interval between liveness heartbeats. Handed verbatim to the liveness actor. + pub heartbeat_interval: Duration, + + /// Absolute path to the `spider-task-executor` binary the process pool spawns. + pub executor_binary_path: PathBuf, + + /// Directory of TDL packages exposed to executors via `SPIDER_TDL_PACKAGE_DIR`. + pub package_dir: PathBuf, + + /// Directory the process pool writes per-executor stderr logs into. + pub log_dir: PathBuf, +} + +/// Errors returned by [`Runtime`] during bootstrap or the main loop. +#[derive(Debug, thiserror::Error)] +pub enum RuntimeError { + /// Boot-time registration with storage failed. + #[error("failed to register with storage: {0}")] + Registration(#[from] LivenessResponseError), + + /// The initial process pool could not be created. + #[error("failed to create the process pool: {0}")] + ProcessPool(#[from] process_pool::InternalError), + + /// Storage rejected a request as malformed. Indicates a contract bug in the runtime, not a + /// transient condition, so the runtime treats it as fatal. + #[error("storage rejected request as invalid: {0}")] + StorageInvalidInput(String), +} + +/// The execution manager runtime: the main loop plus all the state it owns. +/// +/// # Type Parameters +/// +/// * `SchedulerClientType` - Concrete [`SchedulerClient`] the main loop pulls task assignments +/// from. +/// * `StorageClientType` - Concrete [`StorageClient`] used to register task instances and report +/// their outcome. +pub struct Runtime< + SchedulerClientType: SchedulerClient + Clone, + StorageClientType: StorageClient + Clone + 'static, +> { + em_id: ExecutionManagerId, + scheduler_client: SchedulerClientType, + storage_client: StorageClientType, + process_pool: ProcessPool, + session_tracker: SessionTracker, + liveness_handle: LivenessHandle, + liveness_join: JoinHandle<()>, + cancellation_token: CancellationToken, + _cancel_guard: DropGuard, +} + +impl< + SchedulerClientType: SchedulerClient + Clone, + StorageClientType: StorageClient + Clone + 'static, +> Runtime +{ + /// Factory function. + /// + /// Registers the execution manager with storage, seeds the [`SessionTracker`] with the session + /// ID returned by registration, spawns the initial executor [`ProcessPool`] and the liveness + /// actor, then assembles a ready-to-run runtime. The liveness actor sends the first heartbeat + /// by the time this returns. + /// + /// # Type Parameters + /// + /// * `LivenessClientType` - Concrete [`LivenessClient`] used to register at boot and, through + /// the spawned liveness actor, heartbeat thereafter. + /// + /// # Returns + /// + /// A tuple on success, containing: + /// + /// * The created [`Runtime`] instance, ready to run. + /// * The [`CancellationToken`] that the caller can use to request shutdown. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`LivenessClient::register`]'s return values on failure. + /// * Forwards [`ProcessPool::new`]'s return values on failure. + pub async fn create( + scheduler_client: SchedulerClientType, + storage_client: StorageClientType, + liveness_client: Arc, + config: RuntimeConfig, + ) -> Result<(Self, CancellationToken), RuntimeError> { + let registration = liveness_client.register(config.em_ip).await?; + let em_id = registration.em_id; + let session_tracker = SessionTracker::new(registration.session_id); + tracing::info!( + em_id = ? em_id, + session_id = registration.session_id, + "Execution manager registered with storage." + ); + + let process_pool = ProcessPool::new(ProcessPoolConfig { + em_id, + executor_binary_path: config.executor_binary_path, + package_dir: config.package_dir, + log_dir: config.log_dir, + })?; + + let cancellation_token = CancellationToken::new(); + let (liveness_handle, liveness_join) = liveness::spawn( + em_id, + liveness_client, + session_tracker.clone(), + cancellation_token.clone(), + config.heartbeat_interval, + ); + + let cancel_guard = cancellation_token.clone().drop_guard(); + let runtime = Self { + em_id, + scheduler_client, + storage_client, + process_pool, + session_tracker, + liveness_handle, + liveness_join, + cancellation_token: cancellation_token.clone(), + _cancel_guard: cancel_guard, + }; + Ok((runtime, cancellation_token)) + } + + /// Runs the main loop until the runtime is cancelled, then tears it down. + /// + /// # Returns + /// + /// `Ok(())` after a clean shutdown triggered by cancellation. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::main_loop`]'s return values on failure. + pub async fn run(self) -> Result<(), RuntimeError> { + tracing::info!(em_id = ? self.em_id, "Runtime main loop starting."); + let result = self.main_loop().await; + tracing::info!(em_id = ? self.em_id, "Runtime main loop exited. Shutting down."); + self.cancellation_token.cancel(); + if let Err(err) = self.liveness_join.await { + tracing::warn!(err = ? err, "Liveness actor task did not exit cleanly."); + } + result + } + + /// Iterates the main loop. Each iteration pulls a task assignment from the scheduler and runs + /// it through the local pipeline. Returns when the runtime is cancelled or a fatal error + /// occurs. + /// + /// # Returns + /// + /// `Ok(())` when the loop exits cleanly because the runtime was cancelled. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::register_task_instance`]'s return values on failure. + /// * Forwards [`ProcessPool::execute`]'s return values on failure. + async fn main_loop(&self) -> Result<(), RuntimeError> { + loop { + let assignment = tokio::select! { + biased; + () = self.cancellation_token.cancelled() => return Ok(()), + result = self.scheduler_client.next_task(self.em_id) => { + match result { + Ok(assignment) => assignment, + Err(e) => { + tracing::warn!(err = ? e, "Scheduler returned an error. Retrying."); + continue; + } + } + } + }; + + tracing::info!( + bundle_session = assignment.session_id, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Received a new task assignment from the scheduler." + ); + + let current_session = self.session_tracker.current(); + if assignment.session_id < current_session { + tracing::warn!( + bundle_session = assignment.session_id, + current_session, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Dropping stale task assignment from the scheduler." + ); + continue; + } + if assignment.session_id > current_session { + tracing::info!( + new_session = assignment.session_id, + "Observed a newer session via the scheduler. Refreshing liveness." + ); + self.liveness_handle.refresh().await; + } + + let Some(execution_context) = self.register_task_instance(assignment).await? else { + continue; + }; + + let hard_timeout = + Duration::from_millis(execution_context.timeout_policy.hard_timeout_ms); + let request = ExecuteRequest { + job_id: assignment.job_id, + task_id: assignment.task_id, + resource_group_id: assignment.resource_group_id, + ctx: execution_context, + }; + let outcome = self + .process_pool + .execute(request, hard_timeout) + .await + .inspect_err(|err| { + tracing::error!( + err = ? err, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Process pool failed to dispatch task. Bailing out." + ); + })?; + + let current_session = self.session_tracker.current(); + if assignment.session_id < current_session { + tracing::warn!( + bundle_session = assignment.session_id, + current_session, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Dropping stale task assignment's outcome." + ); + continue; + } + + // Fire-and-forget the outcome report so the main loop can dispatch the next task + // without waiting on storage. Errors are logged inside `report_outcome`. + tokio::spawn(report_outcome( + self.storage_client.clone(), + ReportTarget { + em: self.em_id, + job: assignment.job_id, + task: assignment.task_id, + session: assignment.session_id, + }, + outcome, + )); + } + } + + /// Registers a task instance with storage. + /// + /// Races the storage call against [`Self::cancellation_token`]: when it fires, the method + /// returns `Ok(None)` and the next [`Self::main_loop`] iteration observes the token via its + /// top-level [`tokio::select!`] and exits. + /// + /// # Returns + /// + /// * `Ok(Some(execution_context))` if storage accepted the registration. + /// * `Ok(None)` if the assignment should be skipped (stale session, transport failure, any + /// other recoverable storage error, or cancellation mid-call). + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`RuntimeError::StorageInvalidInput`] if storage rejects the request as malformed, which + /// the runtime treats as fatal. + async fn register_task_instance( + &self, + assignment: SchedulerResponse, + ) -> Result, RuntimeError> { + let register_result = tokio::select! { + biased; + () = self.cancellation_token.cancelled() => return Ok(None), + result = self.storage_client.register_task_instance( + assignment.job_id, + assignment.task_id, + self.em_id, + assignment.session_id, + ) => result, + }; + + match register_result { + Ok(execution_context) => Ok(Some(execution_context)), + Err(StorageResponseError::StaleSession { storage_session }) => { + tracing::warn!( + bundle_session = assignment.session_id, + storage_session = storage_session, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Storage rejected task registration as stale. Dropping the assignment." + ); + self.liveness_handle.refresh().await; + Ok(None) + } + Err(StorageResponseError::InvalidInput(err)) => { + tracing::error!( + err = % err, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Storage rejected task registration as malformed. Bailing out." + ); + Err(RuntimeError::StorageInvalidInput(err)) + } + Err(err) => { + tracing::warn!( + err = ? err, + job_id = ? assignment.job_id, + task_id = ? assignment.task_id, + "Storage rejected task registration. Dropping the assignment." + ); + Ok(None) + } + } + } +} + +/// Identifies a single task-instance attempt that an outcome report belongs to. +#[derive(Debug, Clone, Copy)] +struct ReportTarget { + em: ExecutionManagerId, + job: JobId, + task: TaskId, + session: SessionId, +} + +/// A task outcome prepared for transmission to storage. Splits the storage API's two reporting +/// endpoints (success / failure) and carries their payloads. +enum Report { + Success(Option>), + Failure(String), +} + +impl Report { + /// # Returns + /// + /// The constructed report from the task executor's outcome. + fn from_outcome(outcome: Outcome, target: ReportTarget) -> Self { + match outcome { + Outcome::Success { + outputs, + elapsed_us, + } => { + tracing::info!( + job_id = ? target.job, + task_id = ? target.task, + elapsed_us, + "Task completed successfully." + ); + Self::Success(Some(outputs)) + } + Outcome::InTaskFailure { error, elapsed_us } => { + tracing::info!( + job_id = ? target.job, + task_id = ? target.task, + elapsed_us, + "Task reported an in-task failure." + ); + Self::Failure(format!( + "in-task failure: {}", + String::from_utf8_lossy(&error) + )) + } + Outcome::Timeout { hard_timeout } => { + tracing::warn!( + job_id = ? target.job, + task_id = ? target.task, + hard_timeout_ms = ?hard_timeout.as_millis(), + "Task hit the hard timeout." + ); + Self::Failure(format!( + "hard timeout ({} ms) exceeded", + hard_timeout.as_millis() + )) + } + Outcome::ExecutorCrash { exit_status } => { + tracing::warn!( + job_id = ? target.job, + task_id = ? target.task, + exit_status = ?exit_status, + "Task executor crashed." + ); + Self::Failure(format!("executor crashed (exit_status = {exit_status:?})")) + } + } + } + + /// Consumes `self` and sends it to storage via the matching reporting endpoint. + /// + /// # Type Parameters + /// + /// * `StorageClientType` - Concrete [`StorageClient`] the report is sent through. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`StorageClient::report_task_success`]'s return values on failure. + /// * Forwards [`StorageClient::report_task_failure`]'s return values on failure. + async fn send( + self, + storage_client: &StorageClientType, + target: ReportTarget, + ) -> Result<(), StorageResponseError> { + let ReportTarget { + em, + job, + task, + session, + } = target; + match self { + Self::Success(outputs) => { + storage_client + .report_task_success(job, task, em, session, outputs) + .await + } + Self::Failure(message) => { + storage_client + .report_task_failure(job, task, em, session, message) + .await + } + } + } +} + +/// Reports a single task outcome to storage. Designed to run as a detached background task spawned +/// by [`Runtime::main_loop`] so reporting overlaps with the next round of task dispatching; errors +/// are logged rather than propagated. +/// +/// # Type Parameters +/// +/// * `StorageClientType` - Concrete [`StorageClient`] the report is sent through. +async fn report_outcome( + storage_client: StorageClientType, + target: ReportTarget, + outcome: Outcome, +) { + let report = Report::from_outcome(outcome, target); + let _ = report + .send(&storage_client, target) + .await + .inspect_err(|err| { + tracing::error!( + err = ? err, + job_id = ? target.job, + task_id = ? target.task, + "Failed to report task outcome to storage. Dropping the report." + ); + }); +} diff --git a/tests/huntsman/em-runtime/Cargo.toml b/tests/huntsman/em-runtime/Cargo.toml new file mode 100644 index 00000000..55489dc3 --- /dev/null +++ b/tests/huntsman/em-runtime/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "em-runtime-tests" +version = "0.1.0" +edition = "2024" +publish = false + +[lib] +name = "em_runtime_tests" +path = "src/lib.rs" + +[[test]] +name = "runtime" +path = "tests/test_runtime.rs" + +[dev-dependencies] +anyhow = "1.0.98" +spider-core = { path = "../../../components/spider-core" } +spider-execution-manager = { + path = "../../../components/spider-execution-manager" +} +test-utils = { path = "../test-utils" } +tokio = { + version = "1.50.0", + features = ["macros", "rt", "rt-multi-thread", "time"] +} diff --git a/tests/huntsman/em-runtime/src/lib.rs b/tests/huntsman/em-runtime/src/lib.rs new file mode 100644 index 00000000..f0a43541 --- /dev/null +++ b/tests/huntsman/em-runtime/src/lib.rs @@ -0,0 +1,4 @@ +//! Workspace member that hosts cross-crate integration tests for the execution manager runtime. +//! +//! Tests live under `tests/`; the shared mocks and helpers live in the `test-utils` crate. The +//! library itself is intentionally empty. diff --git a/tests/huntsman/em-runtime/tests/test_runtime.rs b/tests/huntsman/em-runtime/tests/test_runtime.rs new file mode 100644 index 00000000..89e9229f --- /dev/null +++ b/tests/huntsman/em-runtime/tests/test_runtime.rs @@ -0,0 +1,591 @@ +//! Integration tests for [`spider_execution_manager::runtime::Runtime`]. +//! +//! Each test wires up the runtime with the in-process mocks from `em_runtime_tests` plus a real +//! `spider-task-executor` binary spawned by the runtime's owned process pool. The binary path and +//! the TDL package staging directory are read from the same env vars the rest of the huntsman +//! integration suite uses (`SPIDER_TASK_EXECUTOR_BIN`, `SPIDER_TDL_PACKAGE_DIR`). +//! +//! All tests are `#[ignore]` so the workspace's plain `cargo test` doesn't run them. + +use std::{path::PathBuf, sync::Arc, time::Duration}; + +use anyhow::Context; +use spider_core::{ + task::{TdlContext, TimeoutPolicy}, + types::{ + id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId, TaskId}, + io::{ExecutionContext, TaskInput}, + }, +}; +use spider_execution_manager::{ + client::{SchedulerError, SchedulerResponse, StorageResponseError}, + runtime::{Runtime, RuntimeConfig, RuntimeError}, +}; +use test_utils::{ + MockLiveness, + MockScheduler, + MockStorage, + PACKAGE_NAME, + decode_single_output, + single_input, + task_executor_bin, + tdl_package_dir, +}; + +const HEARTBEAT_INTERVAL: Duration = Duration::from_millis(100); +const SLOW_HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); +const BOUNDED_WAIT: Duration = Duration::from_secs(2); +const TIGHT_WAIT: Duration = Duration::from_millis(500); + +/// Builds a [`SchedulerResponse`] tagged with `session_id` and fresh ids for the rest. +/// +/// # Returns +/// +/// A scheduler assignment carrying freshly generated `job_id`, `task_id`, and `resource_group_id` +/// alongside the requested `session_id`. +fn assignment_with_session(session_id: u64) -> SchedulerResponse { + SchedulerResponse { + job_id: JobId::random(), + task_id: TaskId::Index(0), + resource_group_id: ResourceGroupId::random(), + session_id, + } +} + +/// Builds an [`ExecutionContext`] pointing at `task_func` in the integration package with the +/// given inputs. Uses a generous hard timeout so well-behaved tasks always finish before the +/// process pool kills them. +/// +/// # Returns +/// +/// A populated [`ExecutionContext`] suitable for handing to the runtime via +/// [`MockStorage::push_register_response`]. +fn execution_context(task_func: &str, inputs: Vec) -> ExecutionContext { + ExecutionContext { + task_instance_id: 1, + tdl_context: TdlContext { + package: PACKAGE_NAME.to_owned(), + task_func: task_func.to_owned(), + }, + timeout_policy: TimeoutPolicy { + soft_timeout_ms: 1_000, + hard_timeout_ms: 5_000, + }, + inputs, + } +} + +/// Polls `predicate` every 5 ms until it returns `true` or `timeout` elapses. +/// +/// # Returns +/// +/// Whether `predicate` returned `true` before the deadline. +async fn wait_until(predicate: impl Fn() -> bool, timeout: Duration) -> bool { + let deadline = tokio::time::Instant::now() + timeout; + while !predicate() { + if tokio::time::Instant::now() >= deadline { + return false; + } + tokio::time::sleep(Duration::from_millis(5)).await; + } + true +} + +/// Builds a fresh [`RuntimeConfig`] pointing at the real executor binary, with a unique per-test +/// log directory and the requested `heartbeat_interval`. +/// +/// # Returns +/// +/// A [`RuntimeConfig`] ready to hand to [`Runtime::create`]. +/// +/// # Panics +/// +/// Panics if the hard-coded loopback ip fails to parse — never in practice. +fn runtime_config(heartbeat_interval: Duration) -> RuntimeConfig { + let unique = ExecutionManagerId::random(); + let log_dir = std::env::temp_dir().join(format!("spider-em-runtime-test-{unique}")); + RuntimeConfig { + em_ip: "127.0.0.1".parse().expect("parse loopback"), + heartbeat_interval, + executor_binary_path: task_executor_bin(), + package_dir: tdl_package_dir(), + log_dir, + } +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn create_registers_and_starts_heartbeats() -> anyhow::Result<()> { + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::new(); + + let (_runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness.clone()), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + assert_eq!(liveness.register_calls().len(), 1); + assert!( + liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await, + "liveness actor should send at least one heartbeat after create returns; observed {} so \ + far", + liveness.heartbeat_count() + ); + + token.cancel(); + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn create_propagates_pool_init_error() { + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::new(); + let bad_config = RuntimeConfig { + executor_binary_path: PathBuf::from("/nonexistent/spider-task-executor"), + ..runtime_config(HEARTBEAT_INTERVAL) + }; + + let result = Runtime::create(scheduler, storage, Arc::new(liveness), bad_config).await; + match result { + Err(RuntimeError::ProcessPool(_)) => {} + Err(other) => panic!("expected ProcessPool error, got {other:?}"), + Ok(_) => panic!("expected ProcessPool error, got Ok"), + } +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn external_cancellation_returns_ok() -> anyhow::Result<()> { + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::new(); + + let (runtime, token) = Runtime::create( + scheduler, + storage, + Arc::new(liveness.clone()), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + let join = tokio::spawn(runtime.run()); + // Let at least one heartbeat happen so we know the loop is alive before cancelling. + assert!(liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await); + + token.cancel(); + let result = tokio::time::timeout(BOUNDED_WAIT, join) + .await + .context("run did not return within bounded time")? + .context("run task panicked")?; + assert!(matches!(result, Ok(())), "expected Ok(()), got {result:?}"); + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn scheduler_error_is_retried() -> anyhow::Result<()> { + const SESSION_ID: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(SESSION_ID); + + // The first poll errors; the loop should log it and poll again rather than bail. The second + // poll returns a real assignment, which we drop on the storage side to keep the test focused. + scheduler.push(Err(SchedulerError::Transport("boom".to_owned()))); + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Err(StorageResponseError::Server("test drop".to_owned()))); + + let (runtime, token) = Runtime::create( + scheduler, + storage.clone(), + Arc::new(liveness), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + let join = tokio::spawn(runtime.run()); + + // Reaching register proves the loop retried past the scheduler error onto the next poll. + assert!( + wait_until(|| !storage.register_calls().is_empty(), BOUNDED_WAIT).await, + "expected the loop to retry past the scheduler error and register the next assignment" + ); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn stale_bundle_is_dropped_without_register() -> anyhow::Result<()> { + const CURRENT_SESSION: SessionId = 10; + const STALE_SESSION: SessionId = 5; + const { assert!(CURRENT_SESSION > STALE_SESSION) }; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(CURRENT_SESSION); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + scheduler.push(Ok(assignment_with_session(STALE_SESSION))); + let join = tokio::spawn(runtime.run()); + + assert!( + wait_until(|| scheduler.call_count() >= 2, BOUNDED_WAIT).await, + "expected scheduler to be polled again after dropping stale bundle; call_count = {}", + scheduler.call_count() + ); + assert!( + storage.register_calls().is_empty(), + "storage should not be touched for a stale bundle" + ); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn newer_bundle_triggers_liveness_refresh() -> anyhow::Result<()> { + const CURRENT_SESSION: SessionId = 5; + const LATEST_SESSION: SessionId = 10; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(CURRENT_SESSION); + + // Slow interval so we can be sure the second observed heartbeat is the refresh-induced one + // (the periodic tick is 5 s away). + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness.clone()), + runtime_config(SLOW_HEARTBEAT_INTERVAL), + ) + .await?; + + // Wait for the periodic-interval's leading tick to settle so the count is a clean baseline. + assert!(liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await); + let baseline = liveness.heartbeat_count(); + + // The newer-session bundle: the runtime should call `LivenessHandle::refresh` before + // registering. Drop the bundle on the storage side to keep the test focused on the refresh. + scheduler.push(Ok(assignment_with_session(LATEST_SESSION))); + storage.push_register_response(Err(StorageResponseError::Server("test drop".to_owned()))); + let join = tokio::spawn(runtime.run()); + + assert!( + liveness.wait_for_heartbeats(baseline + 1, TIGHT_WAIT).await, + "expected an extra heartbeat (refresh) within {TIGHT_WAIT:?}; heartbeats = {}", + liveness.heartbeat_count() + ); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn equal_session_passes_through_to_register() -> anyhow::Result<()> { + const SESSION_ID: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(SESSION_ID); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + // Bundle session matches the tracker exactly — runtime should skip triage and call register. + // Drop on the storage side so we don't need a real execution. + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Err(StorageResponseError::Server("test drop".to_owned()))); + let join = tokio::spawn(runtime.run()); + + assert!( + wait_until(|| !storage.register_calls().is_empty(), BOUNDED_WAIT).await, + "expected register_task_instance to be called with the bundle's session id" + ); + let calls = storage.register_calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].session_id, SESSION_ID); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn stale_session_drops_assignment_and_refreshes() -> anyhow::Result<()> { + const CURRENT_SESSION: SessionId = 10; + const STALE_SESSION: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(STALE_SESSION); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness.clone()), + runtime_config(SLOW_HEARTBEAT_INTERVAL), + ) + .await?; + + assert!(liveness.wait_for_heartbeats(1, BOUNDED_WAIT).await); + let baseline = liveness.heartbeat_count(); + + scheduler.push(Ok(assignment_with_session(STALE_SESSION))); + storage.push_register_response(Err(StorageResponseError::StaleSession { + storage_session: CURRENT_SESSION, + })); + let join = tokio::spawn(runtime.run()); + + // Stale-session response triggers liveness refresh and drops the assignment. + assert!( + liveness.wait_for_heartbeats(baseline + 1, TIGHT_WAIT).await, + "expected refresh-induced heartbeat after StaleSession; heartbeats = {}", + liveness.heartbeat_count() + ); + assert!( + wait_until(|| scheduler.call_count() >= 2, BOUNDED_WAIT).await, + "expected scheduler to be polled again after stale assignment was dropped" + ); + assert_eq!(storage.register_calls().len(), 1); + assert!(storage.success_reports().is_empty()); + assert!(storage.failure_reports().is_empty()); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn recoverable_storage_errors_drop_assignment() -> anyhow::Result<()> { + const SESSION_ID: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(SESSION_ID); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + // Three bundles, three recoverable register failures. Each one should cause the loop to drop + // the assignment and poll the scheduler again. + let recoverable_errors = [ + StorageResponseError::Transport("net blip".to_owned()), + StorageResponseError::CacheStale("stale cache".to_owned()), + StorageResponseError::Server("server boom".to_owned()), + ]; + for err in recoverable_errors { + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Err(err)); + } + let join = tokio::spawn(runtime.run()); + + // After all three are drained, the next scheduler call blocks because the queue is empty. + assert!( + wait_until(|| scheduler.call_count() >= 4, BOUNDED_WAIT).await, + "expected 3 drops + 1 idle poll; call_count = {}", + scheduler.call_count() + ); + assert_eq!(storage.register_calls().len(), 3); + assert!(storage.success_reports().is_empty()); + assert!(storage.failure_reports().is_empty()); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn success_outcome_reports_outputs() -> anyhow::Result<()> { + const SESSION_ID: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(SESSION_ID); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness.clone()), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + let em_id = liveness.em_id(); + + let assignment = assignment_with_session(SESSION_ID); + scheduler.push(Ok(assignment)); + storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64)))); + let join = tokio::spawn(runtime.run()); + + assert!(storage.wait_for_any_report(BOUNDED_WAIT).await); + let reports = storage.success_reports(); + assert_eq!(reports.len(), 1); + let report = &reports[0]; + assert_eq!(report.job_id, assignment.job_id); + assert_eq!(report.task_id, assignment.task_id); + assert_eq!(report.em_id, em_id); + assert_eq!(report.session_id, SESSION_ID); + let outputs = report + .serialized_outputs + .as_ref() + .context("success report should carry outputs")?; + assert_eq!(decode_single_output::(outputs), 55); + assert!(storage.failure_reports().is_empty()); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn non_success_outcome_keeps_loop_serving() -> anyhow::Result<()> { + const SESSION_ID: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(SESSION_ID); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + // First bundle: always_fail. Second bundle: fibonacci. If the loop bails after a failure + // outcome, the second bundle never reaches register / report. + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Ok(execution_context("always_fail", vec![]))); + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64)))); + let join = tokio::spawn(runtime.run()); + + assert!( + wait_until( + || !storage.failure_reports().is_empty() && !storage.success_reports().is_empty(), + BOUNDED_WAIT, + ) + .await, + "expected one failure (always_fail) and one success (fibonacci) report; got success={} \ + failure={}", + storage.success_reports().len(), + storage.failure_reports().len() + ); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn storage_report_error_does_not_kill_runtime() -> anyhow::Result<()> { + const SESSION_ID: SessionId = 5; + + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::with_initial_session(SESSION_ID); + + let (runtime, token) = Runtime::create( + scheduler.clone(), + storage.clone(), + Arc::new(liveness), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + // The first success report fails, the second succeeds. The runtime should keep serving + // assignments either way. + storage.push_success_response(Err(StorageResponseError::Server("report boom".to_owned()))); + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64)))); + scheduler.push(Ok(assignment_with_session(SESSION_ID))); + storage.push_register_response(Ok(execution_context("fibonacci", single_input(&10_u64)))); + let join = tokio::spawn(runtime.run()); + + assert!( + wait_until(|| storage.success_reports().len() >= 2, BOUNDED_WAIT).await, + "expected two success reports; got {}", + storage.success_reports().len() + ); + + token.cancel(); + join.await??; + Ok(()) +} + +#[tokio::test] +#[ignore = "requires `integration-test-tasks` cdylib and `spider-task-executor` binary"] +async fn drop_guard_cancels_token_when_run_future_dropped() -> anyhow::Result<()> { + let scheduler = MockScheduler::new(); + let storage = MockStorage::new(); + let liveness = MockLiveness::new(); + + let (runtime, _token) = Runtime::create( + scheduler, + storage, + Arc::new(liveness.clone()), + runtime_config(HEARTBEAT_INTERVAL), + ) + .await?; + + // Make sure the actor is actively ticking before we drop the runtime. + assert!(liveness.wait_for_heartbeats(2, BOUNDED_WAIT).await); + + // Dropping the `runtime.run()` future inside a short timeout drops the Runtime itself, which + // fires the `DropGuard` and cancels the token the liveness actor watches. + let timeout_result = tokio::time::timeout(Duration::from_millis(150), runtime.run()).await; + assert!( + timeout_result.is_err(), + "run unexpectedly returned within the timeout window: {timeout_result:?}" + ); + + // Give the actor a moment to observe cancellation and drain any in-flight heartbeat call. + tokio::time::sleep(2 * HEARTBEAT_INTERVAL).await; + let snapshot = liveness.heartbeat_count(); + + // Five heartbeat intervals must elapse without the counter advancing. + tokio::time::sleep(5 * HEARTBEAT_INTERVAL).await; + let current = liveness.heartbeat_count(); + assert_eq!( + current, snapshot, + "liveness actor kept heartbeating after Runtime drop; was {snapshot}, now {current}" + ); + Ok(()) +} diff --git a/tests/huntsman/test-utils/src/mock.rs b/tests/huntsman/test-utils/src/mock.rs index e9115759..36db4137 100644 --- a/tests/huntsman/test-utils/src/mock.rs +++ b/tests/huntsman/test-utils/src/mock.rs @@ -18,14 +18,274 @@ use std::{ }; use async_trait::async_trait; -use spider_core::types::id::{ExecutionManagerId, SessionId}; +use spider_core::types::{ + id::{ExecutionManagerId, JobId, SessionId, TaskId}, + io::ExecutionContext, +}; use spider_execution_manager::client::{ LivenessClient, LivenessResponseError, RegistrationResponse, + SchedulerClient, + SchedulerError, + SchedulerResponse, + StorageClient, + StorageResponseError, }; use tokio::sync::Notify; +/// Mock [`SchedulerClient`]. +#[derive(Clone)] +pub struct MockScheduler { + inner: Arc, +} + +impl MockScheduler { + /// Factory function. + /// + /// # Returns + /// + /// A fresh scheduler mock with an empty response queue. `next_task` blocks until the test + /// pushes a response. + #[must_use] + pub fn new() -> Self { + Self { + inner: Arc::new(SchedulerInner { + responses: Mutex::new(VecDeque::new()), + notify: Notify::new(), + call_count: AtomicU64::new(0), + }), + } + } + + /// Queues `response` for the next pending or future [`SchedulerClient::next_task`] call. + pub fn push(&self, response: Result) { + lock(&self.inner.responses).push_back(response); + self.inner.notify.notify_waiters(); + } + + /// # Returns + /// + /// The number of `next_task` calls the scheduler has served (including ones that are still + /// blocked waiting on the response queue). + #[must_use] + pub fn call_count(&self) -> u64 { + self.inner.call_count.load(Ordering::Relaxed) + } +} + +impl Default for MockScheduler { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl SchedulerClient for MockScheduler { + async fn next_task( + &self, + _em_id: ExecutionManagerId, + ) -> Result { + self.inner.call_count.fetch_add(1, Ordering::Relaxed); + loop { + let notified = self.inner.notify.notified(); + let popped = lock(&self.inner.responses).pop_front(); + if let Some(response) = popped { + return response; + } + notified.await; + } + } +} + +/// Captured arguments of one `register_task_instance` call. +#[derive(Debug, Clone)] +pub struct RegisterCall { + pub job_id: JobId, + pub task_id: TaskId, + pub em_id: ExecutionManagerId, + pub session_id: SessionId, +} + +/// Captured arguments of one `report_task_success` call. +#[derive(Debug, Clone)] +pub struct SuccessReport { + pub job_id: JobId, + pub task_id: TaskId, + pub em_id: ExecutionManagerId, + pub session_id: SessionId, + pub serialized_outputs: Option>, +} + +/// Captured arguments of one `report_task_failure` call. +#[derive(Debug, Clone)] +pub struct FailureReport { + pub job_id: JobId, + pub task_id: TaskId, + pub em_id: ExecutionManagerId, + pub session_id: SessionId, + pub error_message: String, +} + +/// Mock [`StorageClient`]. +#[derive(Clone)] +pub struct MockStorage { + inner: Arc, +} + +impl MockStorage { + /// Factory function. + /// + /// # Returns + /// + /// A storage mock with no programmed responses. Tests must push register responses before + /// they fire; success / failure reports default to `Ok(())`. + #[must_use] + pub fn new() -> Self { + Self { + inner: Arc::new(StorageInner { + register_responses: Mutex::new(VecDeque::new()), + success_responses: Mutex::new(VecDeque::new()), + failure_responses: Mutex::new(VecDeque::new()), + register_calls: Mutex::new(Vec::new()), + success_reports: Mutex::new(Vec::new()), + failure_reports: Mutex::new(Vec::new()), + notify: Notify::new(), + }), + } + } + + /// Queues `response` for the next `register_task_instance` call. + pub fn push_register_response(&self, response: Result) { + lock(&self.inner.register_responses).push_back(response); + } + + /// Queues `response` for the next `report_task_success` call. + pub fn push_success_response(&self, response: Result<(), StorageResponseError>) { + lock(&self.inner.success_responses).push_back(response); + } + + /// Queues `response` for the next `report_task_failure` call. + pub fn push_failure_response(&self, response: Result<(), StorageResponseError>) { + lock(&self.inner.failure_responses).push_back(response); + } + + /// # Returns + /// + /// A snapshot of every `register_task_instance` call recorded so far. + #[must_use] + pub fn register_calls(&self) -> Vec { + lock(&self.inner.register_calls).clone() + } + + /// # Returns + /// + /// A snapshot of every `report_task_success` call recorded so far. + #[must_use] + pub fn success_reports(&self) -> Vec { + lock(&self.inner.success_reports).clone() + } + + /// # Returns + /// + /// A snapshot of every `report_task_failure` call recorded so far. + #[must_use] + pub fn failure_reports(&self) -> Vec { + lock(&self.inner.failure_reports).clone() + } + + /// Waits for at least one `report_*` call to be recorded, with a bounded total wait time. + /// + /// # Returns + /// + /// Whether a report was observed before `timeout` elapsed. + pub async fn wait_for_any_report(&self, timeout: Duration) -> bool { + let deadline = tokio::time::Instant::now() + timeout; + loop { + if !self.success_reports().is_empty() || !self.failure_reports().is_empty() { + return true; + } + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + return false; + } + let notified = self.inner.notify.notified(); + tokio::select! { + () = notified => {} + () = tokio::time::sleep(remaining.min(POLL_INTERVAL)) => {} + } + } + } +} + +impl Default for MockStorage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl StorageClient for MockStorage { + async fn register_task_instance( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + ) -> Result { + lock(&self.inner.register_calls).push(RegisterCall { + job_id, + task_id, + em_id, + session_id, + }); + let response = lock(&self.inner.register_responses).pop_front(); + response.expect("mock storage exhausted register responses") + } + + async fn report_task_success( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + serialized_outputs: Option>, + ) -> Result<(), StorageResponseError> { + lock(&self.inner.success_reports).push(SuccessReport { + job_id, + task_id, + em_id, + session_id, + serialized_outputs, + }); + self.inner.notify.notify_waiters(); + lock(&self.inner.success_responses) + .pop_front() + .unwrap_or(Ok(())) + } + + async fn report_task_failure( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + error_message: String, + ) -> Result<(), StorageResponseError> { + lock(&self.inner.failure_reports).push(FailureReport { + job_id, + task_id, + em_id, + session_id, + error_message, + }); + self.inner.notify.notify_waiters(); + lock(&self.inner.failure_responses) + .pop_front() + .unwrap_or(Ok(())) + } +} + /// Mock [`LivenessClient`]. #[derive(Clone)] pub struct MockLiveness { @@ -168,6 +428,24 @@ impl LivenessClient for MockLiveness { /// Default polling interval for `wait_until_*` helpers. Short enough to keep tests snappy. const POLL_INTERVAL: Duration = Duration::from_millis(5); +/// Shared state behind [`MockScheduler`]. +struct SchedulerInner { + responses: Mutex>>, + notify: Notify, + call_count: AtomicU64, +} + +/// Shared state behind [`MockStorage`]. +struct StorageInner { + register_responses: Mutex>>, + success_responses: Mutex>>, + failure_responses: Mutex>>, + register_calls: Mutex>, + success_reports: Mutex>, + failure_reports: Mutex>, + notify: Notify, +} + /// Shared state behind [`MockLiveness`]. struct LivenessInner { em_id: ExecutionManagerId, From 0683275d84d7c2e318feb3788f0223792ae90b8d Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Sun, 7 Jun 2026 15:58:48 -0400 Subject: [PATCH 08/14] build: Split the Ubuntu dev-dependency install script into common, huntsman, and wolf variants. (#336) Co-authored-by: sitaowang1998 --- .devcontainer/Dockerfile | 3 +- .github/workflows/code-linting-checks.yaml | 8 ++--- .github/workflows/tests.yaml | 4 +-- .../lib_install/ubuntu/install-dev-common.sh | 29 +++++++++++++++++++ .../ubuntu/install-dev-huntsman.sh | 26 +++++++++++++++++ .../install-dev-wolf.sh} | 22 ++++++-------- 6 files changed, 72 insertions(+), 20 deletions(-) create mode 100755 tools/scripts/lib_install/ubuntu/install-dev-common.sh create mode 100755 tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh rename tools/scripts/lib_install/{linux/install-dev.sh => ubuntu/install-dev-wolf.sh} (70%) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index a47c1a5e..74610fb6 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -5,7 +5,8 @@ WORKDIR /root RUN mkdir -p ./tools/scripts/lib_install COPY ./tools/scripts/lib_install ./tools/scripts/lib_install -RUN ./tools/scripts/lib_install/linux/install-dev.sh +RUN ./tools/scripts/lib_install/ubuntu/install-dev-wolf.sh +RUN ./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh # NOTE: # `task` doesn't have an apt/yum package so we use its install script. diff --git a/.github/workflows/code-linting-checks.yaml b/.github/workflows/code-linting-checks.yaml index 2890ea61..fa37293f 100644 --- a/.github/workflows/code-linting-checks.yaml +++ b/.github/workflows/code-linting-checks.yaml @@ -47,7 +47,7 @@ jobs: - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv" - name: "Install dev dependencies" - run: "./tools/scripts/lib_install/linux/install-dev.sh" + run: "./tools/scripts/lib_install/ubuntu/install-dev-common.sh" - run: "task lint:toml-check" @@ -75,7 +75,7 @@ jobs: - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv" - name: "Install dev dependencies" - run: "./tools/scripts/lib_install/linux/install-dev.sh" + run: "./tools/scripts/lib_install/ubuntu/install-dev-wolf.sh" - uses: "./tools/yscope-dev-utils/exports/github/actions/print-tool-versions" @@ -151,7 +151,7 @@ jobs: - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv" - name: "Install dev dependencies" - run: "./tools/scripts/lib_install/linux/install-dev.sh" + run: "./tools/scripts/lib_install/ubuntu/install-dev-common.sh" - run: "task lint:py-check" @@ -174,7 +174,7 @@ jobs: - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv" - name: "Install dev dependencies" - run: "./tools/scripts/lib_install/linux/install-dev.sh" + run: "./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh" - run: "task lint:check-rust" diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index e8ee9c47..46870c49 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -59,7 +59,7 @@ jobs: - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv" - name: "Install dev dependencies" - run: "./tools/scripts/lib_install/linux/install-dev.sh" + run: "./tools/scripts/lib_install/ubuntu/install-dev-wolf.sh" - uses: "./tools/yscope-dev-utils/exports/github/actions/print-tool-versions" @@ -97,7 +97,7 @@ jobs: - uses: "./tools/yscope-dev-utils/exports/github/actions/install-uv" - name: "Install dev dependencies" - run: "./tools/scripts/lib_install/linux/install-dev.sh" + run: "./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh" - uses: "./tools/yscope-dev-utils/exports/github/actions/print-tool-versions" diff --git a/tools/scripts/lib_install/ubuntu/install-dev-common.sh b/tools/scripts/lib_install/ubuntu/install-dev-common.sh new file mode 100755 index 00000000..4a24c8e4 --- /dev/null +++ b/tools/scripts/lib_install/ubuntu/install-dev-common.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Installs the dev dependencies shared by all versions of Spider. + +# Exit on any error +set -e + +# Error on undefined variable +set -u + +echo "Checking for elevated privileges..." +privileged_command_prefix="" +if [ ${EUID:-$(id -u)} -ne 0 ] ; then + sudo echo "Script can elevate privileges." + privileged_command_prefix="${privileged_command_prefix} sudo" +fi + +${privileged_command_prefix} apt-get update +DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \ +apt-get install --no-install-recommends -y \ + ca-certificates \ + curl \ + git \ + python3 \ + python3-pip \ + python3-venv + +# Install uv +curl -LsSf https://astral.sh/uv/install.sh | sh diff --git a/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh new file mode 100755 index 00000000..73b64eab --- /dev/null +++ b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Installs the dev dependencies for Spider Huntsman. + +# Exit on any error +set -e + +# Error on undefined variable +set -u + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +"$script_dir"/install-dev-common.sh + +echo "Checking for elevated privileges..." +privileged_command_prefix="" +if [ ${EUID:-$(id -u)} -ne 0 ] ; then + sudo echo "Script can elevate privileges." + privileged_command_prefix="${privileged_command_prefix} sudo" +fi + +# `gcc` and `libc6-dev` are required by `rustc`, which invokes the system C compiler driver to +# link binaries against libc. +DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \ +apt-get install --no-install-recommends -y \ + gcc \ + libc6-dev diff --git a/tools/scripts/lib_install/linux/install-dev.sh b/tools/scripts/lib_install/ubuntu/install-dev-wolf.sh similarity index 70% rename from tools/scripts/lib_install/linux/install-dev.sh rename to tools/scripts/lib_install/ubuntu/install-dev-wolf.sh index c1f1b4f7..2c816b0a 100755 --- a/tools/scripts/lib_install/linux/install-dev.sh +++ b/tools/scripts/lib_install/ubuntu/install-dev-wolf.sh @@ -1,25 +1,28 @@ #!/usr/bin/env bash +# Installs the dev dependencies for Spider Wolf. + # Exit on any error set -e # Error on undefined variable set -u +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +"$script_dir"/install-dev-common.sh + echo "Checking for elevated privileges..." privileged_command_prefix="" if [ ${EUID:-$(id -u)} -ne 0 ] ; then sudo echo "Script can elevate privileges." privileged_command_prefix="${privileged_command_prefix} sudo" fi -${privileged_command_prefix} apt-get update -DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} apt-get install --no-install-recommends -y \ - ca-certificates \ + +DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \ +apt-get install --no-install-recommends -y \ checkinstall \ - curl \ g++ \ gcc \ - git \ jq \ libcurl4 \ libcurl4-openssl-dev \ @@ -27,16 +30,9 @@ DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} apt-get install --no libssl-dev \ make \ openjdk-11-jdk \ - pkg-config \ - python3 \ - python3-pip \ - python3-venv + pkg-config -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir="$script_dir/.." ${privileged_command_prefix} "$lib_install_scripts_dir"/install-cmake.sh 3.23.5 # TODO https://github.com/y-scope/spider/issues/86 "$lib_install_scripts_dir"/check-cmake-version.sh - -# Install uv -curl -LsSf https://astral.sh/uv/install.sh | sh From e39e319b91e7f6623af9022ba12446d8dba637f7 Mon Sep 17 00:00:00 2001 From: sitaowang1998 Date: Sun, 7 Jun 2026 16:37:34 -0400 Subject: [PATCH 09/14] feat(huntsman): Add protobuf scaffolding and gRPC StorageClient for task instance management: (#333) * Add `spider-proto` component for protobuf source files. * Add `spider-proto-rust` component for the generated Rust code. * Add tasks to build the generated Rust code. * Add a GitHub workflow to verify the generated Rust code is up-to-date. * Implement a gRPC-backed `StorageClient` for the execution manager. Co-authored-by: LinZhihao-723 --- .../proto-generated-code-checks.yaml | 38 + Cargo.lock | 889 ++++++++++++++++-- Cargo.toml | 11 + .../spider-execution-manager/Cargo.toml | 2 + .../spider-execution-manager/src/client.rs | 2 + .../src/client/grpc/mod.rs | 5 + .../src/client/grpc/storage.rs | 225 +++++ components/spider-proto-rust/Cargo.toml | 16 + components/spider-proto-rust/build.rs | 81 ++ .../src/generated/storage.rs | 632 +++++++++++++ components/spider-proto-rust/src/id.rs | 44 + components/spider-proto-rust/src/lib.rs | 8 + components/spider-proto/storage/storage.proto | 71 ++ taskfiles/build.yaml | 20 + taskfiles/lint.yaml | 2 +- taskfiles/test.yaml | 9 +- .../ubuntu/install-dev-huntsman.sh | 3 +- 17 files changed, 1978 insertions(+), 80 deletions(-) create mode 100644 .github/workflows/proto-generated-code-checks.yaml create mode 100644 components/spider-execution-manager/src/client/grpc/mod.rs create mode 100644 components/spider-execution-manager/src/client/grpc/storage.rs create mode 100644 components/spider-proto-rust/Cargo.toml create mode 100644 components/spider-proto-rust/build.rs create mode 100644 components/spider-proto-rust/src/generated/storage.rs create mode 100644 components/spider-proto-rust/src/id.rs create mode 100644 components/spider-proto-rust/src/lib.rs create mode 100644 components/spider-proto/storage/storage.proto diff --git a/.github/workflows/proto-generated-code-checks.yaml b/.github/workflows/proto-generated-code-checks.yaml new file mode 100644 index 00000000..823639fa --- /dev/null +++ b/.github/workflows/proto-generated-code-checks.yaml @@ -0,0 +1,38 @@ +name: "proto-generated-code-checks" + +on: + pull_request: + push: + schedule: + # Run daily at 00:15 UTC (the 15 is to avoid periods of high load) + - cron: "15 0 * * *" + workflow_dispatch: + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + # Cancel in-progress jobs for efficiency + cancel-in-progress: true + +jobs: + proto-code-committed: + name: "proto-code-committed" + runs-on: "ubuntu-latest" + steps: + - uses: "actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd" # v6.0.2 + with: + submodules: "recursive" + + - uses: "./tools/yscope-dev-utils/exports/github/actions/install-python" + + - uses: "./tools/yscope-dev-utils/exports/github/actions/install-go-task" + + - name: "Install dev dependencies" + run: "./tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh" + + - name: "`spider-proto-rust` code generation" + shell: "bash" + run: "task build:spider-proto-rust-codegen" + + - name: "Check if the generated proto code is the latest" + shell: "bash" + run: "git diff --exit-code components/spider-proto-rust/src/generated" diff --git a/Cargo.lock b/Cargo.lock index a6653baa..0f3763a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,6 +35,28 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -55,11 +77,64 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] [[package]] name = "base64" @@ -84,9 +159,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" dependencies = [ "serde_core", ] @@ -217,9 +292,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "6.1.0" +version = "6.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" dependencies = [ "cfg-if", "crossbeam-utils", @@ -265,9 +340,9 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", @@ -282,9 +357,9 @@ checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" [[package]] name = "either" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" dependencies = [ "serde", ] @@ -348,6 +423,18 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "flume" version = "0.11.1" @@ -502,10 +589,48 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", "wasip2", + "wasip3", +] + +[[package]] +name = "h2" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.14.0", + "slab", + "tokio", + "tokio-util", + "tracing", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.5" @@ -525,9 +650,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.17.0" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" [[package]] name = "hashlink" @@ -577,6 +702,51 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "http" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "huntsman-complex" version = "0.1.0" @@ -594,6 +764,61 @@ dependencies = [ "spider-tdl", ] +[[package]] +name = "hyper" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "libc", + "pin-project-lite", + "socket2 0.6.4", + "tokio", + "tower-service", + "tracing", +] + [[package]] name = "icu_collections" version = "2.2.0" @@ -676,6 +901,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "idna" version = "1.1.0" @@ -697,6 +928,16 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -704,7 +945,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.17.0", + "hashbrown 0.17.1", + "serde", + "serde_core", ] [[package]] @@ -715,6 +958,15 @@ dependencies = [ "spider-tdl", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -745,6 +997,12 @@ dependencies = [ "spin", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.186" @@ -769,14 +1027,14 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libredox" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3" dependencies = [ "bitflags", "libc", "plain", - "redox_syscall 0.7.5", + "redox_syscall 0.8.1", ] [[package]] @@ -789,6 +1047,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "litemap" version = "0.8.2" @@ -806,9 +1070,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.29" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "matchers" @@ -819,6 +1083,12 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "md-5" version = "0.10.6" @@ -831,21 +1101,33 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mio" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda" dependencies = [ "libc", "wasi", "windows-sys 0.61.2", ] +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + [[package]] name = "non-empty-string" version = "0.2.6" @@ -963,6 +1245,36 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap 2.14.0", +] + +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -1020,6 +1332,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", +] + [[package]] name = "proc-macro-error-attr2" version = "2.0.0" @@ -1051,6 +1373,58 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +dependencies = [ + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.117", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost", +] + [[package]] name = "quote" version = "1.0.45" @@ -1066,6 +1440,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.8.6" @@ -1136,13 +1516,25 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.5" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4666a1a60d8412eab19d94f6d13dcc9cea0a5ef4fdf6a5db306537413c661b1b" +checksum = "5b44b894f2a6e36457d665d1e08c3866add6ed5e70050c1b4ba8a8ddedb02ce7" dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.14" @@ -1200,31 +1592,35 @@ dependencies = [ ] [[package]] -name = "ryu" -version = "1.0.23" +name = "rustix" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] [[package]] -name = "scc" -version = "2.4.0" +name = "rustversion" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc" -dependencies = [ - "sdd", -] +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] -name = "scopeguard" -version = "1.2.0" +name = "ryu" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] -name = "sdd" -version = "3.0.10" +name = "scopeguard" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "secrecy" @@ -1274,9 +1670,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ "itoa", "memchr", @@ -1299,9 +1695,9 @@ dependencies = [ [[package]] name = "serial_test" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f" +checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d" dependencies = [ "fslock", "futures-executor", @@ -1309,15 +1705,14 @@ dependencies = [ "log", "once_cell", "parking_lot", - "scc", "serial_test_derive", ] [[package]] name = "serial_test_derive" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9" +checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" dependencies = [ "proc-macro2", "quote", @@ -1392,9 +1787,19 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.3" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" dependencies = [ "libc", "windows-sys 0.61.2", @@ -1438,14 +1843,26 @@ dependencies = [ "futures-util", "rmp-serde", "spider-core", + "spider-proto-rust", "spider-task-executor", "spider-tdl", "thiserror", "tokio", "tokio-util", + "tonic", "tracing", ] +[[package]] +name = "spider-proto-rust" +version = "0.1.0" +dependencies = [ + "prost", + "spider-core", + "tonic", + "tonic-build", +] + [[package]] name = "spider-storage" version = "0.1.0" @@ -1562,7 +1979,7 @@ dependencies = [ "futures-util", "hashbrown 0.15.5", "hashlink", - "indexmap", + "indexmap 2.14.0", "log", "memchr", "once_cell", @@ -1785,6 +2202,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" + [[package]] name = "synstructure" version = "0.13.2" @@ -1847,6 +2270,19 @@ dependencies = [ "spider-tdl", ] +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "test-utils" version = "0.1.0" @@ -1930,16 +2366,16 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.52.2" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "110a78583f19d5cdb2c5ccf321d1290344e71313c6c37d43520d386027d18386" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ "bytes", "libc", "mio", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.6.4", "tokio-macros", "windows-sys 0.61.2", ] @@ -1980,6 +2416,96 @@ dependencies = [ "tokio", ] +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost", + "socket2 0.5.10", + "tokio", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "prost-types", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand 0.8.6", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + [[package]] name = "tracing" version = "0.1.44" @@ -2041,11 +2567,17 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "typenum" -version = "1.20.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" [[package]] name = "unicode-bidi" @@ -2122,6 +2654,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -2134,7 +2675,16 @@ version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", ] [[package]] @@ -2143,6 +2693,40 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.14.0", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.14.0", + "semver", +] + [[package]] name = "whoami" version = "1.6.1" @@ -2187,7 +2771,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -2205,13 +2798,29 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -2220,48 +2829,184 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.14.0", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.14.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.14.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "writeable" version = "0.6.3" @@ -2270,9 +3015,9 @@ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -2293,18 +3038,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.48" +version = "0.8.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.48" +version = "0.8.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639" dependencies = [ "proc-macro2", "quote", @@ -2313,9 +3058,9 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" dependencies = [ "zerofrom-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 08d6f85b..86489204 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "components/spider-core", "components/spider-derive", "components/spider-execution-manager", + "components/spider-proto-rust", "components/spider-storage", "components/spider-task-executor", "components/spider-tdl", @@ -16,3 +17,13 @@ members = [ "tests/huntsman/tdl-integration", "tests/huntsman/test-utils", ] +default-members = [ + "components/spider-core", + "components/spider-derive", + "components/spider-execution-manager", + "components/spider-proto-rust", + "components/spider-storage", + "components/spider-task-executor", + "components/spider-tdl", + "components/spider-tdl-derive", +] diff --git a/components/spider-execution-manager/Cargo.toml b/components/spider-execution-manager/Cargo.toml index 10f0e3ac..13562728 100644 --- a/components/spider-execution-manager/Cargo.toml +++ b/components/spider-execution-manager/Cargo.toml @@ -18,6 +18,7 @@ futures-util = { } rmp-serde = "1.3.1" spider-core = { path = "../spider-core" } +spider-proto-rust = { path = "../spider-proto-rust" } spider-task-executor = { path = "../spider-task-executor" } spider-tdl = { path = "../spider-tdl" } thiserror = "2.0.18" @@ -26,4 +27,5 @@ tokio = { features = ["io-util", "macros", "process", "rt", "sync", "time"] } tokio-util = { version = "0.7", features = ["codec", "rt"] } +tonic = "0.12.3" tracing = { version = "0.1.41", default-features = false, features = ["std"] } diff --git a/components/spider-execution-manager/src/client.rs b/components/spider-execution-manager/src/client.rs index 4f335f6e..63b132ce 100644 --- a/components/spider-execution-manager/src/client.rs +++ b/components/spider-execution-manager/src/client.rs @@ -6,10 +6,12 @@ //! * [`storage::StorageClient`] — registers task instances and reports their outcome. //! * [`liveness::LivenessClient`] — registers the EM at boot and ticks the heartbeat thereafter. +pub mod grpc; pub mod liveness; pub mod scheduler; pub mod storage; +pub use grpc::GrpcStorageClient; pub use liveness::{LivenessClient, LivenessResponseError, RegistrationResponse}; pub use scheduler::{SchedulerClient, SchedulerError, SchedulerResponse}; pub use storage::{StorageClient, StorageResponseError}; diff --git a/components/spider-execution-manager/src/client/grpc/mod.rs b/components/spider-execution-manager/src/client/grpc/mod.rs new file mode 100644 index 00000000..9f15ee9a --- /dev/null +++ b/components/spider-execution-manager/src/client/grpc/mod.rs @@ -0,0 +1,5 @@ +//! gRPC-backed implementations of the execution manager's client traits. + +pub mod storage; + +pub use storage::GrpcStorageClient; diff --git a/components/spider-execution-manager/src/client/grpc/storage.rs b/components/spider-execution-manager/src/client/grpc/storage.rs new file mode 100644 index 00000000..c037814a --- /dev/null +++ b/components/spider-execution-manager/src/client/grpc/storage.rs @@ -0,0 +1,225 @@ +//! gRPC-backed [`StorageClient`] implementation. +//! +//! Wraps the generated [`TaskInstanceManagementServiceClient`] and adapts its protobuf +//! request/response types to the transport-agnostic [`StorageClient`] trait. + +use async_trait::async_trait; +use spider_core::types::{ + id::{ExecutionManagerId, JobId, SessionId, TaskId}, + io::ExecutionContext, +}; +use spider_proto_rust::storage::{ + self, + register_task_instance_response, + storage_error, + storage_operation_response, + task_instance_management_service_client::TaskInstanceManagementServiceClient, +}; +use tonic::transport::{Channel, Endpoint}; + +use crate::client::storage::{StorageClient, StorageResponseError}; + +/// gRPC-backed [`StorageClient`] implementation. +#[derive(Debug, Clone)] +pub struct GrpcStorageClient { + client: TaskInstanceManagementServiceClient, +} + +impl GrpcStorageClient { + /// Connects to the storage gRPC endpoint. + /// + /// # Returns + /// + /// A new [`GrpcStorageClient`] connected to `endpoint` on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageResponseError::Transport`] if tonic cannot create or connect to the endpoint. + pub async fn connect(endpoint: Endpoint) -> Result { + TaskInstanceManagementServiceClient::connect(endpoint) + .await + .map(|inner| Self { client: inner }) + .map_err(to_transport_error) + } +} + +#[async_trait] +impl StorageClient for GrpcStorageClient { + async fn register_task_instance( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + ) -> Result { + let request = storage::RegisterTaskInstanceRequest { + job_id: job_id.get(), + task_id: Some(storage::TaskId::from(task_id)), + execution_manager_id: em_id.get(), + session_id, + }; + let response = self + .client + .clone() + .register_task_instance(request) + .await + .map_err(to_transport_error)? + .into_inner(); + + match response.result { + Some(register_task_instance_response::Result::ExecutionContext(bytes)) => { + bincode::deserialize(&bytes).map_err(|error| { + StorageResponseError::Transport(format!( + "failed to decode execution context: {error}" + )) + }) + } + Some(register_task_instance_response::Result::Error(error)) => Err(error.into()), + None => Err(StorageResponseError::Transport( + "register task instance response missing result".to_owned(), + )), + } + } + + async fn report_task_success( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + serialized_outputs: Option>, + ) -> Result<(), StorageResponseError> { + let request = storage::ReportTaskSuccessRequest { + job_id: job_id.get(), + task_id: Some(storage::TaskId::from(task_id)), + execution_manager_id: em_id.get(), + session_id, + serialized_outputs: serialized_outputs.unwrap_or_default(), + }; + let response = self + .client + .clone() + .report_task_success(request) + .await + .map_err(to_transport_error)? + .into_inner(); + + storage_operation_response_to_result(response) + } + + async fn report_task_failure( + &self, + job_id: JobId, + task_id: TaskId, + em_id: ExecutionManagerId, + session_id: SessionId, + error_message: String, + ) -> Result<(), StorageResponseError> { + let request = storage::ReportTaskFailureRequest { + job_id: job_id.get(), + task_id: Some(storage::TaskId::from(task_id)), + execution_manager_id: em_id.get(), + session_id, + error_message, + }; + let response = self + .client + .clone() + .report_task_failure(request) + .await + .map_err(to_transport_error)? + .into_inner(); + + storage_operation_response_to_result(response) + } +} + +impl From for StorageResponseError { + fn from(error: storage::StorageError) -> Self { + match storage_error::ErrCode::try_from(error.err_code) { + Ok(storage_error::ErrCode::StaleSession) => Self::StaleSession { + storage_session: error.storage_session, + }, + Ok(storage_error::ErrCode::CacheStale) => Self::CacheStale(error.message), + Ok(storage_error::ErrCode::Transport) => Self::Transport(error.message), + Ok(storage_error::ErrCode::Server | storage_error::ErrCode::Unspecified) => { + Self::Server(error.message) + } + Ok(storage_error::ErrCode::InvalidInput) => Self::InvalidInput(error.message), + Err(error) => Self::Transport(format!("unknown storage error kind: {error}")), + } + } +} + +/// # Returns +/// +/// [`storage::StorageOperationResponse`] converted into [`Result<(), StorageResponseError>`]. +fn storage_operation_response_to_result( + response: storage::StorageOperationResponse, +) -> Result<(), StorageResponseError> { + match response.result { + Some(storage_operation_response::Result::Ok(_)) => Ok(()), + Some(storage_operation_response::Result::Error(error)) => Err(error.into()), + None => Err(StorageResponseError::Transport( + "storage operation response missing `result` message".to_owned(), + )), + } +} + +/// Converts a displayable transport-layer error into [`StorageResponseError::Transport`]. +/// +/// # Returns +/// +/// A [`StorageResponseError::Transport`] containing `error`'s display string. +fn to_transport_error(error: impl std::fmt::Display) -> StorageResponseError { + StorageResponseError::Transport(error.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn storage_error_maps_stale_session() { + let error = storage::StorageError { + err_code: storage_error::ErrCode::StaleSession.into(), + message: "stale".to_owned(), + storage_session: 7, + }; + + match StorageResponseError::from(error) { + StorageResponseError::StaleSession { storage_session } => { + assert_eq!(7, storage_session); + } + error => panic!("unexpected storage response error: {error:?}"), + } + } + + #[test] + fn storage_error_maps_unknown_kind_to_transport_error() { + let error = storage::StorageError { + err_code: 99, + message: "unknown".to_owned(), + storage_session: 0, + }; + + match StorageResponseError::from(error) { + StorageResponseError::Transport(message) => { + assert!(message.contains("unknown storage error kind")); + } + error => panic!("unexpected storage response error: {error:?}"), + } + } + + #[test] + fn missing_storage_operation_result_is_transport_error() { + match storage_operation_response_to_result(storage::StorageOperationResponse { + result: None, + }) { + Err(StorageResponseError::Transport(_)) => {} + result => panic!("unexpected storage operation result: {result:?}"), + } + } +} diff --git a/components/spider-proto-rust/Cargo.toml b/components/spider-proto-rust/Cargo.toml new file mode 100644 index 00000000..6a5e53db --- /dev/null +++ b/components/spider-proto-rust/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "spider-proto-rust" +version = "0.1.0" +edition = "2024" + +[lib] +name = "spider_proto_rust" +path = "src/lib.rs" + +[dependencies] +prost = "0.13.5" +spider-core = { path = "../spider-core" } +tonic = "0.12.3" + +[build-dependencies] +tonic-build = "0.12.3" diff --git a/components/spider-proto-rust/build.rs b/components/spider-proto-rust/build.rs new file mode 100644 index 00000000..e4e8d880 --- /dev/null +++ b/components/spider-proto-rust/build.rs @@ -0,0 +1,81 @@ +use std::{env, fs, path::PathBuf}; + +/// The environment variable that, if set, forces the build script to regenerate the protobuf code. +const SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE: &str = "SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE"; + +/// The default destination directory for generated protobuf code, relative to the crate root. +const SPIDER_PROTO_RUST_GENERATED_DIR: &str = "src/generated"; + +/// The root of the protobuf source files. +const SPIDER_PROTO_ROOT: &str = "spider-proto"; + +/// The protobuf source files to compile, relative to [`SPIDER_PROTO_ROOT`]. +const SPIDER_PROTO_SOURCE_FILES: &[&str] = &["storage/storage.proto"]; + +fn main() { + // Rerun the build script whenever the generation gate is toggled or changes value. + println!("cargo:rerun-if-env-changed={SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE}"); + + let crate_root = PathBuf::from( + env::var_os("CARGO_MANIFEST_DIR").expect("`CARGO_MANIFEST_DIR` env var not set"), + ); + let components_root = crate_root + .parent() + .expect("`CARGO_MANIFEST_DIR` is not a directory"); + + let spider_proto_root = components_root.join(SPIDER_PROTO_ROOT); + let spider_proto_sources = SPIDER_PROTO_SOURCE_FILES + .iter() + .map(|relative_path| { + let abs_path = spider_proto_root.join(relative_path); + println!("cargo:rerun-if-changed={}", abs_path.display()); + abs_path + }) + .collect::>(); + + if env::var_os(SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE).is_none() { + // The committed generated code is used as-is. + return; + } + + let generate_from_source = + env::var_os(SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE).is_some_and(|val| { + const ON: &str = "ON"; + const OFF: &str = "OFF"; + match val.to_str() { + Some(ON) => true, + Some(OFF) => false, + _ => panic!( + "invalid value for {SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE}: expected '{ON}' \ + or '{OFF}'" + ), + } + }); + + if !generate_from_source { + // The committed generated code is used as-is. + return; + } + + let out_dir = crate_root.join(SPIDER_PROTO_RUST_GENERATED_DIR); + if out_dir.exists() { + fs::remove_dir_all(&out_dir).expect("failed to remove existing generated code"); + } + fs::create_dir_all(&out_dir).expect("failed to create output dir for generated code"); + + tonic_build::configure() + .build_client(true) + .build_server(true) + .out_dir(&out_dir) + .compile_protos( + spider_proto_sources.as_ref(), + &[spider_proto_root.as_path()], + ) + .inspect_err(|e| eprintln!("Failed to compile `spider-proto`: {e:?}")) + .expect("proto compilation failed"); + + // NOTE: The generated outputs are deliberately NOT tracked with `cargo:rerun-if-changed`. Cargo + // compares the tracked paths' mtimes against the build script's recorded output file, whose + // mtime is not guaranteed to postdate files written by this script in the same run, so tracking + // our own outputs would make every subsequent build appear dirty. +} diff --git a/components/spider-proto-rust/src/generated/storage.rs b/components/spider-proto-rust/src/generated/storage.rs new file mode 100644 index 00000000..4a4cd353 --- /dev/null +++ b/components/spider-proto-rust/src/generated/storage.rs @@ -0,0 +1,632 @@ +// This file is @generated by prost-build. +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct RegisterTaskInstanceRequest { + #[prost(uint64, tag = "1")] + pub job_id: u64, + #[prost(message, optional, tag = "2")] + pub task_id: ::core::option::Option, + #[prost(uint64, tag = "3")] + pub execution_manager_id: u64, + #[prost(uint64, tag = "4")] + pub session_id: u64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RegisterTaskInstanceResponse { + #[prost(oneof = "register_task_instance_response::Result", tags = "1, 2")] + pub result: ::core::option::Option, +} +/// Nested message and enum types in `RegisterTaskInstanceResponse`. +pub mod register_task_instance_response { + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum Result { + #[prost(bytes, tag = "1")] + ExecutionContext(::prost::alloc::vec::Vec), + #[prost(message, tag = "2")] + Error(super::StorageError), + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ReportTaskSuccessRequest { + #[prost(uint64, tag = "1")] + pub job_id: u64, + #[prost(message, optional, tag = "2")] + pub task_id: ::core::option::Option, + #[prost(uint64, tag = "3")] + pub execution_manager_id: u64, + #[prost(uint64, tag = "4")] + pub session_id: u64, + #[prost(bytes = "vec", tag = "5")] + pub serialized_outputs: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ReportTaskFailureRequest { + #[prost(uint64, tag = "1")] + pub job_id: u64, + #[prost(message, optional, tag = "2")] + pub task_id: ::core::option::Option, + #[prost(uint64, tag = "3")] + pub execution_manager_id: u64, + #[prost(uint64, tag = "4")] + pub session_id: u64, + #[prost(string, tag = "5")] + pub error_message: ::prost::alloc::string::String, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct TaskId { + #[prost(oneof = "task_id::Kind", tags = "1, 2, 3")] + pub kind: ::core::option::Option, +} +/// Nested message and enum types in `TaskId`. +pub mod task_id { + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + pub enum Kind { + #[prost(uint64, tag = "1")] + Index(u64), + #[prost(message, tag = "2")] + Commit(super::Void), + #[prost(message, tag = "3")] + Cleanup(super::Void), + } +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StorageOperationResponse { + #[prost(oneof = "storage_operation_response::Result", tags = "1, 2")] + pub result: ::core::option::Option, +} +/// Nested message and enum types in `StorageOperationResponse`. +pub mod storage_operation_response { + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum Result { + #[prost(message, tag = "1")] + Ok(super::Void), + #[prost(message, tag = "2")] + Error(super::StorageError), + } +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Void {} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct StorageError { + #[prost(enumeration = "storage_error::ErrCode", tag = "1")] + pub err_code: i32, + #[prost(string, tag = "2")] + pub message: ::prost::alloc::string::String, + #[prost(uint64, tag = "3")] + pub storage_session: u64, +} +/// Nested message and enum types in `StorageError`. +pub mod storage_error { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum ErrCode { + Unspecified = 0, + StaleSession = 1, + CacheStale = 2, + Transport = 3, + Server = 4, + InvalidInput = 5, + } + impl ErrCode { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::Unspecified => "ERR_CODE_UNSPECIFIED", + Self::StaleSession => "STALE_SESSION", + Self::CacheStale => "CACHE_STALE", + Self::Transport => "TRANSPORT", + Self::Server => "SERVER", + Self::InvalidInput => "INVALID_INPUT", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "ERR_CODE_UNSPECIFIED" => Some(Self::Unspecified), + "STALE_SESSION" => Some(Self::StaleSession), + "CACHE_STALE" => Some(Self::CacheStale), + "TRANSPORT" => Some(Self::Transport), + "SERVER" => Some(Self::Server), + "INVALID_INPUT" => Some(Self::InvalidInput), + _ => None, + } + } + } +} +/// Generated client implementations. +pub mod task_instance_management_service_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + #[derive(Debug, Clone)] + pub struct TaskInstanceManagementServiceClient { + inner: tonic::client::Grpc, + } + impl TaskInstanceManagementServiceClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl TaskInstanceManagementServiceClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> TaskInstanceManagementServiceClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + TaskInstanceManagementServiceClient::new( + InterceptedService::new(inner, interceptor), + ) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + pub async fn register_task_instance( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/storage.TaskInstanceManagementService/RegisterTaskInstance", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "storage.TaskInstanceManagementService", + "RegisterTaskInstance", + ), + ); + self.inner.unary(req, path, codec).await + } + pub async fn report_task_success( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/storage.TaskInstanceManagementService/ReportTaskSuccess", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "storage.TaskInstanceManagementService", + "ReportTaskSuccess", + ), + ); + self.inner.unary(req, path, codec).await + } + pub async fn report_task_failure( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/storage.TaskInstanceManagementService/ReportTaskFailure", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "storage.TaskInstanceManagementService", + "ReportTaskFailure", + ), + ); + self.inner.unary(req, path, codec).await + } + } +} +/// Generated server implementations. +pub mod task_instance_management_service_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with TaskInstanceManagementServiceServer. + #[async_trait] + pub trait TaskInstanceManagementService: std::marker::Send + std::marker::Sync + 'static { + async fn register_task_instance( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + async fn report_task_success( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + async fn report_task_failure( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + } + #[derive(Debug)] + pub struct TaskInstanceManagementServiceServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl TaskInstanceManagementServiceServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> + for TaskInstanceManagementServiceServer + where + T: TaskInstanceManagementService, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/storage.TaskInstanceManagementService/RegisterTaskInstance" => { + #[allow(non_camel_case_types)] + struct RegisterTaskInstanceSvc( + pub Arc, + ); + impl< + T: TaskInstanceManagementService, + > tonic::server::UnaryService + for RegisterTaskInstanceSvc { + type Response = super::RegisterTaskInstanceResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::register_task_instance( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = RegisterTaskInstanceSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/storage.TaskInstanceManagementService/ReportTaskSuccess" => { + #[allow(non_camel_case_types)] + struct ReportTaskSuccessSvc( + pub Arc, + ); + impl< + T: TaskInstanceManagementService, + > tonic::server::UnaryService + for ReportTaskSuccessSvc { + type Response = super::StorageOperationResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::report_task_success( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = ReportTaskSuccessSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/storage.TaskInstanceManagementService/ReportTaskFailure" => { + #[allow(non_camel_case_types)] + struct ReportTaskFailureSvc( + pub Arc, + ); + impl< + T: TaskInstanceManagementService, + > tonic::server::UnaryService + for ReportTaskFailureSvc { + type Response = super::StorageOperationResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::report_task_failure( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = ReportTaskFailureSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new(empty_body()); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for TaskInstanceManagementServiceServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "storage.TaskInstanceManagementService"; + impl tonic::server::NamedService for TaskInstanceManagementServiceServer { + const NAME: &'static str = SERVICE_NAME; + } +} diff --git a/components/spider-proto-rust/src/id.rs b/components/spider-proto-rust/src/id.rs new file mode 100644 index 00000000..ef21bcd8 --- /dev/null +++ b/components/spider-proto-rust/src/id.rs @@ -0,0 +1,44 @@ +//! Helpers for converting Spider IDs to protobuf fields. + +use spider_core::types::id::TaskId; + +use crate::storage::{self, task_id}; + +impl From for storage::TaskId { + fn from(task_id: TaskId) -> Self { + let kind = match task_id { + TaskId::Index(task_index) => task_id::Kind::Index( + u64::try_from(task_index).expect("task index does not fit in u64"), + ), + TaskId::Commit => task_id::Kind::Commit(storage::Void {}), + TaskId::Cleanup => task_id::Kind::Cleanup(storage::Void {}), + }; + Self { kind: Some(kind) } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn task_id_to_protocol_u64_converts_index_task() { + let task_id = storage::TaskId::from(TaskId::Index(7)); + + assert!(matches!(task_id.kind, Some(task_id::Kind::Index(7)))); + } + + #[test] + fn task_id_to_protocol_converts_commit_task() { + let task_id = storage::TaskId::from(TaskId::Commit); + + assert!(matches!(task_id.kind, Some(task_id::Kind::Commit(_)))); + } + + #[test] + fn task_id_to_protocol_converts_cleanup_task() { + let task_id = storage::TaskId::from(TaskId::Cleanup); + + assert!(matches!(task_id.kind, Some(task_id::Kind::Cleanup(_)))); + } +} diff --git a/components/spider-proto-rust/src/lib.rs b/components/spider-proto-rust/src/lib.rs new file mode 100644 index 00000000..d78e8f0d --- /dev/null +++ b/components/spider-proto-rust/src/lib.rs @@ -0,0 +1,8 @@ +//! Rust gRPC protocol definitions generated from Spider protobuf files. + +pub mod id; + +#[allow(clippy::all, clippy::nursery, clippy::pedantic)] +pub mod storage { + include!("generated/storage.rs"); +} diff --git a/components/spider-proto/storage/storage.proto b/components/spider-proto/storage/storage.proto new file mode 100644 index 00000000..3d6f5483 --- /dev/null +++ b/components/spider-proto/storage/storage.proto @@ -0,0 +1,71 @@ +syntax = "proto3"; + +package storage; + +service TaskInstanceManagementService { + rpc RegisterTaskInstance(RegisterTaskInstanceRequest) returns (RegisterTaskInstanceResponse); + rpc ReportTaskSuccess(ReportTaskSuccessRequest) returns (StorageOperationResponse); + rpc ReportTaskFailure(ReportTaskFailureRequest) returns (StorageOperationResponse); +} + +message RegisterTaskInstanceRequest { + uint64 job_id = 1; + TaskId task_id = 2; + uint64 execution_manager_id = 3; + uint64 session_id = 4; +} + +message RegisterTaskInstanceResponse { + oneof result { + bytes execution_context = 1; + StorageError error = 2; + } +} + +message ReportTaskSuccessRequest { + uint64 job_id = 1; + TaskId task_id = 2; + uint64 execution_manager_id = 3; + uint64 session_id = 4; + bytes serialized_outputs = 5; +} + +message ReportTaskFailureRequest { + uint64 job_id = 1; + TaskId task_id = 2; + uint64 execution_manager_id = 3; + uint64 session_id = 4; + string error_message = 5; +} + +message TaskId { + oneof kind { + uint64 index = 1; + Void commit = 2; + Void cleanup = 3; + } +} + +message StorageOperationResponse { + oneof result { + Void ok = 1; + StorageError error = 2; + } +} + +message Void {} + +message StorageError { + enum ErrCode { + ERR_CODE_UNSPECIFIED = 0; + STALE_SESSION = 1; + CACHE_STALE = 2; + TRANSPORT = 3; + SERVER = 4; + INVALID_INPUT = 5; + } + + ErrCode err_code = 1; + string message = 2; + uint64 storage_session = 3; +} diff --git a/taskfiles/build.yaml b/taskfiles/build.yaml index cfdf3c48..cf605838 100644 --- a/taskfiles/build.yaml +++ b/taskfiles/build.yaml @@ -1,5 +1,8 @@ version: "3" +includes: + toolchains: "toolchains.yaml" + tasks: cpp-target: internal: true @@ -24,6 +27,23 @@ tasks: cmds: - "uv build --directory {{.G_SRC_PYTHON_DIR}} -o {{.G_BUILD_PYTHON_DIR}}" + spider-proto-rust-codegen: + env: + SPIDER_PROTO_RUST_GENERATE_FROM_SOURCE: "ON" + dir: "{{.ROOT_DIR}}" + deps: ["toolchains:rust"] + cmd: |- + . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}" + cargo build --release --package spider-proto-rust + + rust: + dir: "{{.ROOT_DIR}}" + deps: + - "toolchains:rust" + cmd: |- + . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}" + cargo build --release --all-features + tdl-generate-parsers: vars: CHECKSUM_FILE: "{{.G_BUILD_DIR}}/{{.TASK}}.md5" diff --git a/taskfiles/lint.yaml b/taskfiles/lint.yaml index 99f27cea..19547345 100644 --- a/taskfiles/lint.yaml +++ b/taskfiles/lint.yaml @@ -314,4 +314,4 @@ tasks: deps: ["toolchains:rust"] cmd: |- . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}" - cargo +nightly clippy --all-targets --all-features {{.CARGO_CLIPPY_FLAGS}} + cargo +nightly clippy --workspace --all-targets --all-features {{.CARGO_CLIPPY_FLAGS}} diff --git a/taskfiles/test.yaml b/taskfiles/test.yaml index 7d79bfdb..a7b19749 100644 --- a/taskfiles/test.yaml +++ b/taskfiles/test.yaml @@ -33,6 +33,8 @@ tasks: STORAGE_TASK: "spider-py-unit-tests-executor" rust-unit-tests: + deps: + - ":build:rust" cmds: - task: "huntsman-mariadb-storage-task-executor" vars: @@ -234,12 +236,7 @@ tasks: - defer: "rm -rf ${SPIDER_TEST_INSTRUMENT_OUTPUT_DIR}" - |- . "{{.G_RUST_TOOLCHAIN_ENV_FILE}}" - # `--bin` is a workspace-wide target filter; combining it with cdylib packages in the - # same `cargo build` would silently exclude the `.so` artifacts. Use one invocation per - # artifact to keep the target selection unambiguous. - cargo build --release --package huntsman-complex - cargo build --release --package integration-test-tasks - cargo build --release --package spider-task-executor --bin spider-task-executor + cargo build --release --workspace --all-features mkdir -p "{{.G_TDL_PACKAGES_DIR}}/complex" \ "{{.G_TDL_PACKAGES_DIR}}/integration_test_tasks" cp "{{.G_RUST_RELEASE_DIR}}/libhuntsman_complex.so" \ diff --git a/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh index 73b64eab..6143e75e 100755 --- a/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh +++ b/tools/scripts/lib_install/ubuntu/install-dev-huntsman.sh @@ -23,4 +23,5 @@ fi DEBIAN_FRONTEND=noninteractive ${privileged_command_prefix} \ apt-get install --no-install-recommends -y \ gcc \ - libc6-dev + libc6-dev \ + protobuf-compiler From 80152948c142ec2ab3f5912dd0eb1d896cbff1e8 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Mon, 8 Jun 2026 10:54:14 -0400 Subject: [PATCH 10/14] Add get recoverable jobs in db --- components/spider-storage/src/db.rs | 1 + components/spider-storage/src/db/error.rs | 9 +++ components/spider-storage/src/db/mariadb.rs | 61 +++++++++++++++++++- components/spider-storage/src/db/protocol.rs | 33 +++++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) diff --git a/components/spider-storage/src/db.rs b/components/spider-storage/src/db.rs index c5152e2f..8ea9a5f0 100644 --- a/components/spider-storage/src/db.rs +++ b/components/spider-storage/src/db.rs @@ -9,6 +9,7 @@ pub use protocol::{ ExecutionManagerLivenessManagement, ExternalJobOrchestration, InternalJobOrchestration, + RecoverableJob, ResourceGroupManagement, SessionManagement, }; diff --git a/components/spider-storage/src/db/error.rs b/components/spider-storage/src/db/error.rs index 62b6434b..3bce5386 100644 --- a/components/spider-storage/src/db/error.rs +++ b/components/spider-storage/src/db/error.rs @@ -40,6 +40,9 @@ pub enum DbError { #[error("Task graph serialization failure: {0}")] TaskGraphSerializationFailure(#[source] Box), + #[error("Task graph deserialization failure: {0}")] + TaskGraphDeserializationFailure(#[source] Box), + #[error("Value serialization failure: {0}")] ValueSerializationFailure(#[source] Box), @@ -57,6 +60,12 @@ impl DbError { Self::TaskGraphSerializationFailure(Box::new(e)) } + pub fn task_graph_de( + e: DeserializationError, + ) -> Self { + Self::TaskGraphDeserializationFailure(Box::new(e)) + } + pub fn value_ser( e: SerializationError, ) -> Self { diff --git a/components/spider-storage/src/db/mariadb.rs b/components/spider-storage/src/db/mariadb.rs index 6bd7017c..4cb11320 100644 --- a/components/spider-storage/src/db/mariadb.rs +++ b/components/spider-storage/src/db/mariadb.rs @@ -5,9 +5,10 @@ use const_format::formatcp; use secrecy::ExposeSecret; use spider_core::{ job::JobState, + task::TaskGraph, types::{ id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId}, - io::TaskOutput, + io::{TaskInput, TaskOutput}, }, }; use spider_derive::MySqlEnum; @@ -22,6 +23,7 @@ use crate::{ ExecutionManagerLivenessManagement, ExternalJobOrchestration, InternalJobOrchestration, + RecoverableJob, ResourceGroupManagement, SessionManagement, error::ExpectedStates, @@ -380,6 +382,63 @@ impl InternalJobOrchestration for MariaDbStorageConnector { tx.commit().await?; Ok(deleted_job_ids) } + + async fn get_recoverable_jobs(&self) -> Result, DbError> { + const SELECT_QUERY: &str = formatcp!( + "SELECT `id`, `resource_group_id`, `state`, `serialized_task_graph`, \ + `serialized_job_inputs`, `serialized_job_outputs` FROM `{table}` WHERE `state` IN \ + ('{running_state}','{commit_ready_state}','{cleanup_ready_state}');", + table = JOBS_TABLE_NAME, + running_state = JobState::Running.as_str(), + commit_ready_state = JobState::CommitReady.as_str(), + cleanup_ready_state = JobState::CleanupReady.as_str(), + ); + + let rows = sqlx::query_as::< + _, + ( + JobId, + ResourceGroupId, + JobState, + String, + Vec, + Option>, + ), + >(SELECT_QUERY) + .fetch_all(&self.pool) + .await?; + + rows.into_iter() + .map( + |( + id, + resource_group_id, + state, + serialized_task_graph, + serialized_job_inputs, + serialized_job_outputs, + )| { + let task_graph = TaskGraph::from_json(&serialized_task_graph) + .map_err(DbError::task_graph_de)?; + let job_inputs: Vec = + rmp_serde::from_slice(&serialized_job_inputs).map_err(DbError::value_de)?; + let job_submission = ValidatedJobSubmission::create(task_graph, job_inputs) + .map_err(|e| DbError::CorruptedDbState(e.to_string()))?; + let job_outputs = serialized_job_outputs + .map(|outputs| rmp_serde::from_slice(&outputs).map_err(DbError::value_de)) + .transpose()?; + + Ok(RecoverableJob { + id, + resource_group_id, + state, + job_submission, + job_outputs, + }) + }, + ) + .collect() + } } #[async_trait] diff --git a/components/spider-storage/src/db/protocol.rs b/components/spider-storage/src/db/protocol.rs index 0b9e297f..2f9be3fd 100644 --- a/components/spider-storage/src/db/protocol.rs +++ b/components/spider-storage/src/db/protocol.rs @@ -11,6 +11,23 @@ use spider_core::{ use crate::{cache::job_submission::ValidatedJobSubmission, db::error::DbError}; +/// A job persisted in the database that should be rebuilt in the storage cache on startup. +/// +/// Only jobs that have already started execution are recoverable. [`JobState::Ready`] jobs remain +/// database-only until a client starts them. +pub struct RecoverableJob { + /// The persisted job ID. + pub id: JobId, + /// The owning resource group. + pub resource_group_id: ResourceGroupId, + /// The source-of-truth database state. + pub state: JobState, + /// The original job submission. + pub job_submission: ValidatedJobSubmission, + /// The committed job outputs, if the job has reached the commit phase. + pub job_outputs: Option>, +} + /// The database storage interface. A database storage must implement the following traits: /// /// * [`ExternalJobOrchestration`] @@ -244,6 +261,22 @@ pub trait InternalJobOrchestration: Clone + Send + Sync { &self, expire_after_sec: u64, ) -> Result, DbError>; + + /// Gets all jobs that should be recovered into the cache. + /// + /// # Returns + /// + /// All persisted jobs in [`JobState::Running`], [`JobState::CommitReady`], or + /// [`JobState::CleanupReady`] on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`DbError::TaskGraphDeserializationFailure`] if a persisted task graph is invalid. + /// * [`DbError::ValueDeserializationFailure`] if persisted inputs or outputs are invalid. + /// * Forwards [`sqlx::error::Error`] on DB operation failure. + async fn get_recoverable_jobs(&self) -> Result, DbError>; } /// Defines the storage interface for resource group management in the database. From a667a7984f24f28c2f4d6392a3e2e54ab6c942b6 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Mon, 8 Jun 2026 12:20:27 -0400 Subject: [PATCH 11/14] Add runtime recovery --- components/spider-storage/src/cache/job.rs | 95 +++++++++++++++++++ components/spider-storage/src/cache/sync.rs | 7 ++ components/spider-storage/src/cache/task.rs | 24 +++++ .../spider-storage/src/state/runtime.rs | 73 +++++++++++++- .../spider-storage/src/state/service.rs | 8 ++ .../spider-storage/src/state/test_utils.rs | 5 + 6 files changed, 208 insertions(+), 4 deletions(-) diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs index 21997b06..146b015f 100644 --- a/components/spider-storage/src/cache/job.rs +++ b/components/spider-storage/src/cache/job.rs @@ -47,6 +47,20 @@ pub struct SharedJobControlBlock< Arc>, } +/// Persistent job state used to recover a job control block. +pub struct JobRecoveryContext { + /// The persisted job ID. + pub id: JobId, + /// The owning resource group. + pub owner_id: ResourceGroupId, + /// The source-of-truth database state. + pub state: JobState, + /// The original job submission. + pub job_submission: ValidatedJobSubmission, + /// The committed job outputs, if the job has reached the commit phase. + pub job_outputs: Option>, +} + impl< ReadyQueueSenderType: ReadyQueueSender, DbConnectorType: InternalJobOrchestration, @@ -93,6 +107,87 @@ impl< }) } + /// Recovers a job control block from persistent database state. + /// + /// This constructor does not mutate the database. It rebuilds enough cache state to resume + /// scheduling: + /// + /// * [`JobState::Running`] jobs enqueue their initially-ready regular tasks. + /// * [`JobState::CommitReady`] jobs enqueue the commit task. + /// * [`JobState::CleanupReady`] jobs enqueue the cleanup task. + /// + /// # Returns + /// + /// The recovered [`SharedJobControlBlock`] on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`InternalError::UnexpectedJobState`] if `state` is not recoverable. + /// * Forwards [`TaskGraph::create`]'s return values on failure. + /// * Forwards [`TaskGraph::restore_outputs`]'s return values on failure. + /// * Forwards [`SharedJobControlBlock::resend_ready_tasks`]'s return values on failure. + pub async fn recover( + recovery_context: JobRecoveryContext, + ready_queue_sender: ReadyQueueSenderType, + db_connector: DbConnectorType, + task_instance_pool_connector: TaskInstancePoolConnectorType, + ) -> Result { + let JobRecoveryContext { + id, + owner_id, + state, + job_submission, + job_outputs, + } = recovery_context; + if !matches!( + state, + JobState::Running | JobState::CommitReady | JobState::CleanupReady + ) { + return Err(UnexpectedJobState { + current: state, + expected: JobState::Running, + } + .into()); + } + + let num_tasks = job_submission.task_graph().get_num_tasks(); + let mut task_graph = TaskGraph::create(job_submission).await?; + if let Some(outputs) = job_outputs { + task_graph.restore_outputs(outputs).await?; + } + let num_incomplete_tasks = if matches!(state, JobState::CommitReady) { + 0 + } else { + num_tasks + }; + + if matches!(state, JobState::CleanupReady) { + task_graph.cancel_non_terminal().await; + } + + let job_execution_state = JobExecutionState { + state, + task_graph, + num_incomplete_tasks: AtomicUsize::new(num_incomplete_tasks), + ready_queue_sender, + db_connector, + task_instance_pool_connector, + }; + let recovered = Self { + inner: Arc::new(JobControlBlock { + id, + owner_id, + job_execution_state: JobExecutionStateHandle { + inner: tokio::sync::RwLock::new(job_execution_state), + }, + }), + }; + recovered.resend_ready_tasks().await?; + Ok(recovered) + } + /// Returns the job ID. #[must_use] pub fn id(&self) -> JobId { diff --git a/components/spider-storage/src/cache/sync.rs b/components/spider-storage/src/cache/sync.rs index 0fc03448..4d1847a9 100644 --- a/components/spider-storage/src/cache/sync.rs +++ b/components/spider-storage/src/cache/sync.rs @@ -17,6 +17,13 @@ impl Reader { Self { inner } } + /// # Returns + /// + /// A writer for the same shared data. + pub(crate) fn writer(&self) -> Writer { + Writer::new(self.inner.clone()) + } + /// # Returns /// /// A guard that allows read access to the shared data. The guard will be released when it goes diff --git a/components/spider-storage/src/cache/task.rs b/components/spider-storage/src/cache/task.rs index 5ce7ff30..75e1bba3 100644 --- a/components/spider-storage/src/cache/task.rs +++ b/components/spider-storage/src/cache/task.rs @@ -172,6 +172,30 @@ impl TaskGraph { &self.outputs } + /// Restores graph outputs from persisted job outputs. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`InternalError::TaskOutputsLengthMismatch`] if the number of persisted outputs does not + /// match the number of graph outputs. + pub async fn restore_outputs( + &self, + persisted_outputs: Vec, + ) -> Result<(), InternalError> { + if persisted_outputs.len() != self.outputs.len() { + return Err(InternalError::TaskOutputsLengthMismatch( + self.outputs.len(), + persisted_outputs.len(), + )); + } + for (output_reader, output) in self.outputs.iter().zip(persisted_outputs) { + *output_reader.writer().write().await = Some(output); + } + Ok(()) + } + #[must_use] pub const fn has_commit_task(&self) -> bool { self.commit_task.is_some() diff --git a/components/spider-storage/src/state/runtime.rs b/components/spider-storage/src/state/runtime.rs index 5bda0d7a..5d4e77a5 100644 --- a/components/spider-storage/src/state/runtime.rs +++ b/components/spider-storage/src/state/runtime.rs @@ -4,9 +4,12 @@ use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use crate::{ - cache::error::{CacheError, InternalError}, + cache::{ + error::{CacheError, InternalError}, + job::{JobRecoveryContext, SharedJobControlBlock}, + }, config::DatabaseConfig, - db::{DbStorage, MariaDbStorageConnector, SessionManagement}, + db::{DbStorage, MariaDbStorageConnector, RecoverableJob, SessionManagement}, ready_queue::{ReadyQueueConfig, ReadyQueueSender, ReadyQueueSenderHandle, create_ready_queue}, state::{JobCache, ServiceState, StorageServerError}, task_instance_pool::{ @@ -121,11 +124,16 @@ pub async fn create_runtime( ) .map_err(CacheError::from)?; - // TODO: Recover jobs from the database. + let job_cache = recover_job_cache( + &db, + ready_queue_sender.clone(), + task_instance_pool_connector.clone(), + ) + .await?; let service_state = ServiceState::new( db, session_id, - JobCache::new(), + job_cache, ready_queue_sender, ready_queue_receiver, task_instance_pool_connector, @@ -144,6 +152,63 @@ pub async fn create_runtime( const STOP_BACKGROUND_TASKS_TIMEOUT_SEC: u64 = 30; +/// Recovers jobs from persistent storage into the cache. +/// +/// # Returns +/// +/// A [`JobCache`] containing all recoverable jobs on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`DbStorage::get_recoverable_jobs`]'s return values on failure. +/// * Forwards [`SharedJobControlBlock::recover`]'s return values on failure. +/// * Forwards [`JobCache::insert`]'s return values on failure. +async fn recover_job_cache< + ReadyQueueSenderType: ReadyQueueSender, + DbConnectorType: DbStorage, + TaskInstancePoolConnectorType: TaskInstancePoolConnector, +>( + db: &DbConnectorType, + ready_queue_sender: ReadyQueueSenderType, + task_instance_pool_connector: TaskInstancePoolConnectorType, +) -> Result< + JobCache, + StorageServerError, +> { + let job_cache = JobCache::new(); + for recoverable_job in db.get_recoverable_jobs().await? { + let RecoverableJob { + id, + resource_group_id, + state, + job_submission, + job_outputs, + } = recoverable_job; + let jcb = SharedJobControlBlock::recover( + JobRecoveryContext { + id, + owner_id: resource_group_id, + state, + job_submission, + job_outputs, + }, + ready_queue_sender.clone(), + db.clone(), + task_instance_pool_connector.clone(), + ) + .await?; + job_cache.insert(jcb).await?; + tracing::info!( + job_id = ? id, + job_state = ? state, + "Job recovered into cache.", + ); + } + Ok(job_cache) +} + #[cfg(test)] mod tests { use std::time::Duration; diff --git a/components/spider-storage/src/state/service.rs b/components/spider-storage/src/state/service.rs index ac257e77..a198fb3c 100644 --- a/components/spider-storage/src/state/service.rs +++ b/components/spider-storage/src/state/service.rs @@ -83,6 +83,14 @@ impl< } } + /// # Returns + /// + /// The storage session ID owned by this service state. + #[must_use] + pub fn session_id(&self) -> SessionId { + self.inner.session_id + } + /// Registers a job in the database and inserts its control block into the cache. /// /// # Returns diff --git a/components/spider-storage/src/state/test_utils.rs b/components/spider-storage/src/state/test_utils.rs index a2536d6c..52dcf383 100644 --- a/components/spider-storage/src/state/test_utils.rs +++ b/components/spider-storage/src/state/test_utils.rs @@ -27,6 +27,7 @@ use crate::{ ExecutionManagerLivenessManagement, ExternalJobOrchestration, InternalJobOrchestration, + RecoverableJob, ResourceGroupManagement, SessionManagement, }, @@ -166,6 +167,10 @@ impl InternalJobOrchestration for MockDbConnector { ) -> Result, DbError> { Ok(Vec::new()) } + + async fn get_recoverable_jobs(&self) -> Result, DbError> { + Ok(Vec::new()) + } } #[async_trait::async_trait] From da87450c42e31b66b089716b7b47ee4517899fee Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Mon, 8 Jun 2026 13:01:37 -0400 Subject: [PATCH 12/14] Add unit tests --- .../spider-storage/tests/mariadb_infra.rs | 24 +- .../spider-storage/tests/mariadb_test.rs | 4 +- .../spider-storage/tests/recovery_test.rs | 481 ++++++++++++++++++ .../spider-storage/tests/scheduling_infra.rs | 12 +- .../tests/test_spider_storage.rs | 1 + 5 files changed, 514 insertions(+), 8 deletions(-) create mode 100644 components/spider-storage/tests/recovery_test.rs diff --git a/components/spider-storage/tests/mariadb_infra.rs b/components/spider-storage/tests/mariadb_infra.rs index 0772ec04..299ec1fb 100644 --- a/components/spider-storage/tests/mariadb_infra.rs +++ b/components/spider-storage/tests/mariadb_infra.rs @@ -16,6 +16,23 @@ use spider_storage::{ /// Panics if any required environment variable (`MARIADB_PORT`, `MARIADB_DATABASE`, /// `MARIADB_USERNAME`, `MARIADB_PASSWORD`) is missing or if the connection fails. pub async fn create_mariadb_connector() -> MariaDbStorageConnector { + MariaDbStorageConnector::connect(&create_mariadb_config()) + .await + .expect("connect failed") +} + +/// Creates a [`DatabaseConfig`] from environment variables. +/// +/// # Returns +/// +/// A [`DatabaseConfig`] configured from environment variables. +/// +/// # Panics +/// +/// Panics if any required environment variable (`MARIADB_PORT`, `MARIADB_DATABASE`, +/// `MARIADB_USERNAME`, `MARIADB_PASSWORD`) is missing or if `MARIADB_PORT` is invalid. +#[must_use] +pub fn create_mariadb_config() -> DatabaseConfig { let port: u16 = std::env::var("MARIADB_PORT") .expect("MARIADB_PORT") .parse() @@ -24,17 +41,14 @@ pub async fn create_mariadb_connector() -> MariaDbStorageConnector { let username = std::env::var("MARIADB_USERNAME").expect("MARIADB_USERNAME"); let password = std::env::var("MARIADB_PASSWORD").expect("MARIADB_PASSWORD"); - let config = DatabaseConfig { + DatabaseConfig { host: "localhost".to_string(), port, name: database, username, password: SecretString::from(password), max_connections: 5, - }; - MariaDbStorageConnector::connect(&config) - .await - .expect("connect failed") + } } /// Registers a new resource group with a random external ID and a fixed test password. diff --git a/components/spider-storage/tests/mariadb_test.rs b/components/spider-storage/tests/mariadb_test.rs index 88343c82..f58a020f 100644 --- a/components/spider-storage/tests/mariadb_test.rs +++ b/components/spider-storage/tests/mariadb_test.rs @@ -269,7 +269,7 @@ async fn test_get_error_wrong_state() { async fn test_cancel_job_with_cleanup_transitions_to_cleanup_ready() { let storage = create_mariadb_connector().await; let rg_id = create_test_resource_group(&storage).await; - let (graph, inputs) = single_task_graph(); + let (graph, inputs) = build_flat_task_graph(1, TEST_INPUT_PAYLOAD_SIZE, false, true); let job_submission = ValidatedJobSubmission::create(graph, inputs).expect("job submission should be valid"); @@ -403,7 +403,7 @@ async fn test_commit_outputs_without_commit_task() { async fn test_commit_outputs_with_commit_task() { let storage = create_mariadb_connector().await; let rg_id = create_test_resource_group(&storage).await; - let (graph, inputs) = single_task_graph(); + let (graph, inputs) = build_flat_task_graph(1, TEST_INPUT_PAYLOAD_SIZE, true, false); let job_submission = ValidatedJobSubmission::create(graph, inputs).expect("job submission should be valid"); diff --git a/components/spider-storage/tests/recovery_test.rs b/components/spider-storage/tests/recovery_test.rs new file mode 100644 index 00000000..84860644 --- /dev/null +++ b/components/spider-storage/tests/recovery_test.rs @@ -0,0 +1,481 @@ +use std::{net::IpAddr, time::Duration}; + +use spider_core::{ + job::JobState, + task::TaskIndex, + types::{ + id::{JobId, TaskInstanceId}, + io::TaskInput, + }, +}; +use spider_storage::{ + db::ExternalJobOrchestration, + ready_queue::{ReadyQueueConfig, ReadyQueueEntry}, + state::{Runtime, ServiceState, StorageServerError, create_runtime}, + task_instance_pool::TaskInstancePoolConfig, +}; +use spider_tdl::wire::{TaskInputsSerializer, TaskOutputsSerializer}; + +use crate::{ + mariadb_infra::{create_mariadb_config, create_mariadb_connector}, + task_graph_builder::build_flat_task_graph, +}; + +#[tokio::test] +async fn restarted_storage_cache_does_not_recover_ready_job() -> anyhow::Result<()> { + let db_config = create_mariadb_config(); + let (runtime, _) = create_runtime( + &db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let service = runtime.get_service_state(); + let job_id = create_registered_job(&service, false, false).await?; + assert_eq!(service.get_job_state(job_id).await?, JobState::Ready); + runtime.stop().await?; + + let (recovered_runtime, _) = create_runtime( + &db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let recovered_service = recovered_runtime.get_service_state(); + let start_result = recovered_service.start_job(job_id).await; + assert!( + matches!(start_result, Err(StorageServerError::JobNotFound(id)) if id == job_id), + "ready job should not be recovered into cache" + ); + assert_eq!( + recovered_service.get_job_state(job_id).await?, + JobState::Ready + ); + recovered_runtime.stop().await?; + Ok(()) +} + +#[tokio::test] +async fn restarted_storage_cache_recovers_running_job_from_start() -> anyhow::Result<()> { + let db_config = create_mariadb_config(); + let (job_id, recovered_service, recovered_runtime) = + restart_after_starting_job(&db_config, false, false).await?; + + let ready_entries = recovered_service + .poll_ready_tasks(32, Duration::from_secs(1)) + .await?; + let ready_entry = find_entry_for_job(ready_entries, job_id); + + let task_instance_id = + run_recovered_regular_task(&recovered_service, job_id, ready_entry.task_kind).await?; + let state = recovered_service + .succeed_task_instance( + recovered_service.session_id(), + job_id, + task_instance_id, + ready_entry.task_kind, + serialized_single_output()?, + ) + .await?; + assert_eq!(state, JobState::Succeeded); + + assert_eq!( + create_mariadb_connector().await.get_state(job_id).await?, + JobState::Succeeded + ); + recovered_runtime.stop().await?; + Ok(()) +} + +#[tokio::test] +async fn restarted_storage_cache_recovers_commit_ready_job() -> anyhow::Result<()> { + let db_config = create_mariadb_config(); + let (job_id, recovered_service, recovered_runtime) = + restart_after_commit_ready(&db_config).await?; + + let ready_entries = recovered_service + .poll_commit_ready_tasks(32, Duration::from_secs(1)) + .await?; + let _ready_entry = find_entry_for_job(ready_entries, job_id); + + let execution_manager_id = recovered_service + .register_execution_manager(IpAddr::from([127, 0, 0, 1])) + .await?; + let execution_context = recovered_service + .create_task_instance( + recovered_service.session_id(), + job_id, + spider_core::types::id::TaskId::Commit, + execution_manager_id, + ) + .await?; + let state = recovered_service + .succeed_commit_task_instance( + recovered_service.session_id(), + job_id, + execution_context.task_instance_id, + ) + .await?; + assert_eq!(state, JobState::Succeeded); + let expected_outputs = TaskOutputsSerializer::deserialize(&serialized_single_output()?)?; + assert_eq!( + recovered_service.get_job_outputs(job_id).await?, + expected_outputs + ); + + assert_eq!( + create_mariadb_connector().await.get_state(job_id).await?, + JobState::Succeeded + ); + recovered_runtime.stop().await?; + Ok(()) +} + +#[tokio::test] +async fn restarted_storage_cache_recovers_cleanup_ready_job() -> anyhow::Result<()> { + let db_config = create_mariadb_config(); + let (job_id, recovered_service, recovered_runtime) = + restart_after_cleanup_ready(&db_config).await?; + + let ready_entries = recovered_service + .poll_cleanup_ready_tasks(32, Duration::from_secs(1)) + .await?; + let _ready_entry = find_entry_for_job(ready_entries, job_id); + + let execution_manager_id = recovered_service + .register_execution_manager(IpAddr::from([127, 0, 0, 1])) + .await?; + let execution_context = recovered_service + .create_task_instance( + recovered_service.session_id(), + job_id, + spider_core::types::id::TaskId::Cleanup, + execution_manager_id, + ) + .await?; + let state = recovered_service + .succeed_cleanup_task_instance( + recovered_service.session_id(), + job_id, + execution_context.task_instance_id, + ) + .await?; + assert_eq!(state, JobState::Cancelled); + + assert_eq!( + create_mariadb_connector().await.get_state(job_id).await?, + JobState::Cancelled + ); + recovered_runtime.stop().await?; + Ok(()) +} + +/// Starts a job, stops the runtime, and creates a replacement runtime over the same database. +/// +/// # Returns +/// +/// The job ID, recovered service state, and recovered runtime on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`create_runtime`]'s return values on failure. +/// * Forwards [`create_and_start_job`]'s return values on failure. +/// * Forwards [`Runtime::stop`]'s return values on failure. +async fn restart_after_starting_job( + db_config: &spider_storage::DatabaseConfig, + with_commit: bool, + with_cleanup: bool, +) -> anyhow::Result<( + JobId, + ServiceState< + spider_storage::ready_queue::ReadyQueueSenderHandle, + spider_storage::db::MariaDbStorageConnector, + spider_storage::task_instance_pool::TaskInstancePoolHandle, + >, + Runtime< + spider_storage::ready_queue::ReadyQueueSenderHandle, + spider_storage::db::MariaDbStorageConnector, + spider_storage::task_instance_pool::TaskInstancePoolHandle, + >, +)> { + let (runtime, _) = create_runtime( + db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let service = runtime.get_service_state(); + let job_id = create_and_start_job(&service, with_commit, with_cleanup).await?; + runtime.stop().await?; + + let (recovered_runtime, _) = create_runtime( + db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let recovered_service = recovered_runtime.get_service_state(); + Ok((job_id, recovered_service, recovered_runtime)) +} + +/// Drives a job to [`JobState::CommitReady`], stops the runtime, and creates a replacement runtime. +/// +/// # Returns +/// +/// The job ID, recovered service state, and recovered runtime on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`restart_after_starting_job`]'s return values on failure. +/// * Forwards [`ServiceState::poll_ready_tasks`]'s return values on failure. +/// * Forwards [`run_recovered_regular_task`]'s return values on failure. +/// * Forwards [`serialized_single_output`]'s return values on failure. +/// * Forwards [`ServiceState::succeed_task_instance`]'s return values on failure. +/// * Forwards [`Runtime::stop`]'s return values on failure. +/// * Forwards [`create_runtime`]'s return values on failure. +async fn restart_after_commit_ready( + db_config: &spider_storage::DatabaseConfig, +) -> anyhow::Result<( + JobId, + ServiceState< + spider_storage::ready_queue::ReadyQueueSenderHandle, + spider_storage::db::MariaDbStorageConnector, + spider_storage::task_instance_pool::TaskInstancePoolHandle, + >, + Runtime< + spider_storage::ready_queue::ReadyQueueSenderHandle, + spider_storage::db::MariaDbStorageConnector, + spider_storage::task_instance_pool::TaskInstancePoolHandle, + >, +)> { + let (job_id, service, runtime) = restart_after_starting_job(db_config, true, false).await?; + let ready_entries = service.poll_ready_tasks(32, Duration::from_secs(1)).await?; + let ready_entry = find_entry_for_job(ready_entries, job_id); + let task_instance_id = + run_recovered_regular_task(&service, job_id, ready_entry.task_kind).await?; + let state = service + .succeed_task_instance( + service.session_id(), + job_id, + task_instance_id, + 0, + serialized_single_output()?, + ) + .await?; + assert_eq!(state, JobState::CommitReady); + runtime.stop().await?; + + let (recovered_runtime, _) = create_runtime( + db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let recovered_service = recovered_runtime.get_service_state(); + Ok((job_id, recovered_service, recovered_runtime)) +} + +/// Drives a job to [`JobState::CleanupReady`], stops the runtime, and creates a replacement +/// runtime. +/// +/// # Returns +/// +/// The job ID, recovered service state, and recovered runtime on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`create_runtime`]'s return values on failure. +/// * Forwards [`create_and_start_job`]'s return values on failure. +/// * Forwards [`ServiceState::cancel_job`]'s return values on failure. +/// * Forwards [`Runtime::stop`]'s return values on failure. +async fn restart_after_cleanup_ready( + db_config: &spider_storage::DatabaseConfig, +) -> anyhow::Result<( + JobId, + ServiceState< + spider_storage::ready_queue::ReadyQueueSenderHandle, + spider_storage::db::MariaDbStorageConnector, + spider_storage::task_instance_pool::TaskInstancePoolHandle, + >, + Runtime< + spider_storage::ready_queue::ReadyQueueSenderHandle, + spider_storage::db::MariaDbStorageConnector, + spider_storage::task_instance_pool::TaskInstancePoolHandle, + >, +)> { + let (runtime, _) = create_runtime( + db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let service = runtime.get_service_state(); + let job_id = create_and_start_job(&service, false, true).await?; + let state = service.cancel_job(job_id).await?; + assert_eq!(state, JobState::CleanupReady); + runtime.stop().await?; + + let (recovered_runtime, _) = create_runtime( + db_config, + &ReadyQueueConfig::default(), + &TaskInstancePoolConfig::default(), + ) + .await?; + let recovered_service = recovered_runtime.get_service_state(); + Ok((job_id, recovered_service, recovered_runtime)) +} + +/// Registers and starts a flat recovery-test job. +/// +/// # Returns +/// +/// The registered job ID on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`create_registered_job`]'s return values on failure. +/// * Forwards [`ServiceState::start_job`]'s return values on failure. +async fn create_and_start_job< + ReadyQueueSenderType: spider_storage::ready_queue::ReadyQueueSender, + DbConnectorType: spider_storage::db::DbStorage, + TaskInstancePoolConnectorType: spider_storage::task_instance_pool::TaskInstancePoolConnector, +>( + service: &ServiceState, + with_commit: bool, + with_cleanup: bool, +) -> anyhow::Result { + let job_id = create_registered_job(service, with_commit, with_cleanup).await?; + service.start_job(job_id).await?; + Ok(job_id) +} + +/// Registers a flat recovery-test job without starting it. +/// +/// # Returns +/// +/// The registered job ID on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`ServiceState::add_resource_group`]'s return values on failure. +/// * Forwards [`spider_core::task::TaskGraph::to_json`]'s return values on failure. +/// * Forwards [`serialize_inputs`]'s return values on failure. +/// * Forwards [`ServiceState::register_job`]'s return values on failure. +async fn create_registered_job< + ReadyQueueSenderType: spider_storage::ready_queue::ReadyQueueSender, + DbConnectorType: spider_storage::db::DbStorage, + TaskInstancePoolConnectorType: spider_storage::task_instance_pool::TaskInstancePoolConnector, +>( + service: &ServiceState, + with_commit: bool, + with_cleanup: bool, +) -> anyhow::Result { + let rg_id = service + .add_resource_group( + format!("recovery-test-{}", rand::random::()), + b"test-password".to_vec(), + ) + .await?; + let (task_graph, inputs) = build_flat_task_graph(1, 4, with_commit, with_cleanup); + Ok(service + .register_job(rg_id, task_graph.to_json()?, serialize_inputs(inputs)?) + .await?) +} + +/// Registers an execution manager and creates an instance for a recovered regular task. +/// +/// # Returns +/// +/// The created task instance ID on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`ServiceState::register_execution_manager`]'s return values on failure. +/// * Forwards [`ServiceState::create_task_instance`]'s return values on failure. +async fn run_recovered_regular_task< + ReadyQueueSenderType: spider_storage::ready_queue::ReadyQueueSender, + DbConnectorType: spider_storage::db::DbStorage, + TaskInstancePoolConnectorType: spider_storage::task_instance_pool::TaskInstancePoolConnector, +>( + service: &ServiceState, + job_id: JobId, + task_index: TaskIndex, +) -> anyhow::Result { + let execution_manager_id = service + .register_execution_manager(IpAddr::from([127, 0, 0, 1])) + .await?; + let execution_context = service + .create_task_instance( + service.session_id(), + job_id, + spider_core::types::id::TaskId::Index(task_index), + execution_manager_id, + ) + .await?; + Ok(execution_context.task_instance_id) +} + +/// Finds the ready-queue entry for a job. +/// +/// # Returns +/// +/// The matching ready-queue entry. +/// +/// # Panics +/// +/// Panics if no matching entry exists. +fn find_entry_for_job( + entries: Vec>, + job_id: JobId, +) -> ReadyQueueEntry { + entries + .into_iter() + .find(|entry| entry.job_id == job_id) + .expect("recovered job should be enqueued") +} + +/// Serializes task inputs into the storage service wire format. +/// +/// # Returns +/// +/// The serialized task inputs on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`TaskInputsSerializer::append`]'s return values on failure. +fn serialize_inputs(inputs: Vec) -> anyhow::Result> { + let mut serializer = TaskInputsSerializer::new(); + for input in inputs { + serializer.append(input)?; + } + Ok(serializer.release()) +} + +/// Serializes the single output payload used by recovery tests. +/// +/// # Returns +/// +/// The serialized task output on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`TaskOutputsSerializer::from_tuple`]'s return values on failure. +fn serialized_single_output() -> anyhow::Result> { + Ok(TaskOutputsSerializer::from_tuple(&(vec![1u8; 4],))?) +} diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs index a089d66f..d4fa4878 100644 --- a/components/spider-storage/tests/scheduling_infra.rs +++ b/components/spider-storage/tests/scheduling_infra.rs @@ -98,7 +98,13 @@ use spider_storage::{ job_submission::ValidatedJobSubmission, task::{SharedTaskControlBlock, SharedTerminationTaskControlBlock}, }, - db::{DbError, ExternalJobOrchestration, InternalJobOrchestration, MariaDbStorageConnector}, + db::{ + DbError, + ExternalJobOrchestration, + InternalJobOrchestration, + MariaDbStorageConnector, + RecoverableJob, + }, ready_queue::ReadyQueueSender, task_instance_pool::{TaskInstanceMetadata, TaskInstancePoolConnector}, }; @@ -176,6 +182,10 @@ impl InternalJobOrchestration for NoopDbConnector { ) -> Result, DbError> { Ok(Vec::new()) } + + async fn get_recoverable_jobs(&self) -> Result, DbError> { + Ok(Vec::new()) + } } /// The result of running a workload to completion. diff --git a/components/spider-storage/tests/test_spider_storage.rs b/components/spider-storage/tests/test_spider_storage.rs index 78520dd4..6e69cc13 100644 --- a/components/spider-storage/tests/test_spider_storage.rs +++ b/components/spider-storage/tests/test_spider_storage.rs @@ -4,3 +4,4 @@ mod task_graph_builder; mod jcb_test; mod mariadb_test; +mod recovery_test; From e3db65f0a39bf7f27bcd7e379747f4ebab612089 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Mon, 8 Jun 2026 15:55:29 -0400 Subject: [PATCH 13/14] Address comment --- components/spider-storage/src/cache/job.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs index 146b015f..15c8ed5a 100644 --- a/components/spider-storage/src/cache/job.rs +++ b/components/spider-storage/src/cache/job.rs @@ -125,6 +125,7 @@ impl< /// Returns an error if: /// /// * [`InternalError::UnexpectedJobState`] if `state` is not recoverable. + /// * [`InternalError::TaskGraphCorrupted`] if a commit-ready job has no persisted outputs. /// * Forwards [`TaskGraph::create`]'s return values on failure. /// * Forwards [`TaskGraph::restore_outputs`]'s return values on failure. /// * Forwards [`SharedJobControlBlock::resend_ready_tasks`]'s return values on failure. @@ -154,6 +155,12 @@ impl< let num_tasks = job_submission.task_graph().get_num_tasks(); let mut task_graph = TaskGraph::create(job_submission).await?; + if matches!(state, JobState::CommitReady) && job_outputs.is_none() { + return Err(InternalError::TaskGraphCorrupted( + "commit-ready job has no persisted outputs".to_owned(), + ) + .into()); + } if let Some(outputs) = job_outputs { task_graph.restore_outputs(outputs).await?; } From 022223f8a930d02b46695e41aef484d881f74726 Mon Sep 17 00:00:00 2001 From: Sitao Wang Date: Tue, 9 Jun 2026 16:07:20 -0400 Subject: [PATCH 14/14] Use RecoverableJob --- components/spider-storage/src/cache/job.rs | 26 +++++-------------- .../spider-storage/src/state/runtime.rs | 21 ++++----------- 2 files changed, 11 insertions(+), 36 deletions(-) diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs index 15c8ed5a..a6e3bf98 100644 --- a/components/spider-storage/src/cache/job.rs +++ b/components/spider-storage/src/cache/job.rs @@ -22,7 +22,7 @@ use crate::{ job_submission::ValidatedJobSubmission, task::TaskGraph, }, - db::InternalJobOrchestration, + db::{InternalJobOrchestration, RecoverableJob}, ready_queue::ReadyQueueSender, task_instance_pool::{TaskInstanceMetadata, TaskInstancePoolConnector}, }; @@ -47,20 +47,6 @@ pub struct SharedJobControlBlock< Arc>, } -/// Persistent job state used to recover a job control block. -pub struct JobRecoveryContext { - /// The persisted job ID. - pub id: JobId, - /// The owning resource group. - pub owner_id: ResourceGroupId, - /// The source-of-truth database state. - pub state: JobState, - /// The original job submission. - pub job_submission: ValidatedJobSubmission, - /// The committed job outputs, if the job has reached the commit phase. - pub job_outputs: Option>, -} - impl< ReadyQueueSenderType: ReadyQueueSender, DbConnectorType: InternalJobOrchestration, @@ -130,18 +116,18 @@ impl< /// * Forwards [`TaskGraph::restore_outputs`]'s return values on failure. /// * Forwards [`SharedJobControlBlock::resend_ready_tasks`]'s return values on failure. pub async fn recover( - recovery_context: JobRecoveryContext, + recoverable_job: RecoverableJob, ready_queue_sender: ReadyQueueSenderType, db_connector: DbConnectorType, task_instance_pool_connector: TaskInstancePoolConnectorType, ) -> Result { - let JobRecoveryContext { + let RecoverableJob { id, - owner_id, + resource_group_id, state, job_submission, job_outputs, - } = recovery_context; + } = recoverable_job; if !matches!( state, JobState::Running | JobState::CommitReady | JobState::CleanupReady @@ -185,7 +171,7 @@ impl< let recovered = Self { inner: Arc::new(JobControlBlock { id, - owner_id, + owner_id: resource_group_id, job_execution_state: JobExecutionStateHandle { inner: tokio::sync::RwLock::new(job_execution_state), }, diff --git a/components/spider-storage/src/state/runtime.rs b/components/spider-storage/src/state/runtime.rs index 5d4e77a5..9350bce1 100644 --- a/components/spider-storage/src/state/runtime.rs +++ b/components/spider-storage/src/state/runtime.rs @@ -6,10 +6,10 @@ use tokio_util::sync::CancellationToken; use crate::{ cache::{ error::{CacheError, InternalError}, - job::{JobRecoveryContext, SharedJobControlBlock}, + job::SharedJobControlBlock, }, config::DatabaseConfig, - db::{DbStorage, MariaDbStorageConnector, RecoverableJob, SessionManagement}, + db::{DbStorage, MariaDbStorageConnector, SessionManagement}, ready_queue::{ReadyQueueConfig, ReadyQueueSender, ReadyQueueSenderHandle, create_ready_queue}, state::{JobCache, ServiceState, StorageServerError}, task_instance_pool::{ @@ -179,21 +179,10 @@ async fn recover_job_cache< > { let job_cache = JobCache::new(); for recoverable_job in db.get_recoverable_jobs().await? { - let RecoverableJob { - id, - resource_group_id, - state, - job_submission, - job_outputs, - } = recoverable_job; + let id = recoverable_job.id; + let state = recoverable_job.state; let jcb = SharedJobControlBlock::recover( - JobRecoveryContext { - id, - owner_id: resource_group_id, - state, - job_submission, - job_outputs, - }, + recoverable_job, ready_queue_sender.clone(), db.clone(), task_instance_pool_connector.clone(),