From ac0dd7743c460b8d23ad0462fe1e03d78910ceda Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 28 May 2026 22:35:41 -0400 Subject: [PATCH 01/14] WIP --- Cargo.lock | 10 ++++ Cargo.toml | 1 + components/spider-scheduler/Cargo.toml | 14 +++++ components/spider-scheduler/src/core.rs | 56 ++++++++++++++++++ components/spider-scheduler/src/dispatch.rs | 28 +++++++++ components/spider-scheduler/src/error.rs | 25 ++++++++ components/spider-scheduler/src/lib.rs | 21 +++++++ .../spider-scheduler/src/storage_client.rs | 58 +++++++++++++++++++ components/spider-scheduler/src/types.rs | 35 +++++++++++ 9 files changed, 248 insertions(+) create mode 100644 components/spider-scheduler/Cargo.toml create mode 100644 components/spider-scheduler/src/core.rs create mode 100644 components/spider-scheduler/src/dispatch.rs create mode 100644 components/spider-scheduler/src/error.rs create mode 100644 components/spider-scheduler/src/lib.rs create mode 100644 components/spider-scheduler/src/storage_client.rs create mode 100644 components/spider-scheduler/src/types.rs diff --git a/Cargo.lock b/Cargo.lock index 2888d5e8..0262e8d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1499,6 +1499,16 @@ dependencies = [ "tracing", ] +[[package]] +name = "spider-scheduler" +version = "0.1.0" +dependencies = [ + "async-trait", + "spider-core", + "thiserror", + "tokio-util", +] + [[package]] name = "spider-storage" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index ea9992cf..4d8d20e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "components/spider-core", "components/spider-derive", "components/spider-execution-manager", + "components/spider-scheduler", "components/spider-storage", "components/spider-task-executor", "components/spider-tdl", diff --git a/components/spider-scheduler/Cargo.toml b/components/spider-scheduler/Cargo.toml new file mode 100644 index 00000000..59e9b8f7 --- /dev/null +++ b/components/spider-scheduler/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "spider-scheduler" +version = "0.1.0" +edition = "2024" + +[lib] +name = "spider_scheduler" +path = "src/lib.rs" + +[dependencies] +async-trait = "0.1.89" +spider-core = { path = "../spider-core" } +thiserror = "2.0.18" +tokio-util = "0.7.18" diff --git a/components/spider-scheduler/src/core.rs b/components/spider-scheduler/src/core.rs new file mode 100644 index 00000000..95392f49 --- /dev/null +++ b/components/spider-scheduler/src/core.rs @@ -0,0 +1,56 @@ +use std::sync::Arc; + +use async_trait::async_trait; + +use crate::{ + dispatch::DispatchSink, + error::SchedulerError, + storage_client::SchedulerStorageClient, +}; + +/// A cancellation handle used to signal a running [`SchedulerCore`] to stop. +/// +/// Cancelling the token causes [`SchedulerCore::run`] to break out of its scheduling loop and +/// return. +pub type ShutdownToken = tokio_util::sync::CancellationToken; + +/// A pluggable scheduling algorithm. +/// +/// A core owns its decision loop: it polls the inbound queue through a [`SchedulerStorageClient`], +/// applies its algorithm (reading storage as needed for placement), and writes assignments to a +/// [`DispatchSink`]. Modeling the algorithm as a trait lets different scheduling strategies share +/// the same runtime entry point. +#[async_trait] +pub trait SchedulerCore: Send { + /// The storage client the core polls and reads for placement decisions. + type Storage: SchedulerStorageClient; + + /// The dispatch sink the core writes assignments to. + type Sink: DispatchSink; + + /// Runs the scheduling loop until `shutdown` is triggered. + /// + /// The core polls the inbound queue through `storage`, applies its scheduling algorithm, and + /// writes assignments to `sink`, repeating until `shutdown` is cancelled, at which point it + /// returns. + /// + /// # Parameters + /// + /// * `storage` - The storage client used to poll the inbound queue and read state for + /// placement. + /// * `sink` - The dispatch sink that assignments are written to. + /// * `shutdown` - The token that, once cancelled, signals the loop to stop and return. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError`] if the scheduling loop fails irrecoverably, e.g. the storage client or + /// dispatch sink fails. + async fn run( + &mut self, + storage: Arc, + sink: Arc, + shutdown: ShutdownToken, + ) -> Result<(), SchedulerError>; +} diff --git a/components/spider-scheduler/src/dispatch.rs b/components/spider-scheduler/src/dispatch.rs new file mode 100644 index 00000000..c432d73a --- /dev/null +++ b/components/spider-scheduler/src/dispatch.rs @@ -0,0 +1,28 @@ +use async_trait::async_trait; + +use crate::{error::SchedulerError, types::TaskAssignment}; + +/// The write side of the dispatching queue used by the scheduler core. +/// +/// Modeled as a trait so the scheduler core can be unit-tested against a recording sink without +/// standing up the execution-manager-facing service. The production implementation is backed by a +/// bounded single-producer/multi-consumer queue. +#[async_trait] +pub trait DispatchSink: Send + Sync { + /// Enqueues a task assignment for execution managers to consume. + /// + /// Implementations backed by a bounded queue await while the queue is full, applying + /// back-pressure to the scheduler core. + /// + /// # Parameters + /// + /// * `assignment` - The task assignment to enqueue. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::DispatchClosed`] if the dispatching queue is closed and can no longer + /// accept assignments. + async fn dispatch(&self, assignment: TaskAssignment) -> Result<(), SchedulerError>; +} diff --git a/components/spider-scheduler/src/error.rs b/components/spider-scheduler/src/error.rs new file mode 100644 index 00000000..8e7983d5 --- /dev/null +++ b/components/spider-scheduler/src/error.rs @@ -0,0 +1,25 @@ +use spider_core::types::id::JobId; + +/// Errors returned by [`crate::storage_client::SchedulerStorageClient`] operations. +#[derive(Debug, thiserror::Error)] +pub enum StorageClientError { + /// The inbound queue is closed and can no longer yield ready entries. + #[error("inbound queue is closed")] + InboundClosed, + + /// No job with the requested identifier exists. + #[error("job not found: {0:?}")] + JobNotFound(JobId), +} + +/// Errors returned by the scheduler runtime and its components. +#[derive(Debug, thiserror::Error)] +pub enum SchedulerError { + /// Forwarded from the storage client. + #[error(transparent)] + Storage(#[from] StorageClientError), + + /// The dispatching queue is closed and can no longer accept assignments. + #[error("dispatching queue is closed")] + DispatchClosed, +} diff --git a/components/spider-scheduler/src/lib.rs b/components/spider-scheduler/src/lib.rs new file mode 100644 index 00000000..411b26f0 --- /dev/null +++ b/components/spider-scheduler/src/lib.rs @@ -0,0 +1,21 @@ +//! Scheduler skeleton for the Spider task-execution framework. +//! +//! This crate defines the core type and trait abstractions of the scheduler: the data types +//! exchanged with storage and execution managers ([`InboundEntry`], [`TaskAssignment`]), the +//! storage and dispatch seams ([`SchedulerStorageClient`], [`DispatchSink`]), and the pluggable +//! scheduling algorithm ([`SchedulerCore`]). Concrete implementations (the dispatch queue, the +//! runtime, and scheduling algorithms) build on top of these abstractions. + +pub mod core; +pub mod dispatch; +pub mod error; +pub mod storage_client; +pub mod types; + +pub use crate::{ + core::{SchedulerCore, ShutdownToken}, + dispatch::DispatchSink, + error::{SchedulerError, StorageClientError}, + storage_client::SchedulerStorageClient, + types::{InboundEntry, TaskAssignment}, +}; diff --git a/components/spider-scheduler/src/storage_client.rs b/components/spider-scheduler/src/storage_client.rs new file mode 100644 index 00000000..5b847c63 --- /dev/null +++ b/components/spider-scheduler/src/storage_client.rs @@ -0,0 +1,58 @@ +use std::time::Duration; + +use async_trait::async_trait; +use spider_core::{job::JobState, types::id::JobId}; + +use crate::{error::StorageClientError, types::InboundEntry}; + +/// The scheduler's view of the storage layer. +/// +/// Abstracts the storage-owned inbound queue and the read-only queries a scheduling algorithm +/// needs to make placement decisions. Modeled as a trait so the scheduler runtime can be driven by +/// a real storage client in production or a mock in tests. +#[async_trait] +pub trait SchedulerStorageClient: Send + Sync { + /// Polls the storage-owned inbound (ready) queue for newly-ready tasks. + /// + /// Drains up to `max_items` ready entries across all storage lanes (regular, commit, and + /// cleanup tasks), blocking for at most `wait`. Returns an empty vector if no entry becomes + /// ready within `wait`. + /// + /// # Parameters + /// + /// * `max_items` - The maximum number of entries to return from a single poll. + /// * `wait` - The maximum duration to block waiting for ready entries. + /// + /// # Returns + /// + /// The ready entries drained from the inbound queue on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageClientError::InboundClosed`] if the inbound queue is closed and can no longer + /// yield entries. + async fn poll_ready( + &self, + max_items: usize, + wait: Duration, + ) -> Result, StorageClientError>; + + /// Reads the current state of a job. + /// + /// # Parameters + /// + /// * `job_id` - The identifier of the job to query. + /// + /// # Returns + /// + /// The job's current [`JobState`] on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageClientError::JobNotFound`] if no job with the given identifier exists. + async fn job_state(&self, job_id: JobId) -> Result; +} diff --git a/components/spider-scheduler/src/types.rs b/components/spider-scheduler/src/types.rs new file mode 100644 index 00000000..448a398b --- /dev/null +++ b/components/spider-scheduler/src/types.rs @@ -0,0 +1,35 @@ +use spider_core::types::id::{JobId, ResourceGroupId, TaskId}; + +/// A ready task drained from the storage-owned inbound queue. +/// +/// The storage client flattens storage's three ready lanes (regular, commit, and cleanup tasks) +/// into this uniform entry, resolving each to its [`TaskId`] so the scheduler core can treat every +/// ready task identically. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct InboundEntry { + /// The resource group that owns the job. + pub resource_group_id: ResourceGroupId, + + /// The job the task belongs to. + pub job_id: JobId, + + /// The ready task. + pub task_id: TaskId, +} + +/// A task placement decision written by the scheduler core to the dispatching queue. +/// +/// Assignments are intentionally lightweight: they identify the task but carry no inputs. The +/// consuming execution manager registers the task instance against storage on pull to obtain the +/// execution context (inputs, timeouts, and the TDL context). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TaskAssignment { + /// The resource group that owns the job. + pub resource_group_id: ResourceGroupId, + + /// The job the task belongs to. + pub job_id: JobId, + + /// The task to dispatch. + pub task_id: TaskId, +} From dced92eace056244a39c7dad83deefe263e8d392 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Fri, 29 May 2026 18:43:45 -0400 Subject: [PATCH 02/14] Done. --- components/spider-scheduler/src/core.rs | 45 +++++------ components/spider-scheduler/src/dispatch.rs | 28 ------- .../spider-scheduler/src/dispatch_queue.rs | 59 ++++++++++++++ components/spider-scheduler/src/error.rs | 10 ++- components/spider-scheduler/src/lib.rs | 43 +++++++--- .../spider-scheduler/src/storage_client.rs | 78 ++++++++++++++++--- components/spider-scheduler/src/types.rs | 6 +- 7 files changed, 188 insertions(+), 81 deletions(-) delete mode 100644 components/spider-scheduler/src/dispatch.rs create mode 100644 components/spider-scheduler/src/dispatch_queue.rs diff --git a/components/spider-scheduler/src/core.rs b/components/spider-scheduler/src/core.rs index 95392f49..c6bb661c 100644 --- a/components/spider-scheduler/src/core.rs +++ b/components/spider-scheduler/src/core.rs @@ -1,56 +1,47 @@ -use std::sync::Arc; +//! The abstract core of a Spider scheduler. use async_trait::async_trait; use crate::{ - dispatch::DispatchSink, + dispatch_queue::DispatchQueueSink, error::SchedulerError, storage_client::SchedulerStorageClient, }; -/// A cancellation handle used to signal a running [`SchedulerCore`] to stop. -/// -/// Cancelling the token causes [`SchedulerCore::run`] to break out of its scheduling loop and -/// return. -pub type ShutdownToken = tokio_util::sync::CancellationToken; - -/// A pluggable scheduling algorithm. +/// An abstracted core for a scheduling algorithm. /// /// A core owns its decision loop: it polls the inbound queue through a [`SchedulerStorageClient`], /// applies its algorithm (reading storage as needed for placement), and writes assignments to a -/// [`DispatchSink`]. Modeling the algorithm as a trait lets different scheduling strategies share -/// the same runtime entry point. +/// [`DispatchQueueSink`]. Modeling the algorithm as a trait lets different scheduling strategies +/// share the same runtime entry point. #[async_trait] pub trait SchedulerCore: Send { - /// The storage client the core polls and reads for placement decisions. - type Storage: SchedulerStorageClient; + /// The storage client used by the core to poll and read for placement decisions. + type StorageClient: SchedulerStorageClient; /// The dispatch sink the core writes assignments to. - type Sink: DispatchSink; + type Sink: DispatchQueueSink; - /// Runs the scheduling loop until `shutdown` is triggered. + /// Runs the scheduling loop until `cancellation_token` is triggered. /// - /// The core polls the inbound queue through `storage`, applies its scheduling algorithm, and - /// writes assignments to `sink`, repeating until `shutdown` is cancelled, at which point it - /// returns. + /// The core polls the inbound queue through `storage_client`, applies its scheduling algorithm, + /// and writes assignments to `sink`, repeating until `cancellation_token` is fired, at which + /// point it returns. /// /// # Parameters /// - /// * `storage` - The storage client used to poll the inbound queue and read state for + /// * `storage_client` - The storage client used to poll the inbound queue and read state for /// placement. /// * `sink` - The dispatch sink that assignments are written to. - /// * `shutdown` - The token that, once cancelled, signals the loop to stop and return. + /// * `cancellation_token` - The token to signal the scheduling loop to stop. /// /// # Errors /// - /// Returns an error if: - /// - /// * [`SchedulerError`] if the scheduling loop fails irrecoverably, e.g. the storage client or - /// dispatch sink fails. + /// Returns a [`SchedulerError`] instance indicating an irrecoverable error. async fn run( &mut self, - storage: Arc, - sink: Arc, - shutdown: ShutdownToken, + storage_client: Self::StorageClient, + sink: Self::Sink, + cancellation_token: tokio_util::sync::CancellationToken, ) -> Result<(), SchedulerError>; } diff --git a/components/spider-scheduler/src/dispatch.rs b/components/spider-scheduler/src/dispatch.rs deleted file mode 100644 index c432d73a..00000000 --- a/components/spider-scheduler/src/dispatch.rs +++ /dev/null @@ -1,28 +0,0 @@ -use async_trait::async_trait; - -use crate::{error::SchedulerError, types::TaskAssignment}; - -/// The write side of the dispatching queue used by the scheduler core. -/// -/// Modeled as a trait so the scheduler core can be unit-tested against a recording sink without -/// standing up the execution-manager-facing service. The production implementation is backed by a -/// bounded single-producer/multi-consumer queue. -#[async_trait] -pub trait DispatchSink: Send + Sync { - /// Enqueues a task assignment for execution managers to consume. - /// - /// Implementations backed by a bounded queue await while the queue is full, applying - /// back-pressure to the scheduler core. - /// - /// # Parameters - /// - /// * `assignment` - The task assignment to enqueue. - /// - /// # Errors - /// - /// Returns an error if: - /// - /// * [`SchedulerError::DispatchClosed`] if the dispatching queue is closed and can no longer - /// accept assignments. - async fn dispatch(&self, assignment: TaskAssignment) -> Result<(), SchedulerError>; -} diff --git a/components/spider-scheduler/src/dispatch_queue.rs b/components/spider-scheduler/src/dispatch_queue.rs new file mode 100644 index 00000000..7ef57fdc --- /dev/null +++ b/components/spider-scheduler/src/dispatch_queue.rs @@ -0,0 +1,59 @@ +//! The dispatching queue that decouples the scheduler core's placement decisions from the +//! execution-manager-facing service. + +use async_trait::async_trait; +use spider_core::types::id::SessionId; + +use crate::{error::SchedulerError, types::TaskAssignment}; + +/// The writer side of the dispatching queue used by the scheduler core. +#[async_trait] +pub trait DispatchQueueSink: Send + Sync + Clone { + /// Enqueues a task assignment for execution managers to consume. + /// + /// # Parameters + /// + /// * `assignment` - The task assignment to enqueue. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. + async fn enqueue(&self, assignment: TaskAssignment) -> Result<(), SchedulerError>; + + /// Bumps the session ID and invalidates all queued task assignments. + /// + /// # Parameters + /// + /// * `new_session_id` - The new session ID. Must be greater than the current session ID. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. + /// * [`SchedulerError::InvalidSessionId`] if the new session ID is not greater than the current + /// session ID. + async fn bump_session_id(&self, new_session_id: SessionId) -> Result<(), SchedulerError>; +} + +/// The reader side of the dispatching queue, drained by the execution-manager-facing service. +#[async_trait] +pub trait DispatchQueueSource: Send + Sync + Clone { + /// Dequeues the next task assignment for an execution manager to execute. + /// + /// # Returns + /// + /// A tuple on success, containing: + /// + /// * The storage session associated with the assignment. + /// * The next task assignment ready to execute. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. + async fn dequeue(&self) -> Result<(SessionId, TaskAssignment), SchedulerError>; +} diff --git a/components/spider-scheduler/src/error.rs b/components/spider-scheduler/src/error.rs index 8e7983d5..6a852c46 100644 --- a/components/spider-scheduler/src/error.rs +++ b/components/spider-scheduler/src/error.rs @@ -1,4 +1,6 @@ -use spider_core::types::id::JobId; +//! The error types used in this crate. + +use spider_core::types::id::{JobId, SessionId}; /// Errors returned by [`crate::storage_client::SchedulerStorageClient`] operations. #[derive(Debug, thiserror::Error)] @@ -21,5 +23,9 @@ pub enum SchedulerError { /// The dispatching queue is closed and can no longer accept assignments. #[error("dispatching queue is closed")] - DispatchClosed, + DispatchQueueClosed, + + /// The session ID is invalid. + #[error("invalid session ID: {0:?}")] + InvalidSessionId(SessionId), } diff --git a/components/spider-scheduler/src/lib.rs b/components/spider-scheduler/src/lib.rs index 411b26f0..bddd0750 100644 --- a/components/spider-scheduler/src/lib.rs +++ b/components/spider-scheduler/src/lib.rs @@ -1,20 +1,45 @@ -//! Scheduler skeleton for the Spider task-execution framework. +//! Trait and type abstractions for the Spider scheduler. //! -//! This crate defines the core type and trait abstractions of the scheduler: the data types -//! exchanged with storage and execution managers ([`InboundEntry`], [`TaskAssignment`]), the -//! storage and dispatch seams ([`SchedulerStorageClient`], [`DispatchSink`]), and the pluggable -//! scheduling algorithm ([`SchedulerCore`]). Concrete implementations (the dispatch queue, the -//! runtime, and scheduling algorithms) build on top of these abstractions. +//! The scheduler is the serial decision maker that turns ready tasks discovered by the storage +//! layer into assignments for execution managers. It owns placement and ordering policy, not +//! dependency resolution: storage decides *what* is ready, and the scheduler decides *in what +//! order* and *with what throttling* ready tasks are offered to the fleet. +//! +//! The crate defines three trait seams wired into a single pipeline — a storage client that polls +//! the ready queue, a core that makes serial decisions, and a dispatching queue that fans those +//! decisions out to execution managers: +//! +//! ```text +//! storage ── authoritative ready queue (owned by the storage layer, not this crate) +//! │ +//! │ poll_ready / poll_commit_ready / poll_cleanup_ready (SchedulerStorageClient) +//! ▼ +//! ┌───────────────────┐ +//! │ SchedulerCore │ serial loop: poll → decide → enqueue +//! └───────────────────┘ +//! │ +//! │ enqueue (DispatchQueueSink — writer side) +//! ▼ +//! ┌───────────────────┐ +//! │ dispatch queue │ bounded SPMC; a full queue back-pressures the core +//! └───────────────────┘ +//! │ +//! │ dequeue (DispatchQueueSource — reader side) +//! ▼ +//! ┌───────────────────┐ +//! │ scheduler service │ ──▶ execution managers (concurrent fan-out) +//! └───────────────────┘ +//! ``` pub mod core; -pub mod dispatch; +pub mod dispatch_queue; pub mod error; pub mod storage_client; pub mod types; pub use crate::{ - core::{SchedulerCore, ShutdownToken}, - dispatch::DispatchSink, + core::SchedulerCore, + dispatch_queue::{DispatchQueueSink, DispatchQueueSource}, error::{SchedulerError, StorageClientError}, storage_client::SchedulerStorageClient, types::{InboundEntry, TaskAssignment}, diff --git a/components/spider-scheduler/src/storage_client.rs b/components/spider-scheduler/src/storage_client.rs index 5b847c63..9f7adaf4 100644 --- a/components/spider-scheduler/src/storage_client.rs +++ b/components/spider-scheduler/src/storage_client.rs @@ -1,7 +1,12 @@ +//! The scheduler's view of the storage layer, abstracting inbound polling and placement-time reads. + use std::time::Duration; use async_trait::async_trait; -use spider_core::{job::JobState, types::id::JobId}; +use spider_core::{ + job::JobState, + types::id::{JobId, SessionId}, +}; use crate::{error::StorageClientError, types::InboundEntry}; @@ -11,33 +16,84 @@ use crate::{error::StorageClientError, types::InboundEntry}; /// needs to make placement decisions. Modeled as a trait so the scheduler runtime can be driven by /// a real storage client in production or a mock in tests. #[async_trait] -pub trait SchedulerStorageClient: Send + Sync { - /// Polls the storage-owned inbound (ready) queue for newly-ready tasks. - /// - /// Drains up to `max_items` ready entries across all storage lanes (regular, commit, and - /// cleanup tasks), blocking for at most `wait`. Returns an empty vector if no entry becomes - /// ready within `wait`. +pub trait SchedulerStorageClient: Send + Sync + Clone { + /// Polls the regular-task lane of the storage-owned inbound queue for ready tasks. /// /// # Parameters /// /// * `max_items` - The maximum number of entries to return from a single poll. - /// * `wait` - The maximum duration to block waiting for ready entries. + /// * `wait` - The maximum duration to block waiting for ready entries on the storage side. /// /// # Returns /// - /// The ready entries drained from the inbound queue on success. + /// A tuple on success, containing: + /// + /// * The storage session the poll was served under. + /// * The ready regular tasks drained from the lane. /// /// # Errors /// /// Returns an error if: /// - /// * [`StorageClientError::InboundClosed`] if the inbound queue is closed and can no longer + /// * [`StorageClientError::InboundClosed`] if the regular-task lane is closed and can no longer /// yield entries. async fn poll_ready( &self, max_items: usize, wait: Duration, - ) -> Result, StorageClientError>; + ) -> Result<(SessionId, Vec), StorageClientError>; + + /// Polls the commit-task lane of the storage-owned inbound queue for ready tasks. + /// + /// # Parameters + /// + /// * `max_items` - The maximum number of entries to return from a single poll. + /// * `wait` - The maximum duration to block waiting for ready entries on the storage side. + /// + /// # Returns + /// + /// A tuple on success, containing: + /// + /// * The storage session the poll was served under. + /// * The ready commit tasks drained from the lane. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageClientError::InboundClosed`] if the commit-task lane is closed and can no longer + /// yield entries. + async fn poll_commit_ready( + &self, + max_items: usize, + wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError>; + + /// Polls the cleanup-task lane of the storage-owned inbound queue for ready tasks. + /// + /// # Parameters + /// + /// * `max_items` - The maximum number of entries to return from a single poll. + /// * `wait` - The maximum duration to block waiting for ready entries on the storage side. + /// + /// # Returns + /// + /// A tuple on success, containing: + /// + /// * The storage session the poll was served under. + /// * The ready cleanup tasks drained from the lane. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`StorageClientError::InboundClosed`] if the cleanup-task lane is closed and can no longer + /// yield entries. + async fn poll_cleanup_ready( + &self, + max_items: usize, + wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError>; /// Reads the current state of a job. /// diff --git a/components/spider-scheduler/src/types.rs b/components/spider-scheduler/src/types.rs index 448a398b..70267b15 100644 --- a/components/spider-scheduler/src/types.rs +++ b/components/spider-scheduler/src/types.rs @@ -1,3 +1,5 @@ +//! The data types the scheduler exchanges with the storage layer and execution managers. + use spider_core::types::id::{JobId, ResourceGroupId, TaskId}; /// A ready task drained from the storage-owned inbound queue. @@ -18,10 +20,6 @@ pub struct InboundEntry { } /// A task placement decision written by the scheduler core to the dispatching queue. -/// -/// Assignments are intentionally lightweight: they identify the task but carry no inputs. The -/// consuming execution manager registers the task instance against storage on pull to obtain the -/// execution context (inputs, timeouts, and the TDL context). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct TaskAssignment { /// The resource group that owns the job. From 9c436bd9d4efbd90ddec53860fbcb58342ed6a73 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Sat, 30 May 2026 14:45:35 -0400 Subject: [PATCH 03/14] Update dispatch queue's trait. --- .../spider-scheduler/src/dispatch_queue.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/components/spider-scheduler/src/dispatch_queue.rs b/components/spider-scheduler/src/dispatch_queue.rs index 7ef57fdc..2305a64d 100644 --- a/components/spider-scheduler/src/dispatch_queue.rs +++ b/components/spider-scheduler/src/dispatch_queue.rs @@ -1,6 +1,8 @@ //! The dispatching queue that decouples the scheduler core's placement decisions from the //! execution-manager-facing service. +use std::time::Duration; + use async_trait::async_trait; use spider_core::types::id::SessionId; @@ -36,6 +38,11 @@ pub trait DispatchQueueSink: Send + Sync + Clone { /// * [`SchedulerError::InvalidSessionId`] if the new session ID is not greater than the current /// session ID. async fn bump_session_id(&self, new_session_id: SessionId) -> Result<(), SchedulerError>; + + /// # Returns + /// + /// The current size of the dispatch queue. + fn size(&self) -> usize; } /// The reader side of the dispatching queue, drained by the execution-manager-facing service. @@ -43,9 +50,14 @@ pub trait DispatchQueueSink: Send + Sync + Clone { pub trait DispatchQueueSource: Send + Sync + Clone { /// Dequeues the next task assignment for an execution manager to execute. /// + /// # Parameters + /// + /// * `wait_time` - The maximum amount of time to wait for a task assignment. + /// /// # Returns /// - /// A tuple on success, containing: + /// `None` if no task assignment is available within the specified wait time, or a tuple + /// containing: /// /// * The storage session associated with the assignment. /// * The next task assignment ready to execute. @@ -55,5 +67,8 @@ pub trait DispatchQueueSource: Send + Sync + Clone { /// Returns an error if: /// /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. - async fn dequeue(&self) -> Result<(SessionId, TaskAssignment), SchedulerError>; + async fn dequeue( + &self, + wait_time: Duration, + ) -> Result, SchedulerError>; } From 5650d5a832b3cee4a00d51b40fc40638213c14b9 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Sun, 31 May 2026 17:28:27 -0400 Subject: [PATCH 04/14] Fix done. --- components/spider-core/src/types/id.rs | 47 ++++++------------- components/spider-storage/src/cache.rs | 15 ------ components/spider-storage/src/cache/job.rs | 3 +- .../spider-storage/src/task_instance_pool.rs | 3 +- .../spider-storage/tests/scheduling_infra.rs | 3 +- components/spider-tdl/src/task.rs | 2 +- components/spider-tdl/src/task_context.rs | 2 +- .../spider-tdl/tests/test_task_macro.rs | 4 +- tests/huntsman/task-executor/src/lib.rs | 2 +- .../task-executor/tests/test_process_pool.rs | 2 +- .../huntsman/tdl-integration/tests/complex.rs | 2 +- 11 files changed, 24 insertions(+), 61 deletions(-) diff --git a/components/spider-core/src/types/id.rs b/components/spider-core/src/types/id.rs index 21821e7e..4735f798 100644 --- a/components/spider-core/src/types/id.rs +++ b/components/spider-core/src/types/id.rs @@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize}; use sqlx::{Database, encode::IsNull}; use uuid::Uuid; +use crate::task::TaskIndex; + /// A generic identifier type that wraps a UUID and a type marker. /// /// # Type Parameters: @@ -96,9 +98,18 @@ pub type UuidBytes = uuid::Bytes; pub enum ResourceGroupIdMarker {} pub type ResourceGroupId = Id; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum TaskIdMarker {} -pub type TaskId = Id; +/// Identifier of a task inside a job. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum TaskId { + /// The index of the task in the job's task graph. + Index(TaskIndex), + + /// The commit task. + Commit, + + /// The cleanup task. + Cleanup, +} #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum JobIdMarker {} @@ -169,33 +180,3 @@ where } pub type SignedJobId = SignedId; - -pub type SignedTaskId = SignedId; - -#[cfg(test)] -mod tests { - use std::any::TypeId; - - use super::*; - - #[test] - fn test_id_basic() { - let id = TaskId::new(); - let underlying_uuid = id.as_uuid_ref().to_owned(); - assert_eq!(id, TaskId::from(underlying_uuid)); - - assert_ne!(TypeId::of::(), TypeId::of::()); - } - - #[test] - fn task_id_json_roundtrip() { - let id = TaskId::new(); - let deserialized_id: TaskId = serde_json::from_str( - serde_json::to_string(&id) - .expect("JSON serialization failure") - .as_str(), - ) - .expect("JSON deserialization failure"); - assert_eq!(id, deserialized_id); - } -} diff --git a/components/spider-storage/src/cache.rs b/components/spider-storage/src/cache.rs index d520f519..89a5e13d 100644 --- a/components/spider-storage/src/cache.rs +++ b/components/spider-storage/src/cache.rs @@ -1,21 +1,6 @@ -use spider_core::task::TaskIndex; - pub mod error; pub mod io; pub mod job; pub mod job_submission; mod sync; pub mod task; - -/// Identifier of a task inside a job. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum TaskId { - /// The index of the task in the job's task graph. - Index(TaskIndex), - - /// The commit task. - Commit, - - /// The cleanup task. - Cleanup, -} diff --git a/components/spider-storage/src/cache/job.rs b/components/spider-storage/src/cache/job.rs index 5c575e8e..c5a06ccb 100644 --- a/components/spider-storage/src/cache/job.rs +++ b/components/spider-storage/src/cache/job.rs @@ -10,7 +10,7 @@ use spider_core::{ job::JobState, task::{TaskIndex, TaskState}, types::{ - id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId}, + id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId}, io::{ExecutionContext, TaskOutput}, }, }; @@ -18,7 +18,6 @@ use tokio::sync::{RwLockReadGuard, RwLockWriteGuard}; use crate::{ cache::{ - TaskId, error::{CacheError, InternalError, InternalError::UnexpectedJobState, StaleStateError}, job_submission::ValidatedJobSubmission, task::TaskGraph, diff --git a/components/spider-storage/src/task_instance_pool.rs b/components/spider-storage/src/task_instance_pool.rs index ace45ce6..bba0cf77 100644 --- a/components/spider-storage/src/task_instance_pool.rs +++ b/components/spider-storage/src/task_instance_pool.rs @@ -23,12 +23,11 @@ use std::{ }; use async_trait::async_trait; -use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId}; +use spider_core::types::id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId}; use tokio::sync::mpsc; use crate::{ cache::{ - TaskId, error::InternalError, task::{SharedTaskControlBlock, SharedTerminationTaskControlBlock}, }, diff --git a/components/spider-storage/tests/scheduling_infra.rs b/components/spider-storage/tests/scheduling_infra.rs index d3e5eb98..046a35eb 100644 --- a/components/spider-storage/tests/scheduling_infra.rs +++ b/components/spider-storage/tests/scheduling_infra.rs @@ -87,13 +87,12 @@ use spider_core::{ job::JobState, task::TaskIndex, types::{ - id::{ExecutionManagerId, JobId, ResourceGroupId, TaskInstanceId}, + id::{ExecutionManagerId, JobId, ResourceGroupId, TaskId, TaskInstanceId}, io::{ExecutionContext, TaskOutput}, }, }; use spider_storage::{ cache::{ - TaskId, error::{CacheError, InternalError}, job::SharedJobControlBlock, job_submission::ValidatedJobSubmission, diff --git a/components/spider-tdl/src/task.rs b/components/spider-tdl/src/task.rs index 99ca904d..d4015e0c 100644 --- a/components/spider-tdl/src/task.rs +++ b/components/spider-tdl/src/task.rs @@ -254,7 +254,7 @@ mod tests { fn make_encoded_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; diff --git a/components/spider-tdl/src/task_context.rs b/components/spider-tdl/src/task_context.rs index 60348315..d412bdb4 100644 --- a/components/spider-tdl/src/task_context.rs +++ b/components/spider-tdl/src/task_context.rs @@ -31,7 +31,7 @@ mod tests { fn round_trip_msgpack() -> anyhow::Result<()> { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 13, resource_group_id: ResourceGroupId::new(), }; diff --git a/components/spider-tdl/tests/test_task_macro.rs b/components/spider-tdl/tests/test_task_macro.rs index e2a070fe..9a891f19 100644 --- a/components/spider-tdl/tests/test_task_macro.rs +++ b/components/spider-tdl/tests/test_task_macro.rs @@ -81,7 +81,7 @@ fn translate(_ctx: TaskContext, p: Point, dx: int32, dy: int32) -> Result<(Point fn make_encoded_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; @@ -303,7 +303,7 @@ fn direct_execute_call_round_trips() -> anyhow::Result<()> { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; diff --git a/tests/huntsman/task-executor/src/lib.rs b/tests/huntsman/task-executor/src/lib.rs index c42a20f4..2a7e5ca1 100644 --- a/tests/huntsman/task-executor/src/lib.rs +++ b/tests/huntsman/task-executor/src/lib.rs @@ -194,7 +194,7 @@ pub fn tdl_package_dir() -> PathBuf { pub fn build_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; diff --git a/tests/huntsman/task-executor/tests/test_process_pool.rs b/tests/huntsman/task-executor/tests/test_process_pool.rs index 7bc5d332..9e762af1 100644 --- a/tests/huntsman/task-executor/tests/test_process_pool.rs +++ b/tests/huntsman/task-executor/tests/test_process_pool.rs @@ -72,7 +72,7 @@ fn build_pool() -> ProcessPool { fn make_request(task_func: &str, inputs: Vec) -> ExecuteRequest { ExecuteRequest { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), resource_group_id: ResourceGroupId::new(), ctx: ExecutionContext { task_instance_id: 1, diff --git a/tests/huntsman/tdl-integration/tests/complex.rs b/tests/huntsman/tdl-integration/tests/complex.rs index 513e7d75..0e2bc7d5 100644 --- a/tests/huntsman/tdl-integration/tests/complex.rs +++ b/tests/huntsman/tdl-integration/tests/complex.rs @@ -33,7 +33,7 @@ fn lib_path() -> std::path::PathBuf { fn encode_ctx() -> Vec { let ctx = TaskContext { job_id: JobId::new(), - task_id: TaskId::new(), + task_id: TaskId::Index(0), task_instance_id: 1, resource_group_id: ResourceGroupId::new(), }; From 10cb6ad186c3dd2fd3cc135c5e24cdc939b10a90 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Sun, 31 May 2026 20:50:56 -0400 Subject: [PATCH 05/14] Add channel-based dispatch queue implementation. --- Cargo.lock | 8 +- components/spider-scheduler/Cargo.toml | 8 + .../spider-scheduler/src/dispatch_queue.rs | 596 +++++++++++++++++- 3 files changed, 609 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f729b53..1742569a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1504,9 +1504,13 @@ dependencies = [ name = "spider-scheduler" version = "0.1.0" dependencies = [ + "anyhow", + "async-channel", "async-trait", + "dashmap", "spider-core", "thiserror", + "tokio", "tokio-util", ] @@ -1985,9 +1989,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.52.2" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "110a78583f19d5cdb2c5ccf321d1290344e71313c6c37d43520d386027d18386" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ "bytes", "libc", diff --git a/components/spider-scheduler/Cargo.toml b/components/spider-scheduler/Cargo.toml index 59e9b8f7..ee803e17 100644 --- a/components/spider-scheduler/Cargo.toml +++ b/components/spider-scheduler/Cargo.toml @@ -8,7 +8,15 @@ name = "spider_scheduler" path = "src/lib.rs" [dependencies] +async-channel = "2.3.1" async-trait = "0.1.89" spider-core = { path = "../spider-core" } thiserror = "2.0.18" +tokio = { version = "1.52.3", features = ["sync", "time"] } tokio-util = "0.7.18" + +[dev-dependencies] +anyhow = "1.0.102" +dashmap = "6.1.0" +tokio = { version = "1.52.3", features = ["macros", "rt-multi-thread"] } +tokio-util = { version = "0.7.18", features = ["rt"] } diff --git a/components/spider-scheduler/src/dispatch_queue.rs b/components/spider-scheduler/src/dispatch_queue.rs index 2305a64d..18435b17 100644 --- a/components/spider-scheduler/src/dispatch_queue.rs +++ b/components/spider-scheduler/src/dispatch_queue.rs @@ -1,10 +1,11 @@ //! The dispatching queue that decouples the scheduler core's placement decisions from the //! execution-manager-facing service. -use std::time::Duration; +use std::{sync::Arc, time::Duration}; use async_trait::async_trait; use spider_core::types::id::SessionId; +use tokio::sync::RwLock; use crate::{error::SchedulerError, types::TaskAssignment}; @@ -72,3 +73,596 @@ pub trait DispatchQueueSource: Send + Sync + Clone { wait_time: Duration, ) -> Result, SchedulerError>; } + +/// A cloneable writer handle for the dispatching queue, implementing [`DispatchQueueSink`] using +/// an async channel. +/// +/// # NOTE +/// +/// The current implementation assumes that `enqueue` and `bump_session_id` will not be called +/// concurrently: `bump_session_id` must be called before consequent `enqueue` calls to make session +/// ID consistent with the enqueued assignments. +#[derive(Clone)] +pub struct DispatchQueueWriter { + inner: Arc, +} + +#[async_trait] +impl DispatchQueueSink for DispatchQueueWriter { + async fn enqueue(&self, assignment: TaskAssignment) -> Result<(), SchedulerError> { + self.inner + .assignment_sender + .send(assignment) + .await + .map_err(|_| SchedulerError::DispatchQueueClosed) + } + + async fn bump_session_id(&self, new_session_id: SessionId) -> Result<(), SchedulerError> { + let mut session_id_guard = self.inner.session_id.write().await; + if new_session_id <= *session_id_guard { + return Err(SchedulerError::InvalidSessionId(new_session_id)); + } + *session_id_guard = new_session_id; + while self.inner.assignment_receiver.try_recv().is_ok() { + // Drain the queue. + } + + // Lock session ID for the entire duration of the drain to exclude all readers. + drop(session_id_guard); + Ok(()) + } + + fn size(&self) -> usize { + self.inner.assignment_sender.len() + } +} + +/// A cloneable reader handle for the dispatching queue, implementing [`DispatchQueueSource`] using +/// an async channel. +#[derive(Clone)] +pub struct DispatchQueueReader { + inner: Arc, +} + +#[async_trait] +impl DispatchQueueSource for DispatchQueueReader { + async fn dequeue( + &self, + wait_time: Duration, + ) -> Result, SchedulerError> { + // Lock session ID for the entire duration of the dequeue operation to exclude any + // `bump_session_id` operations. + let session_id_guard = self.inner.session_id.read().await; + + if let Ok(assignment) = self.inner.assignment_receiver.try_recv() { + return Ok(Some((*session_id_guard, assignment))); + } + + if wait_time.is_zero() { + return Ok(None); + } + + match tokio::time::timeout(wait_time, self.inner.assignment_receiver.recv()).await { + Ok(Ok(assignment)) => Ok(Some((*session_id_guard, assignment))), + Ok(Err(_)) => Err(SchedulerError::DispatchQueueClosed), + Err(_) => Ok(None), + } + } +} + +/// Dispatch queue factory. +/// +/// # Returns +/// +/// A tuple containing: +/// +/// * The writer for the scheduler core to enqueue task assignments. +/// * The reader for the execution-manager-facing service to dequeue task assignments. +#[must_use] +pub fn create_dispatch_queue( + capacity: usize, + init_session_id: SessionId, +) -> (DispatchQueueWriter, DispatchQueueReader) { + let (assignment_sender, assignment_receiver) = async_channel::bounded(capacity); + let session_id = Arc::new(RwLock::new(init_session_id)); + let writer_inner = Arc::new(DispatchQueueWriterInner { + session_id: session_id.clone(), + assignment_sender, + assignment_receiver: assignment_receiver.clone(), + }); + let reader_inner = Arc::new(DispatchQueueReaderInner { + session_id, + assignment_receiver, + }); + ( + DispatchQueueWriter { + inner: writer_inner, + }, + DispatchQueueReader { + inner: reader_inner, + }, + ) +} + +struct DispatchQueueWriterInner { + session_id: Arc>, + assignment_sender: async_channel::Sender, + assignment_receiver: async_channel::Receiver, +} + +struct DispatchQueueReaderInner { + session_id: Arc>, + assignment_receiver: async_channel::Receiver, +} + +#[cfg(test)] +mod tests { + use std::{ + collections::HashMap, + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, + time::Duration, + }; + + use anyhow::Result; + use dashmap::{DashMap, DashSet}; + use spider_core::types::id::{JobId, ResourceGroupId, SessionId, TaskId}; + use tokio_util::task::TaskTracker; + + use super::*; + use crate::{error::SchedulerError, types::TaskAssignment}; + + /// Generates a [`TaskId`] backed by a module-local monotonic counter. + /// + /// # Returns + /// + /// A new [`TaskId::Index`] whose inner value is unique within the test binary. + fn next_task_id() -> TaskId { + static COUNTER: AtomicUsize = AtomicUsize::new(0); + TaskId::Index(COUNTER.fetch_add(1, Ordering::Relaxed)) + } + + /// # Returns + /// + /// Forwards [`make_assignment_with_task_id`]'s return values with `task_id` set with + /// [`next_task_id`]'s return value. + fn make_assignment() -> TaskAssignment { + make_assignment_with_task_id(next_task_id()) + } + + /// # Returns + /// + /// A new [`TaskAssignment`] with the given `task_id` and other ID fields are auto-generated. + fn make_assignment_with_task_id(task_id: TaskId) -> TaskAssignment { + TaskAssignment { + resource_group_id: ResourceGroupId::new(), + job_id: JobId::new(), + task_id, + } + } + + /// Spawns `reader_count` reader tasks that each drain the queue with `wait_time` and count the + /// assignments they receive, looping until the queue is closed. + /// + /// # Returns + /// + /// A vector of join handles, one per spawned task; each handle yields the number of assignments + /// that the reader pulled from the queue. + fn spawn_counting_readers( + reader: &DispatchQueueReader, + reader_count: usize, + wait_time: Duration, + ) -> Vec> { + (0..reader_count) + .map(|_| { + let r = reader.clone(); + tokio::spawn(async move { + let mut count = 0usize; + loop { + match r.dequeue(wait_time).await { + Ok(Some(_)) => count += 1, + Ok(None) => (), + Err(_) => break, + } + } + count + }) + }) + .collect() + } + + /// Drives the pair-consistency stress scenario for one or more concurrent readers. + /// + /// A single producer issues `ROUNDS` rounds of `[enqueue × k_i; bump_session_id(+1)]` with + /// batch sizes drawn from a 64-bit LCG seeded by `rng_seed`, finishes with a final batch under + /// the latest session, and drops the writer. `reader_count` reader tasks drain the queue + /// concurrently, each delivered assignment is tagged at enqueue time, and pair consistency is + /// verified across the collected results once all readers are closed. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`tokio::task::JoinHandle`]'s return values on failure (indicating a task panic). + async fn run_pair_consistency_stress(reader_count: usize, rng_seed: u64) -> Result<()> { + const INIT_SESSION: SessionId = 1; + const ROUNDS: usize = 20; + const CAPACITY: usize = 16; + const FINAL_BATCH: usize = 5; + + assert!(reader_count > 0, "`reader_count` must be positive"); + + let (writer, reader) = create_dispatch_queue(CAPACITY, INIT_SESSION); + let tagged: Arc> = Arc::new(DashMap::new()); + + let tagged_for_writer = tagged.clone(); + let writer_handle = tokio::spawn(async move { + let mut current_session = INIT_SESSION; + let mut rng = rng_seed; + for _ in 0..ROUNDS { + // 64-bit LCG parameters + const LCG_MULTIPLIER: u64 = 6_364_136_223_846_793_005; + const LCG_INCREMENT: u64 = 1_442_695_040_888_963_407; + rng = rng.wrapping_mul(LCG_MULTIPLIER).wrapping_add(LCG_INCREMENT); + let k = usize::try_from(rng % (CAPACITY as u64 + 1)) + .expect("modulo result fits in usize"); + for _ in 0..k { + let id = next_task_id(); + tagged_for_writer.insert(id, current_session); + writer + .enqueue(make_assignment_with_task_id(id)) + .await + .expect("enqueue failed"); + } + current_session += 1; + writer + .bump_session_id(current_session) + .await + .expect("bump failed"); + } + // Final batch under the latest session, which guarantees the readers have something to + // drain post-bump. + for _ in 0..FINAL_BATCH { + let id = next_task_id(); + tagged_for_writer.insert(id, current_session); + writer + .enqueue(make_assignment_with_task_id(id)) + .await + .expect("enqueue failed"); + } + drop(writer); + }); + + let all_delivered: Arc> = Arc::new(DashMap::new()); + let duplicates: Arc> = Arc::new(DashSet::new()); + let tracker = TaskTracker::new(); + for _ in 0..reader_count { + let r = reader.clone(); + let delivered_for_reader = all_delivered.clone(); + let duplicates_for_reader = duplicates.clone(); + tracker.spawn(async move { + loop { + match r.dequeue(Duration::from_millis(500)).await { + Ok(Some((session, assignment))) => { + if delivered_for_reader + .insert(assignment.task_id, session) + .is_some() + { + duplicates_for_reader.insert(assignment.task_id); + } + } + Ok(None) => (), + Err(_) => break, + } + } + }); + } + tracker.close(); + drop(reader); + + writer_handle.await?; + tracker.wait().await; + + assert!( + duplicates.is_empty(), + "duplicate deliveries: {:?}", + duplicates.iter().map(|e| *e.key()).collect::>(), + ); + for entry in all_delivered.iter() { + let task_id = *entry.key(); + let delivered_session = *entry.value(); + let expected = tagged.get(&task_id).map(|e| *e.value()); + assert_eq!( + Some(delivered_session), + expected, + "pair stamp mismatch: task_id={task_id:?}, delivered={delivered_session}, \ + expected={expected:?}", + ); + } + + let delivered_count = all_delivered.len(); + assert!( + delivered_count >= FINAL_BATCH, + "expected at least the final batch ({FINAL_BATCH}) to be delivered, got \ + {delivered_count}", + ); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn sanity_round_trip_and_initial_session() -> Result<()> { + const SESSION_ID: SessionId = 1; + let (writer, reader) = create_dispatch_queue(8, SESSION_ID); + let assignment = make_assignment(); + + writer.enqueue(assignment).await?; + + let (session, received) = reader + .dequeue(Duration::from_millis(1)) + .await? + .expect("expected an assignment"); + assert_eq!(session, SESSION_ID); + assert_eq!(received, assignment); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn load_balancing_across_consumers() -> Result<()> { + const N: usize = 100; + const M: usize = 4; + let (writer, reader) = create_dispatch_queue(32, 1); + + let reader_handles = spawn_counting_readers(&reader, M, Duration::from_millis(500)); + drop(reader); + + for _ in 0..N { + writer + .enqueue(make_assignment()) + .await + .expect("enqueue failed"); + } + drop(writer); + + let mut total = 0usize; + for handle in reader_handles { + total += handle.await?; + } + assert_eq!(total, N); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn many_readers_with_slow_producer() -> Result<()> { + const N: usize = 10; + const M: usize = 16; + let (writer, reader) = create_dispatch_queue(8, 1); + + let reader_handles = spawn_counting_readers(&reader, M, Duration::from_millis(500)); + drop(reader); + + for _ in 0..N { + writer + .enqueue(make_assignment()) + .await + .expect("enqueue failed"); + tokio::time::sleep(Duration::from_millis(10)).await; + } + drop(writer); + + let mut total = 0usize; + for handle in reader_handles { + total += handle.await?; + } + assert_eq!(total, N); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn bump_same_session_id_returns_invalid() -> Result<()> { + const SESSION_ID: SessionId = 5; + let (writer, _reader) = create_dispatch_queue(8, SESSION_ID); + let result = writer.bump_session_id(SESSION_ID).await; + assert!( + matches!(result, Err(SchedulerError::InvalidSessionId(5))), + "expected InvalidSessionId(5), got {result:?}", + ); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn bump_smaller_smaller_session_id_returns_invalid() -> Result<()> { + const SESSION_ID: SessionId = 5; + const SMALLER_SESSION_ID: SessionId = SESSION_ID - 1; + + let (writer, _reader) = create_dispatch_queue(8, SESSION_ID); + let result = writer.bump_session_id(SMALLER_SESSION_ID).await; + assert!(matches!( + result, + Err(SchedulerError::InvalidSessionId(SMALLER_SESSION_ID)) + )); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn bump_higher_succeeds() -> Result<()> { + const SESSION_ID: SessionId = 5; + const NEW_SESSION_ID: SessionId = SESSION_ID + 1; + + let (writer, reader) = create_dispatch_queue(8, SESSION_ID); + writer.bump_session_id(NEW_SESSION_ID).await?; + writer.enqueue(make_assignment()).await?; + + let (session, _) = reader + .dequeue(Duration::from_secs(1)) + .await? + .expect("expected an assignment"); + assert_eq!(session, NEW_SESSION_ID); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn pre_bump_items_not_delivered() -> Result<()> { + let (writer, reader) = create_dispatch_queue(8, 1); + writer.enqueue(make_assignment()).await?; + writer.enqueue(make_assignment()).await?; + writer.bump_session_id(2).await?; + + let result = reader.dequeue(Duration::from_millis(100)).await?; + assert_eq!(result, None); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn post_bump_items_paired_with_new_session() -> Result<()> { + let (writer, reader) = create_dispatch_queue(8, 1); + writer.bump_session_id(2).await?; + let assignment = make_assignment(); + writer.enqueue(assignment).await?; + + let (session, received) = reader + .dequeue(Duration::from_secs(1)) + .await? + .expect("expected an assignment"); + assert_eq!(session, 2); + assert_eq!(received, assignment); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn successive_bumps() -> Result<()> { + let (writer, reader) = create_dispatch_queue(8, 1); + writer.bump_session_id(2).await?; + writer.bump_session_id(3).await?; + + let equal = writer.bump_session_id(3).await; + assert!( + matches!(equal, Err(SchedulerError::InvalidSessionId(3))), + "expected InvalidSessionId(3), got {equal:?}", + ); + let smaller = writer.bump_session_id(2).await; + assert!( + matches!(smaller, Err(SchedulerError::InvalidSessionId(2))), + "expected InvalidSessionId(2), got {smaller:?}", + ); + + writer.enqueue(make_assignment()).await?; + let (session, _) = reader + .dequeue(Duration::from_secs(1)) + .await? + .expect("expected an assignment"); + assert_eq!(session, 3); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn size_zero_after_bump() -> Result<()> { + let (writer, _reader) = create_dispatch_queue(8, 1); + writer.enqueue(make_assignment()).await?; + writer.enqueue(make_assignment()).await?; + writer.enqueue(make_assignment()).await?; + assert_eq!(writer.size(), 3); + + writer.bump_session_id(2).await?; + assert_eq!(writer.size(), 0); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn single_bump_pair_consistency() -> Result<()> { + const INIT_SESSION: SessionId = 10; + const MID_SESSION: SessionId = 20; + const FINAL_SESSION: SessionId = 30; + + let (writer, reader) = create_dispatch_queue(8, INIT_SESSION); + + let pre_bump_ids: Vec = (0..3).map(|_| next_task_id()).collect(); + let post_bump_ids: Vec = (0..2).map(|_| next_task_id()).collect(); + let final_id = next_task_id(); + + let pre_bump_for_writer = pre_bump_ids.clone(); + let post_bump_for_writer = post_bump_ids.clone(); + let writer_handle = tokio::spawn(async move { + for &id in &pre_bump_for_writer { + writer + .enqueue(make_assignment_with_task_id(id)) + .await + .expect("enqueue failed"); + } + // Wait for the reader to consume the batch before bumping, so the items survive into + // the delivered set instead of being drained. + while writer.size() > 0 { + tokio::time::sleep(Duration::from_millis(50)).await; + } + writer + .bump_session_id(MID_SESSION) + .await + .expect("bump to mid session failed"); + + for &id in &post_bump_for_writer { + writer + .enqueue(make_assignment_with_task_id(id)) + .await + .expect("enqueue failed"); + } + while writer.size() > 0 { + tokio::time::sleep(Duration::from_millis(50)).await; + } + writer + .bump_session_id(FINAL_SESSION) + .await + .expect("bump to final session failed"); + + writer + .enqueue(make_assignment_with_task_id(final_id)) + .await + .expect("enqueue failed"); + drop(writer); + }); + + let mut delivered: HashMap = HashMap::new(); + loop { + match reader.dequeue(Duration::from_millis(100)).await { + Ok(Some((session, assignment))) => { + let prior = delivered.insert(assignment.task_id, session); + assert_eq!( + prior, None, + "duplicate delivery for {:?}", + assignment.task_id + ); + } + Ok(None) => (), + Err(_) => break, + } + } + writer_handle.await?; + + for &id in &pre_bump_ids { + assert_eq!( + delivered.get(&id).copied(), + Some(INIT_SESSION), + "pre-bump item not paired with initial session: {id:?}", + ); + } + for &id in &post_bump_ids { + assert_eq!( + delivered.get(&id).copied(), + Some(MID_SESSION), + "post-bump item not paired with mid session: {id:?}", + ); + } + assert_eq!(delivered.get(&final_id).copied(), Some(FINAL_SESSION)); + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn pair_consistency_stress_with_one_reader() -> Result<()> { + run_pair_consistency_stress(1, 1_234_567).await + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn pair_consistency_stress_with_many_readers() -> Result<()> { + run_pair_consistency_stress(4, 7_654_321).await + } +} From dda7770592f4555d35532d1b011f2e6307810d52 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 3 Jun 2026 11:18:35 -0400 Subject: [PATCH 06/14] Done. --- Cargo.lock | 1 + components/spider-scheduler/Cargo.toml | 3 +- .../examples/round_robin_load.rs | 392 +++++++++ components/spider-scheduler/src/core.rs | 2 +- components/spider-scheduler/src/core_impl.rs | 3 + .../src/core_impl/round_robin.rs | 776 ++++++++++++++++++ components/spider-scheduler/src/error.rs | 6 + components/spider-scheduler/src/lib.rs | 1 + 8 files changed, 1182 insertions(+), 2 deletions(-) create mode 100644 components/spider-scheduler/examples/round_robin_load.rs create mode 100644 components/spider-scheduler/src/core_impl.rs create mode 100644 components/spider-scheduler/src/core_impl/round_robin.rs diff --git a/Cargo.lock b/Cargo.lock index 1742569a..6d96c4b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1508,6 +1508,7 @@ dependencies = [ "async-channel", "async-trait", "dashmap", + "serde", "spider-core", "thiserror", "tokio", diff --git a/components/spider-scheduler/Cargo.toml b/components/spider-scheduler/Cargo.toml index ee803e17..a928fddf 100644 --- a/components/spider-scheduler/Cargo.toml +++ b/components/spider-scheduler/Cargo.toml @@ -12,8 +12,9 @@ async-channel = "2.3.1" async-trait = "0.1.89" spider-core = { path = "../spider-core" } thiserror = "2.0.18" -tokio = { version = "1.52.3", features = ["sync", "time"] } +tokio = { version = "1.52.3", features = ["macros", "rt", "sync", "time"] } tokio-util = "0.7.18" +serde = { version = "1.0.228", features = ["derive"] } [dev-dependencies] anyhow = "1.0.102" diff --git a/components/spider-scheduler/examples/round_robin_load.rs b/components/spider-scheduler/examples/round_robin_load.rs new file mode 100644 index 00000000..9d2d9385 --- /dev/null +++ b/components/spider-scheduler/examples/round_robin_load.rs @@ -0,0 +1,392 @@ +//! Load-test and instrumentation harness for the round-robin scheduler core. +//! +//! Topology: +//! +//! ```text +//! submitter ──▶ MockStorage (ready lane) ──poll──▶ RoundRobinCore ──enqueue──▶ dispatch queue ──▶ 64 workers +//! ``` +//! +//! * A mock storage holds 128 jobs of 1000 tasks each, released gradually (one job at a time) to +//! simulate a job-submission cycle rather than making everything ready at `t=0`. +//! * 1% of the tasks are submitted twice (back-to-back) so the scheduler's deduplication can be +//! exercised; workers must still observe every task exactly once. +//! * 64 workers drain the dispatch queue, sleeping 5ms per task to model execution latency. +//! +//! Run with (release recommended so the timings are meaningful): +//! +//! ```bash +//! cargo run -p spider-scheduler --example round_robin_load --release +//! ``` + +use std::{ + sync::{ + Arc, + atomic::{AtomicBool, AtomicUsize, Ordering}, + }, + time::{Duration, Instant}, +}; + +use async_trait::async_trait; +use dashmap::DashSet; +use spider_core::{ + job::JobState, + types::id::{JobId, ResourceGroupId, SessionId, TaskId}, +}; +use spider_scheduler::{ + DispatchQueueSource, + SchedulerCore, + SchedulerStorageClient, + StorageClientError, + core_impl::RoundRobinConfig, + dispatch_queue::{DispatchQueueReader, DispatchQueueWriter, create_dispatch_queue}, + types::InboundEntry, +}; +use tokio_util::sync::CancellationToken; + +// --------------------------------------------------------------------------------------------- +// Workload parameters +// --------------------------------------------------------------------------------------------- + +const NUM_JOBS: usize = 128; +const TASKS_PER_JOB: usize = 1000; +const TOTAL_UNIQUE_TASKS: usize = NUM_JOBS * TASKS_PER_JOB; + +/// Every `DUP_EVERY`-th task within a job is submitted twice, yielding exactly 1% duplicates. +const DUP_EVERY: usize = 100; +const EXPECTED_DUPLICATES_SUBMITTED: usize = NUM_JOBS * TASKS_PER_JOB / DUP_EVERY; + +const NUM_WORKERS: usize = 64; +const WORKER_SLEEP: Duration = Duration::from_millis(5); +const WORKER_POLL_WAIT: Duration = Duration::from_millis(10); + +/// Delay between releasing successive jobs into storage (the "submission cycle"). +const JOB_SUBMIT_INTERVAL: Duration = Duration::from_millis(10); + +/// A fixed session: this harness never bumps the session, so storage and the dispatch queue both +/// start (and stay) here. +const SESSION_ID: SessionId = 0; + +// Round-robin scheduler configuration (as requested). +const ACTIVE_JOB_POOL_CAPACITY: usize = 8; +const DISPATCH_QUEUE_CAPACITY: usize = NUM_WORKERS * 4; // 256 +const STORAGE_POLLING_WAIT_TIME_MS: u64 = 10; // dispatch/poll interval +const READY_TASK_CAPACITY: usize = TASKS_PER_JOB * NUM_WORKERS; // 64_000 +const COMMIT_READY_TASK_CAPACITY: usize = 10; +const CLEANUP_READY_TASK_CAPACITY: usize = 10; + +/// Safety net so a scheduling bug that drops a task cannot hang the harness forever. +const OVERALL_TIMEOUT: Duration = Duration::from_mins(2); + +// --------------------------------------------------------------------------------------------- +// Mock storage +// --------------------------------------------------------------------------------------------- + +/// A mock [`SchedulerStorageClient`] whose regular lane is backed by an unbounded channel that the +/// submitter feeds. Commit and cleanup lanes are always empty. +#[derive(Clone)] +struct MockStorage { + inner: Arc, +} + +struct MockStorageInner { + ready_tx: async_channel::Sender, + ready_rx: async_channel::Receiver, +} + +impl MockStorage { + fn new() -> Self { + let (ready_tx, ready_rx) = async_channel::unbounded(); + Self { + inner: Arc::new(MockStorageInner { ready_tx, ready_rx }), + } + } + + /// # Returns + /// + /// A cloned sender for the regular ready lane, used by the submitter task. + fn sender(&self) -> async_channel::Sender { + self.inner.ready_tx.clone() + } +} + +#[async_trait] +impl SchedulerStorageClient for MockStorage { + async fn poll_ready( + &self, + max_items: usize, + wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + if max_items == 0 { + // The scheduler has no buffer headroom; emulate a real blocking poll that yields + // nothing rather than pulling past the requested cap. + tokio::time::sleep(wait).await; + return Ok((SESSION_ID, Vec::new())); + } + + let mut out = Vec::new(); + // Block up to `wait` for the first entry, mirroring a real long-poll. + match tokio::time::timeout(wait, self.inner.ready_rx.recv()).await { + Ok(Ok(entry)) => out.push(entry), + // Channel closed (never happens here, sender is held by storage) or timed out: return + // whatever we have (possibly nothing). + Ok(Err(_)) | Err(_) => return Ok((SESSION_ID, out)), + } + // Drain the rest without blocking, up to `max_items`. + while out.len() < max_items { + match self.inner.ready_rx.try_recv() { + Ok(entry) => out.push(entry), + Err(_) => break, + } + } + Ok((SESSION_ID, out)) + } + + async fn poll_commit_ready( + &self, + _max_items: usize, + wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + tokio::time::sleep(wait).await; + Ok((SESSION_ID, Vec::new())) + } + + async fn poll_cleanup_ready( + &self, + _max_items: usize, + wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + tokio::time::sleep(wait).await; + Ok((SESSION_ID, Vec::new())) + } + + async fn job_state(&self, _job_id: JobId) -> Result { + Ok(JobState::Running) + } +} + +// --------------------------------------------------------------------------------------------- +// Submitter & workers +// --------------------------------------------------------------------------------------------- + +/// Releases each job's tasks into storage one job at a time, duplicating every `DUP_EVERY`-th task +/// back-to-back so the duplicate lands in the same poll batch as its original. +async fn submit_jobs(jobs: Vec<(JobId, ResourceGroupId)>, tx: async_channel::Sender) { + for (job_id, resource_group_id) in jobs { + for i in 0..TASKS_PER_JOB { + let entry = InboundEntry { + resource_group_id, + job_id, + task_id: TaskId::Index(i), + }; + tx.send(entry).await.expect("ready lane closed"); + if i % DUP_EVERY == 0 { + tx.send(entry).await.expect("ready lane closed"); + } + } + tokio::time::sleep(JOB_SUBMIT_INTERVAL).await; + } +} + +/// Shared bookkeeping for the "each task is polled exactly once" check. +struct WorkerStats { + seen: DashSet<(JobId, TaskId)>, + total_received: AtomicUsize, + duplicate_received: AtomicUsize, +} + +/// A single worker: drain the dispatch queue, record each assignment, then sleep to model work. +async fn worker(reader: DispatchQueueReader, stats: Arc, done: Arc) { + loop { + if done.load(Ordering::Relaxed) { + break; + } + match reader.dequeue(WORKER_POLL_WAIT).await { + Ok(Some((_session, assignment))) => { + stats.total_received.fetch_add(1, Ordering::Relaxed); + if !stats.seen.insert((assignment.job_id, assignment.task_id)) { + stats.duplicate_received.fetch_add(1, Ordering::Relaxed); + } + tokio::time::sleep(WORKER_SLEEP).await; + } + Ok(None) => {} + // Dispatch queue closed (scheduler dropped its writer): nothing more will arrive. + Err(_) => break, + } + } +} + +// --------------------------------------------------------------------------------------------- +// Harness +// --------------------------------------------------------------------------------------------- + +#[tokio::main(flavor = "multi_thread")] +async fn main() { + let storage = MockStorage::new(); + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, SESSION_ID); + + let config = RoundRobinConfig::::new( + ACTIVE_JOB_POOL_CAPACITY, + DISPATCH_QUEUE_CAPACITY, + READY_TASK_CAPACITY, + COMMIT_READY_TASK_CAPACITY, + CLEANUP_READY_TASK_CAPACITY, + STORAGE_POLLING_WAIT_TIME_MS, + ); + let metrics = config.metrics(); + + let jobs: Vec<(JobId, ResourceGroupId)> = (0..NUM_JOBS) + .map(|_| (JobId::new(), ResourceGroupId::new())) + .collect(); + + // Scheduler. + let scheduler_token = CancellationToken::new(); + let scheduler_handle = { + let token = scheduler_token.clone(); + let storage = storage.clone(); + tokio::spawn(async move { config.run(storage, writer, token).await }) + }; + + // Workers. + let stats = Arc::new(WorkerStats { + seen: DashSet::with_capacity(TOTAL_UNIQUE_TASKS), + total_received: AtomicUsize::new(0), + duplicate_received: AtomicUsize::new(0), + }); + let done = Arc::new(AtomicBool::new(false)); + let worker_handles: Vec<_> = (0..NUM_WORKERS) + .map(|_| tokio::spawn(worker(reader.clone(), stats.clone(), done.clone()))) + .collect(); + drop(reader); + + // Submitter. + let submit_handle = tokio::spawn(submit_jobs(jobs, storage.sender())); + + // Drive to completion: every unique task delivered, or the safety timeout. Poll tightly so the + // metrics are frozen as soon as the last task arrives, keeping the idle tail out of the averages. + let start = Instant::now(); + let mut timed_out = false; + loop { + if stats.seen.len() >= TOTAL_UNIQUE_TASKS { + break; + } + if start.elapsed() > OVERALL_TIMEOUT { + timed_out = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + // Stop timing now that every task has arrived; the scheduler only spins on empty polls past + // this point and must not pollute the stage 1 & 2 averages. + metrics.stop(); + let wall = start.elapsed(); + + // Tear down. + done.store(true, Ordering::Relaxed); + scheduler_token.cancel(); + submit_handle.abort(); + for handle in worker_handles { + let _ = handle.await; + } + match scheduler_handle.await { + Ok(Ok(())) => {} + Ok(Err(e)) => eprintln!("scheduler returned an error: {e:?}"), + Err(e) => eprintln!("scheduler task panicked: {e:?}"), + } + let _ = submit_handle.await; + + report(&metrics, &stats, wall, timed_out); +} + +/// Prints the collected timing and correctness results. +fn report( + metrics: &spider_scheduler::core_impl::RoundRobinMetrics, + stats: &WorkerStats, + wall: Duration, + timed_out: bool, +) { + let loop_count = metrics.loop_count.load(Ordering::Relaxed); + let total_loop_ns = metrics.total_loop_ns.load(Ordering::Relaxed); + let buffer_count = metrics.buffer_enrich_count.load(Ordering::Relaxed); + let buffer_ns = metrics.buffer_enrich_ns.load(Ordering::Relaxed); + let dispatch_count = metrics.dispatch_enrich_count.load(Ordering::Relaxed); + let dispatch_ns = metrics.dispatch_enrich_ns.load(Ordering::Relaxed); + + let total = stats.total_received.load(Ordering::Relaxed); + let duplicates = stats.duplicate_received.load(Ordering::Relaxed); + let unique = stats.seen.len(); + + println!("\n================ Round-robin scheduler load test ================"); + println!( + "Wall-clock runtime: {:.3} s", + wall.as_secs_f64() + ); + if timed_out { + println!("!! TIMED OUT before all unique tasks were delivered !!"); + } + + println!("\n---- Workload ----"); + println!("Jobs: {NUM_JOBS}"); + println!("Tasks per job: {TASKS_PER_JOB}"); + println!("Unique tasks (expected): {TOTAL_UNIQUE_TASKS}"); + println!("Duplicate task entries submitted: {EXPECTED_DUPLICATES_SUBMITTED}"); + println!("Workers: {NUM_WORKERS}"); + + println!("\n---- Scheduling-loop timing ----"); + println!("Scheduling-loop iterations: {loop_count}"); + println!( + "Avg time per scheduling loop: {:>9.3} us", + avg_us(total_loop_ns, loop_count) + ); + println!( + "Avg buffer-enrich time (stage 1): {:>9.3} us (over {buffer_count} iterations that \ + polled a non-empty result)", + avg_us(buffer_ns, buffer_count) + ); + println!( + "Avg dispatch-enrich time (stage 2): {:>9.3} us (over {dispatch_count} iterations that \ + dispatched >=1 task)", + avg_us(dispatch_ns, dispatch_count) + ); + let idle_loops = loop_count.saturating_sub(dispatch_count); + println!( + "No-dispatch loop iterations: {idle_loops} ({:.1}% of all iterations)", + percent(idle_loops, loop_count) + ); + + println!("\n---- Correctness: each task polled exactly once ----"); + println!("Total assignments received: {total}"); + println!("Unique (job, task) pairs received: {unique}"); + println!("Duplicate deliveries observed: {duplicates}"); + + let exactly_once = !timed_out + && duplicates == 0 + && unique == TOTAL_UNIQUE_TASKS + && total == TOTAL_UNIQUE_TASKS; + println!( + "\nRESULT: each task polled exactly once -> {}", + if exactly_once { "PASS" } else { "FAIL" } + ); + println!("=================================================================\n"); +} + +/// # Returns +/// +/// `ns / count` converted to microseconds, or `0.0` when `count` is zero. +fn avg_us(ns: u64, count: u64) -> f64 { + if count == 0 { + 0.0 + } else { + ns as f64 / count as f64 / 1_000.0 + } +} + +/// # Returns +/// +/// `part` as a percentage of `whole`, or `0.0` when `whole` is zero. +fn percent(part: u64, whole: u64) -> f64 { + if whole == 0 { + 0.0 + } else { + part as f64 / whole as f64 * 100.0 + } +} diff --git a/components/spider-scheduler/src/core.rs b/components/spider-scheduler/src/core.rs index c6bb661c..f6715341 100644 --- a/components/spider-scheduler/src/core.rs +++ b/components/spider-scheduler/src/core.rs @@ -39,7 +39,7 @@ pub trait SchedulerCore: Send { /// /// Returns a [`SchedulerError`] instance indicating an irrecoverable error. async fn run( - &mut self, + self, storage_client: Self::StorageClient, sink: Self::Sink, cancellation_token: tokio_util::sync::CancellationToken, diff --git a/components/spider-scheduler/src/core_impl.rs b/components/spider-scheduler/src/core_impl.rs new file mode 100644 index 00000000..2e27d2e9 --- /dev/null +++ b/components/spider-scheduler/src/core_impl.rs @@ -0,0 +1,3 @@ +mod round_robin; + +pub use round_robin::*; diff --git a/components/spider-scheduler/src/core_impl/round_robin.rs b/components/spider-scheduler/src/core_impl/round_robin.rs new file mode 100644 index 00000000..f32c36ca --- /dev/null +++ b/components/spider-scheduler/src/core_impl/round_robin.rs @@ -0,0 +1,776 @@ +use std::{ + collections::{HashMap, HashSet, VecDeque}, + sync::{ + Arc, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, + time::{Duration, Instant}, +}; + +use async_trait::async_trait; +use spider_core::types::id::{JobId, ResourceGroupId, SessionId, TaskId}; +use tokio::select; +use tokio_util::sync::CancellationToken; +use serde::Deserialize; +use crate::{ + DispatchQueueSink, + InboundEntry, + SchedulerCore, + SchedulerError, + SchedulerStorageClient, + StorageClientError, + TaskAssignment, +}; + +#[derive(Deserialize)] +pub struct RoundRobinConfig< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> { + /// The capacity of the active jobs pool. The scheduler will make task assignments from these + /// jobs in a round-robin manner. + pub active_job_pool_capacity: usize, + + /// The capacity of the dispatch queue. + pub dispatch_queue_capacity: usize, + + /// The capacity of the total pending ready tasks buffered in the scheduler. + pub ready_task_capacity: usize, + + /// The capacity of the total pending commit-ready tasks buffered in the scheduler. + pub commit_ready_task_capacity: usize, + + /// The capacity of the total pending cleanup-ready tasks buffered in the scheduler. + pub cleanup_ready_task_capacity: usize, + + pub storage_polling_wait_time_ms: u64, + + #[serde(skip)] + metrics: Arc, + + #[serde(skip)] + _marker: std::marker::PhantomData<(SchedulerStorageClientType, DispatchQueueSinkType)>, +} + +/// Instrumentation counters for the round-robin scheduling loop. +/// +/// Durations are accumulated in nanoseconds; an average is a `*_ns` total divided by its matching +/// `*_count`. All counters use [`Ordering::Relaxed`] and are meant for coarse profiling only, not +/// for establishing happens-before relationships. +#[derive(Debug, Default)] +pub struct RoundRobinMetrics { + /// Number of completed scheduling-loop iterations (`loop_once` calls). + pub loop_count: AtomicU64, + + /// Total wall-clock time spent across all scheduling-loop iterations. + pub total_loop_ns: AtomicU64, + + /// Number of iterations that processed a fresh inbound polling result. + pub buffer_enrich_count: AtomicU64, + + /// Total time spent draining inbound polling results into the scheduler's buffers ("enrich the + /// buffer", stage 1). + pub buffer_enrich_ns: AtomicU64, + + /// Number of iterations that dispatched at least one assignment. + pub dispatch_enrich_count: AtomicU64, + + /// Total time spent making scheduling decisions and filling the dispatch queue ("enrich the + /// dispatch queue", stage 2). + pub dispatch_enrich_ns: AtomicU64, + + /// When set, the scheduling loop stops accumulating any of the counters above. Used to exclude + /// the idle tail (after all work has drained) from the averages. + stopped: AtomicBool, +} + +impl RoundRobinMetrics { + /// Freezes all counters: subsequent scheduling-loop iterations are not recorded. + pub fn stop(&self) { + self.stopped.store(true, Ordering::Relaxed); + } + + /// # Returns + /// + /// Whether the counters are still being recorded. + fn is_recording(&self) -> bool { + !self.stopped.load(Ordering::Relaxed) + } +} + +impl< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> RoundRobinConfig +{ + /// Creates a new round-robin configuration with a fresh, empty set of metrics. + #[must_use] + pub fn new( + active_job_pool_capacity: usize, + dispatch_queue_capacity: usize, + ready_task_capacity: usize, + commit_ready_task_capacity: usize, + cleanup_ready_task_capacity: usize, + storage_polling_wait_time_ms: u64, + ) -> Self { + Self { + active_job_pool_capacity, + dispatch_queue_capacity, + ready_task_capacity, + commit_ready_task_capacity, + cleanup_ready_task_capacity, + storage_polling_wait_time_ms, + metrics: Arc::new(RoundRobinMetrics::default()), + _marker: std::marker::PhantomData, + } + } + + /// # Returns + /// + /// A shared handle to the loop instrumentation counters, so callers can read them while (or + /// after) the scheduler runs. + #[must_use] + pub fn metrics(&self) -> Arc { + Arc::clone(&self.metrics) + } +} + +/// # Returns +/// +/// The time elapsed since `start` in nanoseconds, saturating at [`u64::MAX`]. +fn elapsed_nanos(start: Instant) -> u64 { + u64::try_from(start.elapsed().as_nanos()).unwrap_or(u64::MAX) +} + +#[async_trait] +impl< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> SchedulerCore for RoundRobinConfig +{ + type StorageClient = SchedulerStorageClientType; + type Sink = DispatchQueueSinkType; + + async fn run( + self, + storage_client: Self::StorageClient, + sink: Self::Sink, + cancellation_token: CancellationToken, + ) -> Result<(), SchedulerError> { + RoundRobin::new( + SessionId::default(), + storage_client, + sink, + cancellation_token, + self, + ) + .run() + .await + } +} + +struct JobEntry { + job_id: JobId, + resource_group_id: ResourceGroupId, + task_ids: VecDeque, +} + +impl JobEntry { + fn new(job_id: JobId, resource_group_id: ResourceGroupId, init_task_id: TaskId) -> Self { + Self { + job_id, + resource_group_id, + task_ids: VecDeque::from([init_task_id]), + } + } + + fn enqueue(&mut self, task_id: TaskId) { + self.task_ids.push_back(task_id); + } + + fn dequeue(&mut self) -> Option { + self.task_ids.pop_front() + } +} + +#[derive(Clone)] +enum ActiveJobQueueEntry { + Ready(JobId), + CommitReady, + CleanupReady, +} + +struct RoundRobin< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> { + storage_client: SchedulerStorageClientType, + sink: DispatchQueueSinkType, + cancellation_token: CancellationToken, + config: RoundRobinConfig, + storage_session_id: SessionId, + ready_set: HashSet<(JobId, TaskId)>, + + active_jobs: HashMap, + active_job_queue: Vec, + active_job_queue_cursor: usize, + + pending_jobs: HashMap, + pending_job_queue: VecDeque, + + commit_ready_queue: VecDeque<(JobId, ResourceGroupId)>, + cleanup_ready_queue: VecDeque<(JobId, ResourceGroupId)>, + + commit_ready_or_cleanup_ready_tasks: HashSet, + + inbound_queue_reader: AsyncInboundQueueReader, +} + +impl< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> RoundRobin +{ + fn new( + storage_session_id: SessionId, + storage_client: SchedulerStorageClientType, + sink: DispatchQueueSinkType, + cancellation_token: CancellationToken, + config: RoundRobinConfig, + ) -> Self { + let ready_set = HashSet::with_capacity(config.ready_task_capacity); + let active_jobs = HashMap::with_capacity(config.active_job_pool_capacity); + let active_job_queue = Self::new_active_job_queue(config.active_job_pool_capacity); + let active_job_queue_cursor = 0; + let pending_jobs = HashMap::with_capacity(config.active_job_pool_capacity); + let pending_job_queue = VecDeque::with_capacity(config.active_job_pool_capacity); + let commit_ready_queue = VecDeque::with_capacity(config.commit_ready_task_capacity); + let cleanup_ready_queue = VecDeque::with_capacity(config.cleanup_ready_task_capacity); + let commit_ready_or_cleanup_ready_tasks = HashSet::with_capacity( + config.commit_ready_task_capacity + config.cleanup_ready_task_capacity, + ); + let inbound_queue_reader = AsyncInboundQueueReader::new(storage_client.clone()); + Self { + storage_client, + sink, + cancellation_token, + config, + storage_session_id, + ready_set, + active_jobs, + active_job_queue, + active_job_queue_cursor, + pending_jobs, + pending_job_queue, + commit_ready_queue, + cleanup_ready_queue, + commit_ready_or_cleanup_ready_tasks, + inbound_queue_reader, + } + } + + fn new_active_job_queue(active_job_pool_capacity: usize) -> Vec { + let mut active_job_queue = Vec::with_capacity(active_job_pool_capacity + 2); + active_job_queue.push(ActiveJobQueueEntry::CommitReady); + active_job_queue.push(ActiveJobQueueEntry::CleanupReady); + active_job_queue + } + + async fn run(mut self) -> Result<(), SchedulerError> { + loop { + let cancellation_token = self.cancellation_token.clone(); + select! { + () = cancellation_token.cancelled() => { + return Ok(()); + } + result = self.loop_once() => { + let () = result?; + } + } + } + } + + fn clear_all_placement(&mut self) { + self.ready_set.clear(); + self.active_jobs.clear(); + self.pending_jobs.clear(); + self.pending_job_queue.clear(); + self.commit_ready_queue.clear(); + self.cleanup_ready_queue.clear(); + self.commit_ready_or_cleanup_ready_tasks.clear(); + + self.active_job_queue = Self::new_active_job_queue(self.config.active_job_pool_capacity); + self.active_job_queue_cursor = 0; + } + + fn remove_active_job_and_dequeue_next_pending_job( + &mut self, + job_id: JobId, + ) -> Result<(), SchedulerError> { + if let Some(index) = self.active_job_queue.iter().position(|entry| match entry { + ActiveJobQueueEntry::Ready(id) => *id == job_id, + _ => false, + }) { + self.active_job_queue.swap_remove(index); + } else { + return Err(SchedulerError::Internal( + "attempt to remove a non-existing active job: {job_id:?}".to_string(), + )); + } + + if let Some(entry_to_remove) = self.active_jobs.remove(&job_id) { + self.destroy_job_entry(entry_to_remove); + } else { + return Err(SchedulerError::Internal( + "attempt to destroy a non-existing active job: {job_id:?}".to_string(), + )); + } + + if let Some(next_pending_job) = self.next_pending_job() { + self.active_job_queue + .push(ActiveJobQueueEntry::Ready(next_pending_job.job_id)); + self.active_jobs + .insert(next_pending_job.job_id, next_pending_job); + } + Ok(()) + } + + fn next_pending_job(&mut self) -> Option { + loop { + let job_id = self.pending_job_queue.pop_front()?; + // NOTE: The job may have been cancelled and removed from `pending_jobs`, so the ID in + // the queue may not necessarily exist in `pending_jobs`. + if let Some(pending_job) = self.pending_jobs.remove(&job_id) { + return Some(pending_job); + } + } + } + + fn destroy_job_entry(&mut self, job_entry: JobEntry) { + for task_id in job_entry.task_ids { + self.ready_set.remove(&(job_entry.job_id, task_id)); + } + } + + async fn loop_once(&mut self) -> Result<(), SchedulerError> { + let loop_start = Instant::now(); + let recording = self.config.metrics.is_recording(); + + // Stage 1: Retrieve inbound queue results + let curr_session_id = self.storage_session_id; + let inbound_queue_result = self + .inbound_queue_reader + .poll_ready(curr_session_id) + .await?; + match inbound_queue_result { + InboundQueueResult::Result { + session_id, + ready_entries, + commit_ready_entries, + cleanup_ready_entries, + } => { + let buffer_start = Instant::now(); + let inbound_entry_count = + ready_entries.len() + commit_ready_entries.len() + cleanup_ready_entries.len(); + if session_id < curr_session_id { + return Err(SchedulerError::InvalidSessionId(session_id)); + } + if session_id > curr_session_id { + self.storage_session_id = session_id; + self.clear_all_placement(); + self.sink.bump_session_id(session_id).await?; + } + + // Load commit ready tasks and cleanup ready tasks first to avoid loading a job that + // is already cancelled or commit-ready. + for inbound_entry in commit_ready_entries { + if !self + .ready_set + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + self.commit_ready_or_cleanup_ready_tasks + .insert(inbound_entry.job_id); + self.commit_ready_queue + .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); + + if self.active_jobs.contains_key(&inbound_entry.job_id) { + self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; + continue; + } + + if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { + self.destroy_job_entry(job_entry); + } + } + + for inbound_entry in cleanup_ready_entries { + if !self + .ready_set + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + self.commit_ready_or_cleanup_ready_tasks + .insert(inbound_entry.job_id); + self.cleanup_ready_queue + .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); + + if self.active_jobs.contains_key(&inbound_entry.job_id) { + self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; + continue; + } + + if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { + self.destroy_job_entry(job_entry); + } + } + + for inbound_entry in ready_entries { + if self + .commit_ready_or_cleanup_ready_tasks + .contains(&inbound_entry.job_id) + { + continue; + } + if !self + .ready_set + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + if let Some(active_job) = self.active_jobs.get_mut(&inbound_entry.job_id) { + active_job.enqueue(inbound_entry.task_id); + continue; + } + if let Some(pending_job) = self.pending_jobs.get_mut(&inbound_entry.job_id) { + pending_job.enqueue(inbound_entry.task_id); + continue; + } + if self.active_jobs.len() < self.config.active_job_pool_capacity { + self.active_jobs.insert( + inbound_entry.job_id, + JobEntry::new( + inbound_entry.job_id, + inbound_entry.resource_group_id, + inbound_entry.task_id, + ), + ); + self.active_job_queue + .push(ActiveJobQueueEntry::Ready(inbound_entry.job_id)); + continue; + } + self.pending_jobs.insert( + inbound_entry.job_id, + JobEntry::new( + inbound_entry.job_id, + inbound_entry.resource_group_id, + inbound_entry.task_id, + ), + ); + self.pending_job_queue.push_back(inbound_entry.job_id); + } + + // Only record iterations that actually had entries to enrich, so the average + // reflects real work rather than empty polls when the scheduler is idle. + if recording && inbound_entry_count > 0 { + self.config + .metrics + .buffer_enrich_ns + .fetch_add(elapsed_nanos(buffer_start), Ordering::Relaxed); + self.config + .metrics + .buffer_enrich_count + .fetch_add(1, Ordering::Relaxed); + } + + self.spawn_inbound_queue_reader(); + } + InboundQueueResult::ResultNotReady => {} + InboundQueueResult::HandleNotSpawned => { + self.spawn_inbound_queue_reader(); + } + } + + // Stage 2: Make scheduling decisions to fill the dispatch queue + let dispatch_start = Instant::now(); + let mut dispatch_queue_slots = self + .config + .dispatch_queue_capacity + .saturating_sub(self.sink.size()); + let initial_dispatch_queue_slots = dispatch_queue_slots; + loop { + if dispatch_queue_slots == 0 || self.ready_set.is_empty() { + break; + } + if self.active_job_queue_cursor >= self.active_job_queue.len() { + self.active_job_queue_cursor = 0; + } + let active_job_queue_entry = + match self.active_job_queue.get(self.active_job_queue_cursor) { + Some(entry) => entry.clone(), + None => { + return Err(SchedulerError::Internal( + "active job queue cursor is corrupted".to_string(), + )); + } + }; + self.active_job_queue_cursor += 1; + match active_job_queue_entry { + ActiveJobQueueEntry::CleanupReady => { + let Some((job_id, resource_group_id)) = self.cleanup_ready_queue.pop_front() + else { + continue; + }; + self.sink + .enqueue(TaskAssignment { + job_id, + resource_group_id, + task_id: TaskId::Cleanup, + }) + .await?; + self.ready_set.remove(&(job_id, TaskId::Cleanup)); + self.commit_ready_or_cleanup_ready_tasks.remove(&job_id); + dispatch_queue_slots -= 1; + } + ActiveJobQueueEntry::CommitReady => { + let Some((job_id, resource_group_id)) = self.commit_ready_queue.pop_front() + else { + continue; + }; + self.sink + .enqueue(TaskAssignment { + job_id, + resource_group_id, + task_id: TaskId::Commit, + }) + .await?; + self.ready_set.remove(&(job_id, TaskId::Commit)); + self.commit_ready_or_cleanup_ready_tasks.remove(&job_id); + dispatch_queue_slots -= 1; + } + ActiveJobQueueEntry::Ready(job_id) => { + let Some(job_entry) = self.active_jobs.get_mut(&job_id) else { + return Err(SchedulerError::Internal( + "attempt to remove a non-existing active job: {job_id:?}".to_string(), + )); + }; + if let Some(task_id) = job_entry.dequeue() { + self.sink + .enqueue(TaskAssignment { + job_id, + resource_group_id: job_entry.resource_group_id, + task_id, + }) + .await?; + self.ready_set.remove(&(job_id, task_id)); + dispatch_queue_slots -= 1; + } else { + self.remove_active_job_and_dequeue_next_pending_job(job_id)?; + } + } + } + } + + let dispatched = initial_dispatch_queue_slots - dispatch_queue_slots; + if recording && dispatched > 0 { + self.config + .metrics + .dispatch_enrich_ns + .fetch_add(elapsed_nanos(dispatch_start), Ordering::Relaxed); + self.config + .metrics + .dispatch_enrich_count + .fetch_add(1, Ordering::Relaxed); + } + + if recording { + self.config + .metrics + .total_loop_ns + .fetch_add(elapsed_nanos(loop_start), Ordering::Relaxed); + self.config + .metrics + .loop_count + .fetch_add(1, Ordering::Relaxed); + } + + // When the iteration dispatched nothing, the loop is either waiting on an in-flight poll or + // back-pressured by a full dispatch queue. In both cases it would otherwise spin without an + // await point; because the inbound polls run on tasks this same runtime must schedule, a + // non-yielding spin livelocks them and the scheduler never makes progress. Yield to let the + // poll tasks and dispatch-queue readers run. + if dispatched == 0 { + tokio::task::yield_now().await; + } + + Ok(()) + } + + fn spawn_inbound_queue_reader(&mut self) { + let num_commit_ready_tasks = self.commit_ready_queue.len(); + let num_cleanup_ready_tasks = self.cleanup_ready_queue.len(); + let max_commit_ready_to_poll = self + .config + .commit_ready_task_capacity + .saturating_sub(num_commit_ready_tasks); + let max_cleanup_ready_to_poll = self + .config + .cleanup_ready_task_capacity + .saturating_sub(num_cleanup_ready_tasks); + let max_ready_to_poll = self.config.ready_task_capacity.saturating_sub( + self.ready_set.len() - num_commit_ready_tasks - num_cleanup_ready_tasks, + ); + self.inbound_queue_reader.spawn( + Duration::from_millis(self.config.storage_polling_wait_time_ms), + max_ready_to_poll, + max_commit_ready_to_poll, + max_cleanup_ready_to_poll, + ); + } +} + +enum InboundQueueResult { + Result { + session_id: SessionId, + ready_entries: Vec, + commit_ready_entries: Vec, + cleanup_ready_entries: Vec, + }, + ResultNotReady, + HandleNotSpawned, +} + +struct InboundQueuePollingHandle { + ready_handle: + tokio::task::JoinHandle), StorageClientError>>, + commit_ready_handle: + tokio::task::JoinHandle), StorageClientError>>, + cleanup_ready_handle: + tokio::task::JoinHandle), StorageClientError>>, +} + +impl InboundQueuePollingHandle { + async fn poll_ready( + &mut self, + curr_session_id: SessionId, + ) -> Result { + if !self.ready_handle.is_finished() + || !self.commit_ready_handle.is_finished() + || !self.cleanup_ready_handle.is_finished() + { + return Ok(InboundQueueResult::ResultNotReady); + } + + let (ready_session_id, ready_entries) = (&mut self.ready_handle) + .await + .map_err(|e| SchedulerError::Internal(e.to_string()))??; + let (commit_session_id, commit_ready_entries) = (&mut self.commit_ready_handle) + .await + .map_err(|e| SchedulerError::Internal(e.to_string()))??; + let (cleanup_session_id, cleanup_ready_entries) = + (&mut self.cleanup_ready_handle) + .await + .map_err(|e| SchedulerError::Internal(e.to_string()))??; + + let latest_session_id = curr_session_id + .max(ready_session_id) + .max(commit_session_id) + .max(cleanup_session_id); + + Ok(InboundQueueResult::Result { + session_id: latest_session_id, + ready_entries: Self::drop_if_stale(ready_session_id, latest_session_id, ready_entries), + commit_ready_entries: Self::drop_if_stale( + commit_session_id, + latest_session_id, + commit_ready_entries, + ), + cleanup_ready_entries: Self::drop_if_stale( + cleanup_session_id, + latest_session_id, + cleanup_ready_entries, + ), + }) + } + + fn drop_if_stale( + session_id: SessionId, + latest_session_id: SessionId, + entries: Vec, + ) -> Vec { + if session_id == latest_session_id { + entries + } else { + Vec::new() + } + } +} + +struct AsyncInboundQueueReader { + storage_client: StorageClientType, + handle: Option, +} + +impl + AsyncInboundQueueReader +{ + const fn new(storage_client: StorageClientType) -> Self { + Self { + storage_client, + handle: None, + } + } + + async fn poll_ready( + &mut self, + curr_session_id: SessionId, + ) -> Result { + match &mut self.handle { + None => Ok(InboundQueueResult::HandleNotSpawned), + Some(handle) => { + let inbound_queue_result = handle.poll_ready(curr_session_id).await?; + if !matches!(inbound_queue_result, InboundQueueResult::ResultNotReady) { + self.handle = None; + } + Ok(inbound_queue_result) + } + } + } + + fn spawn( + &mut self, + storage_polling_wait_time: Duration, + max_ready_entries: usize, + max_commit_ready_entries: usize, + max_cleanup_ready_entries: usize, + ) { + let ready_storage_client = self.storage_client.clone(); + let ready_handle = tokio::task::spawn(async move { + ready_storage_client + .poll_ready(max_ready_entries, storage_polling_wait_time) + .await + }); + + let commit_ready_storage_client = self.storage_client.clone(); + let commit_ready_handle = tokio::task::spawn(async move { + commit_ready_storage_client + .poll_commit_ready(max_commit_ready_entries, storage_polling_wait_time) + .await + }); + + let cleanup_ready_storage_client = self.storage_client.clone(); + let cleanup_ready_handle = tokio::task::spawn(async move { + cleanup_ready_storage_client + .poll_cleanup_ready(max_cleanup_ready_entries, storage_polling_wait_time) + .await + }); + + self.handle = Some(InboundQueuePollingHandle { + ready_handle, + commit_ready_handle, + cleanup_ready_handle, + }); + } +} diff --git a/components/spider-scheduler/src/error.rs b/components/spider-scheduler/src/error.rs index 6a852c46..bff7571d 100644 --- a/components/spider-scheduler/src/error.rs +++ b/components/spider-scheduler/src/error.rs @@ -28,4 +28,10 @@ pub enum SchedulerError { /// The session ID is invalid. #[error("invalid session ID: {0:?}")] InvalidSessionId(SessionId), + + #[error("internal error: {0}")] + Internal(String), + + #[error("async result not ready")] + ResultNotReady, } diff --git a/components/spider-scheduler/src/lib.rs b/components/spider-scheduler/src/lib.rs index bddd0750..a97580d6 100644 --- a/components/spider-scheduler/src/lib.rs +++ b/components/spider-scheduler/src/lib.rs @@ -32,6 +32,7 @@ //! ``` pub mod core; +pub mod core_impl; pub mod dispatch_queue; pub mod error; pub mod storage_client; From 25045e2cfa062ce6dc4a3a65b0d4c490387818c5 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 3 Jun 2026 13:16:24 -0400 Subject: [PATCH 07/14] Polish. --- .../examples/round_robin_load.rs | 392 ---------------- .../src/core_impl/round_robin.rs | 442 ++++++++---------- 2 files changed, 182 insertions(+), 652 deletions(-) delete mode 100644 components/spider-scheduler/examples/round_robin_load.rs diff --git a/components/spider-scheduler/examples/round_robin_load.rs b/components/spider-scheduler/examples/round_robin_load.rs deleted file mode 100644 index 9d2d9385..00000000 --- a/components/spider-scheduler/examples/round_robin_load.rs +++ /dev/null @@ -1,392 +0,0 @@ -//! Load-test and instrumentation harness for the round-robin scheduler core. -//! -//! Topology: -//! -//! ```text -//! submitter ──▶ MockStorage (ready lane) ──poll──▶ RoundRobinCore ──enqueue──▶ dispatch queue ──▶ 64 workers -//! ``` -//! -//! * A mock storage holds 128 jobs of 1000 tasks each, released gradually (one job at a time) to -//! simulate a job-submission cycle rather than making everything ready at `t=0`. -//! * 1% of the tasks are submitted twice (back-to-back) so the scheduler's deduplication can be -//! exercised; workers must still observe every task exactly once. -//! * 64 workers drain the dispatch queue, sleeping 5ms per task to model execution latency. -//! -//! Run with (release recommended so the timings are meaningful): -//! -//! ```bash -//! cargo run -p spider-scheduler --example round_robin_load --release -//! ``` - -use std::{ - sync::{ - Arc, - atomic::{AtomicBool, AtomicUsize, Ordering}, - }, - time::{Duration, Instant}, -}; - -use async_trait::async_trait; -use dashmap::DashSet; -use spider_core::{ - job::JobState, - types::id::{JobId, ResourceGroupId, SessionId, TaskId}, -}; -use spider_scheduler::{ - DispatchQueueSource, - SchedulerCore, - SchedulerStorageClient, - StorageClientError, - core_impl::RoundRobinConfig, - dispatch_queue::{DispatchQueueReader, DispatchQueueWriter, create_dispatch_queue}, - types::InboundEntry, -}; -use tokio_util::sync::CancellationToken; - -// --------------------------------------------------------------------------------------------- -// Workload parameters -// --------------------------------------------------------------------------------------------- - -const NUM_JOBS: usize = 128; -const TASKS_PER_JOB: usize = 1000; -const TOTAL_UNIQUE_TASKS: usize = NUM_JOBS * TASKS_PER_JOB; - -/// Every `DUP_EVERY`-th task within a job is submitted twice, yielding exactly 1% duplicates. -const DUP_EVERY: usize = 100; -const EXPECTED_DUPLICATES_SUBMITTED: usize = NUM_JOBS * TASKS_PER_JOB / DUP_EVERY; - -const NUM_WORKERS: usize = 64; -const WORKER_SLEEP: Duration = Duration::from_millis(5); -const WORKER_POLL_WAIT: Duration = Duration::from_millis(10); - -/// Delay between releasing successive jobs into storage (the "submission cycle"). -const JOB_SUBMIT_INTERVAL: Duration = Duration::from_millis(10); - -/// A fixed session: this harness never bumps the session, so storage and the dispatch queue both -/// start (and stay) here. -const SESSION_ID: SessionId = 0; - -// Round-robin scheduler configuration (as requested). -const ACTIVE_JOB_POOL_CAPACITY: usize = 8; -const DISPATCH_QUEUE_CAPACITY: usize = NUM_WORKERS * 4; // 256 -const STORAGE_POLLING_WAIT_TIME_MS: u64 = 10; // dispatch/poll interval -const READY_TASK_CAPACITY: usize = TASKS_PER_JOB * NUM_WORKERS; // 64_000 -const COMMIT_READY_TASK_CAPACITY: usize = 10; -const CLEANUP_READY_TASK_CAPACITY: usize = 10; - -/// Safety net so a scheduling bug that drops a task cannot hang the harness forever. -const OVERALL_TIMEOUT: Duration = Duration::from_mins(2); - -// --------------------------------------------------------------------------------------------- -// Mock storage -// --------------------------------------------------------------------------------------------- - -/// A mock [`SchedulerStorageClient`] whose regular lane is backed by an unbounded channel that the -/// submitter feeds. Commit and cleanup lanes are always empty. -#[derive(Clone)] -struct MockStorage { - inner: Arc, -} - -struct MockStorageInner { - ready_tx: async_channel::Sender, - ready_rx: async_channel::Receiver, -} - -impl MockStorage { - fn new() -> Self { - let (ready_tx, ready_rx) = async_channel::unbounded(); - Self { - inner: Arc::new(MockStorageInner { ready_tx, ready_rx }), - } - } - - /// # Returns - /// - /// A cloned sender for the regular ready lane, used by the submitter task. - fn sender(&self) -> async_channel::Sender { - self.inner.ready_tx.clone() - } -} - -#[async_trait] -impl SchedulerStorageClient for MockStorage { - async fn poll_ready( - &self, - max_items: usize, - wait: Duration, - ) -> Result<(SessionId, Vec), StorageClientError> { - if max_items == 0 { - // The scheduler has no buffer headroom; emulate a real blocking poll that yields - // nothing rather than pulling past the requested cap. - tokio::time::sleep(wait).await; - return Ok((SESSION_ID, Vec::new())); - } - - let mut out = Vec::new(); - // Block up to `wait` for the first entry, mirroring a real long-poll. - match tokio::time::timeout(wait, self.inner.ready_rx.recv()).await { - Ok(Ok(entry)) => out.push(entry), - // Channel closed (never happens here, sender is held by storage) or timed out: return - // whatever we have (possibly nothing). - Ok(Err(_)) | Err(_) => return Ok((SESSION_ID, out)), - } - // Drain the rest without blocking, up to `max_items`. - while out.len() < max_items { - match self.inner.ready_rx.try_recv() { - Ok(entry) => out.push(entry), - Err(_) => break, - } - } - Ok((SESSION_ID, out)) - } - - async fn poll_commit_ready( - &self, - _max_items: usize, - wait: Duration, - ) -> Result<(SessionId, Vec), StorageClientError> { - tokio::time::sleep(wait).await; - Ok((SESSION_ID, Vec::new())) - } - - async fn poll_cleanup_ready( - &self, - _max_items: usize, - wait: Duration, - ) -> Result<(SessionId, Vec), StorageClientError> { - tokio::time::sleep(wait).await; - Ok((SESSION_ID, Vec::new())) - } - - async fn job_state(&self, _job_id: JobId) -> Result { - Ok(JobState::Running) - } -} - -// --------------------------------------------------------------------------------------------- -// Submitter & workers -// --------------------------------------------------------------------------------------------- - -/// Releases each job's tasks into storage one job at a time, duplicating every `DUP_EVERY`-th task -/// back-to-back so the duplicate lands in the same poll batch as its original. -async fn submit_jobs(jobs: Vec<(JobId, ResourceGroupId)>, tx: async_channel::Sender) { - for (job_id, resource_group_id) in jobs { - for i in 0..TASKS_PER_JOB { - let entry = InboundEntry { - resource_group_id, - job_id, - task_id: TaskId::Index(i), - }; - tx.send(entry).await.expect("ready lane closed"); - if i % DUP_EVERY == 0 { - tx.send(entry).await.expect("ready lane closed"); - } - } - tokio::time::sleep(JOB_SUBMIT_INTERVAL).await; - } -} - -/// Shared bookkeeping for the "each task is polled exactly once" check. -struct WorkerStats { - seen: DashSet<(JobId, TaskId)>, - total_received: AtomicUsize, - duplicate_received: AtomicUsize, -} - -/// A single worker: drain the dispatch queue, record each assignment, then sleep to model work. -async fn worker(reader: DispatchQueueReader, stats: Arc, done: Arc) { - loop { - if done.load(Ordering::Relaxed) { - break; - } - match reader.dequeue(WORKER_POLL_WAIT).await { - Ok(Some((_session, assignment))) => { - stats.total_received.fetch_add(1, Ordering::Relaxed); - if !stats.seen.insert((assignment.job_id, assignment.task_id)) { - stats.duplicate_received.fetch_add(1, Ordering::Relaxed); - } - tokio::time::sleep(WORKER_SLEEP).await; - } - Ok(None) => {} - // Dispatch queue closed (scheduler dropped its writer): nothing more will arrive. - Err(_) => break, - } - } -} - -// --------------------------------------------------------------------------------------------- -// Harness -// --------------------------------------------------------------------------------------------- - -#[tokio::main(flavor = "multi_thread")] -async fn main() { - let storage = MockStorage::new(); - let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, SESSION_ID); - - let config = RoundRobinConfig::::new( - ACTIVE_JOB_POOL_CAPACITY, - DISPATCH_QUEUE_CAPACITY, - READY_TASK_CAPACITY, - COMMIT_READY_TASK_CAPACITY, - CLEANUP_READY_TASK_CAPACITY, - STORAGE_POLLING_WAIT_TIME_MS, - ); - let metrics = config.metrics(); - - let jobs: Vec<(JobId, ResourceGroupId)> = (0..NUM_JOBS) - .map(|_| (JobId::new(), ResourceGroupId::new())) - .collect(); - - // Scheduler. - let scheduler_token = CancellationToken::new(); - let scheduler_handle = { - let token = scheduler_token.clone(); - let storage = storage.clone(); - tokio::spawn(async move { config.run(storage, writer, token).await }) - }; - - // Workers. - let stats = Arc::new(WorkerStats { - seen: DashSet::with_capacity(TOTAL_UNIQUE_TASKS), - total_received: AtomicUsize::new(0), - duplicate_received: AtomicUsize::new(0), - }); - let done = Arc::new(AtomicBool::new(false)); - let worker_handles: Vec<_> = (0..NUM_WORKERS) - .map(|_| tokio::spawn(worker(reader.clone(), stats.clone(), done.clone()))) - .collect(); - drop(reader); - - // Submitter. - let submit_handle = tokio::spawn(submit_jobs(jobs, storage.sender())); - - // Drive to completion: every unique task delivered, or the safety timeout. Poll tightly so the - // metrics are frozen as soon as the last task arrives, keeping the idle tail out of the averages. - let start = Instant::now(); - let mut timed_out = false; - loop { - if stats.seen.len() >= TOTAL_UNIQUE_TASKS { - break; - } - if start.elapsed() > OVERALL_TIMEOUT { - timed_out = true; - break; - } - tokio::time::sleep(Duration::from_millis(1)).await; - } - // Stop timing now that every task has arrived; the scheduler only spins on empty polls past - // this point and must not pollute the stage 1 & 2 averages. - metrics.stop(); - let wall = start.elapsed(); - - // Tear down. - done.store(true, Ordering::Relaxed); - scheduler_token.cancel(); - submit_handle.abort(); - for handle in worker_handles { - let _ = handle.await; - } - match scheduler_handle.await { - Ok(Ok(())) => {} - Ok(Err(e)) => eprintln!("scheduler returned an error: {e:?}"), - Err(e) => eprintln!("scheduler task panicked: {e:?}"), - } - let _ = submit_handle.await; - - report(&metrics, &stats, wall, timed_out); -} - -/// Prints the collected timing and correctness results. -fn report( - metrics: &spider_scheduler::core_impl::RoundRobinMetrics, - stats: &WorkerStats, - wall: Duration, - timed_out: bool, -) { - let loop_count = metrics.loop_count.load(Ordering::Relaxed); - let total_loop_ns = metrics.total_loop_ns.load(Ordering::Relaxed); - let buffer_count = metrics.buffer_enrich_count.load(Ordering::Relaxed); - let buffer_ns = metrics.buffer_enrich_ns.load(Ordering::Relaxed); - let dispatch_count = metrics.dispatch_enrich_count.load(Ordering::Relaxed); - let dispatch_ns = metrics.dispatch_enrich_ns.load(Ordering::Relaxed); - - let total = stats.total_received.load(Ordering::Relaxed); - let duplicates = stats.duplicate_received.load(Ordering::Relaxed); - let unique = stats.seen.len(); - - println!("\n================ Round-robin scheduler load test ================"); - println!( - "Wall-clock runtime: {:.3} s", - wall.as_secs_f64() - ); - if timed_out { - println!("!! TIMED OUT before all unique tasks were delivered !!"); - } - - println!("\n---- Workload ----"); - println!("Jobs: {NUM_JOBS}"); - println!("Tasks per job: {TASKS_PER_JOB}"); - println!("Unique tasks (expected): {TOTAL_UNIQUE_TASKS}"); - println!("Duplicate task entries submitted: {EXPECTED_DUPLICATES_SUBMITTED}"); - println!("Workers: {NUM_WORKERS}"); - - println!("\n---- Scheduling-loop timing ----"); - println!("Scheduling-loop iterations: {loop_count}"); - println!( - "Avg time per scheduling loop: {:>9.3} us", - avg_us(total_loop_ns, loop_count) - ); - println!( - "Avg buffer-enrich time (stage 1): {:>9.3} us (over {buffer_count} iterations that \ - polled a non-empty result)", - avg_us(buffer_ns, buffer_count) - ); - println!( - "Avg dispatch-enrich time (stage 2): {:>9.3} us (over {dispatch_count} iterations that \ - dispatched >=1 task)", - avg_us(dispatch_ns, dispatch_count) - ); - let idle_loops = loop_count.saturating_sub(dispatch_count); - println!( - "No-dispatch loop iterations: {idle_loops} ({:.1}% of all iterations)", - percent(idle_loops, loop_count) - ); - - println!("\n---- Correctness: each task polled exactly once ----"); - println!("Total assignments received: {total}"); - println!("Unique (job, task) pairs received: {unique}"); - println!("Duplicate deliveries observed: {duplicates}"); - - let exactly_once = !timed_out - && duplicates == 0 - && unique == TOTAL_UNIQUE_TASKS - && total == TOTAL_UNIQUE_TASKS; - println!( - "\nRESULT: each task polled exactly once -> {}", - if exactly_once { "PASS" } else { "FAIL" } - ); - println!("=================================================================\n"); -} - -/// # Returns -/// -/// `ns / count` converted to microseconds, or `0.0` when `count` is zero. -fn avg_us(ns: u64, count: u64) -> f64 { - if count == 0 { - 0.0 - } else { - ns as f64 / count as f64 / 1_000.0 - } -} - -/// # Returns -/// -/// `part` as a percentage of `whole`, or `0.0` when `whole` is zero. -fn percent(part: u64, whole: u64) -> f64 { - if whole == 0 { - 0.0 - } else { - part as f64 / whole as f64 * 100.0 - } -} diff --git a/components/spider-scheduler/src/core_impl/round_robin.rs b/components/spider-scheduler/src/core_impl/round_robin.rs index f32c36ca..8708a538 100644 --- a/components/spider-scheduler/src/core_impl/round_robin.rs +++ b/components/spider-scheduler/src/core_impl/round_robin.rs @@ -1,17 +1,47 @@ +//! Round-robin scheduler. +//! +//! This scheduler provides basic fairness across jobs using a round-robin scheduling policy. It +//! polls tasks from the inbound queue (maintained by the storage service) and organizes jobs into +//! two sets: +//! +//! * Active jobs: jobs that participate in round-robin scheduling. +//! * Pending jobs: jobs that are buffered but not yet scheduled. When an active job has no +//! remaining schedulable tasks, it is replaced by the next pending job in FIFO order. +//! +//! The scheduler operates in discrete ticks. During each tick, it attempts to consume the results +//! of an asynchronous inbound-queue polling operation and loads any newly available tasks into its +//! internal buffers. It then makes scheduling decisions until the dispatch queue reaches capacity. +//! +//! # Properties +//! +//! * Each round-robin cycle may schedule at most one additional commit task and one additional +//! cleanup task, if available. +//! * All buffered tasks are unique. Tasks loaded from the inbound queue are deduplicated before +//! entering the scheduler's internal buffers. +//! +//! # Configuration +//! +//! * `active_job_pool_capacity`: Maximum number of active jobs maintained by the scheduler. +//! * `dispatch_queue_capacity`: Maximum number of task assignments in the dispatch queue. +//! * `ready_task_capacity`: Maximum number of ready tasks buffered by the scheduler. +//! * `commit_ready_task_capacity`: Maximum number of buffered commit-ready tasks. +//! * `cleanup_ready_task_capacity`: Maximum number of buffered cleanup-ready tasks. +//! * `storage_polling_wait_time_ms`: Maximum time, in milliseconds, that inbound-queue polling may +//! block on the storage-service side. +//! * `tick_interval_ms`: Interval, in milliseconds, between scheduler ticks (tick execution time +//! included). + use std::{ collections::{HashMap, HashSet, VecDeque}, - sync::{ - Arc, - atomic::{AtomicBool, AtomicU64, Ordering}, - }, - time::{Duration, Instant}, + time::Duration, }; use async_trait::async_trait; +use serde::Deserialize; use spider_core::types::id::{JobId, ResourceGroupId, SessionId, TaskId}; use tokio::select; use tokio_util::sync::CancellationToken; -use serde::Deserialize; + use crate::{ DispatchQueueSink, InboundEntry, @@ -43,113 +73,25 @@ pub struct RoundRobinConfig< /// The capacity of the total pending cleanup-ready tasks buffered in the scheduler. pub cleanup_ready_task_capacity: usize, + /// The maximum time (in milliseconds) that the scheduler will wait for the storage server to + /// fill the inbound-queue reading request. pub storage_polling_wait_time_ms: u64, - #[serde(skip)] - metrics: Arc, + /// The time (in milliseconds) that the scheduler will spend on each tick. + pub tick_interval_ms: u64, #[serde(skip)] _marker: std::marker::PhantomData<(SchedulerStorageClientType, DispatchQueueSinkType)>, } -/// Instrumentation counters for the round-robin scheduling loop. -/// -/// Durations are accumulated in nanoseconds; an average is a `*_ns` total divided by its matching -/// `*_count`. All counters use [`Ordering::Relaxed`] and are meant for coarse profiling only, not -/// for establishing happens-before relationships. -#[derive(Debug, Default)] -pub struct RoundRobinMetrics { - /// Number of completed scheduling-loop iterations (`loop_once` calls). - pub loop_count: AtomicU64, - - /// Total wall-clock time spent across all scheduling-loop iterations. - pub total_loop_ns: AtomicU64, - - /// Number of iterations that processed a fresh inbound polling result. - pub buffer_enrich_count: AtomicU64, - - /// Total time spent draining inbound polling results into the scheduler's buffers ("enrich the - /// buffer", stage 1). - pub buffer_enrich_ns: AtomicU64, - - /// Number of iterations that dispatched at least one assignment. - pub dispatch_enrich_count: AtomicU64, - - /// Total time spent making scheduling decisions and filling the dispatch queue ("enrich the - /// dispatch queue", stage 2). - pub dispatch_enrich_ns: AtomicU64, - - /// When set, the scheduling loop stops accumulating any of the counters above. Used to exclude - /// the idle tail (after all work has drained) from the averages. - stopped: AtomicBool, -} - -impl RoundRobinMetrics { - /// Freezes all counters: subsequent scheduling-loop iterations are not recorded. - pub fn stop(&self) { - self.stopped.store(true, Ordering::Relaxed); - } - - /// # Returns - /// - /// Whether the counters are still being recorded. - fn is_recording(&self) -> bool { - !self.stopped.load(Ordering::Relaxed) - } -} - -impl< - SchedulerStorageClientType: SchedulerStorageClient + 'static, - DispatchQueueSinkType: DispatchQueueSink, -> RoundRobinConfig -{ - /// Creates a new round-robin configuration with a fresh, empty set of metrics. - #[must_use] - pub fn new( - active_job_pool_capacity: usize, - dispatch_queue_capacity: usize, - ready_task_capacity: usize, - commit_ready_task_capacity: usize, - cleanup_ready_task_capacity: usize, - storage_polling_wait_time_ms: u64, - ) -> Self { - Self { - active_job_pool_capacity, - dispatch_queue_capacity, - ready_task_capacity, - commit_ready_task_capacity, - cleanup_ready_task_capacity, - storage_polling_wait_time_ms, - metrics: Arc::new(RoundRobinMetrics::default()), - _marker: std::marker::PhantomData, - } - } - - /// # Returns - /// - /// A shared handle to the loop instrumentation counters, so callers can read them while (or - /// after) the scheduler runs. - #[must_use] - pub fn metrics(&self) -> Arc { - Arc::clone(&self.metrics) - } -} - -/// # Returns -/// -/// The time elapsed since `start` in nanoseconds, saturating at [`u64::MAX`]. -fn elapsed_nanos(start: Instant) -> u64 { - u64::try_from(start.elapsed().as_nanos()).unwrap_or(u64::MAX) -} - #[async_trait] impl< SchedulerStorageClientType: SchedulerStorageClient + 'static, DispatchQueueSinkType: DispatchQueueSink, > SchedulerCore for RoundRobinConfig { - type StorageClient = SchedulerStorageClientType; type Sink = DispatchQueueSinkType; + type StorageClient = SchedulerStorageClientType; async fn run( self, @@ -277,16 +219,25 @@ impl< } async fn run(mut self) -> Result<(), SchedulerError> { + let tick_interval = Duration::from_millis(self.config.tick_interval_ms); loop { + let now = tokio::time::Instant::now(); let cancellation_token = self.cancellation_token.clone(); select! { () = cancellation_token.cancelled() => { return Ok(()); } - result = self.loop_once() => { + result = self.tick() => { let () = result?; } } + let elapsed = now.elapsed(); + let sleep_time = tick_interval.saturating_sub(elapsed); + if !sleep_time.is_zero() { + tokio::time::sleep(sleep_time).await; + } else { + tokio::task::yield_now().await; + } } } @@ -352,11 +303,124 @@ impl< } } - async fn loop_once(&mut self) -> Result<(), SchedulerError> { - let loop_start = Instant::now(); - let recording = self.config.metrics.is_recording(); + async fn tick(&mut self) -> Result<(), SchedulerError> { + self.poll_inbound_queue_result().await?; + self.make_schedule_decision().await?; + Ok(()) + } + + async fn load_inbound_queue_result( + &mut self, + curr_session_id: SessionId, + storage_session_id: SessionId, + ready_entries: Vec, + commit_ready_entries: Vec, + cleanup_ready_entries: Vec, + ) -> Result<(), SchedulerError> { + if storage_session_id < curr_session_id { + return Err(SchedulerError::InvalidSessionId(storage_session_id)); + } + if storage_session_id > curr_session_id { + self.storage_session_id = storage_session_id; + self.clear_all_placement(); + self.sink.bump_session_id(storage_session_id).await?; + } + + // Load commit ready tasks and cleanup ready tasks first to avoid loading a job that + // is already cancelled or commit-ready. + for inbound_entry in commit_ready_entries { + if !self + .ready_set + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + self.commit_ready_or_cleanup_ready_tasks + .insert(inbound_entry.job_id); + self.commit_ready_queue + .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); + + if self.active_jobs.contains_key(&inbound_entry.job_id) { + self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; + continue; + } + + if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { + self.destroy_job_entry(job_entry); + } + } + + for inbound_entry in cleanup_ready_entries { + if !self + .ready_set + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + self.commit_ready_or_cleanup_ready_tasks + .insert(inbound_entry.job_id); + self.cleanup_ready_queue + .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); + + if self.active_jobs.contains_key(&inbound_entry.job_id) { + self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; + continue; + } + + if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { + self.destroy_job_entry(job_entry); + } + } + + for inbound_entry in ready_entries { + if self + .commit_ready_or_cleanup_ready_tasks + .contains(&inbound_entry.job_id) + { + continue; + } + if !self + .ready_set + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + if let Some(active_job) = self.active_jobs.get_mut(&inbound_entry.job_id) { + active_job.enqueue(inbound_entry.task_id); + continue; + } + if let Some(pending_job) = self.pending_jobs.get_mut(&inbound_entry.job_id) { + pending_job.enqueue(inbound_entry.task_id); + continue; + } + if self.active_jobs.len() < self.config.active_job_pool_capacity { + self.active_jobs.insert( + inbound_entry.job_id, + JobEntry::new( + inbound_entry.job_id, + inbound_entry.resource_group_id, + inbound_entry.task_id, + ), + ); + self.active_job_queue + .push(ActiveJobQueueEntry::Ready(inbound_entry.job_id)); + continue; + } + self.pending_jobs.insert( + inbound_entry.job_id, + JobEntry::new( + inbound_entry.job_id, + inbound_entry.resource_group_id, + inbound_entry.task_id, + ), + ); + self.pending_job_queue.push_back(inbound_entry.job_id); + } + + Ok(()) + } - // Stage 1: Retrieve inbound queue results + async fn poll_inbound_queue_result(&mut self) -> Result<(), SchedulerError> { let curr_session_id = self.storage_session_id; let inbound_queue_result = self .inbound_queue_reader @@ -364,127 +428,19 @@ impl< .await?; match inbound_queue_result { InboundQueueResult::Result { - session_id, + session_id: storage_session_id, ready_entries, commit_ready_entries, cleanup_ready_entries, } => { - let buffer_start = Instant::now(); - let inbound_entry_count = - ready_entries.len() + commit_ready_entries.len() + cleanup_ready_entries.len(); - if session_id < curr_session_id { - return Err(SchedulerError::InvalidSessionId(session_id)); - } - if session_id > curr_session_id { - self.storage_session_id = session_id; - self.clear_all_placement(); - self.sink.bump_session_id(session_id).await?; - } - - // Load commit ready tasks and cleanup ready tasks first to avoid loading a job that - // is already cancelled or commit-ready. - for inbound_entry in commit_ready_entries { - if !self - .ready_set - .insert((inbound_entry.job_id, inbound_entry.task_id)) - { - continue; - } - self.commit_ready_or_cleanup_ready_tasks - .insert(inbound_entry.job_id); - self.commit_ready_queue - .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); - - if self.active_jobs.contains_key(&inbound_entry.job_id) { - self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; - continue; - } - - if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { - self.destroy_job_entry(job_entry); - } - } - - for inbound_entry in cleanup_ready_entries { - if !self - .ready_set - .insert((inbound_entry.job_id, inbound_entry.task_id)) - { - continue; - } - self.commit_ready_or_cleanup_ready_tasks - .insert(inbound_entry.job_id); - self.cleanup_ready_queue - .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); - - if self.active_jobs.contains_key(&inbound_entry.job_id) { - self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; - continue; - } - - if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { - self.destroy_job_entry(job_entry); - } - } - - for inbound_entry in ready_entries { - if self - .commit_ready_or_cleanup_ready_tasks - .contains(&inbound_entry.job_id) - { - continue; - } - if !self - .ready_set - .insert((inbound_entry.job_id, inbound_entry.task_id)) - { - continue; - } - if let Some(active_job) = self.active_jobs.get_mut(&inbound_entry.job_id) { - active_job.enqueue(inbound_entry.task_id); - continue; - } - if let Some(pending_job) = self.pending_jobs.get_mut(&inbound_entry.job_id) { - pending_job.enqueue(inbound_entry.task_id); - continue; - } - if self.active_jobs.len() < self.config.active_job_pool_capacity { - self.active_jobs.insert( - inbound_entry.job_id, - JobEntry::new( - inbound_entry.job_id, - inbound_entry.resource_group_id, - inbound_entry.task_id, - ), - ); - self.active_job_queue - .push(ActiveJobQueueEntry::Ready(inbound_entry.job_id)); - continue; - } - self.pending_jobs.insert( - inbound_entry.job_id, - JobEntry::new( - inbound_entry.job_id, - inbound_entry.resource_group_id, - inbound_entry.task_id, - ), - ); - self.pending_job_queue.push_back(inbound_entry.job_id); - } - - // Only record iterations that actually had entries to enrich, so the average - // reflects real work rather than empty polls when the scheduler is idle. - if recording && inbound_entry_count > 0 { - self.config - .metrics - .buffer_enrich_ns - .fetch_add(elapsed_nanos(buffer_start), Ordering::Relaxed); - self.config - .metrics - .buffer_enrich_count - .fetch_add(1, Ordering::Relaxed); - } - + self.load_inbound_queue_result( + curr_session_id, + storage_session_id, + ready_entries, + commit_ready_entries, + cleanup_ready_entries, + ) + .await?; self.spawn_inbound_queue_reader(); } InboundQueueResult::ResultNotReady => {} @@ -493,17 +449,15 @@ impl< } } - // Stage 2: Make scheduling decisions to fill the dispatch queue - let dispatch_start = Instant::now(); + Ok(()) + } + + async fn make_schedule_decision(&mut self) -> Result<(), SchedulerError> { let mut dispatch_queue_slots = self .config .dispatch_queue_capacity .saturating_sub(self.sink.size()); - let initial_dispatch_queue_slots = dispatch_queue_slots; - loop { - if dispatch_queue_slots == 0 || self.ready_set.is_empty() { - break; - } + while dispatch_queue_slots > 0 && !self.ready_set.is_empty() { if self.active_job_queue_cursor >= self.active_job_queue.len() { self.active_job_queue_cursor = 0; } @@ -573,38 +527,6 @@ impl< } } - let dispatched = initial_dispatch_queue_slots - dispatch_queue_slots; - if recording && dispatched > 0 { - self.config - .metrics - .dispatch_enrich_ns - .fetch_add(elapsed_nanos(dispatch_start), Ordering::Relaxed); - self.config - .metrics - .dispatch_enrich_count - .fetch_add(1, Ordering::Relaxed); - } - - if recording { - self.config - .metrics - .total_loop_ns - .fetch_add(elapsed_nanos(loop_start), Ordering::Relaxed); - self.config - .metrics - .loop_count - .fetch_add(1, Ordering::Relaxed); - } - - // When the iteration dispatched nothing, the loop is either waiting on an in-flight poll or - // back-pressured by a full dispatch queue. In both cases it would otherwise spin without an - // await point; because the inbound polls run on tasks this same runtime must schedule, a - // non-yielding spin livelocks them and the scheduler never makes progress. Yield to let the - // poll tasks and dispatch-queue readers run. - if dispatched == 0 { - tokio::task::yield_now().await; - } - Ok(()) } From 8b722d777c11d65d4e41f0c16d601366b49f5dcc Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 15:37:09 -0400 Subject: [PATCH 08/14] Core implementation done. --- components/spider-scheduler/src/core.rs | 6 +- .../src/core_impl/round_robin.rs | 698 ------------- .../core_impl/round_robin/implementation.rs | 919 ++++++++++++++++++ .../src/core_impl/round_robin/mod.rs | 39 + .../src/core_impl/round_robin/tests.rs | 1 + components/spider-scheduler/src/error.rs | 3 + 6 files changed, 965 insertions(+), 701 deletions(-) delete mode 100644 components/spider-scheduler/src/core_impl/round_robin.rs create mode 100644 components/spider-scheduler/src/core_impl/round_robin/implementation.rs create mode 100644 components/spider-scheduler/src/core_impl/round_robin/mod.rs create mode 100644 components/spider-scheduler/src/core_impl/round_robin/tests.rs diff --git a/components/spider-scheduler/src/core.rs b/components/spider-scheduler/src/core.rs index f6715341..ebc5c143 100644 --- a/components/spider-scheduler/src/core.rs +++ b/components/spider-scheduler/src/core.rs @@ -16,12 +16,12 @@ use crate::{ /// share the same runtime entry point. #[async_trait] pub trait SchedulerCore: Send { - /// The storage client used by the core to poll and read for placement decisions. - type StorageClient: SchedulerStorageClient; - /// The dispatch sink the core writes assignments to. type Sink: DispatchQueueSink; + /// The storage client used by the core to poll and read for placement decisions. + type StorageClient: SchedulerStorageClient; + /// Runs the scheduling loop until `cancellation_token` is triggered. /// /// The core polls the inbound queue through `storage_client`, applies its scheduling algorithm, diff --git a/components/spider-scheduler/src/core_impl/round_robin.rs b/components/spider-scheduler/src/core_impl/round_robin.rs deleted file mode 100644 index 8708a538..00000000 --- a/components/spider-scheduler/src/core_impl/round_robin.rs +++ /dev/null @@ -1,698 +0,0 @@ -//! Round-robin scheduler. -//! -//! This scheduler provides basic fairness across jobs using a round-robin scheduling policy. It -//! polls tasks from the inbound queue (maintained by the storage service) and organizes jobs into -//! two sets: -//! -//! * Active jobs: jobs that participate in round-robin scheduling. -//! * Pending jobs: jobs that are buffered but not yet scheduled. When an active job has no -//! remaining schedulable tasks, it is replaced by the next pending job in FIFO order. -//! -//! The scheduler operates in discrete ticks. During each tick, it attempts to consume the results -//! of an asynchronous inbound-queue polling operation and loads any newly available tasks into its -//! internal buffers. It then makes scheduling decisions until the dispatch queue reaches capacity. -//! -//! # Properties -//! -//! * Each round-robin cycle may schedule at most one additional commit task and one additional -//! cleanup task, if available. -//! * All buffered tasks are unique. Tasks loaded from the inbound queue are deduplicated before -//! entering the scheduler's internal buffers. -//! -//! # Configuration -//! -//! * `active_job_pool_capacity`: Maximum number of active jobs maintained by the scheduler. -//! * `dispatch_queue_capacity`: Maximum number of task assignments in the dispatch queue. -//! * `ready_task_capacity`: Maximum number of ready tasks buffered by the scheduler. -//! * `commit_ready_task_capacity`: Maximum number of buffered commit-ready tasks. -//! * `cleanup_ready_task_capacity`: Maximum number of buffered cleanup-ready tasks. -//! * `storage_polling_wait_time_ms`: Maximum time, in milliseconds, that inbound-queue polling may -//! block on the storage-service side. -//! * `tick_interval_ms`: Interval, in milliseconds, between scheduler ticks (tick execution time -//! included). - -use std::{ - collections::{HashMap, HashSet, VecDeque}, - time::Duration, -}; - -use async_trait::async_trait; -use serde::Deserialize; -use spider_core::types::id::{JobId, ResourceGroupId, SessionId, TaskId}; -use tokio::select; -use tokio_util::sync::CancellationToken; - -use crate::{ - DispatchQueueSink, - InboundEntry, - SchedulerCore, - SchedulerError, - SchedulerStorageClient, - StorageClientError, - TaskAssignment, -}; - -#[derive(Deserialize)] -pub struct RoundRobinConfig< - SchedulerStorageClientType: SchedulerStorageClient + 'static, - DispatchQueueSinkType: DispatchQueueSink, -> { - /// The capacity of the active jobs pool. The scheduler will make task assignments from these - /// jobs in a round-robin manner. - pub active_job_pool_capacity: usize, - - /// The capacity of the dispatch queue. - pub dispatch_queue_capacity: usize, - - /// The capacity of the total pending ready tasks buffered in the scheduler. - pub ready_task_capacity: usize, - - /// The capacity of the total pending commit-ready tasks buffered in the scheduler. - pub commit_ready_task_capacity: usize, - - /// The capacity of the total pending cleanup-ready tasks buffered in the scheduler. - pub cleanup_ready_task_capacity: usize, - - /// The maximum time (in milliseconds) that the scheduler will wait for the storage server to - /// fill the inbound-queue reading request. - pub storage_polling_wait_time_ms: u64, - - /// The time (in milliseconds) that the scheduler will spend on each tick. - pub tick_interval_ms: u64, - - #[serde(skip)] - _marker: std::marker::PhantomData<(SchedulerStorageClientType, DispatchQueueSinkType)>, -} - -#[async_trait] -impl< - SchedulerStorageClientType: SchedulerStorageClient + 'static, - DispatchQueueSinkType: DispatchQueueSink, -> SchedulerCore for RoundRobinConfig -{ - type Sink = DispatchQueueSinkType; - type StorageClient = SchedulerStorageClientType; - - async fn run( - self, - storage_client: Self::StorageClient, - sink: Self::Sink, - cancellation_token: CancellationToken, - ) -> Result<(), SchedulerError> { - RoundRobin::new( - SessionId::default(), - storage_client, - sink, - cancellation_token, - self, - ) - .run() - .await - } -} - -struct JobEntry { - job_id: JobId, - resource_group_id: ResourceGroupId, - task_ids: VecDeque, -} - -impl JobEntry { - fn new(job_id: JobId, resource_group_id: ResourceGroupId, init_task_id: TaskId) -> Self { - Self { - job_id, - resource_group_id, - task_ids: VecDeque::from([init_task_id]), - } - } - - fn enqueue(&mut self, task_id: TaskId) { - self.task_ids.push_back(task_id); - } - - fn dequeue(&mut self) -> Option { - self.task_ids.pop_front() - } -} - -#[derive(Clone)] -enum ActiveJobQueueEntry { - Ready(JobId), - CommitReady, - CleanupReady, -} - -struct RoundRobin< - SchedulerStorageClientType: SchedulerStorageClient + 'static, - DispatchQueueSinkType: DispatchQueueSink, -> { - storage_client: SchedulerStorageClientType, - sink: DispatchQueueSinkType, - cancellation_token: CancellationToken, - config: RoundRobinConfig, - storage_session_id: SessionId, - ready_set: HashSet<(JobId, TaskId)>, - - active_jobs: HashMap, - active_job_queue: Vec, - active_job_queue_cursor: usize, - - pending_jobs: HashMap, - pending_job_queue: VecDeque, - - commit_ready_queue: VecDeque<(JobId, ResourceGroupId)>, - cleanup_ready_queue: VecDeque<(JobId, ResourceGroupId)>, - - commit_ready_or_cleanup_ready_tasks: HashSet, - - inbound_queue_reader: AsyncInboundQueueReader, -} - -impl< - SchedulerStorageClientType: SchedulerStorageClient + 'static, - DispatchQueueSinkType: DispatchQueueSink, -> RoundRobin -{ - fn new( - storage_session_id: SessionId, - storage_client: SchedulerStorageClientType, - sink: DispatchQueueSinkType, - cancellation_token: CancellationToken, - config: RoundRobinConfig, - ) -> Self { - let ready_set = HashSet::with_capacity(config.ready_task_capacity); - let active_jobs = HashMap::with_capacity(config.active_job_pool_capacity); - let active_job_queue = Self::new_active_job_queue(config.active_job_pool_capacity); - let active_job_queue_cursor = 0; - let pending_jobs = HashMap::with_capacity(config.active_job_pool_capacity); - let pending_job_queue = VecDeque::with_capacity(config.active_job_pool_capacity); - let commit_ready_queue = VecDeque::with_capacity(config.commit_ready_task_capacity); - let cleanup_ready_queue = VecDeque::with_capacity(config.cleanup_ready_task_capacity); - let commit_ready_or_cleanup_ready_tasks = HashSet::with_capacity( - config.commit_ready_task_capacity + config.cleanup_ready_task_capacity, - ); - let inbound_queue_reader = AsyncInboundQueueReader::new(storage_client.clone()); - Self { - storage_client, - sink, - cancellation_token, - config, - storage_session_id, - ready_set, - active_jobs, - active_job_queue, - active_job_queue_cursor, - pending_jobs, - pending_job_queue, - commit_ready_queue, - cleanup_ready_queue, - commit_ready_or_cleanup_ready_tasks, - inbound_queue_reader, - } - } - - fn new_active_job_queue(active_job_pool_capacity: usize) -> Vec { - let mut active_job_queue = Vec::with_capacity(active_job_pool_capacity + 2); - active_job_queue.push(ActiveJobQueueEntry::CommitReady); - active_job_queue.push(ActiveJobQueueEntry::CleanupReady); - active_job_queue - } - - async fn run(mut self) -> Result<(), SchedulerError> { - let tick_interval = Duration::from_millis(self.config.tick_interval_ms); - loop { - let now = tokio::time::Instant::now(); - let cancellation_token = self.cancellation_token.clone(); - select! { - () = cancellation_token.cancelled() => { - return Ok(()); - } - result = self.tick() => { - let () = result?; - } - } - let elapsed = now.elapsed(); - let sleep_time = tick_interval.saturating_sub(elapsed); - if !sleep_time.is_zero() { - tokio::time::sleep(sleep_time).await; - } else { - tokio::task::yield_now().await; - } - } - } - - fn clear_all_placement(&mut self) { - self.ready_set.clear(); - self.active_jobs.clear(); - self.pending_jobs.clear(); - self.pending_job_queue.clear(); - self.commit_ready_queue.clear(); - self.cleanup_ready_queue.clear(); - self.commit_ready_or_cleanup_ready_tasks.clear(); - - self.active_job_queue = Self::new_active_job_queue(self.config.active_job_pool_capacity); - self.active_job_queue_cursor = 0; - } - - fn remove_active_job_and_dequeue_next_pending_job( - &mut self, - job_id: JobId, - ) -> Result<(), SchedulerError> { - if let Some(index) = self.active_job_queue.iter().position(|entry| match entry { - ActiveJobQueueEntry::Ready(id) => *id == job_id, - _ => false, - }) { - self.active_job_queue.swap_remove(index); - } else { - return Err(SchedulerError::Internal( - "attempt to remove a non-existing active job: {job_id:?}".to_string(), - )); - } - - if let Some(entry_to_remove) = self.active_jobs.remove(&job_id) { - self.destroy_job_entry(entry_to_remove); - } else { - return Err(SchedulerError::Internal( - "attempt to destroy a non-existing active job: {job_id:?}".to_string(), - )); - } - - if let Some(next_pending_job) = self.next_pending_job() { - self.active_job_queue - .push(ActiveJobQueueEntry::Ready(next_pending_job.job_id)); - self.active_jobs - .insert(next_pending_job.job_id, next_pending_job); - } - Ok(()) - } - - fn next_pending_job(&mut self) -> Option { - loop { - let job_id = self.pending_job_queue.pop_front()?; - // NOTE: The job may have been cancelled and removed from `pending_jobs`, so the ID in - // the queue may not necessarily exist in `pending_jobs`. - if let Some(pending_job) = self.pending_jobs.remove(&job_id) { - return Some(pending_job); - } - } - } - - fn destroy_job_entry(&mut self, job_entry: JobEntry) { - for task_id in job_entry.task_ids { - self.ready_set.remove(&(job_entry.job_id, task_id)); - } - } - - async fn tick(&mut self) -> Result<(), SchedulerError> { - self.poll_inbound_queue_result().await?; - self.make_schedule_decision().await?; - Ok(()) - } - - async fn load_inbound_queue_result( - &mut self, - curr_session_id: SessionId, - storage_session_id: SessionId, - ready_entries: Vec, - commit_ready_entries: Vec, - cleanup_ready_entries: Vec, - ) -> Result<(), SchedulerError> { - if storage_session_id < curr_session_id { - return Err(SchedulerError::InvalidSessionId(storage_session_id)); - } - if storage_session_id > curr_session_id { - self.storage_session_id = storage_session_id; - self.clear_all_placement(); - self.sink.bump_session_id(storage_session_id).await?; - } - - // Load commit ready tasks and cleanup ready tasks first to avoid loading a job that - // is already cancelled or commit-ready. - for inbound_entry in commit_ready_entries { - if !self - .ready_set - .insert((inbound_entry.job_id, inbound_entry.task_id)) - { - continue; - } - self.commit_ready_or_cleanup_ready_tasks - .insert(inbound_entry.job_id); - self.commit_ready_queue - .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); - - if self.active_jobs.contains_key(&inbound_entry.job_id) { - self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; - continue; - } - - if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { - self.destroy_job_entry(job_entry); - } - } - - for inbound_entry in cleanup_ready_entries { - if !self - .ready_set - .insert((inbound_entry.job_id, inbound_entry.task_id)) - { - continue; - } - self.commit_ready_or_cleanup_ready_tasks - .insert(inbound_entry.job_id); - self.cleanup_ready_queue - .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); - - if self.active_jobs.contains_key(&inbound_entry.job_id) { - self.remove_active_job_and_dequeue_next_pending_job(inbound_entry.job_id)?; - continue; - } - - if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { - self.destroy_job_entry(job_entry); - } - } - - for inbound_entry in ready_entries { - if self - .commit_ready_or_cleanup_ready_tasks - .contains(&inbound_entry.job_id) - { - continue; - } - if !self - .ready_set - .insert((inbound_entry.job_id, inbound_entry.task_id)) - { - continue; - } - if let Some(active_job) = self.active_jobs.get_mut(&inbound_entry.job_id) { - active_job.enqueue(inbound_entry.task_id); - continue; - } - if let Some(pending_job) = self.pending_jobs.get_mut(&inbound_entry.job_id) { - pending_job.enqueue(inbound_entry.task_id); - continue; - } - if self.active_jobs.len() < self.config.active_job_pool_capacity { - self.active_jobs.insert( - inbound_entry.job_id, - JobEntry::new( - inbound_entry.job_id, - inbound_entry.resource_group_id, - inbound_entry.task_id, - ), - ); - self.active_job_queue - .push(ActiveJobQueueEntry::Ready(inbound_entry.job_id)); - continue; - } - self.pending_jobs.insert( - inbound_entry.job_id, - JobEntry::new( - inbound_entry.job_id, - inbound_entry.resource_group_id, - inbound_entry.task_id, - ), - ); - self.pending_job_queue.push_back(inbound_entry.job_id); - } - - Ok(()) - } - - async fn poll_inbound_queue_result(&mut self) -> Result<(), SchedulerError> { - let curr_session_id = self.storage_session_id; - let inbound_queue_result = self - .inbound_queue_reader - .poll_ready(curr_session_id) - .await?; - match inbound_queue_result { - InboundQueueResult::Result { - session_id: storage_session_id, - ready_entries, - commit_ready_entries, - cleanup_ready_entries, - } => { - self.load_inbound_queue_result( - curr_session_id, - storage_session_id, - ready_entries, - commit_ready_entries, - cleanup_ready_entries, - ) - .await?; - self.spawn_inbound_queue_reader(); - } - InboundQueueResult::ResultNotReady => {} - InboundQueueResult::HandleNotSpawned => { - self.spawn_inbound_queue_reader(); - } - } - - Ok(()) - } - - async fn make_schedule_decision(&mut self) -> Result<(), SchedulerError> { - let mut dispatch_queue_slots = self - .config - .dispatch_queue_capacity - .saturating_sub(self.sink.size()); - while dispatch_queue_slots > 0 && !self.ready_set.is_empty() { - if self.active_job_queue_cursor >= self.active_job_queue.len() { - self.active_job_queue_cursor = 0; - } - let active_job_queue_entry = - match self.active_job_queue.get(self.active_job_queue_cursor) { - Some(entry) => entry.clone(), - None => { - return Err(SchedulerError::Internal( - "active job queue cursor is corrupted".to_string(), - )); - } - }; - self.active_job_queue_cursor += 1; - match active_job_queue_entry { - ActiveJobQueueEntry::CleanupReady => { - let Some((job_id, resource_group_id)) = self.cleanup_ready_queue.pop_front() - else { - continue; - }; - self.sink - .enqueue(TaskAssignment { - job_id, - resource_group_id, - task_id: TaskId::Cleanup, - }) - .await?; - self.ready_set.remove(&(job_id, TaskId::Cleanup)); - self.commit_ready_or_cleanup_ready_tasks.remove(&job_id); - dispatch_queue_slots -= 1; - } - ActiveJobQueueEntry::CommitReady => { - let Some((job_id, resource_group_id)) = self.commit_ready_queue.pop_front() - else { - continue; - }; - self.sink - .enqueue(TaskAssignment { - job_id, - resource_group_id, - task_id: TaskId::Commit, - }) - .await?; - self.ready_set.remove(&(job_id, TaskId::Commit)); - self.commit_ready_or_cleanup_ready_tasks.remove(&job_id); - dispatch_queue_slots -= 1; - } - ActiveJobQueueEntry::Ready(job_id) => { - let Some(job_entry) = self.active_jobs.get_mut(&job_id) else { - return Err(SchedulerError::Internal( - "attempt to remove a non-existing active job: {job_id:?}".to_string(), - )); - }; - if let Some(task_id) = job_entry.dequeue() { - self.sink - .enqueue(TaskAssignment { - job_id, - resource_group_id: job_entry.resource_group_id, - task_id, - }) - .await?; - self.ready_set.remove(&(job_id, task_id)); - dispatch_queue_slots -= 1; - } else { - self.remove_active_job_and_dequeue_next_pending_job(job_id)?; - } - } - } - } - - Ok(()) - } - - fn spawn_inbound_queue_reader(&mut self) { - let num_commit_ready_tasks = self.commit_ready_queue.len(); - let num_cleanup_ready_tasks = self.cleanup_ready_queue.len(); - let max_commit_ready_to_poll = self - .config - .commit_ready_task_capacity - .saturating_sub(num_commit_ready_tasks); - let max_cleanup_ready_to_poll = self - .config - .cleanup_ready_task_capacity - .saturating_sub(num_cleanup_ready_tasks); - let max_ready_to_poll = self.config.ready_task_capacity.saturating_sub( - self.ready_set.len() - num_commit_ready_tasks - num_cleanup_ready_tasks, - ); - self.inbound_queue_reader.spawn( - Duration::from_millis(self.config.storage_polling_wait_time_ms), - max_ready_to_poll, - max_commit_ready_to_poll, - max_cleanup_ready_to_poll, - ); - } -} - -enum InboundQueueResult { - Result { - session_id: SessionId, - ready_entries: Vec, - commit_ready_entries: Vec, - cleanup_ready_entries: Vec, - }, - ResultNotReady, - HandleNotSpawned, -} - -struct InboundQueuePollingHandle { - ready_handle: - tokio::task::JoinHandle), StorageClientError>>, - commit_ready_handle: - tokio::task::JoinHandle), StorageClientError>>, - cleanup_ready_handle: - tokio::task::JoinHandle), StorageClientError>>, -} - -impl InboundQueuePollingHandle { - async fn poll_ready( - &mut self, - curr_session_id: SessionId, - ) -> Result { - if !self.ready_handle.is_finished() - || !self.commit_ready_handle.is_finished() - || !self.cleanup_ready_handle.is_finished() - { - return Ok(InboundQueueResult::ResultNotReady); - } - - let (ready_session_id, ready_entries) = (&mut self.ready_handle) - .await - .map_err(|e| SchedulerError::Internal(e.to_string()))??; - let (commit_session_id, commit_ready_entries) = (&mut self.commit_ready_handle) - .await - .map_err(|e| SchedulerError::Internal(e.to_string()))??; - let (cleanup_session_id, cleanup_ready_entries) = - (&mut self.cleanup_ready_handle) - .await - .map_err(|e| SchedulerError::Internal(e.to_string()))??; - - let latest_session_id = curr_session_id - .max(ready_session_id) - .max(commit_session_id) - .max(cleanup_session_id); - - Ok(InboundQueueResult::Result { - session_id: latest_session_id, - ready_entries: Self::drop_if_stale(ready_session_id, latest_session_id, ready_entries), - commit_ready_entries: Self::drop_if_stale( - commit_session_id, - latest_session_id, - commit_ready_entries, - ), - cleanup_ready_entries: Self::drop_if_stale( - cleanup_session_id, - latest_session_id, - cleanup_ready_entries, - ), - }) - } - - fn drop_if_stale( - session_id: SessionId, - latest_session_id: SessionId, - entries: Vec, - ) -> Vec { - if session_id == latest_session_id { - entries - } else { - Vec::new() - } - } -} - -struct AsyncInboundQueueReader { - storage_client: StorageClientType, - handle: Option, -} - -impl - AsyncInboundQueueReader -{ - const fn new(storage_client: StorageClientType) -> Self { - Self { - storage_client, - handle: None, - } - } - - async fn poll_ready( - &mut self, - curr_session_id: SessionId, - ) -> Result { - match &mut self.handle { - None => Ok(InboundQueueResult::HandleNotSpawned), - Some(handle) => { - let inbound_queue_result = handle.poll_ready(curr_session_id).await?; - if !matches!(inbound_queue_result, InboundQueueResult::ResultNotReady) { - self.handle = None; - } - Ok(inbound_queue_result) - } - } - } - - fn spawn( - &mut self, - storage_polling_wait_time: Duration, - max_ready_entries: usize, - max_commit_ready_entries: usize, - max_cleanup_ready_entries: usize, - ) { - let ready_storage_client = self.storage_client.clone(); - let ready_handle = tokio::task::spawn(async move { - ready_storage_client - .poll_ready(max_ready_entries, storage_polling_wait_time) - .await - }); - - let commit_ready_storage_client = self.storage_client.clone(); - let commit_ready_handle = tokio::task::spawn(async move { - commit_ready_storage_client - .poll_commit_ready(max_commit_ready_entries, storage_polling_wait_time) - .await - }); - - let cleanup_ready_storage_client = self.storage_client.clone(); - let cleanup_ready_handle = tokio::task::spawn(async move { - cleanup_ready_storage_client - .poll_cleanup_ready(max_cleanup_ready_entries, storage_polling_wait_time) - .await - }); - - self.handle = Some(InboundQueuePollingHandle { - ready_handle, - commit_ready_handle, - cleanup_ready_handle, - }); - } -} diff --git a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs new file mode 100644 index 00000000..9ff881bc --- /dev/null +++ b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs @@ -0,0 +1,919 @@ +//! The implementation of the round-robin scheduler core. See the parent module's documentation for +//! the scheduling policy and configuration. + +use std::{ + collections::{HashMap, HashSet, VecDeque}, + time::Duration, +}; + +use async_trait::async_trait; +use serde::Deserialize; +use spider_core::types::id::{JobId, ResourceGroupId, SessionId, TaskId}; +use tokio::select; +use tokio_util::sync::CancellationToken; + +use crate::{ + DispatchQueueSink, + InboundEntry, + SchedulerCore, + SchedulerError, + SchedulerStorageClient, + StorageClientError, + TaskAssignment, +}; + +/// The configuration of the round-robin scheduler core. +/// +/// The configuration itself implements [`SchedulerCore`]: consuming it through +/// [`SchedulerCore::run`] creates the underlying scheduler and drives its scheduling loop. +/// +/// # Type Parameters +/// +/// * `SchedulerStorageClientType` - The storage client used to poll the inbound queue. +/// * `DispatchQueueSinkType` - The dispatch sink that task assignments are written to. +#[derive(Deserialize)] +pub struct RoundRobinConfig< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> { + /// The capacity of the active job queue. The scheduler will make task assignments from these + /// jobs in a round-robin manner. + /// + /// Must be greater than 0. + pub active_job_queue_capacity: usize, + + /// The capacity of the dispatch queue. + /// + /// Must be greater than 0. + pub dispatch_queue_capacity: usize, + + /// The capacity of the total pending ready tasks buffered in the scheduler. + /// + /// Must be greater than 0. + pub ready_task_capacity: usize, + + /// The capacity of the total pending commit-ready tasks buffered in the scheduler. + /// + /// Must be greater than 0. + pub commit_ready_task_capacity: usize, + + /// The capacity of the total pending cleanup-ready tasks buffered in the scheduler. + /// + /// Must be greater than 0. + pub cleanup_ready_task_capacity: usize, + + /// The maximum time (in milliseconds) that the scheduler will wait for the storage server to + /// fill the inbound-queue reading request. + pub storage_poll_timeout_ms: u64, + + /// The time (in milliseconds) that the scheduler will spend on each tick. + pub tick_interval_ms: u64, + + #[serde(skip)] + _marker: std::marker::PhantomData<(SchedulerStorageClientType, DispatchQueueSinkType)>, +} + +#[async_trait] +impl< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> SchedulerCore for RoundRobinConfig +{ + type Sink = DispatchQueueSinkType; + type StorageClient = SchedulerStorageClientType; + + async fn run( + self, + storage_client: Self::StorageClient, + sink: Self::Sink, + cancellation_token: CancellationToken, + ) -> Result<(), SchedulerError> { + RoundRobin::new( + SessionId::default(), + storage_client, + sink, + cancellation_token, + self, + )? + .run() + .await + } +} + +/// A FIFO queue of a job's buffered ready tasks. +struct JobTaskQueue { + job_id: JobId, + resource_group_id: ResourceGroupId, + task_ids: VecDeque, +} + +impl JobTaskQueue { + /// Factory function. + /// + /// # Returns + /// + /// A new task queue for the given job, seeded with `init_task_id`. + fn new(job_id: JobId, resource_group_id: ResourceGroupId, init_task_id: TaskId) -> Self { + Self { + job_id, + resource_group_id, + task_ids: VecDeque::from([init_task_id]), + } + } + + fn enqueue(&mut self, task_id: TaskId) { + self.task_ids.push_back(task_id); + } + + /// # Returns + /// + /// * The next ready task ID in FIFO order. + /// * [`None`] if the queue is empty. + fn dequeue(&mut self) -> Option { + self.task_ids.pop_front() + } +} + +/// A slot in the round-robin rotation that the scheduler draws task assignments from. +#[derive(Clone)] +enum RoundRobinSlot { + /// An active job: assignments are drawn from the job's buffered ready tasks. + Job(JobId), + + /// The commit lane: assignments are drawn from the buffered commit-ready jobs. + CommitReady, + + /// The cleanup lane: assignments are drawn from the buffered cleanup-ready jobs. + CleanupReady, +} + +/// The round-robin scheduler core created from a [`RoundRobinConfig`]. +/// +/// # Type Parameters +/// +/// * `SchedulerStorageClientType` - The storage client used to poll the inbound queue. +/// * `DispatchQueueSinkType` - The dispatch sink that task assignments are written to. +struct RoundRobin< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> { + sink: DispatchQueueSinkType, + cancellation_token: CancellationToken, + config: RoundRobinConfig, + storage_session_id: SessionId, + buffered_tasks: HashSet<(JobId, TaskId)>, + + active_jobs: HashMap, + active_job_queue: Vec, + active_job_queue_round_robin_cursor: usize, + + pending_jobs: HashMap, + pending_job_queue: VecDeque, + + commit_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, + cleanup_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, + + finalizing_jobs: HashSet, + + inbound_queue_reader: AsyncInboundQueueReader, +} + +impl< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> RoundRobin +{ + /// Factory function. + /// + /// Creates a [`RoundRobin`] scheduler from the given config. + /// + /// # Returns + /// + /// The constructed [`RoundRobin`] scheduler on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::InvalidConfig`] if the config contains invalid values. Check + /// [`RoundRobinConfig`]'s docstring for details. + fn new( + storage_session_id: SessionId, + storage_client: SchedulerStorageClientType, + sink: DispatchQueueSinkType, + cancellation_token: CancellationToken, + config: RoundRobinConfig, + ) -> Result { + if config.active_job_queue_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`active_job_queue_capacity` must be greater than 0".to_string(), + )); + } + + if config.dispatch_queue_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`dispatch_queue_capacity` must be greater than 0".to_string(), + )); + } + + if config.ready_task_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`ready_task_capacity` must be greater than 0".to_string(), + )); + } + + if config.commit_ready_task_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`commit_ready_task_capacity` must be greater than 0".to_string(), + )); + } + + if config.cleanup_ready_task_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`cleanup_ready_task_capacity` must be greater than 0".to_string(), + )); + } + + let buffered_tasks = HashSet::with_capacity(config.ready_task_capacity); + let active_jobs = HashMap::with_capacity(config.active_job_queue_capacity); + let active_job_queue = Self::new_active_job_queue(config.active_job_queue_capacity); + let round_robin_cursor = 0; + let pending_jobs = HashMap::with_capacity(config.active_job_queue_capacity); + let pending_job_queue = VecDeque::with_capacity(config.active_job_queue_capacity); + let commit_ready_jobs = VecDeque::with_capacity(config.commit_ready_task_capacity); + let cleanup_ready_jobs = VecDeque::with_capacity(config.cleanup_ready_task_capacity); + let finalizing_jobs = HashSet::with_capacity( + config.commit_ready_task_capacity + config.cleanup_ready_task_capacity, + ); + let inbound_queue_reader = AsyncInboundQueueReader::new(storage_client); + Ok(Self { + sink, + cancellation_token, + config, + storage_session_id, + buffered_tasks, + active_jobs, + active_job_queue, + active_job_queue_round_robin_cursor: round_robin_cursor, + pending_jobs, + pending_job_queue, + commit_ready_jobs, + cleanup_ready_jobs, + finalizing_jobs, + inbound_queue_reader, + }) + } + + /// # Returns + /// + /// A new active job queue containing only the commit-ready and cleanup-ready slots. + fn new_active_job_queue(active_job_pool_capacity: usize) -> Vec { + let mut active_job_queue = Vec::with_capacity(active_job_pool_capacity + 2); + active_job_queue.push(RoundRobinSlot::CommitReady); + active_job_queue.push(RoundRobinSlot::CleanupReady); + active_job_queue + } + + /// Runs the scheduling loop until the cancellation token is triggered. + /// + /// Each iteration executes one [`Self::tick`] and then sleeps for the remainder of the + /// configured tick interval. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::tick`]'s return values on failure. + async fn run(mut self) -> Result<(), SchedulerError> { + let tick_interval = Duration::from_millis(self.config.tick_interval_ms); + loop { + let now = tokio::time::Instant::now(); + let cancellation_token = self.cancellation_token.clone(); + select! { + () = cancellation_token.cancelled() => { + return Ok(()); + } + result = self.tick() => { + let () = result?; + } + } + let elapsed = now.elapsed(); + let sleep_time = tick_interval.saturating_sub(elapsed); + if sleep_time.is_zero() { + tokio::task::yield_now().await; + } else { + tokio::time::sleep(sleep_time).await; + } + } + } + + /// Clears all buffered jobs and tasks, resetting the scheduler to its initial placement state. + fn clear(&mut self) { + self.buffered_tasks.clear(); + self.active_jobs.clear(); + self.pending_jobs.clear(); + self.pending_job_queue.clear(); + self.commit_ready_jobs.clear(); + self.cleanup_ready_jobs.clear(); + self.finalizing_jobs.clear(); + + self.active_job_queue = Self::new_active_job_queue(self.config.active_job_queue_capacity); + self.active_job_queue_round_robin_cursor = 0; + } + + /// Removes the given job from the active set, discards its buffered tasks, and backfills the + /// freed slot with the next pending job, if any. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::Internal`] if the given job is not currently active. + fn retire_active_job(&mut self, job_id: JobId) -> Result<(), SchedulerError> { + if let Some(index) = self.active_job_queue.iter().position(|entry| match entry { + RoundRobinSlot::Job(id) => *id == job_id, + _ => false, + }) { + self.active_job_queue.swap_remove(index); + } else { + return Err(SchedulerError::Internal(format!( + "attempt to remove a non-existing active job: {job_id:?}" + ))); + } + + if let Some(removed_entry) = self.active_jobs.remove(&job_id) { + self.discard_job_tasks(removed_entry); + } else { + return Err(SchedulerError::Internal(format!( + "attempt to destroy a non-existing active job: {job_id:?}" + ))); + } + + if let Some(next_pending_job) = self.pop_next_pending_job() { + self.active_job_queue + .push(RoundRobinSlot::Job(next_pending_job.job_id)); + self.active_jobs + .insert(next_pending_job.job_id, next_pending_job); + } + Ok(()) + } + + /// # Returns + /// + /// The next pending job in FIFO order, or [`None`] if there is no pending job left. + fn pop_next_pending_job(&mut self) -> Option { + loop { + let job_id = self.pending_job_queue.pop_front()?; + // NOTE: The job may have been cancelled and removed from `pending_jobs`, so the ID in + // the queue may not necessarily exist in `pending_jobs`. + if let Some(pending_job) = self.pending_jobs.remove(&job_id) { + return Some(pending_job); + } + } + } + + /// Removes all of the given job's queued tasks from the buffered-task set. + fn discard_job_tasks(&mut self, job_entry: JobTaskQueue) { + for task_id in job_entry.task_ids { + self.buffered_tasks.remove(&(job_entry.job_id, task_id)); + } + } + + /// Executes a single scheduling tick: consumes any completed inbound poll, then makes + /// scheduling decisions to fill the dispatch queue. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::consume_inbound_poll_result`]'s return values on failure. + /// * Forwards [`Self::make_schedule_decisions`]'s return values on failure. + async fn tick(&mut self) -> Result<(), SchedulerError> { + self.consume_inbound_poll_result().await?; + self.make_schedule_decisions().await?; + Ok(()) + } + + /// Loads polled inbound entries into the scheduler's internal buffers. + /// + /// If the polled session is newer than the current session, all existing placement states are + /// cleared and the dispatch queue's session is bumped before loading. Entries whose tasks are + /// already buffered are ignored. + /// + /// A commit-ready or cleanup-ready entry marks its job as finalizing. A finalizing job no + /// longer participates in regular-task scheduling: the job is removed from the active or + /// pending set, its buffered ready tasks are discarded, and its incoming ready entries are + /// ignored. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::InvalidSessionId`] if the polled session is older than the current + /// session. + /// * Forwards [`DispatchQueueSink::bump_session_id`]'s return values on failure. + /// * Forwards [`Self::retire_active_job`]'s return values on failure. + async fn ingest_inbound_entries( + &mut self, + curr_session_id: SessionId, + storage_session_id: SessionId, + ready_entries: Vec, + commit_ready_entries: Vec, + cleanup_ready_entries: Vec, + ) -> Result<(), SchedulerError> { + if storage_session_id < curr_session_id { + return Err(SchedulerError::InvalidSessionId(storage_session_id)); + } + if storage_session_id > curr_session_id { + self.storage_session_id = storage_session_id; + self.clear(); + self.sink.bump_session_id(storage_session_id).await?; + } + + // Load commit ready tasks and cleanup ready tasks first to avoid loading a job that + // is already cancelled or commit-ready. + for inbound_entry in commit_ready_entries { + if !self + .buffered_tasks + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + self.finalizing_jobs.insert(inbound_entry.job_id); + self.commit_ready_jobs + .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); + + if self.active_jobs.contains_key(&inbound_entry.job_id) { + self.retire_active_job(inbound_entry.job_id)?; + continue; + } + + if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { + self.discard_job_tasks(job_entry); + } + } + + for inbound_entry in cleanup_ready_entries { + if !self + .buffered_tasks + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + self.finalizing_jobs.insert(inbound_entry.job_id); + self.cleanup_ready_jobs + .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); + + if self.active_jobs.contains_key(&inbound_entry.job_id) { + self.retire_active_job(inbound_entry.job_id)?; + continue; + } + + if let Some(job_entry) = self.pending_jobs.remove(&inbound_entry.job_id) { + self.discard_job_tasks(job_entry); + } + } + + for inbound_entry in ready_entries { + if self.finalizing_jobs.contains(&inbound_entry.job_id) { + continue; + } + if !self + .buffered_tasks + .insert((inbound_entry.job_id, inbound_entry.task_id)) + { + continue; + } + if let Some(active_job) = self.active_jobs.get_mut(&inbound_entry.job_id) { + active_job.enqueue(inbound_entry.task_id); + continue; + } + if let Some(pending_job) = self.pending_jobs.get_mut(&inbound_entry.job_id) { + pending_job.enqueue(inbound_entry.task_id); + continue; + } + if self.active_jobs.len() < self.config.active_job_queue_capacity { + self.active_jobs.insert( + inbound_entry.job_id, + JobTaskQueue::new( + inbound_entry.job_id, + inbound_entry.resource_group_id, + inbound_entry.task_id, + ), + ); + self.active_job_queue + .push(RoundRobinSlot::Job(inbound_entry.job_id)); + continue; + } + self.pending_jobs.insert( + inbound_entry.job_id, + JobTaskQueue::new( + inbound_entry.job_id, + inbound_entry.resource_group_id, + inbound_entry.task_id, + ), + ); + self.pending_job_queue.push_back(inbound_entry.job_id); + } + + Ok(()) + } + + /// Consumes the in-flight inbound poll if it has completed, ingesting its entries and starting + /// the next poll; starts the initial poll if none is in flight. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`AsyncInboundQueueReader::try_collect_result`]'s return values on failure. + /// * Forwards [`Self::ingest_inbound_entries`]'s return values on failure. + /// * Forwards [`Self::start_inbound_poll`]'s return values on failure. + async fn consume_inbound_poll_result(&mut self) -> Result<(), SchedulerError> { + let curr_session_id = self.storage_session_id; + let inbound_poll_state = self + .inbound_queue_reader + .try_collect_result(curr_session_id) + .await?; + match inbound_poll_state { + InboundPollState::Ready { + session_id: storage_session_id, + ready_entries, + commit_ready_entries, + cleanup_ready_entries, + } => { + self.ingest_inbound_entries( + curr_session_id, + storage_session_id, + ready_entries, + commit_ready_entries, + cleanup_ready_entries, + ) + .await?; + self.start_inbound_poll()?; + } + InboundPollState::Pending => {} + InboundPollState::NotStarted => { + self.start_inbound_poll()?; + } + } + + Ok(()) + } + + /// Makes scheduling decisions in round-robin order, writing task assignments to the dispatch + /// queue until it reaches capacity or no buffered task is left. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::Internal`] if the round-robin queue is inconsistent with the scheduler's + /// job bookkeeping. + /// * Forwards [`DispatchQueueSink::enqueue`]'s return values on failure. + /// * Forwards [`Self::retire_active_job`]'s return values on failure. + async fn make_schedule_decisions(&mut self) -> Result<(), SchedulerError> { + let mut remaining_dispatch_slots = self + .config + .dispatch_queue_capacity + .saturating_sub(self.sink.size()); + while remaining_dispatch_slots > 0 && !self.buffered_tasks.is_empty() { + if self.active_job_queue_round_robin_cursor >= self.active_job_queue.len() { + self.active_job_queue_round_robin_cursor = 0; + } + let active_job_queue_entry = match self + .active_job_queue + .get(self.active_job_queue_round_robin_cursor) + { + Some(entry) => entry.clone(), + None => { + return Err(SchedulerError::Internal( + "round-robin cursor is corrupted".to_string(), + )); + } + }; + self.active_job_queue_round_robin_cursor += 1; + + match active_job_queue_entry { + RoundRobinSlot::CleanupReady => { + let Some((job_id, resource_group_id)) = self.cleanup_ready_jobs.pop_front() + else { + continue; + }; + self.sink + .enqueue(TaskAssignment { + job_id, + resource_group_id, + task_id: TaskId::Cleanup, + }) + .await?; + self.buffered_tasks.remove(&(job_id, TaskId::Cleanup)); + self.finalizing_jobs.remove(&job_id); + remaining_dispatch_slots -= 1; + } + RoundRobinSlot::CommitReady => { + let Some((job_id, resource_group_id)) = self.commit_ready_jobs.pop_front() + else { + continue; + }; + self.sink + .enqueue(TaskAssignment { + job_id, + resource_group_id, + task_id: TaskId::Commit, + }) + .await?; + self.buffered_tasks.remove(&(job_id, TaskId::Commit)); + self.finalizing_jobs.remove(&job_id); + remaining_dispatch_slots -= 1; + } + RoundRobinSlot::Job(job_id) => { + let Some(job_entry) = self.active_jobs.get_mut(&job_id) else { + return Err(SchedulerError::Internal(format!( + "attempt to remove a non-existing active job: {job_id:?}" + ))); + }; + if let Some(task_id) = job_entry.dequeue() { + self.sink + .enqueue(TaskAssignment { + job_id, + resource_group_id: job_entry.resource_group_id, + task_id, + }) + .await?; + self.buffered_tasks.remove(&(job_id, task_id)); + remaining_dispatch_slots -= 1; + } else { + self.retire_active_job(job_id)?; + } + } + } + } + + Ok(()) + } + + /// Starts a new asynchronous inbound poll, with per-lane entry limits derived from the + /// remaining buffer capacities. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`AsyncInboundQueueReader::start`]'s return values on failure. + fn start_inbound_poll(&mut self) -> Result<(), SchedulerError> { + let num_commit_ready_tasks = self.commit_ready_jobs.len(); + let num_cleanup_ready_tasks = self.cleanup_ready_jobs.len(); + let max_commit_ready_entries = self + .config + .commit_ready_task_capacity + .saturating_sub(num_commit_ready_tasks); + let max_cleanup_ready_entries = self + .config + .cleanup_ready_task_capacity + .saturating_sub(num_cleanup_ready_tasks); + let max_ready_entries = self.config.ready_task_capacity.saturating_sub( + self.buffered_tasks.len() - num_commit_ready_tasks - num_cleanup_ready_tasks, + ); + + self.inbound_queue_reader.start( + Duration::from_millis(self.config.storage_poll_timeout_ms), + max_ready_entries, + max_commit_ready_entries, + max_cleanup_ready_entries, + ) + } +} + +/// The state of an asynchronous inbound-queue poll. +enum InboundPollState { + /// The poll has completed, carrying the polled session and the entries drained from each + /// inbound-queue lane. + Ready { + session_id: SessionId, + ready_entries: Vec, + commit_ready_entries: Vec, + cleanup_ready_entries: Vec, + }, + + /// The poll is still in flight. + Pending, + + /// No poll has been started. + NotStarted, +} + +/// The join handles of one in-flight inbound poll, one per inbound-queue lane. +#[allow(clippy::struct_field_names)] +struct InboundPollHandles { + ready_handle: + tokio::task::JoinHandle), StorageClientError>>, + commit_ready_handle: + tokio::task::JoinHandle), StorageClientError>>, + cleanup_ready_handle: + tokio::task::JoinHandle), StorageClientError>>, +} + +impl InboundPollHandles { + /// Tries to collect the results of all lane polls without blocking. + /// + /// Entries from lanes that report an older session than the latest observed session are + /// dropped. + /// + /// # Returns + /// + /// On success: + /// + /// * [`InboundPollState::Pending`] if any lane poll is still in flight. + /// * [`InboundPollState::Ready`] with the latest observed session and its entries otherwise. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::Internal`] if any lane's polling task fails to join. + /// * Forwards [`SchedulerStorageClient::poll_ready`]'s return values on failure. + /// * Forwards [`SchedulerStorageClient::poll_commit_ready`]'s return values on failure. + /// * Forwards [`SchedulerStorageClient::poll_cleanup_ready`]'s return values on failure. + async fn try_collect_result( + &mut self, + curr_session_id: SessionId, + ) -> Result { + if !self.ready_handle.is_finished() + || !self.commit_ready_handle.is_finished() + || !self.cleanup_ready_handle.is_finished() + { + return Ok(InboundPollState::Pending); + } + + let (ready_session_id, ready_entries) = (&mut self.ready_handle) + .await + .map_err(|e| SchedulerError::Internal(e.to_string()))??; + let (commit_session_id, commit_ready_entries) = (&mut self.commit_ready_handle) + .await + .map_err(|e| SchedulerError::Internal(e.to_string()))??; + let (cleanup_session_id, cleanup_ready_entries) = + (&mut self.cleanup_ready_handle) + .await + .map_err(|e| SchedulerError::Internal(e.to_string()))??; + + let latest_session_id = curr_session_id + .max(ready_session_id) + .max(commit_session_id) + .max(cleanup_session_id); + + Ok(InboundPollState::Ready { + session_id: latest_session_id, + ready_entries: Self::drop_if_stale(ready_session_id, latest_session_id, ready_entries), + commit_ready_entries: Self::drop_if_stale( + commit_session_id, + latest_session_id, + commit_ready_entries, + ), + cleanup_ready_entries: Self::drop_if_stale( + cleanup_session_id, + latest_session_id, + cleanup_ready_entries, + ), + }) + } + + /// # Returns + /// + /// `entries` if `session_id` matches `latest_session_id`, or an empty vector otherwise. + fn drop_if_stale( + session_id: SessionId, + latest_session_id: SessionId, + entries: Vec, + ) -> Vec { + if session_id == latest_session_id { + entries + } else { + Vec::new() + } + } +} + +/// A reader that runs inbound-queue polls as background tasks, with at most one polling request +/// (from all three lanes) in flight at a time. +/// +/// # Type Parameters +/// +/// * `StorageClientType` - The storage client used to poll the inbound queue. +struct AsyncInboundQueueReader { + storage_client: StorageClientType, + handle: Option, +} + +impl + AsyncInboundQueueReader +{ + /// Factory function. + /// + /// # Returns + /// + /// A new reader with no poll in flight. + const fn new(storage_client: StorageClientType) -> Self { + Self { + storage_client, + handle: None, + } + } + + /// Tries to collect the result of the in-flight poll without blocking, releasing the poll + /// handles once a result is produced. + /// + /// # Returns + /// + /// On success: + /// + /// * [`InboundPollState::NotStarted`] if no poll is in flight. + /// * Forwards [`InboundPollHandles::try_collect_result`]'s return values otherwise. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`InboundPollHandles::try_collect_result`]'s return values on failure. + async fn try_collect_result( + &mut self, + curr_session_id: SessionId, + ) -> Result { + match &mut self.handle { + None => Ok(InboundPollState::NotStarted), + Some(handle) => { + let inbound_poll_state = handle.try_collect_result(curr_session_id).await?; + if !matches!(inbound_poll_state, InboundPollState::Pending) { + self.handle = None; + } + Ok(inbound_poll_state) + } + } + } + + /// Starts a new inbound poll, polling each inbound-queue lane as a background task. + /// + /// Lanes whose entry limit is 0 are not polled; if all limits are 0, no poll is started. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::Internal`] if a poll is already in flight. + fn start( + &mut self, + storage_poll_timeout: Duration, + max_ready_entries: usize, + max_commit_ready_entries: usize, + max_cleanup_ready_entries: usize, + ) -> Result<(), SchedulerError> { + if self.handle.is_some() { + return Err(SchedulerError::Internal( + "inbound poll handle already exists".to_string(), + )); + } + + if max_ready_entries == 0 && max_commit_ready_entries == 0 && max_cleanup_ready_entries == 0 + { + return Ok(()); + } + + let ready_storage_client = self.storage_client.clone(); + let ready_handle = tokio::task::spawn(async move { + if max_ready_entries == 0 { + return Ok((0, Vec::new())); + } + ready_storage_client + .poll_ready(max_ready_entries, storage_poll_timeout) + .await + }); + + let commit_ready_storage_client = self.storage_client.clone(); + let commit_ready_handle = tokio::task::spawn(async move { + if max_commit_ready_entries == 0 { + return Ok((0, Vec::new())); + } + commit_ready_storage_client + .poll_commit_ready(max_commit_ready_entries, storage_poll_timeout) + .await + }); + + let cleanup_ready_storage_client = self.storage_client.clone(); + let cleanup_ready_handle = tokio::task::spawn(async move { + if max_cleanup_ready_entries == 0 { + return Ok((0, Vec::new())); + } + cleanup_ready_storage_client + .poll_cleanup_ready(max_cleanup_ready_entries, storage_poll_timeout) + .await + }); + + self.handle = Some(InboundPollHandles { + ready_handle, + commit_ready_handle, + cleanup_ready_handle, + }); + + Ok(()) + } +} diff --git a/components/spider-scheduler/src/core_impl/round_robin/mod.rs b/components/spider-scheduler/src/core_impl/round_robin/mod.rs new file mode 100644 index 00000000..ce6f1feb --- /dev/null +++ b/components/spider-scheduler/src/core_impl/round_robin/mod.rs @@ -0,0 +1,39 @@ +//! Round-robin scheduler. +//! +//! This scheduler provides basic fairness across jobs using a round-robin scheduling policy. It +//! polls tasks from the inbound queue (maintained by the storage service) and organizes jobs into +//! two sets: +//! +//! * Active jobs: jobs that participate in round-robin scheduling. +//! * Pending jobs: jobs that are buffered but not yet scheduled. When an active job has no +//! remaining schedulable tasks, it is replaced by the next pending job in FIFO order. +//! +//! The scheduler operates in discrete ticks. During each tick, it attempts to consume the results +//! of an asynchronous inbound-queue polling operation and loads any newly available tasks into its +//! internal buffers. It then makes scheduling decisions until the dispatch queue reaches capacity. +//! +//! # Properties +//! +//! * Each round-robin cycle may schedule at most one additional commit task and one additional +//! cleanup task, if available. +//! * All buffered tasks are unique. Tasks loaded from the inbound queue are deduplicated before +//! entering the scheduler's internal buffers. +//! +//! # Configuration +//! +//! * `active_job_queue_capacity`: Maximum number of active jobs maintained by the scheduler. +//! * `dispatch_queue_capacity`: Maximum number of task assignments in the dispatch queue. +//! * `ready_task_capacity`: Maximum number of ready tasks buffered by the scheduler. +//! * `commit_ready_task_capacity`: Maximum number of buffered commit-ready tasks. +//! * `cleanup_ready_task_capacity`: Maximum number of buffered cleanup-ready tasks. +//! * `storage_poll_timeout_ms`: Maximum time, in milliseconds, that inbound-queue polling may block +//! on the storage-service side. +//! * `tick_interval_ms`: Interval, in milliseconds, between scheduler ticks (tick execution time +//! included). + +mod implementation; + +#[cfg(test)] +mod tests; + +pub use implementation::RoundRobinConfig; diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs new file mode 100644 index 00000000..6ad55423 --- /dev/null +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -0,0 +1 @@ +//! Unit tests for the round-robin scheduler core. diff --git a/components/spider-scheduler/src/error.rs b/components/spider-scheduler/src/error.rs index bff7571d..50851809 100644 --- a/components/spider-scheduler/src/error.rs +++ b/components/spider-scheduler/src/error.rs @@ -32,6 +32,9 @@ pub enum SchedulerError { #[error("internal error: {0}")] Internal(String), + #[error("invalid config: {0}")] + InvalidConfig(String), + #[error("async result not ready")] ResultNotReady, } From 10680666dfa47c697ef9db828e5616b133d11031 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 18:05:22 -0400 Subject: [PATCH 09/14] Add black-box unit testing. --- .../src/{core_impl.rs => core_impl/mod.rs} | 0 .../core_impl/round_robin/implementation.rs | 154 +++--- .../src/core_impl/round_robin/mod.rs | 2 +- .../src/core_impl/round_robin/tests.rs | 480 ++++++++++++++++++ 4 files changed, 575 insertions(+), 61 deletions(-) rename components/spider-scheduler/src/{core_impl.rs => core_impl/mod.rs} (100%) diff --git a/components/spider-scheduler/src/core_impl.rs b/components/spider-scheduler/src/core_impl/mod.rs similarity index 100% rename from components/spider-scheduler/src/core_impl.rs rename to components/spider-scheduler/src/core_impl/mod.rs diff --git a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs index 9ff881bc..3418487e 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs @@ -23,19 +23,8 @@ use crate::{ }; /// The configuration of the round-robin scheduler core. -/// -/// The configuration itself implements [`SchedulerCore`]: consuming it through -/// [`SchedulerCore::run`] creates the underlying scheduler and drives its scheduling loop. -/// -/// # Type Parameters -/// -/// * `SchedulerStorageClientType` - The storage client used to poll the inbound queue. -/// * `DispatchQueueSinkType` - The dispatch sink that task assignments are written to. #[derive(Deserialize)] -pub struct RoundRobinConfig< - SchedulerStorageClientType: SchedulerStorageClient + 'static, - DispatchQueueSinkType: DispatchQueueSink, -> { +pub struct RoundRobinConfig { /// The capacity of the active job queue. The scheduler will make task assignments from these /// jobs in a round-robin manner. /// @@ -68,8 +57,89 @@ pub struct RoundRobinConfig< /// The time (in milliseconds) that the scheduler will spend on each tick. pub tick_interval_ms: u64, +} - #[serde(skip)] +impl RoundRobinConfig { + /// Validates the configuration and creates a ready-to-run scheduler core from it. + /// + /// # Type Parameters + /// + /// * `SchedulerStorageClientType` - The storage client used to poll the inbound queue. + /// * `DispatchQueueSinkType` - The dispatch sink that task assignments are written to. + /// + /// # Returns + /// + /// A newly created round-robin scheduler core on success. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * [`SchedulerError::InvalidConfig`] if any of the following configuration entries is 0: + /// * `active_job_queue_capacity` + /// * `dispatch_queue_capacity` + /// * `ready_task_capacity` + /// * `commit_ready_task_capacity` + /// * `cleanup_ready_task_capacity` + pub fn make_core< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, + >( + self, + ) -> Result, SchedulerError> + { + if self.active_job_queue_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`active_job_queue_capacity` must be greater than 0".to_string(), + )); + } + + if self.dispatch_queue_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`dispatch_queue_capacity` must be greater than 0".to_string(), + )); + } + + if self.ready_task_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`ready_task_capacity` must be greater than 0".to_string(), + )); + } + + if self.commit_ready_task_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`commit_ready_task_capacity` must be greater than 0".to_string(), + )); + } + + if self.cleanup_ready_task_capacity == 0 { + return Err(SchedulerError::InvalidConfig( + "`cleanup_ready_task_capacity` must be greater than 0".to_string(), + )); + } + + Ok(RoundRobinCore { + config: self, + _marker: std::marker::PhantomData, + }) + } +} + +/// The round-robin implementation of [`SchedulerCore`], created from +/// [`RoundRobinConfig::make_core`]. +/// +/// Holding an instance of this type guarantees the wrapped configuration has passed validation, so +/// the scheduling loop can trust its invariants without re-validating. +/// +/// # Type Parameters +/// +/// * `SchedulerStorageClientType` - The storage client used to poll the inbound queue. +/// * `DispatchQueueSinkType` - The dispatch sink that task assignments are written to. +pub struct RoundRobinCore< + SchedulerStorageClientType: SchedulerStorageClient + 'static, + DispatchQueueSinkType: DispatchQueueSink, +> { + config: RoundRobinConfig, _marker: std::marker::PhantomData<(SchedulerStorageClientType, DispatchQueueSinkType)>, } @@ -77,7 +147,7 @@ pub struct RoundRobinConfig< impl< SchedulerStorageClientType: SchedulerStorageClient + 'static, DispatchQueueSinkType: DispatchQueueSink, -> SchedulerCore for RoundRobinConfig +> SchedulerCore for RoundRobinCore { type Sink = DispatchQueueSinkType; type StorageClient = SchedulerStorageClientType; @@ -93,8 +163,8 @@ impl< storage_client, sink, cancellation_token, - self, - )? + self.config, + ) .run() .await } @@ -159,7 +229,7 @@ struct RoundRobin< > { sink: DispatchQueueSinkType, cancellation_token: CancellationToken, - config: RoundRobinConfig, + config: RoundRobinConfig, storage_session_id: SessionId, buffered_tasks: HashSet<(JobId, TaskId)>, @@ -185,55 +255,19 @@ impl< { /// Factory function. /// - /// Creates a [`RoundRobin`] scheduler from the given config. + /// Creates a [`RoundRobin`] scheduler from the given config. The config must have been + /// validated through [`RoundRobinConfig::make_core`]. /// /// # Returns /// - /// The constructed [`RoundRobin`] scheduler on success. - /// - /// # Errors - /// - /// Returns an error if: - /// - /// * [`SchedulerError::InvalidConfig`] if the config contains invalid values. Check - /// [`RoundRobinConfig`]'s docstring for details. + /// The constructed [`RoundRobin`] scheduler. fn new( storage_session_id: SessionId, storage_client: SchedulerStorageClientType, sink: DispatchQueueSinkType, cancellation_token: CancellationToken, - config: RoundRobinConfig, - ) -> Result { - if config.active_job_queue_capacity == 0 { - return Err(SchedulerError::InvalidConfig( - "`active_job_queue_capacity` must be greater than 0".to_string(), - )); - } - - if config.dispatch_queue_capacity == 0 { - return Err(SchedulerError::InvalidConfig( - "`dispatch_queue_capacity` must be greater than 0".to_string(), - )); - } - - if config.ready_task_capacity == 0 { - return Err(SchedulerError::InvalidConfig( - "`ready_task_capacity` must be greater than 0".to_string(), - )); - } - - if config.commit_ready_task_capacity == 0 { - return Err(SchedulerError::InvalidConfig( - "`commit_ready_task_capacity` must be greater than 0".to_string(), - )); - } - - if config.cleanup_ready_task_capacity == 0 { - return Err(SchedulerError::InvalidConfig( - "`cleanup_ready_task_capacity` must be greater than 0".to_string(), - )); - } - + config: RoundRobinConfig, + ) -> Self { let buffered_tasks = HashSet::with_capacity(config.ready_task_capacity); let active_jobs = HashMap::with_capacity(config.active_job_queue_capacity); let active_job_queue = Self::new_active_job_queue(config.active_job_queue_capacity); @@ -246,7 +280,7 @@ impl< config.commit_ready_task_capacity + config.cleanup_ready_task_capacity, ); let inbound_queue_reader = AsyncInboundQueueReader::new(storage_client); - Ok(Self { + Self { sink, cancellation_token, config, @@ -261,7 +295,7 @@ impl< cleanup_ready_jobs, finalizing_jobs, inbound_queue_reader, - }) + } } /// # Returns diff --git a/components/spider-scheduler/src/core_impl/round_robin/mod.rs b/components/spider-scheduler/src/core_impl/round_robin/mod.rs index ce6f1feb..f1224799 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/mod.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/mod.rs @@ -36,4 +36,4 @@ mod implementation; #[cfg(test)] mod tests; -pub use implementation::RoundRobinConfig; +pub use implementation::{RoundRobinConfig, RoundRobinCore}; diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs index 6ad55423..2ac650ca 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/tests.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -1 +1,481 @@ //! Unit tests for the round-robin scheduler core. + +use std::{ + collections::{HashMap, HashSet, VecDeque}, + sync::{ + Arc, + Mutex, + atomic::{AtomicU64, Ordering}, + }, + time::Duration, +}; + +use anyhow::bail; +use async_trait::async_trait; +use spider_core::{ + job::JobState, + types::id::{JobId, ResourceGroupId, SessionId, TaskId}, +}; +use tokio_util::sync::CancellationToken; + +use super::RoundRobinConfig; +use crate::{ + DispatchQueueSource, + InboundEntry, + SchedulerCore, + SchedulerError, + SchedulerStorageClient, + StorageClientError, + TaskAssignment, + dispatch_queue::{DispatchQueueReader, DispatchQueueWriter, create_dispatch_queue}, +}; + +/// The session used by tests that never bump the session. +const DEFAULT_SESSION_ID: SessionId = 0; + +/// The maximum time to wait for expected assignments before failing a test. +const DRAIN_DEADLINE: Duration = Duration::from_secs(5); + +struct MockStorageInner { + session_id: AtomicU64, + ready_batches: Mutex)>>, +} + +/// A mock [`SchedulerStorageClient`] backed by scripted poll batches. +/// +/// Each lane serves its scripted batches in FIFO order, one batch per poll; when a lane's script +/// is empty, polls return an empty batch under the mock's current session immediately (the `wait` +/// parameter is ignored to keep tests fast). +#[derive(Clone)] +struct MockStorageClient { + inner: Arc, +} + +impl MockStorageClient { + /// Factory function. + /// + /// # Returns + /// + /// A new mock storage client with no scripted batches, reporting `session_id` on empty polls. + fn new(session_id: SessionId) -> Self { + Self { + inner: Arc::new(MockStorageInner { + session_id: AtomicU64::new(session_id), + ready_batches: Mutex::new(VecDeque::new()), + }), + } + } + + /// Scripts a batch to be served by the next unserved [`SchedulerStorageClient::poll_ready`] + /// call. + fn push_ready_batch(&self, session_id: SessionId, entries: Vec) { + self.inner + .ready_batches + .lock() + .expect("ready-batch lock poisoned") + .push_back((session_id, entries)); + } + + /// # Returns + /// + /// The session reported on polls that have no scripted batch. + fn current_session(&self) -> SessionId { + self.inner.session_id.load(Ordering::Relaxed) + } +} + +#[async_trait] +impl SchedulerStorageClient for MockStorageClient { + async fn poll_ready( + &self, + max_items: usize, + _wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + let scripted_batch = self + .inner + .ready_batches + .lock() + .expect("ready-batch lock poisoned") + .pop_front(); + let Some((session_id, entries)) = scripted_batch else { + return Ok((self.current_session(), Vec::new())); + }; + assert!( + entries.len() <= max_items, + "scripted batch of {} entries exceeds the scheduler's poll limit of {max_items}", + entries.len(), + ); + Ok((session_id, entries)) + } + + async fn poll_commit_ready( + &self, + _max_items: usize, + _wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + Ok((self.current_session(), Vec::new())) + } + + async fn poll_cleanup_ready( + &self, + _max_items: usize, + _wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + Ok((self.current_session(), Vec::new())) + } + + async fn job_state(&self, _job_id: JobId) -> Result { + Ok(JobState::Running) + } +} + +/// # Returns +/// +/// A config with the given pool and dispatch capacities, and defaults large enough that the other +/// capacities never throttle the tests. +fn make_config( + active_job_queue_capacity: usize, + dispatch_queue_capacity: usize, +) -> RoundRobinConfig { + RoundRobinConfig { + active_job_queue_capacity, + dispatch_queue_capacity, + ready_task_capacity: 16_384, + commit_ready_task_capacity: 16, + cleanup_ready_task_capacity: 16, + storage_poll_timeout_ms: 10, + tick_interval_ms: 1, + } +} + +/// # Returns +/// +/// `n` jobs with freshly generated job and resource-group IDs. +fn make_jobs(n: usize) -> Vec<(JobId, ResourceGroupId)> { + (0..n) + .map(|_| (JobId::new(), ResourceGroupId::new())) + .collect() +} + +/// Builds one inbound ready batch containing `tasks_per_job` tasks per job, interleaved across +/// jobs in per-job FIFO order (task 0 of every job, then task 1 of every job, and so on). +/// +/// When `dup_every` is non-zero, every `dup_every`-th entry is duplicated adjacently within the +/// batch, emulating the duplicate task assignments a real storage may return. +/// +/// # Returns +/// +/// The inbound entries of the batch. +fn make_ready_batch( + jobs: &[(JobId, ResourceGroupId)], + tasks_per_job: usize, + dup_every: usize, +) -> Vec { + let mut entries = Vec::new(); + let mut num_emitted = 0_usize; + for task_index in 0..tasks_per_job { + for &(job_id, resource_group_id) in jobs { + let entry = InboundEntry { + resource_group_id, + job_id, + task_id: TaskId::Index(task_index), + }; + entries.push(entry); + num_emitted += 1; + if dup_every > 0 && num_emitted.is_multiple_of(dup_every) { + entries.push(entry); + } + } + } + entries +} + +/// Validates the given config and spawns the scheduler's public run loop as a background task. +/// +/// # Returns +/// +/// A tuple containing: +/// +/// * The join handle yielding the scheduler's exit result. +/// * The cancellation token that stops the scheduler. +/// +/// # Panics +/// +/// Panics if the given config fails validation. +fn spawn_scheduler( + config: RoundRobinConfig, + storage_client: MockStorageClient, + sink: DispatchQueueWriter, +) -> ( + tokio::task::JoinHandle>, + CancellationToken, +) { + let core = config.make_core().expect("config validation failed"); + let cancellation_token = CancellationToken::new(); + let scheduler_token = cancellation_token.clone(); + let handle = tokio::spawn(async move { core.run(storage_client, sink, scheduler_token).await }); + (handle, cancellation_token) +} + +/// Drains exactly `n` task assignments from the dispatch queue, playing the worker pool's role. +/// +/// # Returns +/// +/// The drained assignments in FIFO order on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Fewer than `n` assignments arrive within [`DRAIN_DEADLINE`]. +/// * Forwards [`DispatchQueueSource::dequeue`]'s return values on failure. +async fn drain_n(reader: &DispatchQueueReader, n: usize) -> anyhow::Result> { + const DEQUEUE_WAIT: Duration = Duration::from_millis(100); + let deadline = tokio::time::Instant::now() + DRAIN_DEADLINE; + let mut assignments = Vec::with_capacity(n); + while assignments.len() < n { + if tokio::time::Instant::now() > deadline { + bail!( + "timed out draining assignments: got {}, expected {n}", + assignments.len(), + ); + } + if let Some((_session_id, assignment)) = reader.dequeue(DEQUEUE_WAIT).await? { + assignments.push(assignment); + } + } + Ok(assignments) +} + +/// Asserts that no further assignment arrives within a short observation window, proving that +/// duplicated or dropped tasks never leak into the dispatch queue. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`DispatchQueueSource::dequeue`]'s return values on failure. +async fn assert_no_more_assignments(reader: &DispatchQueueReader) -> anyhow::Result<()> { + const OBSERVATION_WINDOW: Duration = Duration::from_secs(1); + let unexpected_assignment = reader.dequeue(OBSERVATION_WINDOW).await?; + assert_eq!(unexpected_assignment, None); + Ok(()) +} + +/// Asserts that `assignments` is exactly `rounds` full round-robin rotations over `jobs` in order: +/// rotation `r` consists of task `r` of every job, following the jobs' order, so every job's task +/// indices are dispatched FIFO. +fn assert_strict_rotation( + assignments: &[TaskAssignment], + jobs: &[(JobId, ResourceGroupId)], + rounds: usize, +) { + let expected: Vec<(JobId, ResourceGroupId, TaskId)> = (0..rounds) + .flat_map(|round| { + jobs.iter().map(move |&(job_id, resource_group_id)| { + (job_id, resource_group_id, TaskId::Index(round)) + }) + }) + .collect(); + let actual: Vec<(JobId, ResourceGroupId, TaskId)> = assignments + .iter() + .map(|assignment| { + ( + assignment.job_id, + assignment.resource_group_id, + assignment.task_id, + ) + }) + .collect(); + assert_eq!(actual, expected); +} + +/// Asserts that `assignments` follows the round-robin scheduling policy over `jobs` without pinning +/// down the exact rotation order: +/// +/// * Every aligned window of `jobs.len()` assignments (one full rotation pass) contains each job +/// exactly once. +/// * Each job's task indices are dispatched in FIFO order, with the matching resource group. +/// * Each job receives exactly `tasks_per_job` assignments. +fn assert_round_robin_property( + assignments: &[TaskAssignment], + jobs: &[(JobId, ResourceGroupId)], + tasks_per_job: usize, +) { + assert_eq!(assignments.len(), jobs.len() * tasks_per_job); + + // With equal task counts, no job leaves the rotation mid-phase, so every rotation pass must + // schedule every job exactly once. + for rotation_pass in assignments.chunks(jobs.len()) { + let scheduled_jobs: HashSet = rotation_pass + .iter() + .map(|assignment| assignment.job_id) + .collect(); + assert_eq!( + scheduled_jobs.len(), + jobs.len(), + "a rotation pass repeats or misses a job: {rotation_pass:?}", + ); + } + + let resource_groups: HashMap = jobs.iter().copied().collect(); + let mut next_task_indices: HashMap = HashMap::new(); + for assignment in assignments { + let resource_group_id = *resource_groups + .get(&assignment.job_id) + .expect("assignment belongs to a job outside the given job set"); + assert_eq!(assignment.resource_group_id, resource_group_id); + + let next_task_index = next_task_indices.entry(assignment.job_id).or_insert(0); + assert_eq!(assignment.task_id, TaskId::Index(*next_task_index)); + *next_task_index += 1; + } + + for &(job_id, _) in jobs { + assert_eq!(next_task_indices.get(&job_id).copied(), Some(tasks_per_job)); + } +} + +#[test] +fn zero_capacity_configs_are_rejected() { + let try_make_core = + |config: RoundRobinConfig| config.make_core::(); + + assert!(try_make_core(make_config(2, 2)).is_ok()); + + let zeroed_configs = [ + RoundRobinConfig { + active_job_queue_capacity: 0, + ..make_config(2, 2) + }, + RoundRobinConfig { + dispatch_queue_capacity: 0, + ..make_config(2, 2) + }, + RoundRobinConfig { + ready_task_capacity: 0, + ..make_config(2, 2) + }, + RoundRobinConfig { + commit_ready_task_capacity: 0, + ..make_config(2, 2) + }, + RoundRobinConfig { + cleanup_ready_task_capacity: 0, + ..make_config(2, 2) + }, + ]; + for config in zeroed_configs { + let result = try_make_core(config); + assert!( + matches!(result, Err(SchedulerError::InvalidConfig(_))), + "expected InvalidConfig, got {:?}", + result.err(), + ); + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn single_capacity_pool_schedules_jobs_serially() -> anyhow::Result<()> { + const NUM_JOBS: usize = 3; + const TASKS_PER_JOB: usize = 5; + const DUP_EVERY: usize = 3; + const DISPATCH_QUEUE_CAPACITY: usize = 32; + + let jobs = make_jobs(NUM_JOBS); + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(1, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; + assert_no_more_assignments(&reader).await?; + + // With an active job pool of capacity 1, round-robin degenerates to serial job FIFO: the + // rotation holds a single job at a time, so each job's tasks dispatch as one consecutive + // single-job rotation, in job-arrival order. + for (segment, job) in assignments.chunks(TASKS_PER_JOB).zip(&jobs) { + assert_strict_rotation(segment, std::slice::from_ref(job), TASKS_PER_JOB); + } + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn active_jobs_dispatch_in_round_robin_order() -> anyhow::Result<()> { + const NUM_JOBS: usize = 10; + const TASKS_PER_JOB: usize = 5; + const DUP_EVERY: usize = 4; + const DISPATCH_QUEUE_CAPACITY: usize = 32; + + let jobs = make_jobs(NUM_JOBS); + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(NUM_JOBS, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; + assert_no_more_assignments(&reader).await?; + + // All 10 jobs fit into the active job pool, so no job ever pends and dispatch follows the + // strict rotation: task 0 of every job in batch order, then task 1 of every job, and so on. The + // exact count of 50 (with no trailing assignments) also proves the in-batch duplicates were + // deduplicated. + assert_strict_rotation(&assignments, &jobs, TASKS_PER_JOB); + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn pending_jobs_promote_and_schedule_round_robin() -> anyhow::Result<()> { + const ACTIVE_JOB_QUEUE_CAPACITY: usize = 10; + const NUM_JOBS: usize = 20; + const TASKS_PER_JOB: usize = 5; + const DUP_EVERY: usize = 5; + const DISPATCH_QUEUE_CAPACITY: usize = 32; + + let jobs = make_jobs(NUM_JOBS); + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(ACTIVE_JOB_QUEUE_CAPACITY, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; + assert_no_more_assignments(&reader).await?; + + let (active_jobs, pending_jobs) = jobs.split_at(ACTIVE_JOB_QUEUE_CAPACITY); + let (phase1, phase2) = assignments.split_at(ACTIVE_JOB_QUEUE_CAPACITY * TASKS_PER_JOB); + + // Phase 1: the first 10 jobs in batch order fill the active job pool and dispatch in strict + // rotation; the pending jobs must not appear while the active jobs still have tasks. + assert_strict_rotation(phase1, active_jobs, TASKS_PER_JOB); + + // Phase 2: once the active jobs exhaust, the 10 pending jobs are promoted and scheduled + // round-robin. The exact slot order after the retire-and-promote wave is an implementation + // detail of the rotation bookkeeping, so assert the round-robin property instead of one + // hard-coded sequence. + assert_round_robin_property(phase2, pending_jobs, TASKS_PER_JOB); + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} From bc619580a6d60d5b3eb8c2e52991ed4623e68062 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 18:17:03 -0400 Subject: [PATCH 10/14] Add cleanup and commit tasks. --- .../src/core_impl/round_robin/tests.rs | 175 +++++++++++++++--- 1 file changed, 146 insertions(+), 29 deletions(-) diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs index 2ac650ca..6245d7cb 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/tests.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -39,6 +39,8 @@ const DRAIN_DEADLINE: Duration = Duration::from_secs(5); struct MockStorageInner { session_id: AtomicU64, ready_batches: Mutex)>>, + commit_ready_batches: Mutex)>>, + cleanup_ready_batches: Mutex)>>, } /// A mock [`SchedulerStorageClient`] backed by scripted poll batches. @@ -62,6 +64,8 @@ impl MockStorageClient { inner: Arc::new(MockStorageInner { session_id: AtomicU64::new(session_id), ready_batches: Mutex::new(VecDeque::new()), + commit_ready_batches: Mutex::new(VecDeque::new()), + cleanup_ready_batches: Mutex::new(VecDeque::new()), }), } } @@ -76,52 +80,81 @@ impl MockStorageClient { .push_back((session_id, entries)); } + /// Scripts a batch to be served by the next unserved + /// [`SchedulerStorageClient::poll_commit_ready`] call. + fn push_commit_ready_batch(&self, session_id: SessionId, entries: Vec) { + self.inner + .commit_ready_batches + .lock() + .expect("commit-ready-batch lock poisoned") + .push_back((session_id, entries)); + } + + /// Scripts a batch to be served by the next unserved + /// [`SchedulerStorageClient::poll_cleanup_ready`] call. + fn push_cleanup_ready_batch(&self, session_id: SessionId, entries: Vec) { + self.inner + .cleanup_ready_batches + .lock() + .expect("cleanup-ready-batch lock poisoned") + .push_back((session_id, entries)); + } + /// # Returns /// /// The session reported on polls that have no scripted batch. fn current_session(&self) -> SessionId { self.inner.session_id.load(Ordering::Relaxed) } -} -#[async_trait] -impl SchedulerStorageClient for MockStorageClient { - async fn poll_ready( + /// Serves one poll from the given lane's script. + /// + /// # Returns + /// + /// The lane's next scripted batch, or an empty batch under the current session if the lane's + /// script is exhausted. + fn serve_batch( &self, + batches: &Mutex)>>, max_items: usize, - _wait: Duration, - ) -> Result<(SessionId, Vec), StorageClientError> { - let scripted_batch = self - .inner - .ready_batches - .lock() - .expect("ready-batch lock poisoned") - .pop_front(); + ) -> (SessionId, Vec) { + let scripted_batch = batches.lock().expect("batch lock poisoned").pop_front(); let Some((session_id, entries)) = scripted_batch else { - return Ok((self.current_session(), Vec::new())); + return (self.current_session(), Vec::new()); }; assert!( entries.len() <= max_items, "scripted batch of {} entries exceeds the scheduler's poll limit of {max_items}", entries.len(), ); - Ok((session_id, entries)) + (session_id, entries) + } +} + +#[async_trait] +impl SchedulerStorageClient for MockStorageClient { + async fn poll_ready( + &self, + max_items: usize, + _wait: Duration, + ) -> Result<(SessionId, Vec), StorageClientError> { + Ok(self.serve_batch(&self.inner.ready_batches, max_items)) } async fn poll_commit_ready( &self, - _max_items: usize, + max_items: usize, _wait: Duration, ) -> Result<(SessionId, Vec), StorageClientError> { - Ok((self.current_session(), Vec::new())) + Ok(self.serve_batch(&self.inner.commit_ready_batches, max_items)) } async fn poll_cleanup_ready( &self, - _max_items: usize, + max_items: usize, _wait: Duration, ) -> Result<(SessionId, Vec), StorageClientError> { - Ok((self.current_session(), Vec::new())) + Ok(self.serve_batch(&self.inner.cleanup_ready_batches, max_items)) } async fn job_state(&self, _job_id: JobId) -> Result { @@ -190,6 +223,22 @@ fn make_ready_batch( entries } +/// Builds one inbound batch that marks each given job as finalizing, with `task_id` (either +/// [`TaskId::Commit`] or [`TaskId::Cleanup`]) set on every entry. +/// +/// # Returns +/// +/// The inbound entries of the batch. +fn make_finalizing_batch(jobs: &[(JobId, ResourceGroupId)], task_id: TaskId) -> Vec { + jobs.iter() + .map(|&(job_id, resource_group_id)| InboundEntry { + resource_group_id, + job_id, + task_id, + }) + .collect() +} + /// Validates the given config and spawns the scheduler's public run loop as a background task. /// /// # Returns @@ -262,6 +311,26 @@ async fn assert_no_more_assignments(reader: &DispatchQueueReader) -> anyhow::Res Ok(()) } +/// # Returns +/// +/// A vector of tuples following the order of the input assignments, each tuple containing: +/// +/// * The job ID. +/// * The resource group ID. +/// * The task ID. +fn make_assigment_tuple(assignments: &[TaskAssignment]) -> Vec<(JobId, ResourceGroupId, TaskId)> { + assignments + .iter() + .map(|assignment| { + ( + assignment.job_id, + assignment.resource_group_id, + assignment.task_id, + ) + }) + .collect() +} + /// Asserts that `assignments` is exactly `rounds` full round-robin rotations over `jobs` in order: /// rotation `r` consists of task `r` of every job, following the jobs' order, so every job's task /// indices are dispatched FIFO. @@ -277,17 +346,7 @@ fn assert_strict_rotation( }) }) .collect(); - let actual: Vec<(JobId, ResourceGroupId, TaskId)> = assignments - .iter() - .map(|assignment| { - ( - assignment.job_id, - assignment.resource_group_id, - assignment.task_id, - ) - }) - .collect(); - assert_eq!(actual, expected); + assert_eq!(make_assigment_tuple(assignments), expected); } /// Asserts that `assignments` follows the round-robin scheduling policy over `jobs` without pinning @@ -479,3 +538,61 @@ async fn pending_jobs_promote_and_schedule_round_robin() -> anyhow::Result<()> { scheduler_handle.await.expect("scheduler task panicked")?; Ok(()) } + +#[tokio::test(flavor = "multi_thread")] +async fn commit_and_cleanup_dispatch_once_per_cycle() -> anyhow::Result<()> { + const NUM_ACTIVE_JOBS: usize = 4; + const TASKS_PER_JOB: usize = 3; + const NUM_FINALIZING_JOBS_PER_LANE: usize = 3; + const DISPATCH_QUEUE_CAPACITY: usize = 1024; + + let active_jobs = make_jobs(NUM_ACTIVE_JOBS); + let commit_ready_jobs = make_jobs(NUM_FINALIZING_JOBS_PER_LANE); + let cleanup_ready_jobs = make_jobs(NUM_FINALIZING_JOBS_PER_LANE); + + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&active_jobs, TASKS_PER_JOB, 0), + ); + let mut commit_ready_batch = make_finalizing_batch(&commit_ready_jobs, TaskId::Commit); + // Duplicate one commit-ready entry within the batch: it must dispatch exactly once. + commit_ready_batch.push(commit_ready_batch[0]); + storage_client.push_commit_ready_batch(DEFAULT_SESSION_ID, commit_ready_batch); + storage_client.push_cleanup_ready_batch( + DEFAULT_SESSION_ID, + make_finalizing_batch(&cleanup_ready_jobs, TaskId::Cleanup), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(NUM_ACTIVE_JOBS, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let num_assignments = NUM_ACTIVE_JOBS * TASKS_PER_JOB + 2 * NUM_FINALIZING_JOBS_PER_LANE; + let assignments = drain_n(&reader, num_assignments).await?; + assert_no_more_assignments(&reader).await?; + + // The rotation is [commit lane, cleanup lane, active jobs...], so every cycle dispatches + // exactly one commit task and one cleanup task (while their queues are non-empty), each lane + // drained FIFO, followed by one task of every active job. + let expected: Vec<(JobId, ResourceGroupId, TaskId)> = (0..TASKS_PER_JOB) + .flat_map(|round| { + let (commit_job_id, commit_resource_group_id) = commit_ready_jobs[round]; + let (cleanup_job_id, cleanup_resource_group_id) = cleanup_ready_jobs[round]; + std::iter::once((commit_job_id, commit_resource_group_id, TaskId::Commit)) + .chain(std::iter::once(( + cleanup_job_id, + cleanup_resource_group_id, + TaskId::Cleanup, + ))) + .chain(active_jobs.iter().map(move |&(job_id, resource_group_id)| { + (job_id, resource_group_id, TaskId::Index(round)) + })) + }) + .collect(); + assert_eq!(make_assigment_tuple(&assignments), expected); + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} From 8c8bdf3519d1750919556e221544c768f66fb85f Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 18:57:59 -0400 Subject: [PATCH 11/14] Add the last. --- .../core_impl/round_robin/implementation.rs | 97 +++--- .../src/core_impl/round_robin/tests.rs | 327 +++++++++++++++++- 2 files changed, 374 insertions(+), 50 deletions(-) diff --git a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs index 3418487e..3d7b1214 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs @@ -171,7 +171,7 @@ impl< } /// A FIFO queue of a job's buffered ready tasks. -struct JobTaskQueue { +pub(super) struct JobTaskQueue { job_id: JobId, resource_group_id: ResourceGroupId, task_ids: VecDeque, @@ -204,48 +204,40 @@ impl JobTaskQueue { } } -/// A slot in the round-robin rotation that the scheduler draws task assignments from. -#[derive(Clone)] -enum RoundRobinSlot { - /// An active job: assignments are drawn from the job's buffered ready tasks. - Job(JobId), - - /// The commit lane: assignments are drawn from the buffered commit-ready jobs. - CommitReady, - - /// The cleanup lane: assignments are drawn from the buffered cleanup-ready jobs. - CleanupReady, -} - /// The round-robin scheduler core created from a [`RoundRobinConfig`]. /// /// # Type Parameters /// /// * `SchedulerStorageClientType` - The storage client used to poll the inbound queue. /// * `DispatchQueueSinkType` - The dispatch sink that task assignments are written to. -struct RoundRobin< +/// +/// # Note +/// +/// All member variables are marked `pub(super)` to allow the test module to inspect the internal +/// states. +pub(super) struct RoundRobin< SchedulerStorageClientType: SchedulerStorageClient + 'static, DispatchQueueSinkType: DispatchQueueSink, > { - sink: DispatchQueueSinkType, - cancellation_token: CancellationToken, - config: RoundRobinConfig, - storage_session_id: SessionId, - buffered_tasks: HashSet<(JobId, TaskId)>, + pub(super) sink: DispatchQueueSinkType, + pub(super) cancellation_token: CancellationToken, + pub(super) config: RoundRobinConfig, + pub(super) storage_session_id: SessionId, + pub(super) buffered_tasks: HashSet<(JobId, TaskId)>, - active_jobs: HashMap, - active_job_queue: Vec, - active_job_queue_round_robin_cursor: usize, + pub(super) active_jobs: HashMap, + pub(super) active_job_queue: Vec, + pub(super) active_job_queue_round_robin_cursor: usize, - pending_jobs: HashMap, - pending_job_queue: VecDeque, + pub(super) pending_jobs: HashMap, + pub(super) pending_job_queue: VecDeque, - commit_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, - cleanup_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, + pub(super) commit_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, + pub(super) cleanup_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, - finalizing_jobs: HashSet, + pub(super) finalizing_jobs: HashSet, - inbound_queue_reader: AsyncInboundQueueReader, + pub(super) inbound_queue_reader: AsyncInboundQueueReader, } impl< @@ -261,7 +253,7 @@ impl< /// # Returns /// /// The constructed [`RoundRobin`] scheduler. - fn new( + pub(super) fn new( storage_session_id: SessionId, storage_client: SchedulerStorageClientType, sink: DispatchQueueSinkType, @@ -298,6 +290,21 @@ impl< } } + /// Executes a single scheduling tick: consumes any completed inbound poll, then makes + /// scheduling decisions to fill the dispatch queue. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::consume_inbound_poll_result`]'s return values on failure. + /// * Forwards [`Self::make_schedule_decisions`]'s return values on failure. + pub(super) async fn tick(&mut self) -> Result<(), SchedulerError> { + self.consume_inbound_poll_result().await?; + self.make_schedule_decisions().await?; + Ok(()) + } + /// # Returns /// /// A new active job queue containing only the commit-ready and cleanup-ready slots. @@ -413,21 +420,6 @@ impl< } } - /// Executes a single scheduling tick: consumes any completed inbound poll, then makes - /// scheduling decisions to fill the dispatch queue. - /// - /// # Errors - /// - /// Returns an error if: - /// - /// * Forwards [`Self::consume_inbound_poll_result`]'s return values on failure. - /// * Forwards [`Self::make_schedule_decisions`]'s return values on failure. - async fn tick(&mut self) -> Result<(), SchedulerError> { - self.consume_inbound_poll_result().await?; - self.make_schedule_decisions().await?; - Ok(()) - } - /// Loads polled inbound entries into the scheduler's internal buffers. /// /// If the polled session is newer than the current session, all existing placement states are @@ -719,6 +711,19 @@ impl< } } +/// A slot in the round-robin rotation that the scheduler draws task assignments from. +#[derive(Clone)] +pub(super) enum RoundRobinSlot { + /// An active job: assignments are drawn from the job's buffered ready tasks. + Job(JobId), + + /// The commit lane: assignments are drawn from the buffered commit-ready jobs. + CommitReady, + + /// The cleanup lane: assignments are drawn from the buffered cleanup-ready jobs. + CleanupReady, +} + /// The state of an asynchronous inbound-queue poll. enum InboundPollState { /// The poll has completed, carrying the polled session and the entries drained from each @@ -834,7 +839,7 @@ impl InboundPollHandles { /// # Type Parameters /// /// * `StorageClientType` - The storage client used to poll the inbound queue. -struct AsyncInboundQueueReader { +pub(super) struct AsyncInboundQueueReader { storage_client: StorageClientType, handle: Option, } diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs index 6245d7cb..14242482 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/tests.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -18,7 +18,7 @@ use spider_core::{ }; use tokio_util::sync::CancellationToken; -use super::RoundRobinConfig; +use super::{RoundRobinConfig, implementation::RoundRobin}; use crate::{ DispatchQueueSource, InboundEntry, @@ -33,6 +33,9 @@ use crate::{ /// The session used by tests that never bump the session. const DEFAULT_SESSION_ID: SessionId = 0; +/// The white-box scheduler under test, driven by manual ticks. +type TestScheduler = RoundRobin; + /// The maximum time to wait for expected assignments before failing a test. const DRAIN_DEADLINE: Duration = Duration::from_secs(5); @@ -304,6 +307,10 @@ async fn drain_n(reader: &DispatchQueueReader, n: usize) -> anyhow::Result anyhow::Result<()> { const OBSERVATION_WINDOW: Duration = Duration::from_secs(1); let unexpected_assignment = reader.dequeue(OBSERVATION_WINDOW).await?; @@ -318,7 +325,7 @@ async fn assert_no_more_assignments(reader: &DispatchQueueReader) -> anyhow::Res /// * The job ID. /// * The resource group ID. /// * The task ID. -fn make_assigment_tuple(assignments: &[TaskAssignment]) -> Vec<(JobId, ResourceGroupId, TaskId)> { +fn make_assignment_tuple(assignments: &[TaskAssignment]) -> Vec<(JobId, ResourceGroupId, TaskId)> { assignments .iter() .map(|assignment| { @@ -334,6 +341,10 @@ fn make_assigment_tuple(assignments: &[TaskAssignment]) -> Vec<(JobId, ResourceG /// Asserts that `assignments` is exactly `rounds` full round-robin rotations over `jobs` in order: /// rotation `r` consists of task `r` of every job, following the jobs' order, so every job's task /// indices are dispatched FIFO. +/// +/// # Panics +/// +/// Panics if `assignments` deviates from the expected strict rotation. fn assert_strict_rotation( assignments: &[TaskAssignment], jobs: &[(JobId, ResourceGroupId)], @@ -346,7 +357,7 @@ fn assert_strict_rotation( }) }) .collect(); - assert_eq!(make_assigment_tuple(assignments), expected); + assert_eq!(make_assignment_tuple(assignments), expected); } /// Asserts that `assignments` follows the round-robin scheduling policy over `jobs` without pinning @@ -356,6 +367,10 @@ fn assert_strict_rotation( /// exactly once. /// * Each job's task indices are dispatched in FIFO order, with the matching resource group. /// * Each job receives exactly `tasks_per_job` assignments. +/// +/// # Panics +/// +/// Panics if `assignments` violates any of the properties above. fn assert_round_robin_property( assignments: &[TaskAssignment], jobs: &[(JobId, ResourceGroupId)], @@ -590,9 +605,313 @@ async fn commit_and_cleanup_dispatch_once_per_cycle() -> anyhow::Result<()> { })) }) .collect(); - assert_eq!(make_assigment_tuple(&assignments), expected); + assert_eq!(make_assignment_tuple(&assignments), expected); cancellation_token.cancel(); scheduler_handle.await.expect("scheduler task panicked")?; Ok(()) } + +/// # Returns +/// +/// A white-box scheduler wired to the given storage client and sink, to be driven by manual +/// [`RoundRobin::tick`] calls. +fn make_scheduler( + config: RoundRobinConfig, + storage_client: MockStorageClient, + sink: DispatchQueueWriter, +) -> TestScheduler { + RoundRobin::new( + DEFAULT_SESSION_ID, + storage_client, + sink, + CancellationToken::new(), + config, + ) +} + +/// Ticks the scheduler until `predicate` holds on its state. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * The predicate does not hold within [`DRAIN_DEADLINE`]. +/// * Forwards [`RoundRobin::tick`]'s return values on failure. +async fn tick_until( + scheduler: &mut TestScheduler, + predicate: impl Fn(&TestScheduler) -> bool, +) -> anyhow::Result<()> { + let deadline = tokio::time::Instant::now() + DRAIN_DEADLINE; + while !predicate(scheduler) { + if tokio::time::Instant::now() > deadline { + bail!("timed out waiting for the tick predicate to hold"); + } + scheduler.tick().await?; + tokio::task::yield_now().await; + } + Ok(()) +} + +/// Drains exactly `n` task assignments while manually ticking the scheduler to refill the dispatch +/// queue (the white-box counterpart of [`drain_n`]). +/// +/// # Returns +/// +/// The drained assignments in FIFO order on success. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Fewer than `n` assignments arrive within [`DRAIN_DEADLINE`]. +/// * Forwards [`RoundRobin::tick`]'s return values on failure. +/// * Forwards [`DispatchQueueSource::dequeue`]'s return values on failure. +async fn tick_and_drain_n( + scheduler: &mut TestScheduler, + reader: &DispatchQueueReader, + n: usize, +) -> anyhow::Result> { + let deadline = tokio::time::Instant::now() + DRAIN_DEADLINE; + let mut assignments = Vec::with_capacity(n); + while assignments.len() < n { + if tokio::time::Instant::now() > deadline { + bail!( + "timed out draining assignments: got {}, expected {n}", + assignments.len(), + ); + } + scheduler.tick().await?; + while let Some((_session_id, assignment)) = reader.dequeue(Duration::ZERO).await? { + assignments.push(assignment); + } + tokio::task::yield_now().await; + } + Ok(assignments) +} + +/// Ticks the scheduler a few extra rounds and asserts that no further assignment is dispatched. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * Forwards [`RoundRobin::tick`]'s return values on failure. +/// * Forwards [`DispatchQueueSource::dequeue`]'s return values on failure. +/// +/// # Panics +/// +/// Panics if a further assignment is dispatched. +async fn assert_no_further_assignments( + scheduler: &mut TestScheduler, + reader: &DispatchQueueReader, +) -> anyhow::Result<()> { + const EXTRA_TICKS: usize = 8; + for _ in 0..EXTRA_TICKS { + scheduler.tick().await?; + tokio::task::yield_now().await; + } + let unexpected_assignment = reader.dequeue(Duration::from_millis(50)).await?; + assert_eq!(unexpected_assignment, None); + Ok(()) +} + +/// Drives the shared scenario where a finalizing batch drops one active and one pending job. +/// +/// The finalizing lane is selected by `finalizing_task_id`: commit-ready for [`TaskId::Commit`], +/// or cleanup-ready for [`TaskId::Cleanup`]. The scenario: +/// +/// 1. Buffers four jobs (two active, two pending) and freezes dispatch via a full dispatch queue. +/// 2. Delivers a finalizing batch for one active job and one pending job mid-stream. +/// 3. Asserts both jobs leave the placement state with their buffered regular tasks discarded. +/// 4. Unfreezes and asserts the drained sequence: each finalized job dispatches its finalizing task +/// exactly once and no further regular task, while the surviving jobs complete in FIFO order. +/// +/// # Errors +/// +/// Returns an error if: +/// +/// * `finalizing_task_id` is a regular [`TaskId::Index`] task. +/// * Forwards [`tick_until`]'s return values on failure. +/// * Forwards [`tick_and_drain_n`]'s return values on failure. +/// * Forwards [`assert_no_further_assignments`]'s return values on failure. +/// +/// # Panics +/// +/// Panics if any scheduling-behavior assertion of the scenario fails. +#[allow(clippy::too_many_lines, clippy::similar_names)] +async fn assert_finalizing_ready_drops_jobs(finalizing_task_id: TaskId) -> anyhow::Result<()> { + // NOTE: We disable two linting rules for the following reasons: + // * `clippy::too_many_lines`: This test case is long, but we want to avoid breaking it into + // smaller functions since that would also make the overall flow hard to navigate. + // * `clippy::similar_names`: The linter complains about `job_a_regular`, `job_b_regular`, etc., + // but these names are fine for test cases. + const ACTIVE_JOB_QUEUE_CAPACITY: usize = 2; + const DISPATCH_QUEUE_CAPACITY: usize = 2; + const TASKS_PER_JOB: usize = 3; + const NUM_PRE_FREEZE_ASSIGNMENTS: usize = DISPATCH_QUEUE_CAPACITY; + const NUM_FINALIZED_JOBS: usize = 2; + + if matches!(finalizing_task_id, TaskId::Index(_)) { + bail!("`finalizing_task_id` must be `TaskId::Commit` or `TaskId::Cleanup`"); + } + let is_commit = finalizing_task_id == TaskId::Commit; + + // Batch order makes `job_a` and `job_b` active, `job_p` and `job_q` pending. + let jobs = make_jobs(4); + let (job_a, job_b, job_p, job_q) = (jobs[0], jobs[1], jobs[2], jobs[3]); + + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, 0), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let mut scheduler = make_scheduler( + make_config(ACTIVE_JOB_QUEUE_CAPACITY, DISPATCH_QUEUE_CAPACITY), + storage_client.clone(), + writer, + ); + + // Step 1: ingest the ready batch. The ingesting tick also dispatches exactly two assignments + // (`job_a.t0`, `job_b.t0`), filling the dispatch queue; dispatch is frozen from here on because + // the test does not drain yet. + tick_until(&mut scheduler, |scheduler| { + !scheduler.buffered_tasks.is_empty() + }) + .await?; + assert_eq!( + scheduler + .active_jobs + .keys() + .copied() + .collect::>(), + HashSet::from([job_a.0, job_b.0]), + ); + assert_eq!( + scheduler + .pending_jobs + .keys() + .copied() + .collect::>(), + HashSet::from([job_p.0, job_q.0]), + ); + + // Step 2: with dispatch frozen, deliver the finalizing batch for one active job, `job_b`, and + // one pending job, `job_q`, before any of their remaining tasks can dispatch. + let finalizing_batch = make_finalizing_batch(&[job_b, job_q], finalizing_task_id); + if is_commit { + storage_client.push_commit_ready_batch(DEFAULT_SESSION_ID, finalizing_batch); + } else { + storage_client.push_cleanup_ready_batch(DEFAULT_SESSION_ID, finalizing_batch); + } + tick_until(&mut scheduler, |scheduler| { + scheduler.finalizing_jobs.contains(&job_b.0) && scheduler.finalizing_jobs.contains(&job_q.0) + }) + .await?; + + // Step 3: both jobs left the placement state and their buffered regular tasks are discarded; + // only their finalizing assignments remain queued, in arrival order. + assert!(!scheduler.active_jobs.contains_key(&job_b.0)); + assert!(!scheduler.pending_jobs.contains_key(&job_q.0)); + assert!( + scheduler.buffered_tasks.iter().all(|&(job_id, task_id)| { + (job_id != job_b.0 && job_id != job_q.0) || !matches!(task_id, TaskId::Index(_)) + }), + "a finalized job still has buffered regular tasks", + ); + let finalizing_queue = if is_commit { + &scheduler.commit_ready_jobs + } else { + &scheduler.cleanup_ready_jobs + }; + assert_eq!( + finalizing_queue.iter().copied().collect::>(), + vec![job_b, job_q], + ); + + // Step 4: unfreeze. Every remaining assignment is accounted for below: the pre-freeze + // assignments already queued, one finalizing task per finalized job, `job_a`'s remaining + // tasks (its first task dispatched pre-freeze), and the full task set of `job_p`, which + // backfills `job_b`'s freed slot. + + // total number of assignments = pre-freeze assignments + finalizing assignments + + // remaining `job_a` assignments + full `job_p` assignments + let num_assignments = + NUM_PRE_FREEZE_ASSIGNMENTS + NUM_FINALIZED_JOBS + (TASKS_PER_JOB - 1) + TASKS_PER_JOB; + let assignments = tick_and_drain_n(&mut scheduler, &reader, num_assignments).await?; + assert_no_further_assignments(&mut scheduler, &reader).await?; + assert_eq!(scheduler.buffered_tasks.len(), 0); + + let triples = make_assignment_tuple(&assignments); + + // The pre-freeze head is exactly `job_a.t0`, `job_b.t0`. + assert_eq!( + &triples[..NUM_PRE_FREEZE_ASSIGNMENTS], + &[ + (job_a.0, job_a.1, TaskId::Index(0)), + (job_b.0, job_b.1, TaskId::Index(0)), + ], + ); + + // Each finalized job's finalizing task dispatches exactly once, in arrival (FIFO) order. + let finalizing_assignments: Vec<_> = triples + .iter() + .filter(|&&(_, _, task_id)| task_id == finalizing_task_id) + .copied() + .collect(); + assert_eq!( + finalizing_assignments, + vec![ + (job_b.0, job_b.1, finalizing_task_id), + (job_q.0, job_q.1, finalizing_task_id), + ], + ); + + let job_a_tasks: Vec = triples + .iter() + .filter(|&&(job_id, ..)| job_id == job_a.0) + .map(|&(_, _, task_id)| task_id) + .collect(); + assert_eq!( + job_a_tasks, + vec![TaskId::Index(0), TaskId::Index(1), TaskId::Index(2)], + ); + + let job_b_regular: Vec<_> = triples + .iter() + .filter(|&&(job_id, _, task_id)| job_id == job_b.0 && matches!(task_id, TaskId::Index(_))) + .copied() + .collect(); + assert_eq!(job_b_regular, vec![(job_b.0, job_b.1, TaskId::Index(0))]); + + let job_p_tasks: Vec = triples + .iter() + .filter(|&&(job_id, ..)| job_id == job_p.0) + .map(|&(_, _, task_id)| task_id) + .collect(); + assert_eq!( + job_p_tasks, + vec![TaskId::Index(0), TaskId::Index(1), TaskId::Index(2)], + ); + + let job_q_regular: Vec<_> = triples + .iter() + .filter(|&&(job_id, _, task_id)| job_id == job_q.0 && matches!(task_id, TaskId::Index(_))) + .copied() + .collect(); + assert_eq!(job_q_regular, []); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn cleanup_ready_drops_active_and_pending_jobs() -> anyhow::Result<()> { + assert_finalizing_ready_drops_jobs(TaskId::Cleanup).await +} + +#[tokio::test(flavor = "multi_thread")] +async fn commit_ready_drops_active_and_pending_jobs() -> anyhow::Result<()> { + assert_finalizing_ready_drops_jobs(TaskId::Commit).await +} From 9ef086caca2d2ce47089fe18c10904e298a54353 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 19:03:25 -0400 Subject: [PATCH 12/14] Add commit and cleanup testing. --- .../src/core_impl/round_robin/implementation.rs | 2 ++ .../spider-scheduler/src/core_impl/round_robin/tests.rs | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs index 3d7b1214..c3947683 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs @@ -171,6 +171,8 @@ impl< } /// A FIFO queue of a job's buffered ready tasks. +#[derive(Eq, PartialEq)] +#[derive(Debug)] pub(super) struct JobTaskQueue { job_id: JobId, resource_group_id: ResourceGroupId, diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs index 14242482..f5d4118c 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/tests.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -903,6 +903,12 @@ async fn assert_finalizing_ready_drops_jobs(finalizing_task_id: TaskId) -> anyho .collect(); assert_eq!(job_q_regular, []); + assert!(scheduler.buffered_tasks.is_empty()); + assert!(scheduler.pending_jobs.is_empty()); + assert!(scheduler.pending_job_queue.is_empty()); + assert!(scheduler.commit_ready_jobs.is_empty()); + assert!(scheduler.cleanup_ready_jobs.is_empty()); + Ok(()) } From 7af3840b7569d5e78014303f52f744c76aa14541 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 19:17:35 -0400 Subject: [PATCH 13/14] Add test for session bump. --- .../core_impl/round_robin/implementation.rs | 3 +- .../src/core_impl/round_robin/tests.rs | 438 +++++++++++------- 2 files changed, 271 insertions(+), 170 deletions(-) diff --git a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs index c3947683..68fb27c9 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs @@ -171,8 +171,7 @@ impl< } /// A FIFO queue of a job's buffered ready tasks. -#[derive(Eq, PartialEq)] -#[derive(Debug)] +#[derive(Eq, PartialEq, Debug)] pub(super) struct JobTaskQueue { job_id: JobId, resource_group_id: ResourceGroupId, diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs index f5d4118c..86d4b092 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/tests.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -110,6 +110,11 @@ impl MockStorageClient { self.inner.session_id.load(Ordering::Relaxed) } + /// Sets the session reported on polls that have no scripted batch. + fn set_session(&self, session_id: SessionId) { + self.inner.session_id.store(session_id, Ordering::Relaxed); + } + /// Serves one poll from the given lane's script. /// /// # Returns @@ -449,169 +454,6 @@ fn zero_capacity_configs_are_rejected() { } } -#[tokio::test(flavor = "multi_thread")] -async fn single_capacity_pool_schedules_jobs_serially() -> anyhow::Result<()> { - const NUM_JOBS: usize = 3; - const TASKS_PER_JOB: usize = 5; - const DUP_EVERY: usize = 3; - const DISPATCH_QUEUE_CAPACITY: usize = 32; - - let jobs = make_jobs(NUM_JOBS); - let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); - storage_client.push_ready_batch( - DEFAULT_SESSION_ID, - make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), - ); - - let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); - let config = make_config(1, DISPATCH_QUEUE_CAPACITY); - let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); - - let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; - assert_no_more_assignments(&reader).await?; - - // With an active job pool of capacity 1, round-robin degenerates to serial job FIFO: the - // rotation holds a single job at a time, so each job's tasks dispatch as one consecutive - // single-job rotation, in job-arrival order. - for (segment, job) in assignments.chunks(TASKS_PER_JOB).zip(&jobs) { - assert_strict_rotation(segment, std::slice::from_ref(job), TASKS_PER_JOB); - } - - cancellation_token.cancel(); - scheduler_handle.await.expect("scheduler task panicked")?; - Ok(()) -} - -#[tokio::test(flavor = "multi_thread")] -async fn active_jobs_dispatch_in_round_robin_order() -> anyhow::Result<()> { - const NUM_JOBS: usize = 10; - const TASKS_PER_JOB: usize = 5; - const DUP_EVERY: usize = 4; - const DISPATCH_QUEUE_CAPACITY: usize = 32; - - let jobs = make_jobs(NUM_JOBS); - let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); - storage_client.push_ready_batch( - DEFAULT_SESSION_ID, - make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), - ); - - let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); - let config = make_config(NUM_JOBS, DISPATCH_QUEUE_CAPACITY); - let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); - - let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; - assert_no_more_assignments(&reader).await?; - - // All 10 jobs fit into the active job pool, so no job ever pends and dispatch follows the - // strict rotation: task 0 of every job in batch order, then task 1 of every job, and so on. The - // exact count of 50 (with no trailing assignments) also proves the in-batch duplicates were - // deduplicated. - assert_strict_rotation(&assignments, &jobs, TASKS_PER_JOB); - - cancellation_token.cancel(); - scheduler_handle.await.expect("scheduler task panicked")?; - Ok(()) -} - -#[tokio::test(flavor = "multi_thread")] -async fn pending_jobs_promote_and_schedule_round_robin() -> anyhow::Result<()> { - const ACTIVE_JOB_QUEUE_CAPACITY: usize = 10; - const NUM_JOBS: usize = 20; - const TASKS_PER_JOB: usize = 5; - const DUP_EVERY: usize = 5; - const DISPATCH_QUEUE_CAPACITY: usize = 32; - - let jobs = make_jobs(NUM_JOBS); - let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); - storage_client.push_ready_batch( - DEFAULT_SESSION_ID, - make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), - ); - - let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); - let config = make_config(ACTIVE_JOB_QUEUE_CAPACITY, DISPATCH_QUEUE_CAPACITY); - let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); - - let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; - assert_no_more_assignments(&reader).await?; - - let (active_jobs, pending_jobs) = jobs.split_at(ACTIVE_JOB_QUEUE_CAPACITY); - let (phase1, phase2) = assignments.split_at(ACTIVE_JOB_QUEUE_CAPACITY * TASKS_PER_JOB); - - // Phase 1: the first 10 jobs in batch order fill the active job pool and dispatch in strict - // rotation; the pending jobs must not appear while the active jobs still have tasks. - assert_strict_rotation(phase1, active_jobs, TASKS_PER_JOB); - - // Phase 2: once the active jobs exhaust, the 10 pending jobs are promoted and scheduled - // round-robin. The exact slot order after the retire-and-promote wave is an implementation - // detail of the rotation bookkeeping, so assert the round-robin property instead of one - // hard-coded sequence. - assert_round_robin_property(phase2, pending_jobs, TASKS_PER_JOB); - - cancellation_token.cancel(); - scheduler_handle.await.expect("scheduler task panicked")?; - Ok(()) -} - -#[tokio::test(flavor = "multi_thread")] -async fn commit_and_cleanup_dispatch_once_per_cycle() -> anyhow::Result<()> { - const NUM_ACTIVE_JOBS: usize = 4; - const TASKS_PER_JOB: usize = 3; - const NUM_FINALIZING_JOBS_PER_LANE: usize = 3; - const DISPATCH_QUEUE_CAPACITY: usize = 1024; - - let active_jobs = make_jobs(NUM_ACTIVE_JOBS); - let commit_ready_jobs = make_jobs(NUM_FINALIZING_JOBS_PER_LANE); - let cleanup_ready_jobs = make_jobs(NUM_FINALIZING_JOBS_PER_LANE); - - let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); - storage_client.push_ready_batch( - DEFAULT_SESSION_ID, - make_ready_batch(&active_jobs, TASKS_PER_JOB, 0), - ); - let mut commit_ready_batch = make_finalizing_batch(&commit_ready_jobs, TaskId::Commit); - // Duplicate one commit-ready entry within the batch: it must dispatch exactly once. - commit_ready_batch.push(commit_ready_batch[0]); - storage_client.push_commit_ready_batch(DEFAULT_SESSION_ID, commit_ready_batch); - storage_client.push_cleanup_ready_batch( - DEFAULT_SESSION_ID, - make_finalizing_batch(&cleanup_ready_jobs, TaskId::Cleanup), - ); - - let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); - let config = make_config(NUM_ACTIVE_JOBS, DISPATCH_QUEUE_CAPACITY); - let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); - - let num_assignments = NUM_ACTIVE_JOBS * TASKS_PER_JOB + 2 * NUM_FINALIZING_JOBS_PER_LANE; - let assignments = drain_n(&reader, num_assignments).await?; - assert_no_more_assignments(&reader).await?; - - // The rotation is [commit lane, cleanup lane, active jobs...], so every cycle dispatches - // exactly one commit task and one cleanup task (while their queues are non-empty), each lane - // drained FIFO, followed by one task of every active job. - let expected: Vec<(JobId, ResourceGroupId, TaskId)> = (0..TASKS_PER_JOB) - .flat_map(|round| { - let (commit_job_id, commit_resource_group_id) = commit_ready_jobs[round]; - let (cleanup_job_id, cleanup_resource_group_id) = cleanup_ready_jobs[round]; - std::iter::once((commit_job_id, commit_resource_group_id, TaskId::Commit)) - .chain(std::iter::once(( - cleanup_job_id, - cleanup_resource_group_id, - TaskId::Cleanup, - ))) - .chain(active_jobs.iter().map(move |&(job_id, resource_group_id)| { - (job_id, resource_group_id, TaskId::Index(round)) - })) - }) - .collect(); - assert_eq!(make_assignment_tuple(&assignments), expected); - - cancellation_token.cancel(); - scheduler_handle.await.expect("scheduler task panicked")?; - Ok(()) -} - /// # Returns /// /// A white-box scheduler wired to the given storage client and sink, to be driven by manual @@ -658,7 +500,8 @@ async fn tick_until( /// /// # Returns /// -/// The drained assignments in FIFO order on success. +/// The drained assignments in FIFO order on success, each paired with the session under which it +/// was dequeued. /// /// # Errors /// @@ -671,7 +514,7 @@ async fn tick_and_drain_n( scheduler: &mut TestScheduler, reader: &DispatchQueueReader, n: usize, -) -> anyhow::Result> { +) -> anyhow::Result> { let deadline = tokio::time::Instant::now() + DRAIN_DEADLINE; let mut assignments = Vec::with_capacity(n); while assignments.len() < n { @@ -682,8 +525,8 @@ async fn tick_and_drain_n( ); } scheduler.tick().await?; - while let Some((_session_id, assignment)) = reader.dequeue(Duration::ZERO).await? { - assignments.push(assignment); + while let Some((session_id, assignment)) = reader.dequeue(Duration::ZERO).await? { + assignments.push((session_id, assignment)); } tokio::task::yield_now().await; } @@ -840,7 +683,12 @@ async fn assert_finalizing_ready_drops_jobs(finalizing_task_id: TaskId) -> anyho // remaining `job_a` assignments + full `job_p` assignments let num_assignments = NUM_PRE_FREEZE_ASSIGNMENTS + NUM_FINALIZED_JOBS + (TASKS_PER_JOB - 1) + TASKS_PER_JOB; - let assignments = tick_and_drain_n(&mut scheduler, &reader, num_assignments).await?; + let assignments: Vec = + tick_and_drain_n(&mut scheduler, &reader, num_assignments) + .await? + .into_iter() + .map(|(_session_id, assignment)| assignment) + .collect(); assert_no_further_assignments(&mut scheduler, &reader).await?; assert_eq!(scheduler.buffered_tasks.len(), 0); @@ -912,6 +760,169 @@ async fn assert_finalizing_ready_drops_jobs(finalizing_task_id: TaskId) -> anyho Ok(()) } +#[tokio::test(flavor = "multi_thread")] +async fn single_capacity_pool_schedules_jobs_serially() -> anyhow::Result<()> { + const NUM_JOBS: usize = 3; + const TASKS_PER_JOB: usize = 5; + const DUP_EVERY: usize = 3; + const DISPATCH_QUEUE_CAPACITY: usize = 32; + + let jobs = make_jobs(NUM_JOBS); + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(1, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; + assert_no_more_assignments(&reader).await?; + + // With an active job pool of capacity 1, round-robin degenerates to serial job FIFO: the + // rotation holds a single job at a time, so each job's tasks dispatch as one consecutive + // single-job rotation, in job-arrival order. + for (segment, job) in assignments.chunks(TASKS_PER_JOB).zip(&jobs) { + assert_strict_rotation(segment, std::slice::from_ref(job), TASKS_PER_JOB); + } + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn active_jobs_dispatch_in_round_robin_order() -> anyhow::Result<()> { + const NUM_JOBS: usize = 10; + const TASKS_PER_JOB: usize = 5; + const DUP_EVERY: usize = 4; + const DISPATCH_QUEUE_CAPACITY: usize = 32; + + let jobs = make_jobs(NUM_JOBS); + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(NUM_JOBS, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; + assert_no_more_assignments(&reader).await?; + + // All 10 jobs fit into the active job pool, so no job ever pends and dispatch follows the + // strict rotation: task 0 of every job in batch order, then task 1 of every job, and so on. The + // exact count of 50 (with no trailing assignments) also proves the in-batch duplicates were + // deduplicated. + assert_strict_rotation(&assignments, &jobs, TASKS_PER_JOB); + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn pending_jobs_promote_and_schedule_round_robin() -> anyhow::Result<()> { + const ACTIVE_JOB_QUEUE_CAPACITY: usize = 10; + const NUM_JOBS: usize = 20; + const TASKS_PER_JOB: usize = 5; + const DUP_EVERY: usize = 5; + const DISPATCH_QUEUE_CAPACITY: usize = 32; + + let jobs = make_jobs(NUM_JOBS); + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&jobs, TASKS_PER_JOB, DUP_EVERY), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(ACTIVE_JOB_QUEUE_CAPACITY, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let assignments = drain_n(&reader, NUM_JOBS * TASKS_PER_JOB).await?; + assert_no_more_assignments(&reader).await?; + + let (active_jobs, pending_jobs) = jobs.split_at(ACTIVE_JOB_QUEUE_CAPACITY); + let (phase1, phase2) = assignments.split_at(ACTIVE_JOB_QUEUE_CAPACITY * TASKS_PER_JOB); + + // Phase 1: the first 10 jobs in batch order fill the active job pool and dispatch in strict + // rotation; the pending jobs must not appear while the active jobs still have tasks. + assert_strict_rotation(phase1, active_jobs, TASKS_PER_JOB); + + // Phase 2: once the active jobs exhaust, the 10 pending jobs are promoted and scheduled + // round-robin. The exact slot order after the retire-and-promote wave is an implementation + // detail of the rotation bookkeeping, so assert the round-robin property instead of one + // hard-coded sequence. + assert_round_robin_property(phase2, pending_jobs, TASKS_PER_JOB); + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn commit_and_cleanup_dispatch_once_per_cycle() -> anyhow::Result<()> { + const NUM_ACTIVE_JOBS: usize = 4; + const TASKS_PER_JOB: usize = 3; + const NUM_FINALIZING_JOBS_PER_LANE: usize = 3; + const DISPATCH_QUEUE_CAPACITY: usize = 1024; + + let active_jobs = make_jobs(NUM_ACTIVE_JOBS); + let commit_ready_jobs = make_jobs(NUM_FINALIZING_JOBS_PER_LANE); + let cleanup_ready_jobs = make_jobs(NUM_FINALIZING_JOBS_PER_LANE); + + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&active_jobs, TASKS_PER_JOB, 0), + ); + let mut commit_ready_batch = make_finalizing_batch(&commit_ready_jobs, TaskId::Commit); + // Duplicate one commit-ready entry within the batch: it must dispatch exactly once. + commit_ready_batch.push(commit_ready_batch[0]); + storage_client.push_commit_ready_batch(DEFAULT_SESSION_ID, commit_ready_batch); + storage_client.push_cleanup_ready_batch( + DEFAULT_SESSION_ID, + make_finalizing_batch(&cleanup_ready_jobs, TaskId::Cleanup), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let config = make_config(NUM_ACTIVE_JOBS, DISPATCH_QUEUE_CAPACITY); + let (scheduler_handle, cancellation_token) = spawn_scheduler(config, storage_client, writer); + + let num_assignments = NUM_ACTIVE_JOBS * TASKS_PER_JOB + 2 * NUM_FINALIZING_JOBS_PER_LANE; + let assignments = drain_n(&reader, num_assignments).await?; + assert_no_more_assignments(&reader).await?; + + // The rotation is [commit lane, cleanup lane, active jobs...], so every cycle dispatches + // exactly one commit task and one cleanup task (while their queues are non-empty), each lane + // drained FIFO, followed by one task of every active job. + let expected: Vec<(JobId, ResourceGroupId, TaskId)> = (0..TASKS_PER_JOB) + .flat_map(|round| { + let (commit_job_id, commit_resource_group_id) = commit_ready_jobs[round]; + let (cleanup_job_id, cleanup_resource_group_id) = cleanup_ready_jobs[round]; + std::iter::once((commit_job_id, commit_resource_group_id, TaskId::Commit)) + .chain(std::iter::once(( + cleanup_job_id, + cleanup_resource_group_id, + TaskId::Cleanup, + ))) + .chain(active_jobs.iter().map(move |&(job_id, resource_group_id)| { + (job_id, resource_group_id, TaskId::Index(round)) + })) + }) + .collect(); + assert_eq!(make_assignment_tuple(&assignments), expected); + + cancellation_token.cancel(); + scheduler_handle.await.expect("scheduler task panicked")?; + Ok(()) +} + #[tokio::test(flavor = "multi_thread")] async fn cleanup_ready_drops_active_and_pending_jobs() -> anyhow::Result<()> { assert_finalizing_ready_drops_jobs(TaskId::Cleanup).await @@ -921,3 +932,94 @@ async fn cleanup_ready_drops_active_and_pending_jobs() -> anyhow::Result<()> { async fn commit_ready_drops_active_and_pending_jobs() -> anyhow::Result<()> { assert_finalizing_ready_drops_jobs(TaskId::Commit).await } + +#[tokio::test(flavor = "multi_thread")] +async fn session_bump_clears_buffered_tasks() -> anyhow::Result<()> { + const ACTIVE_JOB_QUEUE_CAPACITY: usize = 4; + const DISPATCH_QUEUE_CAPACITY: usize = 4; + const TASKS_PER_JOB: usize = 4; + const NEW_SESSION_ID: SessionId = DEFAULT_SESSION_ID + 1; + const NEW_TASKS_PER_JOB: usize = 2; + + let old_jobs = make_jobs(4); + let new_jobs = make_jobs(2); + + let storage_client = MockStorageClient::new(DEFAULT_SESSION_ID); + storage_client.push_ready_batch( + DEFAULT_SESSION_ID, + make_ready_batch(&old_jobs, TASKS_PER_JOB, 0), + ); + + let (writer, reader) = create_dispatch_queue(DISPATCH_QUEUE_CAPACITY, DEFAULT_SESSION_ID); + let mut scheduler = make_scheduler( + make_config(ACTIVE_JOB_QUEUE_CAPACITY, DISPATCH_QUEUE_CAPACITY), + storage_client.clone(), + writer, + ); + + // Step 1: ingest the old-session batch. The ingesting tick dispatches enough assignments to + // fill the dispatch queue (which the test never drains); the rest will stay in the buffer. + tick_until(&mut scheduler, |scheduler| { + !scheduler.buffered_tasks.is_empty() + }) + .await?; + assert_eq!(scheduler.active_jobs.len(), old_jobs.len()); + assert_eq!( + scheduler.buffered_tasks.len(), + old_jobs.len() * TASKS_PER_JOB - DISPATCH_QUEUE_CAPACITY, + ); + + // Step 2: bump the session on the storage side and deliver a batch under the new session. + storage_client.set_session(NEW_SESSION_ID); + storage_client.push_ready_batch( + NEW_SESSION_ID, + make_ready_batch(&new_jobs, NEW_TASKS_PER_JOB, 0), + ); + tick_until(&mut scheduler, |scheduler| { + scheduler.storage_session_id == NEW_SESSION_ID + && new_jobs + .iter() + .all(|(job_id, _)| scheduler.active_jobs.contains_key(job_id)) + }) + .await?; + + assert_eq!( + scheduler + .active_jobs + .keys() + .copied() + .collect::>(), + new_jobs + .iter() + .map(|&(job_id, _)| job_id) + .collect::>(), + ); + assert_eq!(scheduler.pending_jobs.len(), 0); + assert!( + scheduler.buffered_tasks.iter().all(|(job_id, _)| { + new_jobs + .iter() + .any(|&(new_job_id, _)| *job_id == new_job_id) + }), + "an old-session task survived the session bump", + ); + + // The session bump drained the dispatch queue: the frozen old-session assignments are gone, and + // draining yields exactly the new jobs' tasks in strict rotation, each paired with the new + // session. + let num_new_assignments = new_jobs.len() * NEW_TASKS_PER_JOB; + let session_stamped = tick_and_drain_n(&mut scheduler, &reader, num_new_assignments).await?; + assert_no_further_assignments(&mut scheduler, &reader).await?; + + for &(session_id, _) in &session_stamped { + assert_eq!(session_id, NEW_SESSION_ID); + } + + let assignments: Vec = session_stamped + .into_iter() + .map(|(_session_id, assignment)| assignment) + .collect(); + assert_strict_rotation(&assignments, &new_jobs, NEW_TASKS_PER_JOB); + + Ok(()) +} From 4c300f37fb6946294e78649e4689fd9329b6d7e0 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 4 Jun 2026 21:46:37 -0400 Subject: [PATCH 14/14] Done with implementation and testing. --- Cargo.lock | 1 + components/spider-scheduler/Cargo.toml | 3 +- .../core_impl/round_robin/implementation.rs | 184 ++++++++++++++++-- .../src/core_impl/round_robin/tests.rs | 25 +++ components/spider-scheduler/src/error.rs | 3 + 5 files changed, 202 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6d96c4b7..4fc8b10b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1513,6 +1513,7 @@ dependencies = [ "thiserror", "tokio", "tokio-util", + "tracing", ] [[package]] diff --git a/components/spider-scheduler/Cargo.toml b/components/spider-scheduler/Cargo.toml index a928fddf..042dcd49 100644 --- a/components/spider-scheduler/Cargo.toml +++ b/components/spider-scheduler/Cargo.toml @@ -10,11 +10,12 @@ path = "src/lib.rs" [dependencies] async-channel = "2.3.1" async-trait = "0.1.89" +serde = { version = "1.0.228", features = ["derive"] } spider-core = { path = "../spider-core" } thiserror = "2.0.18" tokio = { version = "1.52.3", features = ["macros", "rt", "sync", "time"] } tokio-util = "0.7.18" -serde = { version = "1.0.228", features = ["derive"] } +tracing = { version = "0.1.41", default-features = false, features = ["std"] } [dev-dependencies] anyhow = "1.0.102" diff --git a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs index 68fb27c9..51dbb160 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/implementation.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/implementation.rs @@ -3,7 +3,7 @@ use std::{ collections::{HashMap, HashSet, VecDeque}, - time::Duration, + time::{Duration, SystemTime}, }; use async_trait::async_trait; @@ -23,7 +23,7 @@ use crate::{ }; /// The configuration of the round-robin scheduler core. -#[derive(Deserialize)] +#[derive(Debug, Deserialize)] pub struct RoundRobinConfig { /// The capacity of the active job queue. The scheduler will make task assignments from these /// jobs in a round-robin manner. @@ -224,6 +224,7 @@ pub(super) struct RoundRobin< pub(super) cancellation_token: CancellationToken, pub(super) config: RoundRobinConfig, pub(super) storage_session_id: SessionId, + pub(super) buffered_tasks: HashSet<(JobId, TaskId)>, pub(super) active_jobs: HashMap, @@ -237,6 +238,7 @@ pub(super) struct RoundRobin< pub(super) cleanup_ready_jobs: VecDeque<(JobId, ResourceGroupId)>, pub(super) finalizing_jobs: HashSet, + pub(super) finalizing_job_queue: VecDeque<(JobId, SystemTime)>, pub(super) inbound_queue_reader: AsyncInboundQueueReader, } @@ -272,6 +274,7 @@ impl< let finalizing_jobs = HashSet::with_capacity( config.commit_ready_task_capacity + config.cleanup_ready_task_capacity, ); + let finalizing_job_queue = VecDeque::new(); let inbound_queue_reader = AsyncInboundQueueReader::new(storage_client); Self { sink, @@ -287,6 +290,7 @@ impl< commit_ready_jobs, cleanup_ready_jobs, finalizing_jobs, + finalizing_job_queue, inbound_queue_reader, } } @@ -300,9 +304,12 @@ impl< /// /// * Forwards [`Self::consume_inbound_poll_result`]'s return values on failure. /// * Forwards [`Self::make_schedule_decisions`]'s return values on failure. + /// * Forwards [`Self::retire_expired_finalizing_jobs`]'s return values on failure. pub(super) async fn tick(&mut self) -> Result<(), SchedulerError> { + tracing::info!("Starting scheduling tick."); self.consume_inbound_poll_result().await?; self.make_schedule_decisions().await?; + self.retire_expired_finalizing_jobs()?; Ok(()) } @@ -327,16 +334,25 @@ impl< /// /// * Forwards [`Self::tick`]'s return values on failure. async fn run(mut self) -> Result<(), SchedulerError> { + tracing::info!( + config = ? self.config, + init_session_id = self.storage_session_id, + "Round-robin scheduler started." + ); let tick_interval = Duration::from_millis(self.config.tick_interval_ms); loop { let now = tokio::time::Instant::now(); let cancellation_token = self.cancellation_token.clone(); select! { () = cancellation_token.cancelled() => { + tracing::info!("Round-robin scheduler cancelled. Shutting down."); return Ok(()); } result = self.tick() => { - let () = result?; + result.inspect_err(|err| tracing::error!( + err = % err, + "Round-robin scheduler exits on error." + ))?; } } let elapsed = now.elapsed(); @@ -358,6 +374,7 @@ impl< self.commit_ready_jobs.clear(); self.cleanup_ready_jobs.clear(); self.finalizing_jobs.clear(); + self.finalizing_job_queue.clear(); self.active_job_queue = Self::new_active_job_queue(self.config.active_job_queue_capacity); self.active_job_queue_round_robin_cursor = 0; @@ -372,6 +389,7 @@ impl< /// /// * [`SchedulerError::Internal`] if the given job is not currently active. fn retire_active_job(&mut self, job_id: JobId) -> Result<(), SchedulerError> { + tracing::info!(job_id = ? job_id, "Retiring active job."); if let Some(index) = self.active_job_queue.iter().position(|entry| match entry { RoundRobinSlot::Job(id) => *id == job_id, _ => false, @@ -392,6 +410,10 @@ impl< } if let Some(next_pending_job) = self.pop_next_pending_job() { + tracing::info!( + job_id = ? next_pending_job.job_id, + "Pending job promoted to active job." + ); self.active_job_queue .push(RoundRobinSlot::Job(next_pending_job.job_id)); self.active_jobs @@ -416,11 +438,50 @@ impl< /// Removes all of the given job's queued tasks from the buffered-task set. fn discard_job_tasks(&mut self, job_entry: JobTaskQueue) { + tracing::info!( + job_id = ? job_entry.job_id, + num_tasks = job_entry.task_ids.len(), + "Discarding job tasks." + ); for task_id in job_entry.task_ids { self.buffered_tasks.remove(&(job_entry.job_id, task_id)); } } + /// Inserts a job as it is considered finalizing (commit-ready or cleanup-ready). Once inserted, + /// any further tasks for the job will be ignored until this queue is reset. + fn mark_job_finalizing(&mut self, job_id: JobId) { + if self.finalizing_jobs.insert(job_id) { + self.finalizing_job_queue + .push_back((job_id, SystemTime::now())); + } + } + + /// Retires expired finalizing jobs. + /// + /// A finalizing job is considered expired once it has remained in the finalizing state for more + /// than 6 hours. This timeout is currently hard-coded but may be made configurable through + /// [`RoundRobinConfig`] in the future. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`SystemTime::elapsed`]'s return values on failure. + fn retire_expired_finalizing_jobs(&mut self) -> Result<(), SchedulerError> { + const EXPIRATION_TIME: Duration = Duration::from_hours(6); + while let Some((job_id, insertion_time)) = self.finalizing_job_queue.front() { + if insertion_time.elapsed()? > EXPIRATION_TIME { + tracing::info!(job_id = ? job_id, "Finalizing job retired."); + self.finalizing_jobs.remove(job_id); + self.finalizing_job_queue.pop_front(); + } else { + break; + } + } + Ok(()) + } + /// Loads polled inbound entries into the scheduler's internal buffers. /// /// If the polled session is newer than the current session, all existing placement states are @@ -439,7 +500,8 @@ impl< /// * [`SchedulerError::InvalidSessionId`] if the polled session is older than the current /// session. /// * Forwards [`DispatchQueueSink::bump_session_id`]'s return values on failure. - /// * Forwards [`Self::retire_active_job`]'s return values on failure. + /// * Forwards [`Self::enqueue_commit_ready_entries`]'s return values on failure. + /// * Forwards [`Self::enqueue_cleanup_ready_entries`]'s return values on failure. async fn ingest_inbound_entries( &mut self, curr_session_id: SessionId, @@ -452,13 +514,40 @@ impl< return Err(SchedulerError::InvalidSessionId(storage_session_id)); } if storage_session_id > curr_session_id { + tracing::info!( + curr_session_id = ? curr_session_id, + storage_session_id = ? storage_session_id, + "New session detected. Clearing existing placement state and bumping dispatch \ + queue session." + ); self.storage_session_id = storage_session_id; self.clear(); self.sink.bump_session_id(storage_session_id).await?; } - // Load commit ready tasks and cleanup ready tasks first to avoid loading a job that - // is already cancelled or commit-ready. + // Load commit-ready tasks and cleanup-ready tasks first to avoid loading a job that is + // already finalizing. + self.enqueue_commit_ready_entries(commit_ready_entries)?; + self.enqueue_cleanup_ready_entries(cleanup_ready_entries)?; + self.enqueue_ready_entries(ready_entries); + + Ok(()) + } + + /// Enqueues polled commit-ready entries: each entry's job is marked finalizing, queued for a + /// commit-task assignment, and removed from the active or pending set. + /// + /// Entries whose tasks are already buffered are ignored. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::retire_active_job`]'s return values on failure. + fn enqueue_commit_ready_entries( + &mut self, + commit_ready_entries: Vec, + ) -> Result<(), SchedulerError> { for inbound_entry in commit_ready_entries { if !self .buffered_tasks @@ -466,7 +555,13 @@ impl< { continue; } - self.finalizing_jobs.insert(inbound_entry.job_id); + + tracing::info!( + job_id = ? inbound_entry.job_id, + "Commit-ready task received. Finalizing job." + ); + + self.mark_job_finalizing(inbound_entry.job_id); self.commit_ready_jobs .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); @@ -480,6 +575,23 @@ impl< } } + Ok(()) + } + + /// Enqueues polled cleanup-ready entries: each entry's job is marked finalizing, queued for a + /// cleanup-task assignment, and removed from the active or pending set. + /// + /// Entries whose tasks are already buffered are ignored. + /// + /// # Errors + /// + /// Returns an error if: + /// + /// * Forwards [`Self::retire_active_job`]'s return values on failure. + fn enqueue_cleanup_ready_entries( + &mut self, + cleanup_ready_entries: Vec, + ) -> Result<(), SchedulerError> { for inbound_entry in cleanup_ready_entries { if !self .buffered_tasks @@ -487,7 +599,13 @@ impl< { continue; } - self.finalizing_jobs.insert(inbound_entry.job_id); + + tracing::info!( + job_id = ? inbound_entry.job_id, + "Cleanup-ready task received. Finalizing job." + ); + + self.mark_job_finalizing(inbound_entry.job_id); self.cleanup_ready_jobs .push_back((inbound_entry.job_id, inbound_entry.resource_group_id)); @@ -501,8 +619,19 @@ impl< } } + Ok(()) + } + + /// Enqueues polled regular ready entries into their jobs' task queues + /// + /// Entries of finalizing jobs and entries whose tasks are already buffered are ignored. + fn enqueue_ready_entries(&mut self, ready_entries: Vec) { for inbound_entry in ready_entries { if self.finalizing_jobs.contains(&inbound_entry.job_id) { + tracing::info!( + job_id = ? inbound_entry.job_id, + "Ready task received for a finalizing job. Ignored." + ); continue; } if !self @@ -511,6 +640,13 @@ impl< { continue; } + + tracing::debug!( + job_id = ? inbound_entry.job_id, + task_id = ? inbound_entry.task_id, + "Inbound task received." + ); + if let Some(active_job) = self.active_jobs.get_mut(&inbound_entry.job_id) { active_job.enqueue(inbound_entry.task_id); continue; @@ -519,7 +655,12 @@ impl< pending_job.enqueue(inbound_entry.task_id); continue; } + if self.active_jobs.len() < self.config.active_job_queue_capacity { + tracing::info!( + job_id = ? inbound_entry.job_id, + "New job received. Placing in active job queue." + ); self.active_jobs.insert( inbound_entry.job_id, JobTaskQueue::new( @@ -532,6 +673,11 @@ impl< .push(RoundRobinSlot::Job(inbound_entry.job_id)); continue; } + + tracing::info!( + job_id = ? inbound_entry.job_id, + "New job received. Placing in pending job queue." + ); self.pending_jobs.insert( inbound_entry.job_id, JobTaskQueue::new( @@ -542,8 +688,6 @@ impl< ); self.pending_job_queue.push_back(inbound_entry.job_id); } - - Ok(()) } /// Consumes the in-flight inbound poll if it has completed, ingesting its entries and starting @@ -569,6 +713,7 @@ impl< commit_ready_entries, cleanup_ready_entries, } => { + tracing::info!("Inbound poll completed."); self.ingest_inbound_entries( curr_session_id, storage_session_id, @@ -600,10 +745,11 @@ impl< /// * Forwards [`DispatchQueueSink::enqueue`]'s return values on failure. /// * Forwards [`Self::retire_active_job`]'s return values on failure. async fn make_schedule_decisions(&mut self) -> Result<(), SchedulerError> { - let mut remaining_dispatch_slots = self + let dispatch_slots = self .config .dispatch_queue_capacity .saturating_sub(self.sink.size()); + let mut remaining_dispatch_slots = dispatch_slots; while remaining_dispatch_slots > 0 && !self.buffered_tasks.is_empty() { if self.active_job_queue_round_robin_cursor >= self.active_job_queue.len() { self.active_job_queue_round_robin_cursor = 0; @@ -635,7 +781,6 @@ impl< }) .await?; self.buffered_tasks.remove(&(job_id, TaskId::Cleanup)); - self.finalizing_jobs.remove(&job_id); remaining_dispatch_slots -= 1; } RoundRobinSlot::CommitReady => { @@ -651,7 +796,6 @@ impl< }) .await?; self.buffered_tasks.remove(&(job_id, TaskId::Commit)); - self.finalizing_jobs.remove(&job_id); remaining_dispatch_slots -= 1; } RoundRobinSlot::Job(job_id) => { @@ -677,6 +821,12 @@ impl< } } + tracing::info!( + dispatch_slots = dispatch_slots, + num_task_assignments_enqueued = dispatch_slots - remaining_dispatch_slots, + "Decision-making loop completed." + ); + Ok(()) } @@ -915,6 +1065,7 @@ impl if max_ready_entries == 0 && max_commit_ready_entries == 0 && max_cleanup_ready_entries == 0 { + tracing::info!("Inbound poll skipped: all entry limits are 0."); return Ok(()); } @@ -954,6 +1105,13 @@ impl cleanup_ready_handle, }); + tracing::info!( + max_ready_entries = ? max_ready_entries, + max_commit_ready_entries = ? max_commit_ready_entries, + max_cleanup_ready_entries = ? max_cleanup_ready_entries, + "Inbound poll initiated." + ); + Ok(()) } } diff --git a/components/spider-scheduler/src/core_impl/round_robin/tests.rs b/components/spider-scheduler/src/core_impl/round_robin/tests.rs index 86d4b092..e63fe532 100644 --- a/components/spider-scheduler/src/core_impl/round_robin/tests.rs +++ b/components/spider-scheduler/src/core_impl/round_robin/tests.rs @@ -569,6 +569,9 @@ async fn assert_no_further_assignments( /// 3. Asserts both jobs leave the placement state with their buffered regular tasks discarded. /// 4. Unfreezes and asserts the drained sequence: each finalized job dispatches its finalizing task /// exactly once and no further regular task, while the surviving jobs complete in FIFO order. +/// 5. Re-delivers regular ready tasks for the finalized jobs alongside a fresh canary job. Asserts +/// the re-delivered tasks are ignored (the finalizing gate persists after the finalizing tasks +/// are dispatched) while the canary job schedules normally. /// /// # Errors /// @@ -756,6 +759,28 @@ async fn assert_finalizing_ready_drops_jobs(finalizing_task_id: TaskId) -> anyho assert!(scheduler.pending_job_queue.is_empty()); assert!(scheduler.commit_ready_jobs.is_empty()); assert!(scheduler.cleanup_ready_jobs.is_empty()); + assert_eq!(scheduler.finalizing_jobs.len(), NUM_FINALIZED_JOBS); + + assert!(scheduler.finalizing_jobs.contains(&job_b.0)); + assert!(scheduler.finalizing_jobs.contains(&job_q.0)); + + // Step 5: The finalizing gate remains active after the finalizing tasks have been dispatched, + // so re-delivered regular tasks for finalized jobs must be ignored. A fresh canary job is + // included in the same batch. Since a batch is ingested atomically within a single tick, + // successful dispatch of the canary's tasks proves that the finalized jobs' entries have + // already been processed (and ignored), rather than still being in flight. + let canary_jobs = make_jobs(1); + let mut late_batch = make_ready_batch(&[job_b, job_q], TASKS_PER_JOB, 0); + late_batch.extend(make_ready_batch(&canary_jobs, TASKS_PER_JOB, 0)); + storage_client.push_ready_batch(DEFAULT_SESSION_ID, late_batch); + + let late_assignments: Vec<_> = tick_and_drain_n(&mut scheduler, &reader, TASKS_PER_JOB) + .await? + .into_iter() + .map(|(_session_id, assignment)| assignment) + .collect(); + assert_strict_rotation(&late_assignments, &canary_jobs, TASKS_PER_JOB); + assert_no_further_assignments(&mut scheduler, &reader).await?; Ok(()) } diff --git a/components/spider-scheduler/src/error.rs b/components/spider-scheduler/src/error.rs index 50851809..34bb631e 100644 --- a/components/spider-scheduler/src/error.rs +++ b/components/spider-scheduler/src/error.rs @@ -37,4 +37,7 @@ pub enum SchedulerError { #[error("async result not ready")] ResultNotReady, + + #[error(transparent)] + SystemTime(#[from] std::time::SystemTimeError), }