-
Notifications
You must be signed in to change notification settings - Fork 10
feat(spider-scheduler): Add scheduler crate skeleton with trait and type abstractions. #330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
LinZhihao-723
wants to merge
5
commits into
y-scope:main
Choose a base branch
from
LinZhihao-723:scheduler-skeleton
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
ac0dd77
WIP
LinZhihao-723 dced92e
Done.
LinZhihao-723 7e87422
Merge branch 'main' into scheduler-skeleton
LinZhihao-723 9c436bd
Update dispatch queue's trait.
LinZhihao-723 2013cbf
Merge branch 'scheduler-skeleton' of https://github.com/LinZhihao-723…
LinZhihao-723 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| [package] | ||
| name = "spider-scheduler" | ||
| version = "0.1.0" | ||
| edition = "2024" | ||
|
|
||
| [lib] | ||
| name = "spider_scheduler" | ||
| path = "src/lib.rs" | ||
|
|
||
| [dependencies] | ||
| async-trait = "0.1.89" | ||
| spider-core = { path = "../spider-core" } | ||
| thiserror = "2.0.18" | ||
| tokio-util = "0.7.18" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| //! The abstract core of a Spider scheduler. | ||
|
|
||
| use async_trait::async_trait; | ||
|
|
||
| use crate::{ | ||
| dispatch_queue::DispatchQueueSink, | ||
| error::SchedulerError, | ||
| storage_client::SchedulerStorageClient, | ||
| }; | ||
|
|
||
| /// An abstracted core for a scheduling algorithm. | ||
| /// | ||
| /// A core owns its decision loop: it polls the inbound queue through a [`SchedulerStorageClient`], | ||
| /// applies its algorithm (reading storage as needed for placement), and writes assignments to a | ||
| /// [`DispatchQueueSink`]. Modeling the algorithm as a trait lets different scheduling strategies | ||
| /// share the same runtime entry point. | ||
| #[async_trait] | ||
| pub trait SchedulerCore: Send { | ||
| /// The storage client used by the core to poll and read for placement decisions. | ||
| type StorageClient: SchedulerStorageClient; | ||
|
|
||
| /// The dispatch sink the core writes assignments to. | ||
| type Sink: DispatchQueueSink; | ||
|
|
||
| /// Runs the scheduling loop until `cancellation_token` is triggered. | ||
| /// | ||
| /// The core polls the inbound queue through `storage_client`, applies its scheduling algorithm, | ||
| /// and writes assignments to `sink`, repeating until `cancellation_token` is fired, at which | ||
| /// point it returns. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `storage_client` - The storage client used to poll the inbound queue and read state for | ||
| /// placement. | ||
| /// * `sink` - The dispatch sink that assignments are written to. | ||
| /// * `cancellation_token` - The token to signal the scheduling loop to stop. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns a [`SchedulerError`] instance indicating an irrecoverable error. | ||
| async fn run( | ||
| &mut self, | ||
| storage_client: Self::StorageClient, | ||
| sink: Self::Sink, | ||
| cancellation_token: tokio_util::sync::CancellationToken, | ||
| ) -> Result<(), SchedulerError>; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| //! The dispatching queue that decouples the scheduler core's placement decisions from the | ||
| //! execution-manager-facing service. | ||
|
|
||
| use std::time::Duration; | ||
|
|
||
| use async_trait::async_trait; | ||
| use spider_core::types::id::SessionId; | ||
|
|
||
| use crate::{error::SchedulerError, types::TaskAssignment}; | ||
|
|
||
| /// The writer side of the dispatching queue used by the scheduler core. | ||
| #[async_trait] | ||
| pub trait DispatchQueueSink: Send + Sync + Clone { | ||
| /// Enqueues a task assignment for execution managers to consume. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `assignment` - The task assignment to enqueue. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. | ||
| async fn enqueue(&self, assignment: TaskAssignment) -> Result<(), SchedulerError>; | ||
|
|
||
| /// Bumps the session ID and invalidates all queued task assignments. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `new_session_id` - The new session ID. Must be greater than the current session ID. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. | ||
| /// * [`SchedulerError::InvalidSessionId`] if the new session ID is not greater than the current | ||
| /// session ID. | ||
| async fn bump_session_id(&self, new_session_id: SessionId) -> Result<(), SchedulerError>; | ||
|
|
||
| /// # Returns | ||
| /// | ||
| /// The current size of the dispatch queue. | ||
| fn size(&self) -> usize; | ||
| } | ||
|
|
||
| /// The reader side of the dispatching queue, drained by the execution-manager-facing service. | ||
| #[async_trait] | ||
| pub trait DispatchQueueSource: Send + Sync + Clone { | ||
| /// Dequeues the next task assignment for an execution manager to execute. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `wait_time` - The maximum amount of time to wait for a task assignment. | ||
| /// | ||
| /// # Returns | ||
| /// | ||
| /// `None` if no task assignment is available within the specified wait time, or a tuple | ||
| /// containing: | ||
| /// | ||
| /// * The storage session associated with the assignment. | ||
| /// * The next task assignment ready to execute. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`SchedulerError::DispatchQueueClosed`] if the dispatching queue is closed. | ||
| async fn dequeue( | ||
| &self, | ||
| wait_time: Duration, | ||
| ) -> Result<Option<(SessionId, TaskAssignment)>, SchedulerError>; | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| //! The error types used in this crate. | ||
|
|
||
| use spider_core::types::id::{JobId, SessionId}; | ||
|
|
||
| /// Errors returned by [`crate::storage_client::SchedulerStorageClient`] operations. | ||
| #[derive(Debug, thiserror::Error)] | ||
| pub enum StorageClientError { | ||
| /// The inbound queue is closed and can no longer yield ready entries. | ||
| #[error("inbound queue is closed")] | ||
| InboundClosed, | ||
|
|
||
| /// No job with the requested identifier exists. | ||
| #[error("job not found: {0:?}")] | ||
| JobNotFound(JobId), | ||
| } | ||
|
|
||
| /// Errors returned by the scheduler runtime and its components. | ||
| #[derive(Debug, thiserror::Error)] | ||
| pub enum SchedulerError { | ||
| /// Forwarded from the storage client. | ||
| #[error(transparent)] | ||
| Storage(#[from] StorageClientError), | ||
|
|
||
| /// The dispatching queue is closed and can no longer accept assignments. | ||
| #[error("dispatching queue is closed")] | ||
| DispatchQueueClosed, | ||
|
|
||
| /// The session ID is invalid. | ||
| #[error("invalid session ID: {0:?}")] | ||
| InvalidSessionId(SessionId), | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| //! Trait and type abstractions for the Spider scheduler. | ||
| //! | ||
| //! The scheduler is the serial decision maker that turns ready tasks discovered by the storage | ||
| //! layer into assignments for execution managers. It owns placement and ordering policy, not | ||
| //! dependency resolution: storage decides *what* is ready, and the scheduler decides *in what | ||
| //! order* and *with what throttling* ready tasks are offered to the fleet. | ||
| //! | ||
| //! The crate defines three trait seams wired into a single pipeline — a storage client that polls | ||
| //! the ready queue, a core that makes serial decisions, and a dispatching queue that fans those | ||
| //! decisions out to execution managers: | ||
| //! | ||
| //! ```text | ||
| //! storage ── authoritative ready queue (owned by the storage layer, not this crate) | ||
| //! │ | ||
| //! │ poll_ready / poll_commit_ready / poll_cleanup_ready (SchedulerStorageClient) | ||
| //! ▼ | ||
| //! ┌───────────────────┐ | ||
| //! │ SchedulerCore │ serial loop: poll → decide → enqueue | ||
| //! └───────────────────┘ | ||
| //! │ | ||
| //! │ enqueue (DispatchQueueSink — writer side) | ||
| //! ▼ | ||
| //! ┌───────────────────┐ | ||
| //! │ dispatch queue │ bounded SPMC; a full queue back-pressures the core | ||
| //! └───────────────────┘ | ||
| //! │ | ||
| //! │ dequeue (DispatchQueueSource — reader side) | ||
| //! ▼ | ||
| //! ┌───────────────────┐ | ||
| //! │ scheduler service │ ──▶ execution managers (concurrent fan-out) | ||
| //! └───────────────────┘ | ||
| //! ``` | ||
|
|
||
| pub mod core; | ||
| pub mod dispatch_queue; | ||
| pub mod error; | ||
| pub mod storage_client; | ||
| pub mod types; | ||
|
|
||
| pub use crate::{ | ||
| core::SchedulerCore, | ||
| dispatch_queue::{DispatchQueueSink, DispatchQueueSource}, | ||
| error::{SchedulerError, StorageClientError}, | ||
| storage_client::SchedulerStorageClient, | ||
| types::{InboundEntry, TaskAssignment}, | ||
| }; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| //! The scheduler's view of the storage layer, abstracting inbound polling and placement-time reads. | ||
|
|
||
| use std::time::Duration; | ||
|
|
||
| use async_trait::async_trait; | ||
| use spider_core::{ | ||
| job::JobState, | ||
| types::id::{JobId, SessionId}, | ||
| }; | ||
|
|
||
| use crate::{error::StorageClientError, types::InboundEntry}; | ||
|
|
||
| /// The scheduler's view of the storage layer. | ||
| /// | ||
| /// Abstracts the storage-owned inbound queue and the read-only queries a scheduling algorithm | ||
| /// needs to make placement decisions. Modeled as a trait so the scheduler runtime can be driven by | ||
| /// a real storage client in production or a mock in tests. | ||
| #[async_trait] | ||
| pub trait SchedulerStorageClient: Send + Sync + Clone { | ||
| /// Polls the regular-task lane of the storage-owned inbound queue for ready tasks. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `max_items` - The maximum number of entries to return from a single poll. | ||
| /// * `wait` - The maximum duration to block waiting for ready entries on the storage side. | ||
| /// | ||
| /// # Returns | ||
| /// | ||
| /// A tuple on success, containing: | ||
| /// | ||
| /// * The storage session the poll was served under. | ||
| /// * The ready regular tasks drained from the lane. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`StorageClientError::InboundClosed`] if the regular-task lane is closed and can no longer | ||
| /// yield entries. | ||
| async fn poll_ready( | ||
| &self, | ||
| max_items: usize, | ||
| wait: Duration, | ||
| ) -> Result<(SessionId, Vec<InboundEntry>), StorageClientError>; | ||
|
|
||
| /// Polls the commit-task lane of the storage-owned inbound queue for ready tasks. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `max_items` - The maximum number of entries to return from a single poll. | ||
| /// * `wait` - The maximum duration to block waiting for ready entries on the storage side. | ||
| /// | ||
| /// # Returns | ||
| /// | ||
| /// A tuple on success, containing: | ||
| /// | ||
| /// * The storage session the poll was served under. | ||
| /// * The ready commit tasks drained from the lane. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`StorageClientError::InboundClosed`] if the commit-task lane is closed and can no longer | ||
| /// yield entries. | ||
| async fn poll_commit_ready( | ||
| &self, | ||
| max_items: usize, | ||
| wait: Duration, | ||
| ) -> Result<(SessionId, Vec<InboundEntry>), StorageClientError>; | ||
|
|
||
| /// Polls the cleanup-task lane of the storage-owned inbound queue for ready tasks. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `max_items` - The maximum number of entries to return from a single poll. | ||
| /// * `wait` - The maximum duration to block waiting for ready entries on the storage side. | ||
| /// | ||
| /// # Returns | ||
| /// | ||
| /// A tuple on success, containing: | ||
| /// | ||
| /// * The storage session the poll was served under. | ||
| /// * The ready cleanup tasks drained from the lane. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`StorageClientError::InboundClosed`] if the cleanup-task lane is closed and can no longer | ||
| /// yield entries. | ||
| async fn poll_cleanup_ready( | ||
| &self, | ||
| max_items: usize, | ||
| wait: Duration, | ||
| ) -> Result<(SessionId, Vec<InboundEntry>), StorageClientError>; | ||
|
|
||
| /// Reads the current state of a job. | ||
| /// | ||
| /// # Parameters | ||
| /// | ||
| /// * `job_id` - The identifier of the job to query. | ||
| /// | ||
| /// # Returns | ||
| /// | ||
| /// The job's current [`JobState`] on success. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`StorageClientError::JobNotFound`] if no job with the given identifier exists. | ||
| async fn job_state(&self, job_id: JobId) -> Result<JobState, StorageClientError>; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| //! The data types the scheduler exchanges with the storage layer and execution managers. | ||
|
|
||
| use spider_core::types::id::{JobId, ResourceGroupId, TaskId}; | ||
|
|
||
| /// A ready task drained from the storage-owned inbound queue. | ||
| /// | ||
| /// The storage client flattens storage's three ready lanes (regular, commit, and cleanup tasks) | ||
| /// into this uniform entry, resolving each to its [`TaskId`] so the scheduler core can treat every | ||
| /// ready task identically. | ||
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
| pub struct InboundEntry { | ||
| /// The resource group that owns the job. | ||
| pub resource_group_id: ResourceGroupId, | ||
|
|
||
| /// The job the task belongs to. | ||
| pub job_id: JobId, | ||
|
|
||
| /// The ready task. | ||
| pub task_id: TaskId, | ||
| } | ||
|
|
||
| /// A task placement decision written by the scheduler core to the dispatching queue. | ||
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
| pub struct TaskAssignment { | ||
| /// The resource group that owns the job. | ||
| pub resource_group_id: ResourceGroupId, | ||
|
|
||
| /// The job the task belongs to. | ||
| pub job_id: JobId, | ||
|
|
||
| /// The task to dispatch. | ||
| pub task_id: TaskId, | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should probably have a batched enqueue method for better performance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current planned implementation won't benefit from a batch operation: