-
Notifications
You must be signed in to change notification settings - Fork 10
feat(spider-storage): Adds job recovery for restarted storage cache layer. #339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: storage-service-dev
Are you sure you want to change the base?
Changes from all commits
6e812cc
86c7bea
27091f0
85a5130
100129e
d95057f
e030af8
0683275
6e6727d
e39e319
a069c77
29bbb6f
8015294
a667a79
da87450
e3db65f
5776c7d
022223f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -172,6 +172,30 @@ impl TaskGraph { | |
| &self.outputs | ||
| } | ||
|
|
||
| /// Restores graph outputs from persisted job outputs. | ||
| /// | ||
| /// # Errors | ||
| /// | ||
| /// Returns an error if: | ||
| /// | ||
| /// * [`InternalError::TaskOutputsLengthMismatch`] if the number of persisted outputs does not | ||
| /// match the number of graph outputs. | ||
| pub async fn restore_outputs( | ||
| &self, | ||
| persisted_outputs: Vec<TaskOutput>, | ||
| ) -> Result<(), InternalError> { | ||
| if persisted_outputs.len() != self.outputs.len() { | ||
| return Err(InternalError::TaskOutputsLengthMismatch( | ||
| self.outputs.len(), | ||
| persisted_outputs.len(), | ||
| )); | ||
| } | ||
| for (output_reader, output) in self.outputs.iter().zip(persisted_outputs) { | ||
| *output_reader.writer().write().await = Some(output); | ||
| } | ||
|
Comment on lines
+193
to
+195
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you read this change carefully (assume it's done by your coding agent)?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually after my fight with codex. The original is to directly add a |
||
| Ok(()) | ||
| } | ||
|
|
||
| #[must_use] | ||
| pub const fn has_commit_task(&self) -> bool { | ||
| self.commit_task.is_some() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,9 +5,10 @@ use const_format::formatcp; | |
| use secrecy::ExposeSecret; | ||
| use spider_core::{ | ||
| job::JobState, | ||
| task::TaskGraph, | ||
| types::{ | ||
| id::{ExecutionManagerId, JobId, ResourceGroupId, SessionId}, | ||
| io::TaskOutput, | ||
| io::{TaskInput, TaskOutput}, | ||
| }, | ||
| }; | ||
| use spider_derive::MySqlEnum; | ||
|
|
@@ -22,6 +23,7 @@ use crate::{ | |
| ExecutionManagerLivenessManagement, | ||
| ExternalJobOrchestration, | ||
| InternalJobOrchestration, | ||
| RecoverableJob, | ||
| ResourceGroupManagement, | ||
| SessionManagement, | ||
| error::ExpectedStates, | ||
|
|
@@ -380,6 +382,63 @@ impl InternalJobOrchestration for MariaDbStorageConnector { | |
| tx.commit().await?; | ||
| Ok(deleted_job_ids) | ||
| } | ||
|
|
||
| async fn get_recoverable_jobs(&self) -> Result<Vec<RecoverableJob>, DbError> { | ||
| const SELECT_QUERY: &str = formatcp!( | ||
| "SELECT `id`, `resource_group_id`, `state`, `serialized_task_graph`, \ | ||
| `serialized_job_inputs`, `serialized_job_outputs` FROM `{table}` WHERE `state` IN \ | ||
| ('{running_state}','{commit_ready_state}','{cleanup_ready_state}');", | ||
| table = JOBS_TABLE_NAME, | ||
| running_state = JobState::Running.as_str(), | ||
| commit_ready_state = JobState::CommitReady.as_str(), | ||
| cleanup_ready_state = JobState::CleanupReady.as_str(), | ||
| ); | ||
|
|
||
| let rows = sqlx::query_as::< | ||
| _, | ||
| ( | ||
| JobId, | ||
| ResourceGroupId, | ||
| JobState, | ||
| String, | ||
| Vec<u8>, | ||
| Option<Vec<u8>>, | ||
| ), | ||
| >(SELECT_QUERY) | ||
| .fetch_all(&self.pool) | ||
| .await?; | ||
|
Comment on lines
+397
to
+409
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bound recovery reads to avoid startup memory blow-ups. Line 397 fetches all recoverable rows (including blob payloads) into memory, then Line 411 materializes another vector. Under a large in-flight backlog, restart recovery can spike memory and fail availability. Please switch to paged/streamed recovery (e.g., Also applies to: 411-440 🤖 Prompt for AI Agents |
||
|
|
||
| rows.into_iter() | ||
| .map( | ||
| |( | ||
| id, | ||
| resource_group_id, | ||
| state, | ||
| serialized_task_graph, | ||
| serialized_job_inputs, | ||
| serialized_job_outputs, | ||
| )| { | ||
| let task_graph = TaskGraph::from_json(&serialized_task_graph) | ||
| .map_err(DbError::task_graph_de)?; | ||
| let job_inputs: Vec<TaskInput> = | ||
| rmp_serde::from_slice(&serialized_job_inputs).map_err(DbError::value_de)?; | ||
| let job_submission = ValidatedJobSubmission::create(task_graph, job_inputs) | ||
| .map_err(|e| DbError::CorruptedDbState(e.to_string()))?; | ||
| let job_outputs = serialized_job_outputs | ||
| .map(|outputs| rmp_serde::from_slice(&outputs).map_err(DbError::value_de)) | ||
| .transpose()?; | ||
|
|
||
|
sitaowang1998 marked this conversation as resolved.
|
||
| Ok(RecoverableJob { | ||
| id, | ||
| resource_group_id, | ||
| state, | ||
| job_submission, | ||
| job_outputs, | ||
| }) | ||
| }, | ||
| ) | ||
| .collect() | ||
| } | ||
| } | ||
|
|
||
| #[async_trait] | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.