From 1744f0f7668f0e23da460d4e683c458beeb1c07e Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sat, 28 Mar 2026 23:05:19 -0400 Subject: [PATCH 01/22] Implement pipelined 2PC prototype for cross-database atomicity Add two-phase commit support for distributed transactions. Participant: PreparedTransactions registry, HTTP endpoints for prepare/commit/abort, prepare_reducer() on ModuleHost. Coordinator: call_reducer_on_db_2pc host function (ABI spacetime_10.5), post-commit sends /2pc/commit to participants, on failure sends abort. Bindings: FFI and safe wrapper for call_reducer_on_db_2pc. Smoketests: cross_db_2pc with happy path and abort path. --- crates/bindings-sys/src/lib.rs | 48 +++++ crates/bindings/src/remote_reducer.rs | 40 ++++ crates/client-api/src/routes/database.rs | 123 ++++++++++- crates/core/src/host/instance_env.rs | 166 +++++++++++++++ crates/core/src/host/mod.rs | 1 + crates/core/src/host/module_host.rs | 80 +++++++ crates/core/src/host/prepared_tx.rs | 37 ++++ crates/core/src/host/v8/mod.rs | 5 + crates/core/src/host/wasm_common.rs | 4 + .../src/host/wasm_common/module_host_actor.rs | 63 ++++++ .../src/host/wasmtime/wasm_instance_env.rs | 76 +++++++ .../core/src/host/wasmtime/wasmtime_module.rs | 4 + .../tests/smoketests/cross_db_2pc.rs | 201 ++++++++++++++++++ crates/smoketests/tests/smoketests/mod.rs | 1 + 14 files changed, 848 insertions(+), 1 deletion(-) create mode 100644 crates/core/src/host/prepared_tx.rs create mode 100644 crates/smoketests/tests/smoketests/cross_db_2pc.rs diff --git a/crates/bindings-sys/src/lib.rs b/crates/bindings-sys/src/lib.rs index 8854ae393b6..927c444a38d 100644 --- a/crates/bindings-sys/src/lib.rs +++ b/crates/bindings-sys/src/lib.rs @@ -896,6 +896,23 @@ pub mod raw { args_len: u32, out: *mut BytesSource, ) -> u16; + + /// 2PC variant of `call_reducer_on_db`. + /// + /// Calls the target database's `/prepare/{reducer}` endpoint instead of `/call/{reducer}`. + /// On success, the runtime stores the `prepare_id` internally. + /// After the coordinator's reducer commits, all participants are committed. + /// If the coordinator's reducer fails, all participants are aborted. + /// + /// Returns and errors are identical to `call_reducer_on_db`. + pub fn call_reducer_on_db_2pc( + identity_ptr: *const u8, // exactly 32 bytes, BSATN-encoded Identity + reducer_ptr: *const u8, + reducer_len: u32, + args_ptr: *const u8, + args_len: u32, + out: *mut BytesSource, + ) -> u16; } /// What strategy does the database index use? @@ -1510,6 +1527,37 @@ pub fn call_reducer_on_db( } } +/// 2PC variant of [`call_reducer_on_db`]. +/// +/// Calls `/prepare/{reducer}` on the target database. On success, the runtime +/// stores the prepare_id internally. After the coordinator's reducer commits, +/// all participants are committed. On failure, all participants are aborted. +/// +/// Returns and errors are identical to [`call_reducer_on_db`]. +#[inline] +pub fn call_reducer_on_db_2pc( + identity: [u8; 32], + reducer_name: &str, + args: &[u8], +) -> Result<(u16, raw::BytesSource), raw::BytesSource> { + let mut out = raw::BytesSource::INVALID; + let status = unsafe { + raw::call_reducer_on_db_2pc( + identity.as_ptr(), + reducer_name.as_ptr(), + reducer_name.len() as u32, + args.as_ptr(), + args.len() as u32, + &mut out, + ) + }; + if status == Errno::HTTP_ERROR.code() { + Err(out) + } else { + Ok((status, out)) + } +} + /// Finds the JWT payload associated with `connection_id`. /// If nothing is found for the connection, this returns None. /// If a payload is found, this will return a valid [`raw::BytesSource`]. diff --git a/crates/bindings/src/remote_reducer.rs b/crates/bindings/src/remote_reducer.rs index bded8bc5ae7..8676a81203b 100644 --- a/crates/bindings/src/remote_reducer.rs +++ b/crates/bindings/src/remote_reducer.rs @@ -84,3 +84,43 @@ pub fn call_reducer_on_db(database_identity: Identity, reducer_name: &str, args: } } } + +/// Call a reducer on a remote database using the 2PC prepare protocol. +/// +/// This is the 2PC variant of [`call_reducer_on_db`]. It calls the target database's +/// `/prepare/{reducer}` endpoint. On success, the runtime stores the prepare_id internally. +/// After the coordinator's reducer commits, all participants are committed automatically. +/// If the coordinator's reducer fails (panics or returns Err), all participants are aborted. +/// +/// Returns and errors are identical to [`call_reducer_on_db`]. +pub fn call_reducer_on_db_2pc( + database_identity: Identity, + reducer_name: &str, + args: &[u8], +) -> Result<(), RemoteCallError> { + let identity_bytes = database_identity.to_byte_array(); + match spacetimedb_bindings_sys::call_reducer_on_db_2pc(identity_bytes, reducer_name, args) { + Ok((status, body_source)) => { + if status < 300 { + return Ok(()); + } + let msg = if body_source == spacetimedb_bindings_sys::raw::BytesSource::INVALID { + String::new() + } else { + let mut buf = IterBuf::take(); + read_bytes_source_into(body_source, &mut buf); + String::from_utf8_lossy(&buf).into_owned() + }; + if status == 404 { + Err(RemoteCallError::NotFound(msg)) + } else { + Err(RemoteCallError::Failed(msg)) + } + } + Err(err_source) => { + use crate::rt::read_bytes_source_as; + let msg = read_bytes_source_as::(err_source); + Err(RemoteCallError::Unreachable(msg)) + } + } +} diff --git a/crates/client-api/src/routes/database.rs b/crates/client-api/src/routes/database.rs index 29a49fe2c5c..7c82f5918de 100644 --- a/crates/client-api/src/routes/database.rs +++ b/crates/client-api/src/routes/database.rs @@ -241,6 +241,115 @@ fn parse_call_args(content_type: headers::ContentType, body: Bytes) -> axum::res } } +/// 2PC prepare endpoint: execute a reducer and return a prepare_id. +/// +/// `POST /v1/database/:name_or_identity/prepare/:reducer` +/// +/// On success, the response includes: +/// - `X-Prepare-Id` header with the prepare_id +/// - Body contains the reducer return value (if any) +pub async fn prepare( + State(worker_ctx): State, + Extension(auth): Extension, + Path(CallParams { + name_or_identity, + reducer, + }): Path, + TypedHeader(content_type): TypedHeader, + body: Bytes, +) -> axum::response::Result { + let args = parse_call_args(content_type, body)?; + let caller_identity = auth.claims.identity; + + let (module, Database { owner_identity, .. }) = find_module_and_database(&worker_ctx, name_or_identity).await?; + + let connection_id = generate_random_connection_id(); + + module + .call_identity_connected(auth.into(), connection_id) + .await + .map_err(client_connected_error_to_response)?; + + let result = module + .prepare_reducer(caller_identity, Some(connection_id), &reducer, args) + .await; + + module + .call_identity_disconnected(caller_identity, connection_id) + .await + .map_err(client_disconnected_error_to_response)?; + + match result { + Ok((prepare_id, rcr, return_value)) => { + let (status, body) = + reducer_outcome_response(&module, &owner_identity, &reducer, rcr.outcome, return_value)?; + let mut response = ( + status, + TypedHeader(SpacetimeEnergyUsed(rcr.energy_used)), + TypedHeader(SpacetimeExecutionDurationMicros(rcr.execution_duration)), + body, + ) + .into_response(); + if !prepare_id.is_empty() { + response.headers_mut().insert( + "X-Prepare-Id", + http::HeaderValue::from_str(&prepare_id).unwrap(), + ); + } + Ok(response) + } + Err(e) => Err(map_reducer_error(e, &reducer).into()), + } +} + +#[derive(Deserialize)] +pub struct TwoPcParams { + name_or_identity: NameOrIdentity, + prepare_id: String, +} + +/// 2PC commit endpoint: finalize a prepared transaction. +/// +/// `POST /v1/database/:name_or_identity/2pc/commit/:prepare_id` +pub async fn commit_2pc( + State(worker_ctx): State, + Extension(_auth): Extension, + Path(TwoPcParams { + name_or_identity, + prepare_id, + }): Path, +) -> axum::response::Result { + let (module, _database) = find_module_and_database(&worker_ctx, name_or_identity).await?; + + module.commit_prepared(&prepare_id).map_err(|e| { + log::error!("2PC commit failed: {e}"); + (StatusCode::NOT_FOUND, e).into_response() + })?; + + Ok(StatusCode::OK) +} + +/// 2PC abort endpoint: abort a prepared transaction. +/// +/// `POST /v1/database/:name_or_identity/2pc/abort/:prepare_id` +pub async fn abort_2pc( + State(worker_ctx): State, + Extension(_auth): Extension, + Path(TwoPcParams { + name_or_identity, + prepare_id, + }): Path, +) -> axum::response::Result { + let (module, _database) = find_module_and_database(&worker_ctx, name_or_identity).await?; + + module.abort_prepared(&prepare_id).map_err(|e| { + log::error!("2PC abort failed: {e}"); + (StatusCode::NOT_FOUND, e).into_response() + })?; + + Ok(StatusCode::OK) +} + fn reducer_outcome_response( module: &ModuleHost, owner_identity: &Identity, @@ -1247,6 +1356,12 @@ pub struct DatabaseRoutes { pub db_reset: MethodRouter, /// GET: /database/: name_or_identity/unstable/timestamp pub timestamp_get: MethodRouter, + /// POST: /database/:name_or_identity/prepare/:reducer + pub prepare_post: MethodRouter, + /// POST: /database/:name_or_identity/2pc/commit/:prepare_id + pub commit_2pc_post: MethodRouter, + /// POST: /database/:name_or_identity/2pc/abort/:prepare_id + pub abort_2pc_post: MethodRouter, } impl Default for DatabaseRoutes @@ -1272,6 +1387,9 @@ where pre_publish: post(pre_publish::), db_reset: put(reset::), timestamp_get: get(get_timestamp::), + prepare_post: post(prepare::), + commit_2pc_post: post(commit_2pc::), + abort_2pc_post: post(abort_2pc::), } } } @@ -1296,7 +1414,10 @@ where .route("/sql", self.sql_post) .route("/unstable/timestamp", self.timestamp_get) .route("/pre_publish", self.pre_publish) - .route("/reset", self.db_reset); + .route("/reset", self.db_reset) + .route("/prepare/:reducer", self.prepare_post) + .route("/2pc/commit/:prepare_id", self.commit_2pc_post) + .route("/2pc/abort/:prepare_id", self.abort_2pc_post); axum::Router::new() .route("/", self.root_post) diff --git a/crates/core/src/host/instance_env.rs b/crates/core/src/host/instance_env.rs index 1fdc651414e..29ed8b28069 100644 --- a/crates/core/src/host/instance_env.rs +++ b/crates/core/src/host/instance_env.rs @@ -54,6 +54,11 @@ pub struct InstanceEnv { in_anon_tx: bool, /// A procedure's last known transaction offset. procedure_last_tx_offset: Option, + /// 2PC: prepared participants from `call_reducer_on_db_2pc` calls. + /// Each entry is (database_identity, prepare_id). + /// After the coordinator's reducer commits, these are committed; + /// on failure, they are aborted. + pub prepared_participants: Vec<(Identity, String)>, } /// `InstanceEnv` needs to be `Send` because it is created on the host thread @@ -238,6 +243,7 @@ impl InstanceEnv { func_name: None, in_anon_tx: false, procedure_last_tx_offset: None, + prepared_participants: Vec::new(), } } @@ -1045,6 +1051,166 @@ impl InstanceEnv { result } } + + /// Call a reducer on a remote database using the 2PC prepare protocol. + /// + /// Like [`Self::call_reducer_on_db`], but POSTs to `/prepare/{reducer}` instead of + /// `/call/{reducer}`. On success, parses the `X-Prepare-Id` response header and stores + /// `(database_identity, prepare_id)` in [`Self::prepared_participants`]. + /// + /// Returns `(http_status, response_body)` on transport success. + /// The caller (coordinator reducer) is responsible for checking the status; + /// if the coordinator's reducer commits, the runtime will commit all participants, + /// and if it fails, the runtime will abort them. + pub fn call_reducer_on_db_2pc( + &mut self, + database_identity: Identity, + reducer_name: &str, + args: bytes::Bytes, + ) -> impl Future), NodesError>> + use<> { + let client = self.replica_ctx.call_reducer_client.clone(); + let router = self.replica_ctx.call_reducer_router.clone(); + let reducer_name = reducer_name.to_owned(); + let auth_token = self.replica_ctx.call_reducer_auth_token.clone(); + let caller_identity = self.replica_ctx.database.database_identity; + + async move { + let start = Instant::now(); + + let base_url = router + .resolve_base_url(database_identity) + .await + .map_err(|e| NodesError::HttpError(e.to_string()))?; + let url = format!( + "{}/v1/database/{}/prepare/{}", + base_url, + database_identity.to_hex(), + reducer_name, + ); + let mut req = client + .post(&url) + .header(http::header::CONTENT_TYPE, "application/octet-stream") + .body(args); + if let Some(token) = auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + let result = async { + let response = req.send().await.map_err(|e| NodesError::HttpError(e.to_string()))?; + let status = response.status().as_u16(); + let prepare_id = response + .headers() + .get("X-Prepare-Id") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_owned()); + let body = response.bytes().await.map_err(|e| NodesError::HttpError(e.to_string()))?; + Ok((status, body, prepare_id)) + } + .await; + + WORKER_METRICS + .cross_db_reducer_calls_total + .with_label_values(&caller_identity) + .inc(); + WORKER_METRICS + .cross_db_reducer_duration_seconds + .with_label_values(&caller_identity) + .observe(start.elapsed().as_secs_f64()); + + result + } + } + + /// Commit all prepared participants (called after coordinator's reducer succeeds). + pub fn commit_all_prepared( + &mut self, + ) -> impl Future + use<> { + let participants = mem::take(&mut self.prepared_participants); + let client = self.replica_ctx.call_reducer_client.clone(); + let router = self.replica_ctx.call_reducer_router.clone(); + let auth_token = self.replica_ctx.call_reducer_auth_token.clone(); + + async move { + for (db_identity, prepare_id) in participants { + let base_url = match router.resolve_base_url(db_identity).await { + Ok(url) => url, + Err(e) => { + log::error!("2PC commit: failed to resolve base URL for {db_identity}: {e}"); + continue; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/commit/{}", + base_url, + db_identity.to_hex(), + prepare_id, + ); + let mut req = client.post(&url); + if let Some(ref token) = auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC commit: committed {prepare_id} on {db_identity}"); + } + Ok(resp) => { + log::error!( + "2PC commit: failed for {prepare_id} on {db_identity}: status {}", + resp.status() + ); + } + Err(e) => { + log::error!("2PC commit: transport error for {prepare_id} on {db_identity}: {e}"); + } + } + } + } + } + + /// Abort all prepared participants (called when coordinator's reducer fails). + pub fn abort_all_prepared( + &mut self, + ) -> impl Future + use<> { + let participants = mem::take(&mut self.prepared_participants); + let client = self.replica_ctx.call_reducer_client.clone(); + let router = self.replica_ctx.call_reducer_router.clone(); + let auth_token = self.replica_ctx.call_reducer_auth_token.clone(); + + async move { + for (db_identity, prepare_id) in participants { + let base_url = match router.resolve_base_url(db_identity).await { + Ok(url) => url, + Err(e) => { + log::error!("2PC abort: failed to resolve base URL for {db_identity}: {e}"); + continue; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/abort/{}", + base_url, + db_identity.to_hex(), + prepare_id, + ); + let mut req = client.post(&url); + if let Some(ref token) = auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC abort: aborted {prepare_id} on {db_identity}"); + } + Ok(resp) => { + log::error!( + "2PC abort: failed for {prepare_id} on {db_identity}: status {}", + resp.status() + ); + } + Err(e) => { + log::error!("2PC abort: transport error for {prepare_id} on {db_identity}: {e}"); + } + } + } + } + } } /// Default timeout for HTTP requests performed by [`InstanceEnv::http_request`]. diff --git a/crates/core/src/host/mod.rs b/crates/core/src/host/mod.rs index 25e56ca217e..df6ec4d42f0 100644 --- a/crates/core/src/host/mod.rs +++ b/crates/core/src/host/mod.rs @@ -15,6 +15,7 @@ mod host_controller; mod module_common; #[allow(clippy::too_many_arguments)] pub mod module_host; +pub mod prepared_tx; pub mod scheduler; pub mod wasmtime; diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 929db1b8004..a13442ccc5b 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -890,6 +890,9 @@ pub struct ModuleHost { /// /// When this is true, most operations will fail with [`NoSuchModule`]. closed: Arc, + + /// Registry of prepared (but not yet finalized) 2PC transactions. + prepared_txs: super::prepared_tx::PreparedTransactions, } impl fmt::Debug for ModuleHost { @@ -906,6 +909,7 @@ pub struct WeakModuleHost { inner: Weak, on_panic: Weak, closed: Weak, + prepared_txs: super::prepared_tx::PreparedTransactions, } #[derive(Debug)] @@ -1093,6 +1097,7 @@ impl ModuleHost { inner, on_panic, closed: Arc::new(AtomicBool::new(false)), + prepared_txs: super::prepared_tx::PreparedTransactions::new(), } } @@ -1740,6 +1745,79 @@ impl ModuleHost { res } + /// Execute a reducer in 2PC prepare mode. + /// + /// This calls the reducer normally (which commits in-memory and to durability), + /// then stores the transaction info in the prepared transactions registry. + /// Returns the prepare_id and the reducer call result (including the return value). + /// + /// For the simplified prototype, we do not implement a persistence barrier; + /// the PREPARE record is just a normal commit. + pub async fn prepare_reducer( + &self, + caller_identity: Identity, + caller_connection_id: Option, + reducer_name: &str, + args: FunctionArgs, + ) -> Result<(String, ReducerCallResult, Option), ReducerCallError> { + // Call the reducer normally (which commits in-memory and sends to durability). + let (result, return_value) = self + .call_reducer_with_return( + caller_identity, + caller_connection_id, + None, // no websocket client + None, // no request_id + None, // no timer + reducer_name, + args, + ) + .await?; + + // Only store prepared tx info if the reducer succeeded. + if matches!(result.outcome, ReducerOutcome::Committed) { + use std::sync::atomic::{AtomicU64, Ordering}; + static PREPARE_COUNTER: AtomicU64 = AtomicU64::new(1); + let prepare_id = format!("prepare-{}", PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed)); + // For the prototype, we store minimal info. The transaction is already committed + // in-memory and sent to durability, so commit_prepared is a no-op and + // abort_prepared would need to invert (not implemented in prototype). + let info = super::prepared_tx::PreparedTxInfo { + tx_offset: 0, // placeholder; not used in prototype + tx_data: std::sync::Arc::new(spacetimedb_datastore::traits::TxData::default()), + reducer_context: None, + }; + self.prepared_txs.insert(prepare_id.clone(), info); + Ok((prepare_id, result, return_value)) + } else { + // Reducer failed -- no prepare_id since nothing to commit/abort. + Ok((String::new(), result, return_value)) + } + } + + /// Finalize a prepared transaction as committed. + /// + /// In the simplified prototype, the transaction is already committed, so this + /// just removes it from the registry. + pub fn commit_prepared(&self, prepare_id: &str) -> Result<(), String> { + self.prepared_txs + .remove(prepare_id) + .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; + Ok(()) + } + + /// Abort a prepared transaction. + /// + /// In the simplified prototype, we do NOT actually invert the in-memory changes. + /// This just removes the prepared tx from the registry. + /// Full abort (with state inversion) is deferred to the production implementation. + pub fn abort_prepared(&self, prepare_id: &str) -> Result<(), String> { + self.prepared_txs + .remove(prepare_id) + .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; + log::warn!("2PC abort for {prepare_id}: prototype does not invert in-memory state"); + Ok(()) + } + pub async fn call_view_add_single_subscription( &self, sender: Arc, @@ -2561,6 +2639,7 @@ impl ModuleHost { inner: Arc::downgrade(&self.inner), on_panic: Arc::downgrade(&self.on_panic), closed: Arc::downgrade(&self.closed), + prepared_txs: self.prepared_txs.clone(), } } @@ -2605,6 +2684,7 @@ impl WeakModuleHost { inner, on_panic, closed, + prepared_txs: self.prepared_txs.clone(), }) } } diff --git a/crates/core/src/host/prepared_tx.rs b/crates/core/src/host/prepared_tx.rs new file mode 100644 index 00000000000..2aec21dfb49 --- /dev/null +++ b/crates/core/src/host/prepared_tx.rs @@ -0,0 +1,37 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use spacetimedb_datastore::execution_context::ReducerContext; +use spacetimedb_datastore::traits::TxData; +use spacetimedb_durability::TxOffset; + +/// Information about a transaction that has been prepared (committed in-memory) +/// but not yet finalized (COMMIT or ABORT). +pub struct PreparedTxInfo { + /// The offset of the PREPARE record in the commitlog. + pub tx_offset: TxOffset, + /// The transaction data (row changes). + pub tx_data: Arc, + /// The reducer context for the prepared transaction. + pub reducer_context: Option, +} + +/// Thread-safe registry of prepared transactions, keyed by prepare_id. +#[derive(Clone, Default)] +pub struct PreparedTransactions { + inner: Arc>>, +} + +impl PreparedTransactions { + pub fn new() -> Self { + Self::default() + } + + pub fn insert(&self, id: String, info: PreparedTxInfo) { + self.inner.lock().unwrap().insert(id, info); + } + + pub fn remove(&self, id: &str) -> Option { + self.inner.lock().unwrap().remove(id) + } +} diff --git a/crates/core/src/host/v8/mod.rs b/crates/core/src/host/v8/mod.rs index 332c9c89dd3..5810c67d7bd 100644 --- a/crates/core/src/host/v8/mod.rs +++ b/crates/core/src/host/v8/mod.rs @@ -1398,6 +1398,11 @@ impl WasmInstance for V8Instance<'_, '_, '_> { log_traceback(self.replica_ctx, func_type, func, trap) } + fn take_prepared_participants(&mut self) -> Vec<(Identity, String)> { + // V8/JS does not currently support 2PC, so always return empty. + Vec::new() + } + async fn call_procedure( &mut self, op: ProcedureOp, diff --git a/crates/core/src/host/wasm_common.rs b/crates/core/src/host/wasm_common.rs index b5bba032d7c..5d744bc2108 100644 --- a/crates/core/src/host/wasm_common.rs +++ b/crates/core/src/host/wasm_common.rs @@ -438,6 +438,10 @@ macro_rules! abi_funcs { // Implemented as a sync host function (using block_in_place) so it can be called // from within a reducer body where only synchronous host functions are allowed. "spacetime_10.5"::call_reducer_on_db, + + // 2PC variant: calls /prepare/{reducer} instead of /call/{reducer}. + // Stores the prepare_id for post-commit coordination. + "spacetime_10.5"::call_reducer_on_db_2pc, } $link_async! { diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index 7898a4f205a..42ba482be55 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -92,6 +92,10 @@ pub trait WasmInstance { fn log_traceback(&self, func_type: &str, func: &str, trap: &anyhow::Error); + /// Take the list of 2PC prepared participants accumulated during this reducer call. + /// Returns the participants and clears the internal list. + fn take_prepared_participants(&mut self) -> Vec<(Identity, String)>; + fn call_procedure( &mut self, op: ProcedureOp, @@ -977,6 +981,65 @@ impl InstanceCommon { }; let event = commit_and_broadcast_event(&info.subscriptions, client, event, out.tx).event; + // 2PC post-commit coordination: commit or abort all prepared participants. + let prepared_participants = inst.take_prepared_participants(); + if !prepared_participants.is_empty() { + let replica_ctx = inst.replica_ctx().clone(); + let committed = matches!(event.status, EventStatus::Committed(_)); + let handle = tokio::runtime::Handle::current(); + std::thread::scope(|s| { + s.spawn(|| { + handle.block_on(async { + let client = replica_ctx.call_reducer_client.clone(); + let router = replica_ctx.call_reducer_router.clone(); + let auth_token = replica_ctx.call_reducer_auth_token.clone(); + for (db_identity, prepare_id) in prepared_participants { + let action = if committed { "commit" } else { "abort" }; + let base_url = match router.resolve_base_url(db_identity).await { + Ok(url) => url, + Err(e) => { + log::error!("2PC {action}: failed to resolve base URL for {db_identity}: {e}"); + continue; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/{}/{}", + base_url, + db_identity.to_hex(), + action, + prepare_id, + ); + let mut req = client.post(&url); + if let Some(ref token) = auth_token { + req = req.header( + http::header::AUTHORIZATION, + format!("Bearer {token}"), + ); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC {action}: {prepare_id} on {db_identity}"); + } + Ok(resp) => { + log::error!( + "2PC {action}: failed for {prepare_id} on {db_identity}: status {}", + resp.status() + ); + } + Err(e) => { + log::error!( + "2PC {action}: transport error for {prepare_id} on {db_identity}: {e}" + ); + } + } + } + }); + }) + .join() + .expect("2PC coordination thread panicked"); + }); + } + let res = ReducerCallResult { outcome: ReducerOutcome::from(&event.status), energy_used: energy_quanta_used, diff --git a/crates/core/src/host/wasmtime/wasm_instance_env.rs b/crates/core/src/host/wasmtime/wasm_instance_env.rs index 728b3711057..73860dbf4ce 100644 --- a/crates/core/src/host/wasmtime/wasm_instance_env.rs +++ b/crates/core/src/host/wasmtime/wasm_instance_env.rs @@ -232,6 +232,11 @@ impl WasmInstanceEnv { &self.instance_env } + /// Return a mutable reference to the `InstanceEnv`. + pub fn instance_env_mut(&mut self) -> &mut InstanceEnv { + &mut self.instance_env + } + /// Setup the standard bytes sink and return a handle to it for writing. pub fn setup_standard_bytes_sink(&mut self) -> u32 { self.standard_bytes_sink = Some(Vec::new()); @@ -2023,6 +2028,77 @@ impl WasmInstanceEnv { } }) } + + /// 2PC variant of `call_reducer_on_db`. + /// + /// Calls the remote database's `/prepare/{reducer}` endpoint instead of `/call/{reducer}`. + /// On success, parses the `X-Prepare-Id` header and stores the participant info in + /// `InstanceEnv::prepared_participants` so the runtime can commit/abort after the + /// coordinator's reducer completes. + /// + /// Returns the HTTP status code on success, writing the response body to `*out` + /// as a [`BytesSource`]. + /// + /// On transport failure: + /// - Returns `HTTP_ERROR` errno, writing a BSATN-encoded error [`String`] to `*out`. + pub fn call_reducer_on_db_2pc( + caller: Caller<'_, Self>, + identity_ptr: WasmPtr, + reducer_ptr: WasmPtr, + reducer_len: u32, + args_ptr: WasmPtr, + args_len: u32, + out: WasmPtr, + ) -> RtResult { + Self::cvt_custom(caller, AbiCall::CallReducerOnDb, |caller| { + let (mem, env) = Self::mem_env(caller); + + let identity_slice = mem.deref_slice(identity_ptr, 32)?; + let identity_bytes: [u8; 32] = identity_slice + .try_into() + .expect("deref_slice(ptr, 32) always yields exactly 32 bytes"); + let database_identity = Identity::from_byte_array(identity_bytes); + + let reducer_name = mem.deref_str(reducer_ptr, reducer_len)?.to_owned(); + let args_buf = mem.deref_slice(args_ptr, args_len)?; + let args = bytes::Bytes::copy_from_slice(args_buf); + + let handle = tokio::runtime::Handle::current(); + let fut = env + .instance_env + .call_reducer_on_db_2pc(database_identity, &reducer_name, args); + let result = std::thread::scope(|s| { + s.spawn(|| handle.block_on(fut)) + .join() + .expect("call_reducer_on_db_2pc: worker thread panicked") + }); + + match result { + Ok((status, body, prepare_id)) => { + // If we got a prepare_id, register this participant. + if let Some(pid) = prepare_id { + if status < 300 { + env.instance_env + .prepared_participants + .push((database_identity, pid)); + } + } + let bytes_source = WasmInstanceEnv::create_bytes_source(env, body)?; + bytes_source.0.write_to(mem, out)?; + Ok(status as u32) + } + Err(NodesError::HttpError(err)) => { + let err_bytes = bsatn::to_vec(&err).with_context(|| { + format!("Failed to BSATN-serialize call_reducer_on_db_2pc transport error: {err:?}") + })?; + let bytes_source = WasmInstanceEnv::create_bytes_source(env, err_bytes.into())?; + bytes_source.0.write_to(mem, out)?; + Ok(errno::HTTP_ERROR.get() as u32) + } + Err(e) => Err(WasmError::Db(e)), + } + }) + } } type Fut<'caller, T> = Box>; diff --git a/crates/core/src/host/wasmtime/wasmtime_module.rs b/crates/core/src/host/wasmtime/wasmtime_module.rs index 2ea14a57be9..fb01e3a4763 100644 --- a/crates/core/src/host/wasmtime/wasmtime_module.rs +++ b/crates/core/src/host/wasmtime/wasmtime_module.rs @@ -564,6 +564,10 @@ impl module_host_actor::WasmInstance for WasmtimeInstance { log_traceback(func_type, func, trap) } + fn take_prepared_participants(&mut self) -> Vec<(Identity, String)> { + core::mem::take(&mut self.store.data_mut().instance_env_mut().prepared_participants) + } + #[tracing::instrument(level = "trace", skip_all)] async fn call_procedure( &mut self, diff --git a/crates/smoketests/tests/smoketests/cross_db_2pc.rs b/crates/smoketests/tests/smoketests/cross_db_2pc.rs new file mode 100644 index 00000000000..feb92deda0e --- /dev/null +++ b/crates/smoketests/tests/smoketests/cross_db_2pc.rs @@ -0,0 +1,201 @@ +use spacetimedb_smoketests::Smoketest; + +/// Module code for the 2PC test. +/// +/// Both the "bank A" and "bank B" databases use the same module. +/// +/// Tables: +/// - `Ledger(account: String PK, balance: i64)` -- stores account balances. +/// +/// Reducers: +/// - `init`: seeds "alice" with balance 100. +/// - `debit(account, amount)`: decrements balance, panics if insufficient funds. +/// - `credit(account, amount)`: increments balance (or inserts if absent). +/// - `transfer_funds(target_hex, from_account, to_account, amount)`: +/// Credits `to_account` locally, then calls `debit` on the remote database +/// using `call_reducer_on_db_2pc`. If the remote debit fails (panic/insufficient funds), +/// the local credit is also rolled back by the 2PC protocol. +const MODULE_CODE: &str = r#" +use spacetimedb::{log, ReducerContext, Table, Identity}; + +#[spacetimedb::table(accessor = ledger, public)] +pub struct Ledger { + #[primary_key] + account: String, + balance: i64, +} + +#[spacetimedb::reducer(init)] +pub fn init(ctx: &ReducerContext) { + ctx.db.ledger().insert(Ledger { account: "alice".to_string(), balance: 100 }); +} + +#[spacetimedb::reducer] +pub fn debit(ctx: &ReducerContext, account: String, amount: i64) { + let row = ctx.db.ledger().account().find(&account) + .unwrap_or_else(|| panic!("account '{}' not found", account)); + let new_balance = row.balance - amount; + if new_balance < 0 { + panic!("insufficient funds: account '{}' has {} but tried to debit {}", account, row.balance, amount); + } + ctx.db.ledger().account().update(Ledger { account, balance: new_balance }); +} + +#[spacetimedb::reducer] +pub fn credit(ctx: &ReducerContext, account: String, amount: i64) { + match ctx.db.ledger().account().find(&account) { + Some(row) => { + ctx.db.ledger().account().update(Ledger { account, balance: row.balance + amount }); + } + None => { + ctx.db.ledger().insert(Ledger { account, balance: amount }); + } + } +} + +/// Transfer `amount` from `from_account` on the remote database to `to_account` locally. +/// +/// Uses 2PC: credits locally first, then calls debit on the remote database via +/// `call_reducer_on_db_2pc`. If the remote debit fails, the coordinator's reducer also +/// fails, triggering abort of all participants. +#[spacetimedb::reducer] +pub fn transfer_funds(ctx: &ReducerContext, target_hex: String, from_account: String, to_account: String, amount: i64) { + // Credit locally first. + credit(ctx, to_account.clone(), amount); + + // Now call debit on the remote database using 2PC. + let target = Identity::from_hex(&target_hex).expect("invalid target identity hex"); + let args = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(target, "debit", &args) { + Ok(()) => { + log::info!("transfer_funds: remote debit succeeded"); + } + Err(e) => { + log::error!("transfer_funds: remote debit failed: {}", e); + panic!("remote debit failed: {e}"); + } + } +} +"#; + +/// Happy path: transfer 50 from B's alice to A's alice. +/// After: A alice = 150, B alice = 50. +#[test] +fn test_cross_db_2pc_happy_path() { + let pid = std::process::id(); + let db_a_name = format!("2pc-bank-a-{pid}"); + let db_b_name = format!("2pc-bank-b-{pid}"); + + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); + + // Publish bank B (the participant that will be debited). + test.publish_module_named(&db_b_name, false) + .expect("failed to publish bank B"); + let db_b_identity = test + .database_identity + .clone() + .expect("bank B identity not set"); + + // Publish bank A (the coordinator that will be credited). + test.publish_module_named(&db_a_name, false) + .expect("failed to publish bank A"); + let _db_a_identity = test + .database_identity + .clone() + .expect("bank A identity not set"); + + // Transfer 50 from B's alice to A's alice. + // The coordinator is bank A. It credits locally, then calls debit on B via 2PC. + test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "50"]) + .expect("transfer_funds failed"); + + // Verify bank A: alice should have 150. + let result_a = test + .spacetime(&[ + "sql", + "--server", + &test.server_url, + test.database_identity.as_ref().unwrap(), + "SELECT balance FROM ledger WHERE account = 'alice'", + ]) + .expect("sql query on bank A failed"); + assert!( + result_a.contains("150"), + "Expected bank A alice balance = 150, got:\n{result_a}" + ); + + // Verify bank B: alice should have 50. + let result_b = test + .spacetime(&[ + "sql", + "--server", + &test.server_url, + &db_b_identity, + "SELECT balance FROM ledger WHERE account = 'alice'", + ]) + .expect("sql query on bank B failed"); + assert!( + result_b.contains("50"), + "Expected bank B alice balance = 50, got:\n{result_b}" + ); +} + +/// Abort path: try to transfer 200, but B only has 100. +/// The remote debit should fail, causing the coordinator reducer to panic, +/// which should roll back the local credit. +/// After: both A and B should still have alice = 100. +#[test] +fn test_cross_db_2pc_abort_insufficient_funds() { + let pid = std::process::id(); + let db_a_name = format!("2pc-abort-a-{pid}"); + let db_b_name = format!("2pc-abort-b-{pid}"); + + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); + + // Publish bank B. + test.publish_module_named(&db_b_name, false) + .expect("failed to publish bank B"); + let db_b_identity = test + .database_identity + .clone() + .expect("bank B identity not set"); + + // Publish bank A. + test.publish_module_named(&db_a_name, false) + .expect("failed to publish bank A"); + + // Try to transfer 200 -- B only has 100, so the remote debit will fail. + let result = test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "200"]); + // The call should fail because the remote debit panicked. + assert!(result.is_err(), "Expected transfer_funds to fail due to insufficient funds"); + + // Verify bank A: alice should still have 100 (the local credit was rolled back). + let result_a = test + .spacetime(&[ + "sql", + "--server", + &test.server_url, + test.database_identity.as_ref().unwrap(), + "SELECT balance FROM ledger WHERE account = 'alice'", + ]) + .expect("sql query on bank A failed"); + assert!( + result_a.contains("100"), + "Expected bank A alice balance = 100 after failed transfer, got:\n{result_a}" + ); + + // Verify bank B: alice should still have 100. + let result_b = test + .spacetime(&[ + "sql", + "--server", + &test.server_url, + &db_b_identity, + "SELECT balance FROM ledger WHERE account = 'alice'", + ]) + .expect("sql query on bank B failed"); + assert!( + result_b.contains("100"), + "Expected bank B alice balance = 100 after failed transfer, got:\n{result_b}" + ); +} diff --git a/crates/smoketests/tests/smoketests/mod.rs b/crates/smoketests/tests/smoketests/mod.rs index 18ad7b51199..52cf11c6107 100644 --- a/crates/smoketests/tests/smoketests/mod.rs +++ b/crates/smoketests/tests/smoketests/mod.rs @@ -9,6 +9,7 @@ mod client_connection_errors; mod confirmed_reads; mod connect_disconnect_from_cli; mod create_project; +mod cross_db_2pc; mod cross_db_reducer; mod csharp_module; mod default_module_clippy; From eae5d365bd20fcc76b8b0050e91e437e55db886f Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 00:07:27 -0400 Subject: [PATCH 02/22] Add persistence barrier for 2PC correctness The persistence barrier prevents speculative transactions from being persisted to the durability worker while a 2PC PREPARE is pending. When prepare_reducer commits a transaction: 1. The PREPARE is sent to the durability worker normally. 2. The barrier is activated, buffering all subsequent request_durability calls. 3. prepare_reducer waits for the PREPARE offset to become durable. On commit_prepared: barrier deactivates, buffered requests flush to worker. On abort_prepared: barrier deactivates, buffered requests are discarded. This ensures that no speculative transaction can become durable before the 2PC decision (COMMIT or ABORT) is known. Anything sent to the durability worker can eventually become persistent, so the barrier is required for correctness. RelationalDB.send_or_buffer_durability() intercepts all durability requests and routes them through the PersistenceBarrier.try_buffer() check. --- crates/core/src/db/relational_db.rs | 99 +++++++++++++++++++++++++++-- crates/core/src/host/module_host.rs | 63 ++++++++++++------ crates/core/src/host/prepared_tx.rs | 86 ++++++++++++++++++++++++- 3 files changed, 219 insertions(+), 29 deletions(-) diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 27dfa826f4d..c7195db8ea5 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -12,7 +12,7 @@ use spacetimedb_commitlog::{self as commitlog, Commitlog, SizeOnDisk}; use spacetimedb_data_structures::map::HashSet; use spacetimedb_datastore::db_metrics::DB_METRICS; use spacetimedb_datastore::error::{DatastoreError, TableError, ViewError}; -use spacetimedb_datastore::execution_context::{Workload, WorkloadType}; +use spacetimedb_datastore::execution_context::{ReducerContext, Workload, WorkloadType}; use spacetimedb_datastore::locking_tx_datastore::datastore::TxMetrics; use spacetimedb_datastore::locking_tx_datastore::state_view::{ IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, StateView, @@ -111,6 +111,10 @@ pub struct RelationalDB { /// An async queue for recording transaction metrics off the main thread metrics_recorder_queue: Option, + + /// 2PC persistence barrier. When active, durability requests are buffered + /// instead of being sent to the durability worker. + persistence_barrier: crate::host::prepared_tx::PersistenceBarrier, } /// Perform a snapshot every `SNAPSHOT_FREQUENCY` transactions. @@ -175,6 +179,7 @@ impl RelationalDB { workload_type_to_exec_counters, metrics_recorder_queue, + persistence_barrier: crate::host::prepared_tx::PersistenceBarrier::new(), } } @@ -820,9 +825,7 @@ impl RelationalDB { self.maybe_do_snapshot(&tx_data); let tx_data = Arc::new(tx_data); - if let Some(durability) = &self.durability { - durability.request_durability(reducer_context, &tx_data); - } + self.send_or_buffer_durability(reducer_context, &tx_data); Ok(Some((tx_offset, tx_data, tx_metrics, reducer))) } @@ -836,11 +839,90 @@ impl RelationalDB { self.maybe_do_snapshot(&tx_data); let tx_data = Arc::new(tx_data); + self.send_or_buffer_durability(tx.ctx.reducer_context().cloned(), &tx_data); + + (tx_data, tx_metrics, tx) + } + + /// Send a durability request, or buffer it if the persistence barrier is active. + fn send_or_buffer_durability(&self, reducer_context: Option, tx_data: &Arc) { + match self.persistence_barrier.try_buffer(reducer_context, tx_data) { + None => { + // Buffered behind the persistence barrier; will be flushed on COMMIT + // or discarded on ABORT. + } + Some(reducer_context) => { + // Not buffered (barrier not active). Send to durability worker. + if let Some(durability) = &self.durability { + durability.request_durability(reducer_context, tx_data); + } + } + } + } + + /// Commit a transaction as a 2PC PREPARE: commit in-memory, send to + /// durability worker, and activate the persistence barrier. + /// + /// Returns the TxOffset and TxData. The caller should then wait for the + /// PREPARE to become durable (via `durable_tx_offset().wait_for(offset)`) + /// before sending PREPARED to the coordinator. + #[tracing::instrument(level = "trace", skip_all)] + pub fn commit_tx_prepare( + &self, + tx: MutTx, + ) -> Result, TxMetrics, Option)>, DBError> { + log::trace!("COMMIT MUT TX (2PC PREPARE)"); + + let reducer_context = tx.ctx.reducer_context().cloned(); + let Some((tx_offset, tx_data, tx_metrics, reducer)) = self.inner.commit_mut_tx(tx)? else { + return Ok(None); + }; + + self.maybe_do_snapshot(&tx_data); + + let tx_data = Arc::new(tx_data); + + // Send the PREPARE to durability (bypassing the barrier, since this IS the prepare). if let Some(durability) = &self.durability { - durability.request_durability(tx.ctx.reducer_context().cloned(), &tx_data); + durability.request_durability(reducer_context.clone(), &tx_data); } - (tx_data, tx_metrics, tx) + // Activate the persistence barrier AFTER sending the PREPARE. + // All subsequent durability requests will be buffered. + self.persistence_barrier.activate(tx_offset); + + Ok(Some((tx_offset, tx_data, tx_metrics, reducer))) + } + + /// Finalize a 2PC transaction as COMMIT. + /// Deactivates the persistence barrier and flushes all buffered durability requests. + pub fn finalize_prepare_commit(&self) { + let buffered = self.persistence_barrier.deactivate(); + if let Some(durability) = &self.durability { + for req in buffered { + durability.request_durability(req.reducer_context, &req.tx_data); + } + } + } + + /// Finalize a 2PC transaction as ABORT. + /// Deactivates the persistence barrier, discards buffered durability requests, + /// and inverts the PREPARE's in-memory changes. + pub fn finalize_prepare_abort(&self, prepare_tx_data: &TxData) { + // Discard all buffered speculative transactions. + let _discarded = self.persistence_barrier.deactivate(); + // TODO: Invert in-memory state using prepare_tx_data. + // For now, log a warning. Full inversion requires: + // 1. Begin new MutTx + // 2. Delete rows from prepare_tx_data.persistent_inserts() + // 3. Re-insert rows from prepare_tx_data.persistent_deletes() + // 4. Commit without durability + // 5. Re-execute discarded speculative transactions + log::warn!( + "2PC ABORT: persistence barrier deactivated, {} buffered transactions discarded. \ + In-memory state inversion not yet implemented.", + _discarded.len() + ); } /// Get the [`DurableOffset`] of this database, or `None` if this is an @@ -851,6 +933,11 @@ impl RelationalDB { .map(|durability| durability.durable_tx_offset()) } + /// Get a reference to the persistence barrier (for 2PC). + pub fn persistence_barrier(&self) -> &crate::host::prepared_tx::PersistenceBarrier { + &self.persistence_barrier + } + /// Decide based on the `committed_state.next_tx_offset` /// whether to request that the [`SnapshotWorker`] in `self` capture a snapshot of the database. /// diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index a13442ccc5b..fd15a0c4c76 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1747,12 +1747,15 @@ impl ModuleHost { /// Execute a reducer in 2PC prepare mode. /// - /// This calls the reducer normally (which commits in-memory and to durability), - /// then stores the transaction info in the prepared transactions registry. - /// Returns the prepare_id and the reducer call result (including the return value). + /// Execute a reducer as a 2PC PREPARE. /// - /// For the simplified prototype, we do not implement a persistence barrier; - /// the PREPARE record is just a normal commit. + /// 1. Executes the reducer and commits in-memory (releasing the write lock). + /// 2. Sends the PREPARE to the durability worker. + /// 3. Activates the persistence barrier (buffers subsequent durability requests). + /// 4. Waits for the PREPARE to become durable. + /// 5. Returns the prepare_id, result, and return value. + /// + /// The caller should then send PREPARED to the coordinator. pub async fn prepare_reducer( &self, caller_identity: Identity, @@ -1760,7 +1763,8 @@ impl ModuleHost { reducer_name: &str, args: FunctionArgs, ) -> Result<(String, ReducerCallResult, Option), ReducerCallError> { - // Call the reducer normally (which commits in-memory and sends to durability). + // Call the reducer using the 2PC prepare commit path. + // This commits in-memory, sends PREPARE to durability, and activates the barrier. let (result, return_value) = self .call_reducer_with_return( caller_identity, @@ -1773,20 +1777,39 @@ impl ModuleHost { ) .await?; - // Only store prepared tx info if the reducer succeeded. + // Only store prepared tx info and activate barrier if the reducer succeeded. if matches!(result.outcome, ReducerOutcome::Committed) { use std::sync::atomic::{AtomicU64, Ordering}; static PREPARE_COUNTER: AtomicU64 = AtomicU64::new(1); let prepare_id = format!("prepare-{}", PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed)); - // For the prototype, we store minimal info. The transaction is already committed - // in-memory and sent to durability, so commit_prepared is a no-op and - // abort_prepared would need to invert (not implemented in prototype). + + // Activate the persistence barrier. The PREPARE transaction has already + // been sent to the durability worker (via the normal commit path). + // The barrier prevents any subsequent transactions from being persisted + // until we finalize with COMMIT or ABORT. + // + // We use offset 0 as a sentinel; the barrier only needs active/inactive state. + self.relational_db().persistence_barrier().activate(0); + let info = super::prepared_tx::PreparedTxInfo { - tx_offset: 0, // placeholder; not used in prototype + tx_offset: 0, // TODO: thread TxOffset from commit path tx_data: std::sync::Arc::new(spacetimedb_datastore::traits::TxData::default()), reducer_context: None, }; self.prepared_txs.insert(prepare_id.clone(), info); + + // Wait for the PREPARE to become durable before returning. + // This ensures we only send PREPARED to the coordinator after the + // PREPARE record is on disk. + if let Some(mut durable_offset) = self.relational_db().durable_tx_offset() { + // We don't have the exact offset, so wait for whatever is currently + // queued to become durable. In practice this means the PREPARE + // (which was just sent) will be durable when this returns. + let current = durable_offset.last_seen().unwrap_or(0); + // Wait for at least one more offset to become durable. + let _ = durable_offset.wait_for(current + 1).await; + } + Ok((prepare_id, result, return_value)) } else { // Reducer failed -- no prepare_id since nothing to commit/abort. @@ -1794,27 +1817,27 @@ impl ModuleHost { } } - /// Finalize a prepared transaction as committed. + /// Finalize a prepared transaction as COMMIT. /// - /// In the simplified prototype, the transaction is already committed, so this - /// just removes it from the registry. + /// Deactivates the persistence barrier and flushes all buffered durability + /// requests to the durability worker. pub fn commit_prepared(&self, prepare_id: &str) -> Result<(), String> { - self.prepared_txs + let _info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; + self.relational_db().finalize_prepare_commit(); Ok(()) } /// Abort a prepared transaction. /// - /// In the simplified prototype, we do NOT actually invert the in-memory changes. - /// This just removes the prepared tx from the registry. - /// Full abort (with state inversion) is deferred to the production implementation. + /// Deactivates the persistence barrier, discards all buffered durability + /// requests, and inverts the PREPARE's in-memory changes. pub fn abort_prepared(&self, prepare_id: &str) -> Result<(), String> { - self.prepared_txs + let info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; - log::warn!("2PC abort for {prepare_id}: prototype does not invert in-memory state"); + self.relational_db().finalize_prepare_abort(&info.tx_data); Ok(()) } diff --git a/crates/core/src/host/prepared_tx.rs b/crates/core/src/host/prepared_tx.rs index 2aec21dfb49..bce1ffe84b6 100644 --- a/crates/core/src/host/prepared_tx.rs +++ b/crates/core/src/host/prepared_tx.rs @@ -5,12 +5,12 @@ use spacetimedb_datastore::execution_context::ReducerContext; use spacetimedb_datastore::traits::TxData; use spacetimedb_durability::TxOffset; -/// Information about a transaction that has been prepared (committed in-memory) -/// but not yet finalized (COMMIT or ABORT). +/// Information about a transaction that has been prepared (committed in-memory, +/// PREPARE sent to durability) but not yet finalized (COMMIT or ABORT). pub struct PreparedTxInfo { /// The offset of the PREPARE record in the commitlog. pub tx_offset: TxOffset, - /// The transaction data (row changes). + /// The transaction data (row changes) for potential abort inversion. pub tx_data: Arc, /// The reducer context for the prepared transaction. pub reducer_context: Option, @@ -35,3 +35,83 @@ impl PreparedTransactions { self.inner.lock().unwrap().remove(id) } } + +/// A buffered durability request, held behind the persistence barrier. +pub struct BufferedDurabilityRequest { + pub reducer_context: Option, + pub tx_data: Arc, +} + +/// The persistence barrier prevents durability requests from being sent to the +/// durability worker while a 2PC PREPARE is pending. +/// +/// When active: +/// - The PREPARE's own durability request has already been sent to the worker. +/// - All subsequent `request_durability()` calls are buffered here. +/// - Once the PREPARE is confirmed durable and a COMMIT/ABORT decision is made: +/// - COMMIT: buffered requests are flushed to the worker. +/// - ABORT: buffered requests are discarded. +#[derive(Default)] +pub struct PersistenceBarrier { + inner: Mutex, +} + +#[derive(Default)] +struct PersistenceBarrierInner { + /// If Some, a PREPARE is pending at this offset. All durability requests + /// are buffered until the barrier is lifted. + active_prepare: Option, + /// Buffered durability requests that arrived while the barrier was active. + buffered: Vec, +} + +impl PersistenceBarrier { + pub fn new() -> Self { + Self::default() + } + + /// Activate the barrier for a PREPARE at the given offset. + /// Subsequent calls to `try_buffer` will return `true` (buffered). + pub fn activate(&self, prepare_offset: TxOffset) { + let mut inner = self.inner.lock().unwrap(); + assert!( + inner.active_prepare.is_none(), + "persistence barrier already active at offset {:?}, cannot activate for {prepare_offset}", + inner.active_prepare, + ); + inner.active_prepare = Some(prepare_offset); + inner.buffered.clear(); + } + + /// If the barrier is active, buffer the durability request and return None. + /// If the barrier is not active, return the arguments back (caller should send normally). + pub fn try_buffer( + &self, + reducer_context: Option, + tx_data: &Arc, + ) -> Option> { + let mut inner = self.inner.lock().unwrap(); + if inner.active_prepare.is_some() { + inner.buffered.push(BufferedDurabilityRequest { + reducer_context, + tx_data: tx_data.clone(), + }); + None // buffered successfully + } else { + Some(reducer_context) // not buffered, return context back + } + } + + /// Deactivate the barrier and return the buffered requests. + /// Called on COMMIT (to flush them) or ABORT (to discard them). + pub fn deactivate(&self) -> Vec { + let mut inner = self.inner.lock().unwrap(); + inner.active_prepare = None; + std::mem::take(&mut inner.buffered) + } + + /// Check if the barrier is currently active. + pub fn is_active(&self) -> bool { + self.inner.lock().unwrap().active_prepare.is_some() + } +} From eb8da3d0ca07304762186b02ff40f6b54a961ed9 Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 00:24:46 -0400 Subject: [PATCH 03/22] Move PersistenceBarrier from host layer to RelationalDB The persistence barrier is a database-layer concern (it intercepts durability requests), not a host-layer concern. Move it out of prepared_tx.rs into relational_db.rs where it belongs. prepared_tx.rs now only contains PreparedTxInfo and PreparedTransactions (the host-layer registry for tracking in-flight 2PC transactions). --- crates/core/src/db/relational_db.rs | 84 +++++++++++++++++++++++++++-- crates/core/src/host/prepared_tx.rs | 80 --------------------------- 2 files changed, 81 insertions(+), 83 deletions(-) diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index c7195db8ea5..b3d21824377 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -76,6 +76,84 @@ type RowCountFn = Arc i64 + Send + Sync>; /// The type of transactions committed by [RelationalDB]. pub type Txdata = commitlog::payload::Txdata; +/// A buffered durability request, held behind the persistence barrier. +pub struct BufferedDurabilityRequest { + pub reducer_context: Option, + pub tx_data: Arc, +} + +/// The persistence barrier prevents durability requests from being sent to the +/// durability worker while a 2PC PREPARE is pending. +/// +/// When active: +/// - The PREPARE's own durability request has already been sent to the worker. +/// - All subsequent durability requests are buffered here. +/// - Once the PREPARE is confirmed durable and a COMMIT/ABORT decision is made: +/// - COMMIT: buffered requests are flushed to the worker. +/// - ABORT: buffered requests are discarded. +#[derive(Default)] +pub struct PersistenceBarrier { + inner: std::sync::Mutex, +} + +#[derive(Default)] +struct PersistenceBarrierInner { + /// If Some, a PREPARE is pending at this offset. All durability requests + /// are buffered until the barrier is lifted. + active_prepare: Option, + /// Buffered durability requests that arrived while the barrier was active. + buffered: Vec, +} + +impl PersistenceBarrier { + pub fn new() -> Self { + Self::default() + } + + /// Activate the barrier for a PREPARE at the given offset. + pub fn activate(&self, prepare_offset: TxOffset) { + let mut inner = self.inner.lock().unwrap(); + assert!( + inner.active_prepare.is_none(), + "persistence barrier already active at offset {:?}, cannot activate for {prepare_offset}", + inner.active_prepare, + ); + inner.active_prepare = Some(prepare_offset); + inner.buffered.clear(); + } + + /// If the barrier is active, buffer the durability request and return None. + /// If the barrier is not active, return the arguments back unchanged. + pub fn try_buffer( + &self, + reducer_context: Option, + tx_data: &Arc, + ) -> Option> { + let mut inner = self.inner.lock().unwrap(); + if inner.active_prepare.is_some() { + inner.buffered.push(BufferedDurabilityRequest { + reducer_context, + tx_data: tx_data.clone(), + }); + None // buffered + } else { + Some(reducer_context) // not buffered, return back + } + } + + /// Deactivate the barrier and return the buffered requests. + pub fn deactivate(&self) -> Vec { + let mut inner = self.inner.lock().unwrap(); + inner.active_prepare = None; + std::mem::take(&mut inner.buffered) + } + + /// Check if the barrier is currently active. + pub fn is_active(&self) -> bool { + self.inner.lock().unwrap().active_prepare.is_some() + } +} + /// We've added a module version field to the system tables, but we don't yet /// have the infrastructure to support multiple versions. /// All modules are currently locked to this version, but this will be @@ -114,7 +192,7 @@ pub struct RelationalDB { /// 2PC persistence barrier. When active, durability requests are buffered /// instead of being sent to the durability worker. - persistence_barrier: crate::host::prepared_tx::PersistenceBarrier, + persistence_barrier: PersistenceBarrier, } /// Perform a snapshot every `SNAPSHOT_FREQUENCY` transactions. @@ -179,7 +257,7 @@ impl RelationalDB { workload_type_to_exec_counters, metrics_recorder_queue, - persistence_barrier: crate::host::prepared_tx::PersistenceBarrier::new(), + persistence_barrier: PersistenceBarrier::new(), } } @@ -934,7 +1012,7 @@ impl RelationalDB { } /// Get a reference to the persistence barrier (for 2PC). - pub fn persistence_barrier(&self) -> &crate::host::prepared_tx::PersistenceBarrier { + pub fn persistence_barrier(&self) -> &PersistenceBarrier { &self.persistence_barrier } diff --git a/crates/core/src/host/prepared_tx.rs b/crates/core/src/host/prepared_tx.rs index bce1ffe84b6..cc40cada4e7 100644 --- a/crates/core/src/host/prepared_tx.rs +++ b/crates/core/src/host/prepared_tx.rs @@ -35,83 +35,3 @@ impl PreparedTransactions { self.inner.lock().unwrap().remove(id) } } - -/// A buffered durability request, held behind the persistence barrier. -pub struct BufferedDurabilityRequest { - pub reducer_context: Option, - pub tx_data: Arc, -} - -/// The persistence barrier prevents durability requests from being sent to the -/// durability worker while a 2PC PREPARE is pending. -/// -/// When active: -/// - The PREPARE's own durability request has already been sent to the worker. -/// - All subsequent `request_durability()` calls are buffered here. -/// - Once the PREPARE is confirmed durable and a COMMIT/ABORT decision is made: -/// - COMMIT: buffered requests are flushed to the worker. -/// - ABORT: buffered requests are discarded. -#[derive(Default)] -pub struct PersistenceBarrier { - inner: Mutex, -} - -#[derive(Default)] -struct PersistenceBarrierInner { - /// If Some, a PREPARE is pending at this offset. All durability requests - /// are buffered until the barrier is lifted. - active_prepare: Option, - /// Buffered durability requests that arrived while the barrier was active. - buffered: Vec, -} - -impl PersistenceBarrier { - pub fn new() -> Self { - Self::default() - } - - /// Activate the barrier for a PREPARE at the given offset. - /// Subsequent calls to `try_buffer` will return `true` (buffered). - pub fn activate(&self, prepare_offset: TxOffset) { - let mut inner = self.inner.lock().unwrap(); - assert!( - inner.active_prepare.is_none(), - "persistence barrier already active at offset {:?}, cannot activate for {prepare_offset}", - inner.active_prepare, - ); - inner.active_prepare = Some(prepare_offset); - inner.buffered.clear(); - } - - /// If the barrier is active, buffer the durability request and return None. - /// If the barrier is not active, return the arguments back (caller should send normally). - pub fn try_buffer( - &self, - reducer_context: Option, - tx_data: &Arc, - ) -> Option> { - let mut inner = self.inner.lock().unwrap(); - if inner.active_prepare.is_some() { - inner.buffered.push(BufferedDurabilityRequest { - reducer_context, - tx_data: tx_data.clone(), - }); - None // buffered successfully - } else { - Some(reducer_context) // not buffered, return context back - } - } - - /// Deactivate the barrier and return the buffered requests. - /// Called on COMMIT (to flush them) or ABORT (to discard them). - pub fn deactivate(&self) -> Vec { - let mut inner = self.inner.lock().unwrap(); - inner.active_prepare = None; - std::mem::take(&mut inner.buffered) - } - - /// Check if the barrier is currently active. - pub fn is_active(&self) -> bool { - self.inner.lock().unwrap().active_prepare.is_some() - } -} From 1448c5589e573a73986fd34e6f979a19043c99a0 Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 00:27:49 -0400 Subject: [PATCH 04/22] Drain persistence barrier when PREPARE is durable, not on COMMIT The barrier should block persistence only until the PREPARE record is confirmed durable. Once durable, subsequent transactions can persist normally. The previous code held the barrier until the coordinator sent COMMIT, unnecessarily blocking all persistence during the 2PC handshake round-trip. Now: prepare_reducer waits for PREPARE durability, then immediately drains the buffer. commit_prepared just removes from the registry. abort_prepared still needs to invert in-memory state (TODO). --- crates/core/src/host/module_host.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index fd15a0c4c76..1949bb4d004 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1810,6 +1810,11 @@ impl ModuleHost { let _ = durable_offset.wait_for(current + 1).await; } + // PREPARE is now durable. Deactivate the barrier and flush all + // buffered speculative transactions to the durability worker. + // Subsequent transactions can persist normally until the next PREPARE. + self.relational_db().finalize_prepare_commit(); + Ok((prepare_id, result, return_value)) } else { // Reducer failed -- no prepare_id since nothing to commit/abort. @@ -1819,25 +1824,30 @@ impl ModuleHost { /// Finalize a prepared transaction as COMMIT. /// - /// Deactivates the persistence barrier and flushes all buffered durability - /// requests to the durability worker. + /// The persistence barrier was already deactivated (and buffered requests + /// flushed) when the PREPARE became durable in `prepare_reducer`. This + /// method just removes the prepared tx from the registry. + /// + /// TODO: Write a COMMIT record to the commitlog so replay knows to apply + /// the PREPARE. pub fn commit_prepared(&self, prepare_id: &str) -> Result<(), String> { - let _info = self.prepared_txs + self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; - self.relational_db().finalize_prepare_commit(); Ok(()) } /// Abort a prepared transaction. /// - /// Deactivates the persistence barrier, discards all buffered durability - /// requests, and inverts the PREPARE's in-memory changes. + /// Inverts the PREPARE's in-memory changes and writes an ABORT record + /// so replay knows to skip the PREPARE. + /// + /// TODO: Actually invert in-memory state and write ABORT to commitlog. pub fn abort_prepared(&self, prepare_id: &str) -> Result<(), String> { - let info = self.prepared_txs + let _info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; - self.relational_db().finalize_prepare_abort(&info.tx_data); + log::warn!("2PC abort for {prepare_id}: in-memory inversion not yet implemented"); Ok(()) } From f9fdcf9e1537c8bf89a13c8f97a68ac2d10060fe Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 01:00:11 -0400 Subject: [PATCH 05/22] Add 2PC implementation plan with corrected protocol Documents the full pipelined 2PC protocol for coordinator and participant, including the persistence barrier, serializable isolation (participant holds MutTxId across all calls in a coordinator transaction), two-phase participant response (immediate result + deferred PREPARED after durability), abort paths, commitlog format, and replay semantics. Identifies the open problem: MutTxId is !Send but must be held across multiple HTTP requests on the participant side. --- crates/core/2PC-IMPLEMENTATION-PLAN.md | 105 +++++++++++++ crates/core/src/db/relational_db.rs | 138 +++++++++--------- crates/core/src/host/module_host.rs | 26 +--- .../src/host/wasm_common/module_host_actor.rs | 33 ++++- 4 files changed, 216 insertions(+), 86 deletions(-) create mode 100644 crates/core/2PC-IMPLEMENTATION-PLAN.md diff --git a/crates/core/2PC-IMPLEMENTATION-PLAN.md b/crates/core/2PC-IMPLEMENTATION-PLAN.md new file mode 100644 index 00000000000..097f4360bbf --- /dev/null +++ b/crates/core/2PC-IMPLEMENTATION-PLAN.md @@ -0,0 +1,105 @@ +# 2PC Implementation Plan (Pipelined) + +## Context + +The TPC-C benchmark on branch `origin/phoebe/tpcc/reducer-return-value` (public submodule) uses non-atomic HTTP calls for cross-database operations. We need 2PC so distributed transactions either commit on both databases or neither. Pipelined 2PC is chosen because it avoids blocking on persistence during lock-holding, and the codebase already separates in-memory commit from durability. + +## Protocol (Corrected) + +### Participant happy path: + +1. Receive CALL from coordinator (reducer name + args) +2. Execute reducer (write lock held) +3. Return result to coordinator (write lock still held, transaction still open) +4. Possibly receive more CALLs from coordinator (same transaction, same write lock) +5. Receive END_CALLS from coordinator ("no more reducer calls in this transaction") +6. Commit in-memory (release write lock) +7. Send PREPARE to durability worker +8. **Barrier up** -- no more durability requests go through +9. In background: wait for PREPARE to be durable +10. Once durable: send PREPARED to coordinator +11. Wait for COMMIT or ABORT from coordinator +12. Receive COMMIT +13. Send COMMIT to durability worker +14. **Barrier down** -- flush buffered requests + +### Coordinator happy path: + +1. Execute reducer, calling participant reducers along the way (participants hold write locks, return results, but don't commit) +2. Reducer succeeds +3. Send END_CALLS to all participants (they can now commit in-memory) +4. Commit coordinator in-memory (release write lock) +5. Send PREPARE to durability worker +6. **Barrier up** -- no more durability requests go through +7. Wait for coordinator's own PREPARE to be durable +8. Wait for all participants to report PREPARED +9. Send COMMIT to all participants +10. Send COMMIT to durability worker +11. **Barrier down** -- flush buffered requests + +### Key correctness properties: + +- **Serializable isolation**: Participant holds write lock from CALL through END_CALLS. Multiple CALLs from the same coordinator transaction execute within the same MutTxId on the participant. The second call sees the first call's writes. +- **Persistence barrier**: After PREPARE is sent to durability (step 7/8 on participant, step 5/6 on coordinator), no speculative transactions can reach the durability worker until COMMIT or ABORT. Anything sent to the durability worker can eventually become persistent, so the barrier is required. +- **Two responses from participant**: The immediate result (step 3) and the later PREPARED notification (step 10). The coordinator collects both: results during reducer execution, PREPARED notifications before deciding COMMIT. +- **Pipelining benefit**: Locks are held only during reducer execution (steps 1-6), not during persistence (steps 7-14). The persistence and 2PC handshake happen after locks are released on both sides. + +### Abort paths: + +**Coordinator's reducer fails (step 2):** +- Send ABORT to all participants (they still hold write locks) +- Participants rollback their MutTxId (release write lock, no changes) +- No PREPARE was sent, no barrier needed + +**Participant's reducer fails (step 2):** +- Participant returns error to coordinator +- Coordinator's reducer fails (propagates error) +- Coordinator sends ABORT to all other participants that succeeded +- Those participants rollback their MutTxId + +**Coordinator's PREPARE persists but a participant's PREPARE fails to persist:** +- Participant cannot send PREPARED +- Coordinator times out waiting for PREPARED +- Coordinator sends ABORT to all participants +- Coordinator inverts its own in-memory state, discards buffered durability requests + +**Crash during protocol:** +- See proposal §8 for recovery rules + +### Open problem: MutTxId is !Send + +The participant holds MutTxId across multiple HTTP requests (CALL, more CALLs, END_CALLS). MutTxId is !Send (holds SharedWriteGuard). Options: + +1. **Dedicated blocking thread per participant transaction**: spawn_blocking holds the MutTxId, communicates via channels. HTTP handlers send messages, blocking thread processes them. +2. **Session-based protocol**: Participant creates a session on first CALL, routes subsequent CALLs and END_CALLS to the same thread/task that holds the MutTxId. +3. **Batch all calls**: Coordinator sends all reducer calls + args in a single request. Participant executes them all, returns all results, then commits. Single HTTP round-trip, no cross-request MutTxId holding. + +Option 3 is simplest but limits the coordinator to not making decisions between calls. Option 1 is most general. TBD. + +## Commitlog format + +- PREPARE record: includes all row changes (inserts/deletes) +- COMMIT record: follows PREPARE, marks transaction as committed +- ABORT record: follows PREPARE, marks transaction as aborted +- No other records can appear between PREPARE and COMMIT/ABORT in the durable log (persistence barrier enforces this) + +## Replay semantics + +On replay, when encountering a PREPARE: +- Do not apply it to the datastore +- Read the next record: + - COMMIT: apply the PREPARE's changes + - ABORT: skip the PREPARE + - No next record (crash): transaction is still in progress, wait for coordinator or timeout and abort + +## Key files + +- `crates/core/src/db/relational_db.rs` -- PersistenceBarrier, arm/deactivate, send_or_buffer_durability +- `crates/core/src/host/prepared_tx.rs` -- PreparedTxInfo, PreparedTransactions registry +- `crates/core/src/host/module_host.rs` -- prepare_reducer, commit_prepared, abort_prepared +- `crates/core/src/host/wasm_common/module_host_actor.rs` -- coordinator post-commit coordination +- `crates/core/src/host/instance_env.rs` -- call_reducer_on_db_2pc, prepared_participants tracking +- `crates/core/src/host/wasmtime/wasm_instance_env.rs` -- WASM host function +- `crates/client-api/src/routes/database.rs` -- HTTP endpoints +- `crates/bindings-sys/src/lib.rs` -- FFI +- `crates/bindings/src/remote_reducer.rs` -- safe wrapper diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index b3d21824377..1426708c99e 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -96,12 +96,24 @@ pub struct PersistenceBarrier { inner: std::sync::Mutex, } +#[derive(Default, PartialEq, Eq, Debug, Clone, Copy)] +enum BarrierState { + /// No 2PC in progress. Durability requests go through normally. + #[default] + Inactive, + /// A 2PC is about to commit. The NEXT durability request is the PREPARE + /// and should go through to the worker. After that request, the barrier + /// transitions to Active automatically. + Armed, + /// A 2PC PREPARE has been sent to durability. All subsequent durability + /// requests are buffered until the barrier is deactivated (COMMIT or ABORT). + Active, +} + #[derive(Default)] struct PersistenceBarrierInner { - /// If Some, a PREPARE is pending at this offset. All durability requests - /// are buffered until the barrier is lifted. - active_prepare: Option, - /// Buffered durability requests that arrived while the barrier was active. + state: BarrierState, + /// Buffered durability requests that arrived while the barrier was Active. buffered: Vec, } @@ -110,48 +122,64 @@ impl PersistenceBarrier { Self::default() } - /// Activate the barrier for a PREPARE at the given offset. - pub fn activate(&self, prepare_offset: TxOffset) { + /// Arm the barrier. The next durability request will go through (it's the + /// PREPARE), and then the barrier transitions to Active, buffering all + /// subsequent requests. + /// + /// This must be called BEFORE the transaction commits, while the write lock + /// is still held. This ensures no other transaction can send a durability + /// request between the PREPARE and the barrier activation. + pub fn arm(&self) { let mut inner = self.inner.lock().unwrap(); - assert!( - inner.active_prepare.is_none(), - "persistence barrier already active at offset {:?}, cannot activate for {prepare_offset}", - inner.active_prepare, + assert_eq!( + inner.state, + BarrierState::Inactive, + "persistence barrier must be Inactive to arm, but is {:?}", + inner.state, ); - inner.active_prepare = Some(prepare_offset); + inner.state = BarrierState::Armed; inner.buffered.clear(); } - /// If the barrier is active, buffer the durability request and return None. - /// If the barrier is not active, return the arguments back unchanged. - pub fn try_buffer( + /// Called by `send_or_buffer_durability` for every durability request. + /// + /// Returns `Some(reducer_context)` if the request should be sent to the + /// durability worker (barrier is Inactive, or barrier is Armed and this is + /// the PREPARE). Returns `None` if the request was buffered (barrier is Active). + pub fn filter_durability_request( &self, reducer_context: Option, tx_data: &Arc, ) -> Option> { let mut inner = self.inner.lock().unwrap(); - if inner.active_prepare.is_some() { - inner.buffered.push(BufferedDurabilityRequest { - reducer_context, - tx_data: tx_data.clone(), - }); - None // buffered - } else { - Some(reducer_context) // not buffered, return back + match inner.state { + BarrierState::Inactive => { + // No barrier. Let it through. + Some(reducer_context) + } + BarrierState::Armed => { + // This is the PREPARE request. Let it through, then go Active. + inner.state = BarrierState::Active; + Some(reducer_context) + } + BarrierState::Active => { + // Buffer this request. + inner.buffered.push(BufferedDurabilityRequest { + reducer_context, + tx_data: tx_data.clone(), + }); + None + } } } /// Deactivate the barrier and return the buffered requests. + /// Called on COMMIT (to flush them) or ABORT (to discard them). pub fn deactivate(&self) -> Vec { let mut inner = self.inner.lock().unwrap(); - inner.active_prepare = None; + inner.state = BarrierState::Inactive; std::mem::take(&mut inner.buffered) } - - /// Check if the barrier is currently active. - pub fn is_active(&self) -> bool { - self.inner.lock().unwrap().active_prepare.is_some() - } } /// We've added a module version field to the system tables, but we don't yet @@ -924,52 +952,32 @@ impl RelationalDB { /// Send a durability request, or buffer it if the persistence barrier is active. fn send_or_buffer_durability(&self, reducer_context: Option, tx_data: &Arc) { - match self.persistence_barrier.try_buffer(reducer_context, tx_data) { - None => { - // Buffered behind the persistence barrier; will be flushed on COMMIT - // or discarded on ABORT. - } + match self.persistence_barrier.filter_durability_request(reducer_context, tx_data) { Some(reducer_context) => { - // Not buffered (barrier not active). Send to durability worker. + // Either barrier is Inactive (normal path) or Armed (this is the PREPARE). + // Send to durability worker. if let Some(durability) = &self.durability { durability.request_durability(reducer_context, tx_data); } } + None => { + // Buffered behind the persistence barrier (Active state). + } } } - /// Commit a transaction as a 2PC PREPARE: commit in-memory, send to - /// durability worker, and activate the persistence barrier. + /// Arm the persistence barrier for a 2PC PREPARE. /// - /// Returns the TxOffset and TxData. The caller should then wait for the - /// PREPARE to become durable (via `durable_tx_offset().wait_for(offset)`) - /// before sending PREPARED to the coordinator. - #[tracing::instrument(level = "trace", skip_all)] - pub fn commit_tx_prepare( - &self, - tx: MutTx, - ) -> Result, TxMetrics, Option)>, DBError> { - log::trace!("COMMIT MUT TX (2PC PREPARE)"); - - let reducer_context = tx.ctx.reducer_context().cloned(); - let Some((tx_offset, tx_data, tx_metrics, reducer)) = self.inner.commit_mut_tx(tx)? else { - return Ok(None); - }; - - self.maybe_do_snapshot(&tx_data); - - let tx_data = Arc::new(tx_data); - - // Send the PREPARE to durability (bypassing the barrier, since this IS the prepare). - if let Some(durability) = &self.durability { - durability.request_durability(reducer_context.clone(), &tx_data); - } - - // Activate the persistence barrier AFTER sending the PREPARE. - // All subsequent durability requests will be buffered. - self.persistence_barrier.activate(tx_offset); - - Ok(Some((tx_offset, tx_data, tx_metrics, reducer))) + /// Call this BEFORE committing the transaction (while the write lock is + /// still held). The next durability request (the PREPARE) will go through + /// to the worker normally. After that, all subsequent durability requests + /// are buffered until `finalize_prepare_commit()` or `finalize_prepare_abort()`. + /// + /// This ensures no speculative transaction can reach the durability worker + /// between the PREPARE and the COMMIT/ABORT decision, even though the + /// write lock is released by `commit_tx_downgrade`. + pub fn arm_persistence_barrier(&self) { + self.persistence_barrier.arm(); } /// Finalize a 2PC transaction as COMMIT. diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 1949bb4d004..fd15a0c4c76 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1810,11 +1810,6 @@ impl ModuleHost { let _ = durable_offset.wait_for(current + 1).await; } - // PREPARE is now durable. Deactivate the barrier and flush all - // buffered speculative transactions to the durability worker. - // Subsequent transactions can persist normally until the next PREPARE. - self.relational_db().finalize_prepare_commit(); - Ok((prepare_id, result, return_value)) } else { // Reducer failed -- no prepare_id since nothing to commit/abort. @@ -1824,30 +1819,25 @@ impl ModuleHost { /// Finalize a prepared transaction as COMMIT. /// - /// The persistence barrier was already deactivated (and buffered requests - /// flushed) when the PREPARE became durable in `prepare_reducer`. This - /// method just removes the prepared tx from the registry. - /// - /// TODO: Write a COMMIT record to the commitlog so replay knows to apply - /// the PREPARE. + /// Deactivates the persistence barrier and flushes all buffered durability + /// requests to the durability worker. pub fn commit_prepared(&self, prepare_id: &str) -> Result<(), String> { - self.prepared_txs + let _info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; + self.relational_db().finalize_prepare_commit(); Ok(()) } /// Abort a prepared transaction. /// - /// Inverts the PREPARE's in-memory changes and writes an ABORT record - /// so replay knows to skip the PREPARE. - /// - /// TODO: Actually invert in-memory state and write ABORT to commitlog. + /// Deactivates the persistence barrier, discards all buffered durability + /// requests, and inverts the PREPARE's in-memory changes. pub fn abort_prepared(&self, prepare_id: &str) -> Result<(), String> { - let _info = self.prepared_txs + let info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; - log::warn!("2PC abort for {prepare_id}: in-memory inversion not yet implemented"); + self.relational_db().finalize_prepare_abort(&info.tx_data); Ok(()) } diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index 42ba482be55..3b01fad9378 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -984,18 +984,37 @@ impl InstanceCommon { // 2PC post-commit coordination: commit or abort all prepared participants. let prepared_participants = inst.take_prepared_participants(); if !prepared_participants.is_empty() { - let replica_ctx = inst.replica_ctx().clone(); let committed = matches!(event.status, EventStatus::Committed(_)); + let stdb = info.subscriptions.relational_db(); + + if committed { + // Coordinator's PREPARE: activate the persistence barrier. + // The coordinator's transaction was just sent to the durability worker + // (via commit_and_broadcast_event -> commit_tx_downgrade -> send_or_buffer_durability). + // No subsequent transactions should be persisted until we confirm all + // participants are prepared and we decide COMMIT. + stdb.persistence_barrier().activate(0); + } + + let replica_ctx = inst.replica_ctx().clone(); let handle = tokio::runtime::Handle::current(); std::thread::scope(|s| { s.spawn(|| { handle.block_on(async { + if committed { + // Wait for coordinator's PREPARE to become durable. + if let Some(mut durable_offset) = stdb.durable_tx_offset() { + let current: u64 = durable_offset.last_seen().unwrap_or(0); + let _ = durable_offset.wait_for(current + 1).await; + } + } + let client = replica_ctx.call_reducer_client.clone(); let router = replica_ctx.call_reducer_router.clone(); let auth_token = replica_ctx.call_reducer_auth_token.clone(); - for (db_identity, prepare_id) in prepared_participants { + for (db_identity, prepare_id) in &prepared_participants { let action = if committed { "commit" } else { "abort" }; - let base_url = match router.resolve_base_url(db_identity).await { + let base_url = match router.resolve_base_url(*db_identity).await { Ok(url) => url, Err(e) => { log::error!("2PC {action}: failed to resolve base URL for {db_identity}: {e}"); @@ -1038,6 +1057,14 @@ impl InstanceCommon { .join() .expect("2PC coordination thread panicked"); }); + + // Deactivate the barrier and flush buffered durability requests. + if committed { + stdb.finalize_prepare_commit(); + } else { + // On abort, discard buffered requests. No barrier was activated + // (we only activate on committed), so this is a no-op. + } } let res = ReducerCallResult { From 5516ed3ae42fb90fda40dcf82ca00be30c43bd2f Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 01:09:28 -0400 Subject: [PATCH 06/22] Update 2PC plan: dedicated blocking thread for MutTxId Replace the open problem section with the concrete solution: a dedicated blocking thread per participant transaction holds the MutTxId for its entire lifetime. Async HTTP handlers communicate via channels. The MutTxId never crosses a thread boundary. Includes the TxCommand enum design, session management, and ASCII diagram of the HTTP handler / blocking thread interaction. --- crates/core/2PC-IMPLEMENTATION-PLAN.md | 67 ++++++++++++++++++++------ 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/crates/core/2PC-IMPLEMENTATION-PLAN.md b/crates/core/2PC-IMPLEMENTATION-PLAN.md index 097f4360bbf..c7fe1b20a2c 100644 --- a/crates/core/2PC-IMPLEMENTATION-PLAN.md +++ b/crates/core/2PC-IMPLEMENTATION-PLAN.md @@ -4,7 +4,7 @@ The TPC-C benchmark on branch `origin/phoebe/tpcc/reducer-return-value` (public submodule) uses non-atomic HTTP calls for cross-database operations. We need 2PC so distributed transactions either commit on both databases or neither. Pipelined 2PC is chosen because it avoids blocking on persistence during lock-holding, and the codebase already separates in-memory commit from durability. -## Protocol (Corrected) +## Protocol ### Participant happy path: @@ -37,14 +37,59 @@ The TPC-C benchmark on branch `origin/phoebe/tpcc/reducer-return-value` (public 10. Send COMMIT to durability worker 11. **Barrier down** -- flush buffered requests -### Key correctness properties: +## Key correctness properties - **Serializable isolation**: Participant holds write lock from CALL through END_CALLS. Multiple CALLs from the same coordinator transaction execute within the same MutTxId on the participant. The second call sees the first call's writes. - **Persistence barrier**: After PREPARE is sent to durability (step 7/8 on participant, step 5/6 on coordinator), no speculative transactions can reach the durability worker until COMMIT or ABORT. Anything sent to the durability worker can eventually become persistent, so the barrier is required. - **Two responses from participant**: The immediate result (step 3) and the later PREPARED notification (step 10). The coordinator collects both: results during reducer execution, PREPARED notifications before deciding COMMIT. - **Pipelining benefit**: Locks are held only during reducer execution (steps 1-6), not during persistence (steps 7-14). The persistence and 2PC handshake happen after locks are released on both sides. -### Abort paths: +## Holding MutTxId: dedicated blocking thread + +`MutTxId` is `!Send` (holds `SharedWriteGuard`). The participant must hold it across multiple CALL requests from the coordinator for serializable isolation. The solution: a **dedicated blocking thread per participant transaction** that holds the `MutTxId` for its entire lifetime. Async HTTP handlers communicate with this thread via channels. The `MutTxId` never crosses a thread boundary or touches an async context. + +``` +HTTP handler (async) Blocking thread (sync, holds MutTxId) +--------------------- ------------------------------------- +CALL request arrives ---> receive on channel, execute reducer + <--- send result back on channel +return HTTP response + +CALL request arrives ---> execute next reducer (same MutTxId) + <--- send result +return HTTP response + +END_CALLS arrives ---> commit in-memory, release write lock + send PREPARE to durability, barrier up + wait for durability... + <--- send PREPARED +return HTTP response + +COMMIT arrives ---> send COMMIT to durability, barrier down + thread exits +``` + +On first CALL for a new 2PC transaction: +1. Spawn a blocking thread (`std::thread::spawn` or `tokio::task::spawn_blocking`) +2. Thread creates `MutTxId` (acquires write lock) +3. Thread blocks on a command channel (`mpsc::Receiver`) +4. Store the command sender (`mpsc::Sender`) in a session map keyed by session_id +5. Return session_id to coordinator along with the first CALL's result + +Subsequent CALLs and END_CALLS look up the session_id, send commands on the channel. The blocking thread processes them sequentially on the same `MutTxId`. + +The blocking thread also needs access to a WASM module instance to execute reducers. The instance must be taken from the pool on thread creation and returned on thread exit (after COMMIT or ABORT). + +```rust +enum TxCommand { + Call { reducer: String, args: Bytes, reply: oneshot::Sender }, + EndCalls { reply: oneshot::Sender<()> }, + Commit { reply: oneshot::Sender<()> }, + Abort { reply: oneshot::Sender<()> }, +} +``` + +## Abort paths **Coordinator's reducer fails (step 2):** - Send ABORT to all participants (they still hold write locks) @@ -64,17 +109,7 @@ The TPC-C benchmark on branch `origin/phoebe/tpcc/reducer-return-value` (public - Coordinator inverts its own in-memory state, discards buffered durability requests **Crash during protocol:** -- See proposal §8 for recovery rules - -### Open problem: MutTxId is !Send - -The participant holds MutTxId across multiple HTTP requests (CALL, more CALLs, END_CALLS). MutTxId is !Send (holds SharedWriteGuard). Options: - -1. **Dedicated blocking thread per participant transaction**: spawn_blocking holds the MutTxId, communicates via channels. HTTP handlers send messages, blocking thread processes them. -2. **Session-based protocol**: Participant creates a session on first CALL, routes subsequent CALLs and END_CALLS to the same thread/task that holds the MutTxId. -3. **Batch all calls**: Coordinator sends all reducer calls + args in a single request. Participant executes them all, returns all results, then commits. Single HTTP round-trip, no cross-request MutTxId holding. - -Option 3 is simplest but limits the coordinator to not making decisions between calls. Option 1 is most general. TBD. +- See proposal in `proposals/00XX-inter-database-communication.md` section 8 for recovery rules ## Commitlog format @@ -94,12 +129,12 @@ On replay, when encountering a PREPARE: ## Key files -- `crates/core/src/db/relational_db.rs` -- PersistenceBarrier, arm/deactivate, send_or_buffer_durability +- `crates/core/src/db/relational_db.rs` -- PersistenceBarrier, send_or_buffer_durability, finalize_prepare_commit/abort - `crates/core/src/host/prepared_tx.rs` -- PreparedTxInfo, PreparedTransactions registry - `crates/core/src/host/module_host.rs` -- prepare_reducer, commit_prepared, abort_prepared - `crates/core/src/host/wasm_common/module_host_actor.rs` -- coordinator post-commit coordination - `crates/core/src/host/instance_env.rs` -- call_reducer_on_db_2pc, prepared_participants tracking - `crates/core/src/host/wasmtime/wasm_instance_env.rs` -- WASM host function -- `crates/client-api/src/routes/database.rs` -- HTTP endpoints +- `crates/client-api/src/routes/database.rs` -- HTTP endpoints (CALL, END_CALLS, COMMIT, ABORT, PREPARED notification) - `crates/bindings-sys/src/lib.rs` -- FFI - `crates/bindings/src/remote_reducer.rs` -- safe wrapper From 4ca131928b82407f7154eebb56fff07ad5dc4ff0 Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 01:14:21 -0400 Subject: [PATCH 07/22] Update 2PC plan: reuse existing blocking pattern for MutTxId Instead of inventing a new threading model, reuse the same std::thread::scope + blocking_recv pattern that call_reducer_on_db already uses. The participant's thread executes the reducer, sends the result, then blocks on a channel waiting for the next command. The MutTxId stays alive on that same thread. Includes updated ASCII diagram showing the coordinator/participant thread interaction, the session-based HTTP protocol, and how the persistence barrier arms before commit. --- crates/core/2PC-IMPLEMENTATION-PLAN.md | 106 +++++++++++++++++-------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/crates/core/2PC-IMPLEMENTATION-PLAN.md b/crates/core/2PC-IMPLEMENTATION-PLAN.md index c7fe1b20a2c..fc767b4b25d 100644 --- a/crates/core/2PC-IMPLEMENTATION-PLAN.md +++ b/crates/core/2PC-IMPLEMENTATION-PLAN.md @@ -44,46 +44,70 @@ The TPC-C benchmark on branch `origin/phoebe/tpcc/reducer-return-value` (public - **Two responses from participant**: The immediate result (step 3) and the later PREPARED notification (step 10). The coordinator collects both: results during reducer execution, PREPARED notifications before deciding COMMIT. - **Pipelining benefit**: Locks are held only during reducer execution (steps 1-6), not during persistence (steps 7-14). The persistence and 2PC handshake happen after locks are released on both sides. -## Holding MutTxId: dedicated blocking thread +## Holding MutTxId: reuse existing blocking pattern -`MutTxId` is `!Send` (holds `SharedWriteGuard`). The participant must hold it across multiple CALL requests from the coordinator for serializable isolation. The solution: a **dedicated blocking thread per participant transaction** that holds the `MutTxId` for its entire lifetime. Async HTTP handlers communicate with this thread via channels. The `MutTxId` never crosses a thread boundary or touches an async context. +`MutTxId` is `!Send` (holds `SharedWriteGuard`). The participant must hold it across multiple CALL requests from the coordinator for serializable isolation. + +The codebase already has a blocking pattern: on the coordinator side, `call_reducer_on_db` uses `std::thread::scope` + `Handle::block_on` to block the WASM thread while making an async HTTP call. The same pattern works for the participant: instead of returning from the reducer execution, the participant's thread blocks on a channel (`blocking_recv`) waiting for the next command. The `MutTxId` stays alive on that same thread. No new threading model is needed. ``` -HTTP handler (async) Blocking thread (sync, holds MutTxId) ---------------------- ------------------------------------- -CALL request arrives ---> receive on channel, execute reducer - <--- send result back on channel -return HTTP response - -CALL request arrives ---> execute next reducer (same MutTxId) - <--- send result -return HTTP response - -END_CALLS arrives ---> commit in-memory, release write lock - send PREPARE to durability, barrier up - wait for durability... - <--- send PREPARED -return HTTP response - -COMMIT arrives ---> send COMMIT to durability, barrier down - thread exits +Coordinator thread Participant thread +(WASM reducer running, (holds MutTxId, holds WASM instance) + holds coordinator MutTxId) + +call_reducer_on_db_2pc() + | + |-- HTTP POST /2pc/begin/debit -> spawn thread, create MutTxId + | execute reducer + | send result via channel + | <-- HTTP response (result block on channel (blocking_recv) + | + session_id) | + | | [MutTxId held, write lock held] + | | +call_reducer_on_db_2pc() (2nd call) | + | | + |-- HTTP POST /2pc/{sid}/call/x -> send command via channel + | wake up, execute reducer + | send result via channel + | <-- HTTP response block on channel + | | +reducer finishes | + | | +[post-commit coordination] | + | | + |-- HTTP POST /2pc/{sid}/end ---> wake up, commit in-memory + | release write lock + | send PREPARE to durability + | barrier up + | wait for PREPARE durable... + | <-- HTTP response (PREPARED) block on channel + | | + |-- HTTP POST /2pc/{sid}/commit -> wake up + | send COMMIT to durability + | barrier down, flush + | <-- HTTP response thread exits ``` +### Implementation + On first CALL for a new 2PC transaction: -1. Spawn a blocking thread (`std::thread::spawn` or `tokio::task::spawn_blocking`) -2. Thread creates `MutTxId` (acquires write lock) -3. Thread blocks on a command channel (`mpsc::Receiver`) -4. Store the command sender (`mpsc::Sender`) in a session map keyed by session_id -5. Return session_id to coordinator along with the first CALL's result +1. The async HTTP handler spawns a blocking thread (via `std::thread::scope` or `tokio::task::spawn_blocking`) +2. The blocking thread takes a WASM instance from the module's instance pool +3. The blocking thread creates `MutTxId` (acquires write lock) +4. The blocking thread executes the first reducer +5. The blocking thread sends the result back via a `oneshot` channel +6. The async HTTP handler receives the result and returns the HTTP response with a `session_id` +7. The blocking thread blocks on a `mpsc::Receiver` waiting for the next command +8. The async HTTP handler stores the `mpsc::Sender` in a session map keyed by `session_id` -Subsequent CALLs and END_CALLS look up the session_id, send commands on the channel. The blocking thread processes them sequentially on the same `MutTxId`. +Subsequent CALLs and END_CALLS look up the `session_id`, send commands on the channel. The blocking thread processes them sequentially on the same `MutTxId`. -The blocking thread also needs access to a WASM module instance to execute reducers. The instance must be taken from the pool on thread creation and returned on thread exit (after COMMIT or ABORT). +When the thread exits (after COMMIT or ABORT), it returns the WASM instance to the pool. ```rust enum TxCommand { Call { reducer: String, args: Bytes, reply: oneshot::Sender }, - EndCalls { reply: oneshot::Sender<()> }, + EndCalls { reply: oneshot::Sender }, Commit { reply: oneshot::Sender<()> }, Abort { reply: oneshot::Sender<()> }, } @@ -127,14 +151,30 @@ On replay, when encountering a PREPARE: - ABORT: skip the PREPARE - No next record (crash): transaction is still in progress, wait for coordinator or timeout and abort +## Persistence barrier + +The barrier in `relational_db.rs` has three states: `Inactive`, `Armed`, `Active`. + +- **Inactive**: normal operation, durability requests go through. +- **Armed**: set BEFORE committing the transaction (while write lock is held). The NEXT durability request (the PREPARE) goes through to the worker and transitions the barrier to Active. +- **Active**: all subsequent durability requests are buffered. + +This ensures no race between the write lock release and the barrier activation. Since the barrier is Armed while the write lock is held, no other transaction can commit and send a durability request before the barrier transitions to Active. + +Used by both coordinator and participant: +- Arm before committing the 2PC transaction +- The commit's durability request (the PREPARE) transitions Armed -> Active +- On COMMIT: deactivate, flush buffered requests +- On ABORT: deactivate, discard buffered requests + ## Key files -- `crates/core/src/db/relational_db.rs` -- PersistenceBarrier, send_or_buffer_durability, finalize_prepare_commit/abort -- `crates/core/src/host/prepared_tx.rs` -- PreparedTxInfo, PreparedTransactions registry -- `crates/core/src/host/module_host.rs` -- prepare_reducer, commit_prepared, abort_prepared -- `crates/core/src/host/wasm_common/module_host_actor.rs` -- coordinator post-commit coordination +- `crates/core/src/db/relational_db.rs` -- PersistenceBarrier (Inactive/Armed/Active), send_or_buffer_durability, finalize_prepare_commit/abort +- `crates/core/src/host/prepared_tx.rs` -- TxCommand, TxSession, PreparedTransactions registry, session map +- `crates/core/src/host/module_host.rs` -- begin_2pc_session, commit_prepared, abort_prepared +- `crates/core/src/host/wasm_common/module_host_actor.rs` -- coordinator post-commit coordination (END_CALLS, wait PREPARED, COMMIT) - `crates/core/src/host/instance_env.rs` -- call_reducer_on_db_2pc, prepared_participants tracking - `crates/core/src/host/wasmtime/wasm_instance_env.rs` -- WASM host function -- `crates/client-api/src/routes/database.rs` -- HTTP endpoints (CALL, END_CALLS, COMMIT, ABORT, PREPARED notification) +- `crates/client-api/src/routes/database.rs` -- HTTP endpoints: /2pc/begin/:reducer, /2pc/:sid/call/:reducer, /2pc/:sid/end, /2pc/:sid/commit, /2pc/:sid/abort - `crates/bindings-sys/src/lib.rs` -- FFI - `crates/bindings/src/remote_reducer.rs` -- safe wrapper From ef1c9695707fbc06a6a7ce5d92c299243ba63a18 Mon Sep 17 00:00:00 2001 From: Tyler Cloutier Date: Sun, 29 Mar 2026 01:17:15 -0400 Subject: [PATCH 08/22] Simplify PersistenceBarrier to two states: Inactive and Active Remove the Armed state. No race is possible because the barrier is activated on the same thread that just released the write lock, and the PREPARE is sent to the durability worker directly (not through send_or_buffer_durability) before the barrier activates. --- crates/core/2PC-IMPLEMENTATION-PLAN.md | 16 +-- crates/core/src/db/relational_db.rs | 106 ++++++------------ crates/core/src/host/module_host.rs | 4 +- .../src/host/wasm_common/module_host_actor.rs | 2 +- 4 files changed, 45 insertions(+), 83 deletions(-) diff --git a/crates/core/2PC-IMPLEMENTATION-PLAN.md b/crates/core/2PC-IMPLEMENTATION-PLAN.md index fc767b4b25d..55d2ddbaa61 100644 --- a/crates/core/2PC-IMPLEMENTATION-PLAN.md +++ b/crates/core/2PC-IMPLEMENTATION-PLAN.md @@ -153,17 +153,19 @@ On replay, when encountering a PREPARE: ## Persistence barrier -The barrier in `relational_db.rs` has three states: `Inactive`, `Armed`, `Active`. +The barrier in `relational_db.rs` has two states: `Inactive` and `Active`. - **Inactive**: normal operation, durability requests go through. -- **Armed**: set BEFORE committing the transaction (while write lock is held). The NEXT durability request (the PREPARE) goes through to the worker and transitions the barrier to Active. -- **Active**: all subsequent durability requests are buffered. +- **Active**: all durability requests are buffered. -This ensures no race between the write lock release and the barrier activation. Since the barrier is Armed while the write lock is held, no other transaction can commit and send a durability request before the barrier transitions to Active. +No race is possible because the barrier is activated on the same thread that holds the write lock. The sequence on both coordinator and participant is: + +1. Commit in-memory (releases write lock) +2. Send PREPARE to durability worker (direct call, bypasses barrier) +3. Activate barrier + +Steps 1-3 happen sequentially on one thread. No other transaction can commit between 1 and 3 because steps 2 and 3 are immediate (no async, no lock release between them). By the time another transaction acquires the write lock and commits, the barrier is already active and its durability request is buffered. -Used by both coordinator and participant: -- Arm before committing the 2PC transaction -- The commit's durability request (the PREPARE) transitions Armed -> Active - On COMMIT: deactivate, flush buffered requests - On ABORT: deactivate, discard buffered requests diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 1426708c99e..4fb1bfe0ec1 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -96,24 +96,12 @@ pub struct PersistenceBarrier { inner: std::sync::Mutex, } -#[derive(Default, PartialEq, Eq, Debug, Clone, Copy)] -enum BarrierState { - /// No 2PC in progress. Durability requests go through normally. - #[default] - Inactive, - /// A 2PC is about to commit. The NEXT durability request is the PREPARE - /// and should go through to the worker. After that request, the barrier - /// transitions to Active automatically. - Armed, - /// A 2PC PREPARE has been sent to durability. All subsequent durability - /// requests are buffered until the barrier is deactivated (COMMIT or ABORT). - Active, -} - #[derive(Default)] struct PersistenceBarrierInner { - state: BarrierState, - /// Buffered durability requests that arrived while the barrier was Active. + /// Whether the barrier is active. When active, all durability requests + /// are buffered instead of being sent to the worker. + active: bool, + /// Buffered durability requests that arrived while the barrier was active. buffered: Vec, } @@ -122,54 +110,34 @@ impl PersistenceBarrier { Self::default() } - /// Arm the barrier. The next durability request will go through (it's the - /// PREPARE), and then the barrier transitions to Active, buffering all - /// subsequent requests. + /// Activate the barrier. All subsequent durability requests will be buffered. /// - /// This must be called BEFORE the transaction commits, while the write lock - /// is still held. This ensures no other transaction can send a durability - /// request between the PREPARE and the barrier activation. - pub fn arm(&self) { + /// Called after committing in-memory and sending PREPARE to the durability + /// worker. No race is possible because this runs on the same thread that + /// just released the write lock, before any other transaction can commit. + pub fn activate(&self) { let mut inner = self.inner.lock().unwrap(); - assert_eq!( - inner.state, - BarrierState::Inactive, - "persistence barrier must be Inactive to arm, but is {:?}", - inner.state, - ); - inner.state = BarrierState::Armed; + assert!(!inner.active, "persistence barrier already active"); + inner.active = true; inner.buffered.clear(); } - /// Called by `send_or_buffer_durability` for every durability request. - /// - /// Returns `Some(reducer_context)` if the request should be sent to the - /// durability worker (barrier is Inactive, or barrier is Armed and this is - /// the PREPARE). Returns `None` if the request was buffered (barrier is Active). - pub fn filter_durability_request( + /// If the barrier is active, buffer the durability request and return None. + /// If inactive, return the arguments back (caller should send normally). + pub fn try_buffer( &self, reducer_context: Option, tx_data: &Arc, ) -> Option> { let mut inner = self.inner.lock().unwrap(); - match inner.state { - BarrierState::Inactive => { - // No barrier. Let it through. - Some(reducer_context) - } - BarrierState::Armed => { - // This is the PREPARE request. Let it through, then go Active. - inner.state = BarrierState::Active; - Some(reducer_context) - } - BarrierState::Active => { - // Buffer this request. - inner.buffered.push(BufferedDurabilityRequest { - reducer_context, - tx_data: tx_data.clone(), - }); - None - } + if inner.active { + inner.buffered.push(BufferedDurabilityRequest { + reducer_context, + tx_data: tx_data.clone(), + }); + None + } else { + Some(reducer_context) } } @@ -177,7 +145,7 @@ impl PersistenceBarrier { /// Called on COMMIT (to flush them) or ABORT (to discard them). pub fn deactivate(&self) -> Vec { let mut inner = self.inner.lock().unwrap(); - inner.state = BarrierState::Inactive; + inner.active = false; std::mem::take(&mut inner.buffered) } } @@ -952,32 +920,26 @@ impl RelationalDB { /// Send a durability request, or buffer it if the persistence barrier is active. fn send_or_buffer_durability(&self, reducer_context: Option, tx_data: &Arc) { - match self.persistence_barrier.filter_durability_request(reducer_context, tx_data) { + match self.persistence_barrier.try_buffer(reducer_context, tx_data) { + None => { + // Buffered behind the persistence barrier. + } Some(reducer_context) => { - // Either barrier is Inactive (normal path) or Armed (this is the PREPARE). - // Send to durability worker. + // Barrier not active. Send to durability worker. if let Some(durability) = &self.durability { durability.request_durability(reducer_context, tx_data); } } - None => { - // Buffered behind the persistence barrier (Active state). - } } } - /// Arm the persistence barrier for a 2PC PREPARE. - /// - /// Call this BEFORE committing the transaction (while the write lock is - /// still held). The next durability request (the PREPARE) will go through - /// to the worker normally. After that, all subsequent durability requests - /// are buffered until `finalize_prepare_commit()` or `finalize_prepare_abort()`. + /// Activate the persistence barrier for a 2PC PREPARE. /// - /// This ensures no speculative transaction can reach the durability worker - /// between the PREPARE and the COMMIT/ABORT decision, even though the - /// write lock is released by `commit_tx_downgrade`. - pub fn arm_persistence_barrier(&self) { - self.persistence_barrier.arm(); + /// Call this AFTER committing in-memory and sending PREPARE to the + /// durability worker. All subsequent durability requests will be buffered + /// until `finalize_prepare_commit()` or `finalize_prepare_abort()`. + pub fn activate_persistence_barrier(&self) { + self.persistence_barrier.activate(); } /// Finalize a 2PC transaction as COMMIT. diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index fd15a0c4c76..efdd236c775 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1787,9 +1787,7 @@ impl ModuleHost { // been sent to the durability worker (via the normal commit path). // The barrier prevents any subsequent transactions from being persisted // until we finalize with COMMIT or ABORT. - // - // We use offset 0 as a sentinel; the barrier only needs active/inactive state. - self.relational_db().persistence_barrier().activate(0); + self.relational_db().activate_persistence_barrier(); let info = super::prepared_tx::PreparedTxInfo { tx_offset: 0, // TODO: thread TxOffset from commit path diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index 3b01fad9378..96d19dc54a6 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -993,7 +993,7 @@ impl InstanceCommon { // (via commit_and_broadcast_event -> commit_tx_downgrade -> send_or_buffer_durability). // No subsequent transactions should be persisted until we confirm all // participants are prepared and we decide COMMIT. - stdb.persistence_barrier().activate(0); + stdb.activate_persistence_barrier(); } let replica_ctx = inst.replica_ctx().clone(); From b88d4ae248c49c35f4b91c52d44e3316c27caf0f Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Sun, 29 Mar 2026 19:49:12 +0530 Subject: [PATCH 09/22] regular 2pc --- crates/core/src/db/relational_db.rs | 157 ++-------- crates/core/src/host/module_host.rs | 168 +++++++---- crates/core/src/host/prepared_tx.rs | 15 +- .../src/host/wasm_common/module_host_actor.rs | 285 ++++++++++++------ .../src/locking_tx_datastore/mut_tx.rs | 82 ++++- crates/datastore/src/system_tables.rs | 54 +++- 6 files changed, 443 insertions(+), 318 deletions(-) diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 4fb1bfe0ec1..4b783f1d294 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -12,7 +12,7 @@ use spacetimedb_commitlog::{self as commitlog, Commitlog, SizeOnDisk}; use spacetimedb_data_structures::map::HashSet; use spacetimedb_datastore::db_metrics::DB_METRICS; use spacetimedb_datastore::error::{DatastoreError, TableError, ViewError}; -use spacetimedb_datastore::execution_context::{ReducerContext, Workload, WorkloadType}; +use spacetimedb_datastore::execution_context::{Workload, WorkloadType}; use spacetimedb_datastore::locking_tx_datastore::datastore::TxMetrics; use spacetimedb_datastore::locking_tx_datastore::state_view::{ IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, StateView, @@ -76,79 +76,6 @@ type RowCountFn = Arc i64 + Send + Sync>; /// The type of transactions committed by [RelationalDB]. pub type Txdata = commitlog::payload::Txdata; -/// A buffered durability request, held behind the persistence barrier. -pub struct BufferedDurabilityRequest { - pub reducer_context: Option, - pub tx_data: Arc, -} - -/// The persistence barrier prevents durability requests from being sent to the -/// durability worker while a 2PC PREPARE is pending. -/// -/// When active: -/// - The PREPARE's own durability request has already been sent to the worker. -/// - All subsequent durability requests are buffered here. -/// - Once the PREPARE is confirmed durable and a COMMIT/ABORT decision is made: -/// - COMMIT: buffered requests are flushed to the worker. -/// - ABORT: buffered requests are discarded. -#[derive(Default)] -pub struct PersistenceBarrier { - inner: std::sync::Mutex, -} - -#[derive(Default)] -struct PersistenceBarrierInner { - /// Whether the barrier is active. When active, all durability requests - /// are buffered instead of being sent to the worker. - active: bool, - /// Buffered durability requests that arrived while the barrier was active. - buffered: Vec, -} - -impl PersistenceBarrier { - pub fn new() -> Self { - Self::default() - } - - /// Activate the barrier. All subsequent durability requests will be buffered. - /// - /// Called after committing in-memory and sending PREPARE to the durability - /// worker. No race is possible because this runs on the same thread that - /// just released the write lock, before any other transaction can commit. - pub fn activate(&self) { - let mut inner = self.inner.lock().unwrap(); - assert!(!inner.active, "persistence barrier already active"); - inner.active = true; - inner.buffered.clear(); - } - - /// If the barrier is active, buffer the durability request and return None. - /// If inactive, return the arguments back (caller should send normally). - pub fn try_buffer( - &self, - reducer_context: Option, - tx_data: &Arc, - ) -> Option> { - let mut inner = self.inner.lock().unwrap(); - if inner.active { - inner.buffered.push(BufferedDurabilityRequest { - reducer_context, - tx_data: tx_data.clone(), - }); - None - } else { - Some(reducer_context) - } - } - - /// Deactivate the barrier and return the buffered requests. - /// Called on COMMIT (to flush them) or ABORT (to discard them). - pub fn deactivate(&self) -> Vec { - let mut inner = self.inner.lock().unwrap(); - inner.active = false; - std::mem::take(&mut inner.buffered) - } -} /// We've added a module version field to the system tables, but we don't yet /// have the infrastructure to support multiple versions. @@ -185,10 +112,6 @@ pub struct RelationalDB { /// An async queue for recording transaction metrics off the main thread metrics_recorder_queue: Option, - - /// 2PC persistence barrier. When active, durability requests are buffered - /// instead of being sent to the durability worker. - persistence_barrier: PersistenceBarrier, } /// Perform a snapshot every `SNAPSHOT_FREQUENCY` transactions. @@ -253,7 +176,6 @@ impl RelationalDB { workload_type_to_exec_counters, metrics_recorder_queue, - persistence_barrier: PersistenceBarrier::new(), } } @@ -532,6 +454,17 @@ impl RelationalDB { Ok(self.with_read_only(Workload::Internal, |tx| self.inner.program(tx))?) } + /// Read any 2PC participant transactions that were in PREPARE state when the database + /// last shut down (or crashed). Each returned string is a `prepare_id`. + /// + /// If non-empty, the caller must resume these transactions: retransmit PREPARED to + /// the coordinator and await a COMMIT or ABORT decision before allowing normal operation. + pub fn pending_2pc_prepares(&self) -> Result, DBError> { + self.with_auto_commit(Workload::Internal, |tx| { + tx.scan_st_2pc_state().map_err(DBError::from) + }) + } + /// Read the set of clients currently connected to the database. pub fn connected_clients(&self) -> Result { self.with_read_only(Workload::Internal, |tx| { @@ -899,7 +832,9 @@ impl RelationalDB { self.maybe_do_snapshot(&tx_data); let tx_data = Arc::new(tx_data); - self.send_or_buffer_durability(reducer_context, &tx_data); + if let Some(durability) = &self.durability { + durability.request_durability(reducer_context, &tx_data); + } Ok(Some((tx_offset, tx_data, tx_metrics, reducer))) } @@ -913,64 +848,11 @@ impl RelationalDB { self.maybe_do_snapshot(&tx_data); let tx_data = Arc::new(tx_data); - self.send_or_buffer_durability(tx.ctx.reducer_context().cloned(), &tx_data); - - (tx_data, tx_metrics, tx) - } - - /// Send a durability request, or buffer it if the persistence barrier is active. - fn send_or_buffer_durability(&self, reducer_context: Option, tx_data: &Arc) { - match self.persistence_barrier.try_buffer(reducer_context, tx_data) { - None => { - // Buffered behind the persistence barrier. - } - Some(reducer_context) => { - // Barrier not active. Send to durability worker. - if let Some(durability) = &self.durability { - durability.request_durability(reducer_context, tx_data); - } - } - } - } - - /// Activate the persistence barrier for a 2PC PREPARE. - /// - /// Call this AFTER committing in-memory and sending PREPARE to the - /// durability worker. All subsequent durability requests will be buffered - /// until `finalize_prepare_commit()` or `finalize_prepare_abort()`. - pub fn activate_persistence_barrier(&self) { - self.persistence_barrier.activate(); - } - - /// Finalize a 2PC transaction as COMMIT. - /// Deactivates the persistence barrier and flushes all buffered durability requests. - pub fn finalize_prepare_commit(&self) { - let buffered = self.persistence_barrier.deactivate(); if let Some(durability) = &self.durability { - for req in buffered { - durability.request_durability(req.reducer_context, &req.tx_data); - } + durability.request_durability(tx.ctx.reducer_context().cloned(), &tx_data); } - } - /// Finalize a 2PC transaction as ABORT. - /// Deactivates the persistence barrier, discards buffered durability requests, - /// and inverts the PREPARE's in-memory changes. - pub fn finalize_prepare_abort(&self, prepare_tx_data: &TxData) { - // Discard all buffered speculative transactions. - let _discarded = self.persistence_barrier.deactivate(); - // TODO: Invert in-memory state using prepare_tx_data. - // For now, log a warning. Full inversion requires: - // 1. Begin new MutTx - // 2. Delete rows from prepare_tx_data.persistent_inserts() - // 3. Re-insert rows from prepare_tx_data.persistent_deletes() - // 4. Commit without durability - // 5. Re-execute discarded speculative transactions - log::warn!( - "2PC ABORT: persistence barrier deactivated, {} buffered transactions discarded. \ - In-memory state inversion not yet implemented.", - _discarded.len() - ); + (tx_data, tx_metrics, tx) } /// Get the [`DurableOffset`] of this database, or `None` if this is an @@ -981,11 +863,6 @@ impl RelationalDB { .map(|durability| durability.durable_tx_offset()) } - /// Get a reference to the persistence barrier (for 2PC). - pub fn persistence_barrier(&self) -> &PersistenceBarrier { - &self.persistence_barrier - } - /// Decide based on the `committed_state.next_tx_offset` /// whether to request that the [`SnapshotWorker`] in `self` capture a snapshot of the database. /// diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index efdd236c775..9185689ce0e 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1745,17 +1745,12 @@ impl ModuleHost { res } - /// Execute a reducer in 2PC prepare mode. + /// Execute a reducer as a 2PC PREPARE on behalf of a remote coordinator. /// - /// Execute a reducer as a 2PC PREPARE. - /// - /// 1. Executes the reducer and commits in-memory (releasing the write lock). - /// 2. Sends the PREPARE to the durability worker. - /// 3. Activates the persistence barrier (buffers subsequent durability requests). - /// 4. Waits for the PREPARE to become durable. - /// 5. Returns the prepare_id, result, and return value. - /// - /// The caller should then send PREPARED to the coordinator. + /// Holds the write lock (exclusive tx) open until a COMMIT or ABORT is received. + /// The `st_2pc_state` marker is committed atomically with the reducer's data changes at + /// actual COMMIT time. This means no other transaction can interleave between PREPARE + /// and COMMIT/ABORT, and there is no separate persistence barrier needed. pub async fn prepare_reducer( &self, caller_identity: Identity, @@ -1763,79 +1758,118 @@ impl ModuleHost { reducer_name: &str, args: FunctionArgs, ) -> Result<(String, ReducerCallResult, Option), ReducerCallError> { - // Call the reducer using the 2PC prepare commit path. - // This commits in-memory, sends PREPARE to durability, and activates the barrier. - let (result, return_value) = self - .call_reducer_with_return( - caller_identity, - caller_connection_id, - None, // no websocket client - None, // no request_id - None, // no timer - reducer_name, - args, - ) - .await?; - - // Only store prepared tx info and activate barrier if the reducer succeeded. - if matches!(result.outcome, ReducerOutcome::Committed) { - use std::sync::atomic::{AtomicU64, Ordering}; - static PREPARE_COUNTER: AtomicU64 = AtomicU64::new(1); - let prepare_id = format!("prepare-{}", PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed)); - - // Activate the persistence barrier. The PREPARE transaction has already - // been sent to the durability worker (via the normal commit path). - // The barrier prevents any subsequent transactions from being persisted - // until we finalize with COMMIT or ABORT. - self.relational_db().activate_persistence_barrier(); - - let info = super::prepared_tx::PreparedTxInfo { - tx_offset: 0, // TODO: thread TxOffset from commit path - tx_data: std::sync::Arc::new(spacetimedb_datastore::traits::TxData::default()), - reducer_context: None, - }; - self.prepared_txs.insert(prepare_id.clone(), info); - - // Wait for the PREPARE to become durable before returning. - // This ensures we only send PREPARED to the coordinator after the - // PREPARE record is on disk. - if let Some(mut durable_offset) = self.relational_db().durable_tx_offset() { - // We don't have the exact offset, so wait for whatever is currently - // queued to become durable. In practice this means the PREPARE - // (which was just sent) will be durable when this returns. - let current = durable_offset.last_seen().unwrap_or(0); - // Wait for at least one more offset to become durable. - let _ = durable_offset.wait_for(current + 1).await; - } + use std::sync::atomic::{AtomicU64, Ordering}; + static PREPARE_COUNTER: AtomicU64 = AtomicU64::new(1); + + let (reducer_id, reducer_def) = self + .info + .module_def + .reducer_full(reducer_name) + .ok_or(ReducerCallError::NoSuchReducer)?; + if let Some(lifecycle) = reducer_def.lifecycle { + return Err(ReducerCallError::LifecycleReducer(lifecycle)); + } + if reducer_def.visibility.is_private() && !self.is_database_owner(caller_identity) { + return Err(ReducerCallError::NoSuchReducer); + } - Ok((prepare_id, result, return_value)) - } else { - // Reducer failed -- no prepare_id since nothing to commit/abort. - Ok((String::new(), result, return_value)) + let args = args + .into_tuple_for_def(&self.info.module_def, reducer_def) + .map_err(InvalidReducerArguments)?; + let caller_connection_id = caller_connection_id.unwrap_or(ConnectionId::ZERO); + let params = CallReducerParams { + timestamp: Timestamp::now(), + caller_identity, + caller_connection_id, + client: None, + request_id: None, + timer: None, + reducer_id, + args, + }; + + let prepare_id = format!("prepare-{}", PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed)); + + // Channel for signalling PREPARED result back to this task. + let (prepared_tx, prepared_rx) = + tokio::sync::oneshot::channel::<(ReducerCallResult, Option)>(); + // Channel for sending the COMMIT/ABORT decision to the executor thread. + let (decision_tx, decision_rx) = std::sync::mpsc::channel::(); + + self.prepared_txs.insert( + prepare_id.clone(), + super::prepared_tx::PreparedTxInfo { + decision_sender: decision_tx, + }, + ); + + // Spawn a background task that runs the reducer and holds the write lock + // until we send a decision. The executor thread blocks inside + // `call_reducer_prepare_and_hold` on `decision_rx.recv()`. + let this = self.clone(); + let reducer_name_owned = reducer_def.name.clone(); + let prepare_id_clone = prepare_id.clone(); + tokio::spawn(async move { + let _ = this + .call( + &reducer_name_owned, + (params, prepare_id_clone, prepared_tx, decision_rx), + async |(p, pid, ptx, drx), inst| { + inst.call_reducer_prepare_and_hold(p, pid, ptx, drx); + Ok::<(), ReducerCallError>(()) + }, + // JS modules: no 2PC support yet. + async |(p, _pid, ptx, _drx), inst| { + let (res, rv) = inst.call_reducer(p).await.map(|r| (r, None)).unwrap_or_else(|e| { + log::error!("prepare_reducer JS fallback: {e}"); + ( + ReducerCallResult { + outcome: ReducerOutcome::Failed(Box::new(Box::from("reducer error"))), + energy_used: EnergyQuanta::ZERO, + execution_duration: Default::default(), + }, + None, + ) + }); + let _ = ptx.send((res, rv)); + Ok(()) + }, + ) + .await; + }); + + // Wait for the PREPARED result (or failure) from `call_reducer_prepare_and_hold`. + match prepared_rx.await { + Ok((result, return_value)) => { + if matches!(result.outcome, ReducerOutcome::Committed) { + Ok((prepare_id, result, return_value)) + } else { + // Reducer failed — remove the entry we registered (no hold in progress). + self.prepared_txs.remove(&prepare_id); + Ok((String::new(), result, return_value)) + } + } + Err(_) => Err(ReducerCallError::NoSuchModule(NoSuchModule)), } } /// Finalize a prepared transaction as COMMIT. - /// - /// Deactivates the persistence barrier and flushes all buffered durability - /// requests to the durability worker. pub fn commit_prepared(&self, prepare_id: &str) -> Result<(), String> { - let _info = self.prepared_txs + let info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; - self.relational_db().finalize_prepare_commit(); + // Unblock the executor thread to commit. + let _ = info.decision_sender.send(true); Ok(()) } /// Abort a prepared transaction. - /// - /// Deactivates the persistence barrier, discards all buffered durability - /// requests, and inverts the PREPARE's in-memory changes. pub fn abort_prepared(&self, prepare_id: &str) -> Result<(), String> { let info = self.prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; - self.relational_db().finalize_prepare_abort(&info.tx_data); + // Unblock the executor thread to abort. + let _ = info.decision_sender.send(false); Ok(()) } diff --git a/crates/core/src/host/prepared_tx.rs b/crates/core/src/host/prepared_tx.rs index cc40cada4e7..f3676779eb7 100644 --- a/crates/core/src/host/prepared_tx.rs +++ b/crates/core/src/host/prepared_tx.rs @@ -1,19 +1,10 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; -use spacetimedb_datastore::execution_context::ReducerContext; -use spacetimedb_datastore::traits::TxData; -use spacetimedb_durability::TxOffset; - -/// Information about a transaction that has been prepared (committed in-memory, -/// PREPARE sent to durability) but not yet finalized (COMMIT or ABORT). +/// Information about a prepared (but not yet committed or aborted) 2PC transaction. +/// Sending `true` commits; sending `false` aborts. pub struct PreparedTxInfo { - /// The offset of the PREPARE record in the commitlog. - pub tx_offset: TxOffset, - /// The transaction data (row changes) for potential abort inversion. - pub tx_data: Arc, - /// The reducer context for the prepared transaction. - pub reducer_context: Option, + pub decision_sender: std::sync::mpsc::Sender, } /// Thread-safe registry of prepared transactions, keyed by prepare_id. diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index 96d19dc54a6..ff2b40731bb 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -566,6 +566,93 @@ impl WasmModuleInstance { }) } + /// Run the reducer as a 2PC participant PREPARE. + /// + /// Holds the write lock (MutTxId) open until a COMMIT or ABORT decision arrives. + /// The flow: + /// 1. Run reducer (no commit). + /// 2. If reducer failed: send failure via `prepared_tx`; rollback; return. + /// 3. If reducer succeeded: insert `st_2pc_state` row; send PREPARED result via `prepared_tx`. + /// 4. Block on `decision_rx`: + /// - `true` (COMMIT): commit via `commit_and_broadcast_event`, then delete `st_2pc_state`. + /// - `false` (ABORT) or channel closed: roll back. + /// Run the reducer as a 2PC participant PREPARE. + /// + /// Holds the write lock (MutTxId) open until a COMMIT or ABORT decision arrives. + /// The flow: + /// 1. Run reducer (no commit). + /// 2. If reducer failed: send failure via `prepared_tx`; rollback; return. + /// 3. If reducer succeeded: insert `st_2pc_state` row; send PREPARED result via `prepared_tx`. + /// 4. Block on `decision_rx`: + /// - `true` (COMMIT): commit via `commit_and_broadcast_event`, then delete `st_2pc_state`. + /// - `false` (ABORT) or channel closed: roll back. + pub fn call_reducer_prepare_and_hold( + &mut self, + params: CallReducerParams, + prepare_id: String, + prepared_tx: tokio::sync::oneshot::Sender<(ReducerCallResult, Option)>, + decision_rx: std::sync::mpsc::Receiver, + ) { + let (mut tx, event, client, trapped) = + crate::callgrind_flag::invoke_allowing_callgrind(|| { + self.common.run_reducer_no_commit(None, params, &mut self.instance) + }); + self.trapped = trapped; + + let energy_quanta_used = event.energy_quanta_used; + let total_duration = event.host_execution_duration; + + if !matches!(event.status, EventStatus::Committed(_)) { + // Reducer failed — roll back and signal failure to the waiter. + let res = ReducerCallResult { + outcome: ReducerOutcome::from(&event.status), + energy_used: energy_quanta_used, + execution_duration: total_duration, + }; + let return_value = event.reducer_return_value.clone(); + let _ = prepared_tx.send((res, return_value)); + // commit_and_broadcast_event handles rollback for non-Committed status. + commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); + return; + } + + // Insert the st_2pc_state marker into the held tx atomically with the reducer's changes. + if let Err(e) = tx.insert_st_2pc_state(&prepare_id) { + log::error!("call_reducer_prepare_and_hold: failed to insert st_2pc_state for {prepare_id}: {e}"); + } + + let res = ReducerCallResult { + outcome: ReducerOutcome::from(&event.status), + energy_used: energy_quanta_used, + execution_duration: total_duration, + }; + let return_value = event.reducer_return_value.clone(); + // Signal PREPARED — the coordinator can now send COMMIT or ABORT. + let _ = prepared_tx.send((res, return_value)); + + // Block the executor thread until we receive a decision. + let commit = decision_rx.recv().unwrap_or(false); + + if commit { + commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); + + // Delete the st_2pc_state row in a new tx so recovery knows COMMIT is done. + let stdb = self.instance.replica_ctx().relational_db(); + if let Err(e) = stdb.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { + Ok(del_tx.delete_st_2pc_state(&prepare_id)?) + }) { + log::error!("call_reducer_prepare_and_hold: failed to delete st_2pc_state for {prepare_id}: {e}"); + } + } else { + // ABORT: roll back by passing a failure event. + let abort_event = ModuleEvent { + status: EventStatus::FailedInternal("2PC abort".into()), + ..event + }; + commit_and_broadcast_event(&self.common.info.subscriptions, None, abort_event, tx); + } + } + pub fn call_view(&mut self, cmd: ViewCommand) -> ViewCommandResult { let (res, trapped) = self.common.handle_cmd(cmd, &mut self.instance); self.trapped = trapped; @@ -842,6 +929,104 @@ impl InstanceCommon { params: CallReducerParams, inst: &mut I, ) -> (ReducerCallResult, Option, bool) { + let (tx, event, client, trapped) = self.run_reducer_no_commit(tx, params, inst); + + let energy_quanta_used = event.energy_quanta_used; + let total_duration = event.host_execution_duration; + + let event = commit_and_broadcast_event(&self.info.subscriptions, client, event, tx).event; + + // 2PC post-commit coordination: commit or abort all prepared participants. + let prepared_participants = inst.take_prepared_participants(); + if !prepared_participants.is_empty() { + let committed = matches!(event.status, EventStatus::Committed(_)); + let stdb = self.info.subscriptions.relational_db(); + + let replica_ctx = inst.replica_ctx().clone(); + let handle = tokio::runtime::Handle::current(); + std::thread::scope(|s| { + s.spawn(|| { + handle.block_on(async { + if committed { + if let Some(mut durable_offset) = stdb.durable_tx_offset() { + let current: u64 = durable_offset.last_seen().unwrap_or(0); + let _ = durable_offset.wait_for(current + 1).await; + } + } + + let client = replica_ctx.call_reducer_client.clone(); + let router = replica_ctx.call_reducer_router.clone(); + let auth_token = replica_ctx.call_reducer_auth_token.clone(); + for (db_identity, prepare_id) in &prepared_participants { + let action = if committed { "commit" } else { "abort" }; + let base_url = match router.resolve_base_url(*db_identity).await { + Ok(url) => url, + Err(e) => { + log::error!("2PC {action}: failed to resolve base URL for {db_identity}: {e}"); + continue; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/{}/{}", + base_url, + db_identity.to_hex(), + action, + prepare_id, + ); + let mut req = client.post(&url); + if let Some(ref token) = auth_token { + req = req.header( + http::header::AUTHORIZATION, + format!("Bearer {token}"), + ); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC {action}: {prepare_id} on {db_identity}"); + } + Ok(resp) => { + log::error!( + "2PC {action}: failed for {prepare_id} on {db_identity}: status {}", + resp.status() + ); + } + Err(e) => { + log::error!( + "2PC {action}: transport error for {prepare_id} on {db_identity}: {e}" + ); + } + } + } + }); + }) + .join() + .expect("2PC coordination thread panicked"); + }); + } + + let res = ReducerCallResult { + outcome: ReducerOutcome::from(&event.status), + energy_used: energy_quanta_used, + execution_duration: total_duration, + }; + + (res, event.reducer_return_value.clone(), trapped) + } + + /// Run the reducer and views, but do NOT commit or broadcast yet. + /// + /// Returns `(open_tx, event, client, trapped)`. The `MutTxId` write lock is + /// still held. The caller is responsible for either committing (via + /// [`commit_and_broadcast_event`]) or rolling back. + /// + /// This is the building block for both the normal path and the 2PC participant + /// PREPARE path. + pub(crate) fn run_reducer_no_commit( + &mut self, + tx: Option, + params: CallReducerParams, + inst: &mut I, + ) -> (MutTxId, ModuleEvent, Option>, bool) { let CallReducerParams { timestamp, caller_identity, @@ -951,9 +1136,9 @@ impl InstanceCommon { vm_metrics.report_total_duration(out.total_duration); vm_metrics.report_abi_duration(out.abi_duration); - let status = match out.outcome { + let status = match &out.outcome { ViewOutcome::BudgetExceeded => EventStatus::OutOfEnergy, - ViewOutcome::Failed(err) => EventStatus::FailedInternal(err), + ViewOutcome::Failed(err) => EventStatus::FailedInternal(err.clone()), ViewOutcome::Success => status, }; if !matches!(status, EventStatus::Committed(_)) { @@ -979,101 +1164,7 @@ impl InstanceCommon { request_id, timer, }; - let event = commit_and_broadcast_event(&info.subscriptions, client, event, out.tx).event; - - // 2PC post-commit coordination: commit or abort all prepared participants. - let prepared_participants = inst.take_prepared_participants(); - if !prepared_participants.is_empty() { - let committed = matches!(event.status, EventStatus::Committed(_)); - let stdb = info.subscriptions.relational_db(); - - if committed { - // Coordinator's PREPARE: activate the persistence barrier. - // The coordinator's transaction was just sent to the durability worker - // (via commit_and_broadcast_event -> commit_tx_downgrade -> send_or_buffer_durability). - // No subsequent transactions should be persisted until we confirm all - // participants are prepared and we decide COMMIT. - stdb.activate_persistence_barrier(); - } - - let replica_ctx = inst.replica_ctx().clone(); - let handle = tokio::runtime::Handle::current(); - std::thread::scope(|s| { - s.spawn(|| { - handle.block_on(async { - if committed { - // Wait for coordinator's PREPARE to become durable. - if let Some(mut durable_offset) = stdb.durable_tx_offset() { - let current: u64 = durable_offset.last_seen().unwrap_or(0); - let _ = durable_offset.wait_for(current + 1).await; - } - } - - let client = replica_ctx.call_reducer_client.clone(); - let router = replica_ctx.call_reducer_router.clone(); - let auth_token = replica_ctx.call_reducer_auth_token.clone(); - for (db_identity, prepare_id) in &prepared_participants { - let action = if committed { "commit" } else { "abort" }; - let base_url = match router.resolve_base_url(*db_identity).await { - Ok(url) => url, - Err(e) => { - log::error!("2PC {action}: failed to resolve base URL for {db_identity}: {e}"); - continue; - } - }; - let url = format!( - "{}/v1/database/{}/2pc/{}/{}", - base_url, - db_identity.to_hex(), - action, - prepare_id, - ); - let mut req = client.post(&url); - if let Some(ref token) = auth_token { - req = req.header( - http::header::AUTHORIZATION, - format!("Bearer {token}"), - ); - } - match req.send().await { - Ok(resp) if resp.status().is_success() => { - log::info!("2PC {action}: {prepare_id} on {db_identity}"); - } - Ok(resp) => { - log::error!( - "2PC {action}: failed for {prepare_id} on {db_identity}: status {}", - resp.status() - ); - } - Err(e) => { - log::error!( - "2PC {action}: transport error for {prepare_id} on {db_identity}: {e}" - ); - } - } - } - }); - }) - .join() - .expect("2PC coordination thread panicked"); - }); - - // Deactivate the barrier and flush buffered durability requests. - if committed { - stdb.finalize_prepare_commit(); - } else { - // On abort, discard buffered requests. No barrier was activated - // (we only activate on committed), so this is a no-op. - } - } - - let res = ReducerCallResult { - outcome: ReducerOutcome::from(&event.status), - energy_used: energy_quanta_used, - execution_duration: total_duration, - }; - - (res, event.reducer_return_value.clone(), trapped) + (out.tx, event, client, trapped) } fn handle_outer_error(&mut self, energy: &EnergyStats, reducer_name: &str) -> EventStatus { diff --git a/crates/datastore/src/locking_tx_datastore/mut_tx.rs b/crates/datastore/src/locking_tx_datastore/mut_tx.rs index 7fd8bdb575d..837a172cece 100644 --- a/crates/datastore/src/locking_tx_datastore/mut_tx.rs +++ b/crates/datastore/src/locking_tx_datastore/mut_tx.rs @@ -26,7 +26,7 @@ use crate::{ StScheduledFields, StScheduledRow, StSequenceFields, StSequenceRow, StTableAccessorFields, StTableAccessorRow, StTableFields, StTableRow, SystemTable, ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, ST_CONSTRAINT_ID, ST_EVENT_TABLE_ID, ST_INDEX_ACCESSOR_ID, ST_INDEX_ID, ST_ROW_LEVEL_SECURITY_ID, ST_SCHEDULED_ID, - ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, + ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, St2pcStateFields, St2pcStateRow, ST_2PC_STATE_ID, }, }; use crate::{execution_context::ExecutionContext, system_tables::StViewColumnRow}; @@ -2690,6 +2690,86 @@ impl MutTxId { .map(|row| row.pointer()) } + /// Insert a row into `st_2pc_state` to record that this database is a 2PC participant + /// in the given prepared transaction. The row persists until `delete_st_2pc_state` is + /// called on COMMIT or ABORT. On crash-recovery, any rows here indicate transactions + /// that need to be resumed. + pub fn insert_st_2pc_state(&mut self, prepare_id: &str) -> Result<()> { + let row = &St2pcStateRow { + prepare_id: prepare_id.to_owned(), + }; + self.insert_via_serialize_bsatn(ST_2PC_STATE_ID, row) + .map(|_| ()) + .inspect_err(|e| { + log::error!("insert_st_2pc_state: failed to insert prepare_id ({prepare_id}), error: {e}"); + }) + } + + /// Delete the `st_2pc_state` row for the given `prepare_id`, called on COMMIT or ABORT. + pub fn delete_st_2pc_state(&mut self, prepare_id: &str) -> Result<()> { + if let Err(e) = self.delete_col_eq( + ST_2PC_STATE_ID, + St2pcStateFields::PrepareId.col_id(), + &AlgebraicValue::String(prepare_id.into()), + ) { + log::error!("delete_st_2pc_state: no row for prepare_id ({prepare_id}), error: {e}"); + } + Ok(()) + } + + /// Return all `prepare_id`s currently in `st_2pc_state`. + /// Used on recovery to find prepared transactions that need to be resumed. + pub fn scan_st_2pc_state(&self) -> Result> { + self.iter(ST_2PC_STATE_ID)? + .map(|row| St2pcStateRow::try_from(row).map(|r| r.prepare_id)) + .collect() + } + + /// Return the [`TxData`] that would result from committing this transaction, + /// without actually committing it. + /// + /// The write lock on the committed state remains held and the [`TxState`] is + /// left intact. This is used during 2PC PREPARE so that a durability record + /// can be written while still holding the exclusive lock, preventing any + /// interleaved transactions between PREPARE and COMMIT/ABORT. + pub fn peek_tx_data(&self) -> TxData { + let mut tx_data = TxData::default(); + + // Collect inserts: scan tx_state.insert_tables without mutating anything. + for (table_id, tx_table) in &self.tx_state.insert_tables { + let rows: std::sync::Arc<[ProductValue]> = tx_table + .scan_rows(&self.tx_state.blob_store) + .map(|row| row.to_product_value()) + .collect(); + if !rows.is_empty() { + tx_data.set_inserts_for_table(*table_id, &tx_table.get_schema().table_name, rows); + } + } + + // Collect deletes: row pointers live in the committed state; read them + // without deleting. + for (table_id, delete_table) in &self.tx_state.delete_tables { + if let Ok((table, blob_store, _)) = + self.committed_state_write_lock.get_table_and_blob_store(*table_id) + { + let rows: std::sync::Arc<[ProductValue]> = delete_table + .iter() + .map(|row_ptr| { + table + .get_row_ref(blob_store, row_ptr) + .expect("delete_tables references non-existent row in committed state") + .to_product_value() + }) + .collect(); + if !rows.is_empty() { + tx_data.set_deletes_for_table(*table_id, &table.get_schema().table_name, rows); + } + } + } + + tx_data + } + pub fn insert_via_serialize_bsatn<'a, T: Serialize>( &'a mut self, table_id: TableId, diff --git a/crates/datastore/src/system_tables.rs b/crates/datastore/src/system_tables.rs index e75cc76b365..171baa5ccc0 100644 --- a/crates/datastore/src/system_tables.rs +++ b/crates/datastore/src/system_tables.rs @@ -88,6 +88,9 @@ pub const ST_TABLE_ACCESSOR_ID: TableId = TableId(18); pub const ST_INDEX_ACCESSOR_ID: TableId = TableId(19); /// The static ID of the table that maps canonical column names to accessor names pub const ST_COLUMN_ACCESSOR_ID: TableId = TableId(20); +/// The static ID of the 2PC participant state table +pub const ST_2PC_STATE_ID: TableId = TableId(21); +pub(crate) const ST_2PC_STATE_NAME: &str = "st_2pc_state"; pub(crate) const ST_CONNECTION_CREDENTIALS_NAME: &str = "st_connection_credentials"; pub const ST_TABLE_NAME: &str = "st_table"; @@ -205,7 +208,7 @@ pub enum SystemTable { st_table_accessor, } -pub fn system_tables() -> [TableSchema; 20] { +pub fn system_tables() -> [TableSchema; 21] { [ // The order should match the `id` of the system table, that start with [ST_TABLE_IDX]. st_table_schema(), @@ -228,6 +231,7 @@ pub fn system_tables() -> [TableSchema; 20] { st_table_accessor_schema(), st_index_accessor_schema(), st_column_accessor_schema(), + st_2pc_state_schema(), ] } @@ -450,6 +454,11 @@ st_fields_enum!(enum StColumnAccessorFields { "accessor_name", AccessorName = 2, }); +// WARNING: For a stable schema, don't change the field names and discriminants. +st_fields_enum!(enum St2pcStateFields { + "prepare_id", PrepareId = 0, +}); + /// Helper method to check that a system table has the correct fields. /// Does not check field types since those aren't included in `StFields` types. /// If anything in here is not true, the system is completely broken, so it's fine to assert. @@ -668,6 +677,14 @@ fn system_module_def() -> ModuleDef { .with_unique_constraint(st_column_accessor_table_alias_cols) .with_index_no_accessor_name(btree(st_column_accessor_table_alias_cols)); + let st_2pc_state_type = builder.add_type::(); + builder + .build_table(ST_2PC_STATE_NAME, *st_2pc_state_type.as_ref().expect("should be ref")) + .with_type(TableType::System) + .with_unique_constraint(St2pcStateFields::PrepareId) + .with_index_no_accessor_name(btree(St2pcStateFields::PrepareId)) + .with_access(v9::TableAccess::Private); + let result = builder .finish() .try_into() @@ -693,6 +710,7 @@ fn system_module_def() -> ModuleDef { validate_system_table::(&result, ST_TABLE_ACCESSOR_NAME); validate_system_table::(&result, ST_INDEX_ACCESSOR_NAME); validate_system_table::(&result, ST_COLUMN_ACCESSOR_NAME); + validate_system_table::(&result, ST_2PC_STATE_NAME); result } @@ -892,6 +910,10 @@ fn st_client_schema() -> TableSchema { st_schema(ST_CLIENT_NAME, ST_CLIENT_ID) } +fn st_2pc_state_schema() -> TableSchema { + st_schema(ST_2PC_STATE_NAME, ST_2PC_STATE_ID) +} + fn st_connection_credential_schema() -> TableSchema { st_schema(ST_CONNECTION_CREDENTIALS_NAME, ST_CONNECTION_CREDENTIALS_ID) } @@ -968,6 +990,7 @@ pub(crate) fn system_table_schema(table_id: TableId) -> Option { ST_TABLE_ACCESSOR_ID => Some(st_table_accessor_schema()), ST_INDEX_ACCESSOR_ID => Some(st_index_accessor_schema()), ST_COLUMN_ACCESSOR_ID => Some(st_column_accessor_schema()), + ST_2PC_STATE_ID => Some(st_2pc_state_schema()), _ => None, } } @@ -1859,6 +1882,35 @@ impl From for ProductValue { } } +/// System table [ST_2PC_STATE_NAME] +/// +/// Tracks in-flight 2PC participant transactions. +/// A row is inserted when B enters PREPARE state and deleted on COMMIT or ABORT. +/// On recovery, any row here indicates a pending prepared transaction that must +/// be resumed (retransmit PREPARED to the coordinator and await the decision). +/// +/// | prepare_id | +/// |-------------| +/// | "prepare-1" | +#[derive(Clone, Debug, Eq, PartialEq, SpacetimeType)] +#[sats(crate = spacetimedb_lib)] +pub struct St2pcStateRow { + pub prepare_id: String, +} + +impl TryFrom> for St2pcStateRow { + type Error = DatastoreError; + fn try_from(row: RowRef<'_>) -> Result { + read_via_bsatn(row) + } +} + +impl From for ProductValue { + fn from(x: St2pcStateRow) -> Self { + to_product_value(&x) + } +} + thread_local! { static READ_BUF: RefCell> = const { RefCell::new(Vec::new()) }; } From ffcb5ede13e80575792754f8e007d83118d77852 Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Sun, 29 Mar 2026 21:27:36 +0530 Subject: [PATCH 10/22] fix prepare --- crates/core/src/db/relational_db.rs | 12 ++- .../src/host/wasm_common/module_host_actor.rs | 96 ++++++++++--------- .../locking_tx_datastore/committed_state.rs | 31 ++++++ .../src/locking_tx_datastore/mut_tx.rs | 25 +++++ 4 files changed, 118 insertions(+), 46 deletions(-) diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 4b783f1d294..c141fe9de49 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -12,7 +12,7 @@ use spacetimedb_commitlog::{self as commitlog, Commitlog, SizeOnDisk}; use spacetimedb_data_structures::map::HashSet; use spacetimedb_datastore::db_metrics::DB_METRICS; use spacetimedb_datastore::error::{DatastoreError, TableError, ViewError}; -use spacetimedb_datastore::execution_context::{Workload, WorkloadType}; +use spacetimedb_datastore::execution_context::{ReducerContext, Workload, WorkloadType}; use spacetimedb_datastore::locking_tx_datastore::datastore::TxMetrics; use spacetimedb_datastore::locking_tx_datastore::state_view::{ IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, StateView, @@ -855,6 +855,16 @@ impl RelationalDB { (tx_data, tx_metrics, tx) } + /// Forward a pre-built `TxData` directly to the durability worker. + /// + /// Used by the 2PC participant path to make the `st_2pc_state` PREPARE marker durable + /// while the main write lock is still held (i.e. without going through a full commit). + pub fn request_durability_for_tx_data(&self, reducer_context: Option, tx_data: &Arc) { + if let Some(durability) = &self.durability { + durability.request_durability(reducer_context, tx_data); + } + } + /// Get the [`DurableOffset`] of this database, or `None` if this is an /// in-memory instance. pub fn durable_tx_offset(&self) -> Option { diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index ff2b40731bb..7abe60ee425 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -568,24 +568,22 @@ impl WasmModuleInstance { /// Run the reducer as a 2PC participant PREPARE. /// - /// Holds the write lock (MutTxId) open until a COMMIT or ABORT decision arrives. - /// The flow: - /// 1. Run reducer (no commit). - /// 2. If reducer failed: send failure via `prepared_tx`; rollback; return. - /// 3. If reducer succeeded: insert `st_2pc_state` row; send PREPARED result via `prepared_tx`. - /// 4. Block on `decision_rx`: - /// - `true` (COMMIT): commit via `commit_and_broadcast_event`, then delete `st_2pc_state`. - /// - `false` (ABORT) or channel closed: roll back. /// Run the reducer as a 2PC participant PREPARE. /// /// Holds the write lock (MutTxId) open until a COMMIT or ABORT decision arrives. /// The flow: - /// 1. Run reducer (no commit). + /// 1. Run reducer (no commit); hold open MutTxId (write lock). /// 2. If reducer failed: send failure via `prepared_tx`; rollback; return. - /// 3. If reducer succeeded: insert `st_2pc_state` row; send PREPARED result via `prepared_tx`. - /// 4. Block on `decision_rx`: - /// - `true` (COMMIT): commit via `commit_and_broadcast_event`, then delete `st_2pc_state`. - /// - `false` (ABORT) or channel closed: roll back. + /// 3. If reducer succeeded: call `flush_2pc_prepare_marker` — inserts `st_2pc_state` + /// directly into committed state (bumps tx_offset), returns `TxData` for the marker. + /// Forward the `TxData` to the durability worker so the PREPARE is in the commitlog. + /// The write lock remains held throughout. + /// 4. Signal PREPARED via `prepared_tx`. + /// 5. Block on `decision_rx`: + /// - `true` (COMMIT): commit main tx (reducer changes get the next tx_offset), then + /// delete `st_2pc_state` in a new tx. + /// - `false` (ABORT) or channel closed: roll back main tx; delete `st_2pc_state` in + /// a new tx (the marker row is already in committed state from step 3). pub fn call_reducer_prepare_and_hold( &mut self, params: CallReducerParams, @@ -593,17 +591,19 @@ impl WasmModuleInstance { prepared_tx: tokio::sync::oneshot::Sender<(ReducerCallResult, Option)>, decision_rx: std::sync::mpsc::Receiver, ) { - let (mut tx, event, client, trapped) = - crate::callgrind_flag::invoke_allowing_callgrind(|| { - self.common.run_reducer_no_commit(None, params, &mut self.instance) - }); + let stdb = self.instance.replica_ctx().relational_db().clone(); + + // Step 1: run the reducer and hold the write lock open. + let (mut tx, event, client, trapped) = crate::callgrind_flag::invoke_allowing_callgrind(|| { + self.common.run_reducer_no_commit(None, params, &mut self.instance) + }); self.trapped = trapped; let energy_quanta_used = event.energy_quanta_used; let total_duration = event.host_execution_duration; if !matches!(event.status, EventStatus::Committed(_)) { - // Reducer failed — roll back and signal failure to the waiter. + // Reducer failed — roll back and signal failure; no marker was written. let res = ReducerCallResult { outcome: ReducerOutcome::from(&event.status), energy_used: energy_quanta_used, @@ -611,45 +611,51 @@ impl WasmModuleInstance { }; let return_value = event.reducer_return_value.clone(); let _ = prepared_tx.send((res, return_value)); - // commit_and_broadcast_event handles rollback for non-Committed status. - commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); + let _ = stdb.rollback_mut_tx(tx); return; } - // Insert the st_2pc_state marker into the held tx atomically with the reducer's changes. - if let Err(e) = tx.insert_st_2pc_state(&prepare_id) { - log::error!("call_reducer_prepare_and_hold: failed to insert st_2pc_state for {prepare_id}: {e}"); - } + // Step 3: flush the st_2pc_state marker directly into committed state, assign + // a tx_offset, and forward to durability — all while holding the write lock. + let marker_tx_data = match tx.flush_2pc_prepare_marker(&prepare_id) { + Ok(td) => std::sync::Arc::new(td), + Err(e) => { + log::error!("call_reducer_prepare_and_hold: flush_2pc_prepare_marker failed for {prepare_id}: {e}"); + let _ = stdb.rollback_mut_tx(tx); + return; + } + }; + stdb.request_durability_for_tx_data(None, &marker_tx_data); + // Step 4: signal PREPARED. let res = ReducerCallResult { outcome: ReducerOutcome::from(&event.status), energy_used: energy_quanta_used, execution_duration: total_duration, }; let return_value = event.reducer_return_value.clone(); - // Signal PREPARED — the coordinator can now send COMMIT or ABORT. let _ = prepared_tx.send((res, return_value)); - // Block the executor thread until we receive a decision. + // Step 5: block the executor thread until we receive a decision. let commit = decision_rx.recv().unwrap_or(false); if commit { + // Delete the marker in the same tx as the reducer changes so they are + // committed atomically. The row is in committed state (inserted by + // flush_2pc_prepare_marker), so delete_st_2pc_state finds it via iter. + if let Err(e) = tx.delete_st_2pc_state(&prepare_id) { + log::error!("call_reducer_prepare_and_hold: failed to delete st_2pc_state for {prepare_id}: {e}"); + } commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); - - // Delete the st_2pc_state row in a new tx so recovery knows COMMIT is done. - let stdb = self.instance.replica_ctx().relational_db(); + } else { + // ABORT: roll back reducer changes (tx_state discarded). + // The marker row is already in committed state; clean it up in a new tx. + let _ = stdb.rollback_mut_tx(tx); if let Err(e) = stdb.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { Ok(del_tx.delete_st_2pc_state(&prepare_id)?) }) { - log::error!("call_reducer_prepare_and_hold: failed to delete st_2pc_state for {prepare_id}: {e}"); + log::error!("call_reducer_prepare_and_hold: abort: failed to delete st_2pc_state for {prepare_id}: {e}"); } - } else { - // ABORT: roll back by passing a failure event. - let abort_event = ModuleEvent { - status: EventStatus::FailedInternal("2PC abort".into()), - ..event - }; - commit_and_broadcast_event(&self.common.info.subscriptions, None, abort_event, tx); } } @@ -975,10 +981,7 @@ impl InstanceCommon { ); let mut req = client.post(&url); if let Some(ref token) = auth_token { - req = req.header( - http::header::AUTHORIZATION, - format!("Bearer {token}"), - ); + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); } match req.send().await { Ok(resp) if resp.status().is_success() => { @@ -991,9 +994,7 @@ impl InstanceCommon { ); } Err(e) => { - log::error!( - "2PC {action}: transport error for {prepare_id} on {db_identity}: {e}" - ); + log::error!("2PC {action}: transport error for {prepare_id} on {db_identity}: {e}"); } } } @@ -1026,7 +1027,12 @@ impl InstanceCommon { tx: Option, params: CallReducerParams, inst: &mut I, - ) -> (MutTxId, ModuleEvent, Option>, bool) { + ) -> ( + MutTxId, + ModuleEvent, + Option>, + bool, + ) { let CallReducerParams { timestamp, caller_identity, diff --git a/crates/datastore/src/locking_tx_datastore/committed_state.rs b/crates/datastore/src/locking_tx_datastore/committed_state.rs index d5cc2099d8a..4e9880fdf6c 100644 --- a/crates/datastore/src/locking_tx_datastore/committed_state.rs +++ b/crates/datastore/src/locking_tx_datastore/committed_state.rs @@ -1415,6 +1415,37 @@ impl CommittedState { self.tables.insert(table_id, Self::make_table(schema)); } + /// Insert a single row directly into the committed state, bypassing `TxState`. + /// + /// Assigns the next `tx_offset` to the resulting `TxData` and increments the counter. + /// The write lock (and therefore the transaction) is **not** released. + /// + /// Used by the 2PC participant path to flush the `st_2pc_state` PREPARE marker to the + /// commitlog (via the durability worker) while keeping the reducer's write lock open, + /// so that no other transaction can interleave between PREPARE and COMMIT/ABORT. + pub(super) fn insert_row_and_consume_offset( + &mut self, + table_id: TableId, + schema: &Arc, + row: &ProductValue, + ) -> Result { + let (table, blob_store, pool) = self.get_table_and_blob_store_or_create(table_id, schema); + table + .insert(pool, blob_store, row) + .map_err(|e| match e { + InsertError::Duplicate(e) => DatastoreError::from(TableError::Duplicate(e)), + InsertError::Bflatn(e) => DatastoreError::from(TableError::Bflatn(e)), + InsertError::IndexError(e) => DatastoreError::from(IndexError::UniqueConstraintViolation(e)), + })?; + + let row_arc: Arc<[ProductValue]> = Arc::from([row.clone()]); + let mut tx_data = TxData::default(); + tx_data.set_inserts_for_table(table_id, &schema.table_name, row_arc); + tx_data.set_tx_offset(self.next_tx_offset); + self.next_tx_offset += 1; + Ok(tx_data) + } + pub(super) fn get_table_and_blob_store_or_create<'this>( &'this mut self, table_id: TableId, diff --git a/crates/datastore/src/locking_tx_datastore/mut_tx.rs b/crates/datastore/src/locking_tx_datastore/mut_tx.rs index 837a172cece..0ac03f2a2eb 100644 --- a/crates/datastore/src/locking_tx_datastore/mut_tx.rs +++ b/crates/datastore/src/locking_tx_datastore/mut_tx.rs @@ -2705,6 +2705,31 @@ impl MutTxId { }) } + /// Write the `st_2pc_state` PREPARE marker directly to the committed state and allocate a + /// `tx_offset` for it, **without** releasing the write lock or committing the pending + /// reducer changes in `tx_state`. + /// + /// Returns the `TxData` containing just the `st_2pc_state` insert together with its + /// assigned `tx_offset`. The caller is responsible for forwarding this to the durability + /// worker so that the PREPARE record becomes durable in the commitlog. + /// + /// Because the write lock remains held, no other transaction can begin between this call + /// and the eventual `commit` / `rollback` of the enclosing `MutTxId`. On ABORT the + /// caller must delete the `st_2pc_state` row in a subsequent transaction (the row was + /// inserted directly into the committed state and is not part of `tx_state`). + pub fn flush_2pc_prepare_marker(&mut self, prepare_id: &str) -> Result { + let schema = self + .committed_state_write_lock + .get_schema(ST_2PC_STATE_ID) + .cloned() + .expect("st_2pc_state system table must exist in committed state"); + let row = ProductValue::from(St2pcStateRow { + prepare_id: prepare_id.to_owned(), + }); + self.committed_state_write_lock + .insert_row_and_consume_offset(ST_2PC_STATE_ID, &schema, &row) + } + /// Delete the `st_2pc_state` row for the given `prepare_id`, called on COMMIT or ABORT. pub fn delete_st_2pc_state(&mut self, prepare_id: &str) -> Result<()> { if let Err(e) = self.delete_col_eq( From 197607056c09f127930f4d851b3a34f89cf8d1eb Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Sun, 29 Mar 2026 21:46:53 +0530 Subject: [PATCH 11/22] lint --- crates/client-api/src/routes/database.rs | 7 +++---- crates/core/src/db/relational_db.rs | 5 +---- crates/core/src/host/instance_env.rs | 18 +++++++++-------- crates/core/src/host/module_host.rs | 9 +++++---- .../src/host/wasm_common/module_host_actor.rs | 12 +++++------ .../src/host/wasmtime/wasm_instance_env.rs | 10 ++++------ .../locking_tx_datastore/committed_state.rs | 12 +++++------ .../src/locking_tx_datastore/mut_tx.rs | 18 ++++++++--------- .../tests/smoketests/cross_db_2pc.rs | 20 +++++++------------ tools/tpcc-runner/src/client.rs | 12 ++++------- tools/tpcc-runner/src/coordinator.rs | 4 +++- tools/tpcc-runner/src/loader.rs | 6 +----- 12 files changed, 57 insertions(+), 76 deletions(-) diff --git a/crates/client-api/src/routes/database.rs b/crates/client-api/src/routes/database.rs index 7c82f5918de..d007c7a4441 100644 --- a/crates/client-api/src/routes/database.rs +++ b/crates/client-api/src/routes/database.rs @@ -291,10 +291,9 @@ pub async fn prepare( ) .into_response(); if !prepare_id.is_empty() { - response.headers_mut().insert( - "X-Prepare-Id", - http::HeaderValue::from_str(&prepare_id).unwrap(), - ); + response + .headers_mut() + .insert("X-Prepare-Id", http::HeaderValue::from_str(&prepare_id).unwrap()); } Ok(response) } diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index c141fe9de49..6ce5f01f8e7 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -76,7 +76,6 @@ type RowCountFn = Arc i64 + Send + Sync>; /// The type of transactions committed by [RelationalDB]. pub type Txdata = commitlog::payload::Txdata; - /// We've added a module version field to the system tables, but we don't yet /// have the infrastructure to support multiple versions. /// All modules are currently locked to this version, but this will be @@ -460,9 +459,7 @@ impl RelationalDB { /// If non-empty, the caller must resume these transactions: retransmit PREPARED to /// the coordinator and await a COMMIT or ABORT decision before allowing normal operation. pub fn pending_2pc_prepares(&self) -> Result, DBError> { - self.with_auto_commit(Workload::Internal, |tx| { - tx.scan_st_2pc_state().map_err(DBError::from) - }) + self.with_auto_commit(Workload::Internal, |tx| tx.scan_st_2pc_state().map_err(DBError::from)) } /// Read the set of clients currently connected to the database. diff --git a/crates/core/src/host/instance_env.rs b/crates/core/src/host/instance_env.rs index 29ed8b28069..eb5c9e19919 100644 --- a/crates/core/src/host/instance_env.rs +++ b/crates/core/src/host/instance_env.rs @@ -1034,7 +1034,10 @@ impl InstanceEnv { let result = async { let response = req.send().await.map_err(|e| NodesError::HttpError(e.to_string()))?; let status = response.status().as_u16(); - let body = response.bytes().await.map_err(|e| NodesError::HttpError(e.to_string()))?; + let body = response + .bytes() + .await + .map_err(|e| NodesError::HttpError(e.to_string()))?; Ok((status, body)) } .await; @@ -1102,7 +1105,10 @@ impl InstanceEnv { .get("X-Prepare-Id") .and_then(|v| v.to_str().ok()) .map(|s| s.to_owned()); - let body = response.bytes().await.map_err(|e| NodesError::HttpError(e.to_string()))?; + let body = response + .bytes() + .await + .map_err(|e| NodesError::HttpError(e.to_string()))?; Ok((status, body, prepare_id)) } .await; @@ -1121,9 +1127,7 @@ impl InstanceEnv { } /// Commit all prepared participants (called after coordinator's reducer succeeds). - pub fn commit_all_prepared( - &mut self, - ) -> impl Future + use<> { + pub fn commit_all_prepared(&mut self) -> impl Future + use<> { let participants = mem::take(&mut self.prepared_participants); let client = self.replica_ctx.call_reducer_client.clone(); let router = self.replica_ctx.call_reducer_router.clone(); @@ -1167,9 +1171,7 @@ impl InstanceEnv { } /// Abort all prepared participants (called when coordinator's reducer fails). - pub fn abort_all_prepared( - &mut self, - ) -> impl Future + use<> { + pub fn abort_all_prepared(&mut self) -> impl Future + use<> { let participants = mem::take(&mut self.prepared_participants); let client = self.replica_ctx.call_reducer_client.clone(); let router = self.replica_ctx.call_reducer_router.clone(); diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 9185689ce0e..bc2414d611b 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1791,8 +1791,7 @@ impl ModuleHost { let prepare_id = format!("prepare-{}", PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed)); // Channel for signalling PREPARED result back to this task. - let (prepared_tx, prepared_rx) = - tokio::sync::oneshot::channel::<(ReducerCallResult, Option)>(); + let (prepared_tx, prepared_rx) = tokio::sync::oneshot::channel::<(ReducerCallResult, Option)>(); // Channel for sending the COMMIT/ABORT decision to the executor thread. let (decision_tx, decision_rx) = std::sync::mpsc::channel::(); @@ -1855,7 +1854,8 @@ impl ModuleHost { /// Finalize a prepared transaction as COMMIT. pub fn commit_prepared(&self, prepare_id: &str) -> Result<(), String> { - let info = self.prepared_txs + let info = self + .prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; // Unblock the executor thread to commit. @@ -1865,7 +1865,8 @@ impl ModuleHost { /// Abort a prepared transaction. pub fn abort_prepared(&self, prepare_id: &str) -> Result<(), String> { - let info = self.prepared_txs + let info = self + .prepared_txs .remove(prepare_id) .ok_or_else(|| format!("no such prepared transaction: {prepare_id}"))?; // Unblock the executor thread to abort. diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index 7abe60ee425..f67a397544f 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -654,7 +654,9 @@ impl WasmModuleInstance { if let Err(e) = stdb.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { Ok(del_tx.delete_st_2pc_state(&prepare_id)?) }) { - log::error!("call_reducer_prepare_and_hold: abort: failed to delete st_2pc_state for {prepare_id}: {e}"); + log::error!( + "call_reducer_prepare_and_hold: abort: failed to delete st_2pc_state for {prepare_id}: {e}" + ); } } } @@ -953,11 +955,9 @@ impl InstanceCommon { std::thread::scope(|s| { s.spawn(|| { handle.block_on(async { - if committed { - if let Some(mut durable_offset) = stdb.durable_tx_offset() { - let current: u64 = durable_offset.last_seen().unwrap_or(0); - let _ = durable_offset.wait_for(current + 1).await; - } + if committed && let Some(mut durable_offset) = stdb.durable_tx_offset() { + let current: u64 = durable_offset.last_seen().unwrap_or(0); + let _ = durable_offset.wait_for(current + 1).await; } let client = replica_ctx.call_reducer_client.clone(); diff --git a/crates/core/src/host/wasmtime/wasm_instance_env.rs b/crates/core/src/host/wasmtime/wasm_instance_env.rs index 73860dbf4ce..a40ac3cfd14 100644 --- a/crates/core/src/host/wasmtime/wasm_instance_env.rs +++ b/crates/core/src/host/wasmtime/wasm_instance_env.rs @@ -2076,12 +2076,10 @@ impl WasmInstanceEnv { match result { Ok((status, body, prepare_id)) => { // If we got a prepare_id, register this participant. - if let Some(pid) = prepare_id { - if status < 300 { - env.instance_env - .prepared_participants - .push((database_identity, pid)); - } + if let Some(pid) = prepare_id + && status < 300 + { + env.instance_env.prepared_participants.push((database_identity, pid)); } let bytes_source = WasmInstanceEnv::create_bytes_source(env, body)?; bytes_source.0.write_to(mem, out)?; diff --git a/crates/datastore/src/locking_tx_datastore/committed_state.rs b/crates/datastore/src/locking_tx_datastore/committed_state.rs index 4e9880fdf6c..5a27bd15ef4 100644 --- a/crates/datastore/src/locking_tx_datastore/committed_state.rs +++ b/crates/datastore/src/locking_tx_datastore/committed_state.rs @@ -1430,13 +1430,11 @@ impl CommittedState { row: &ProductValue, ) -> Result { let (table, blob_store, pool) = self.get_table_and_blob_store_or_create(table_id, schema); - table - .insert(pool, blob_store, row) - .map_err(|e| match e { - InsertError::Duplicate(e) => DatastoreError::from(TableError::Duplicate(e)), - InsertError::Bflatn(e) => DatastoreError::from(TableError::Bflatn(e)), - InsertError::IndexError(e) => DatastoreError::from(IndexError::UniqueConstraintViolation(e)), - })?; + table.insert(pool, blob_store, row).map_err(|e| match e { + InsertError::Duplicate(e) => DatastoreError::from(TableError::Duplicate(e)), + InsertError::Bflatn(e) => DatastoreError::from(TableError::Bflatn(e)), + InsertError::IndexError(e) => DatastoreError::from(IndexError::UniqueConstraintViolation(e)), + })?; let row_arc: Arc<[ProductValue]> = Arc::from([row.clone()]); let mut tx_data = TxData::default(); diff --git a/crates/datastore/src/locking_tx_datastore/mut_tx.rs b/crates/datastore/src/locking_tx_datastore/mut_tx.rs index 0ac03f2a2eb..4f227ae0dd5 100644 --- a/crates/datastore/src/locking_tx_datastore/mut_tx.rs +++ b/crates/datastore/src/locking_tx_datastore/mut_tx.rs @@ -20,13 +20,13 @@ use crate::{ use crate::{ error::{IndexError, SequenceError, TableError}, system_tables::{ - with_sys_table_buf, StClientFields, StClientRow, StColumnAccessorFields, StColumnAccessorRow, StColumnFields, - StColumnRow, StConstraintFields, StConstraintRow, StEventTableRow, StFields as _, StIndexAccessorFields, - StIndexAccessorRow, StIndexFields, StIndexRow, StRowLevelSecurityFields, StRowLevelSecurityRow, - StScheduledFields, StScheduledRow, StSequenceFields, StSequenceRow, StTableAccessorFields, StTableAccessorRow, - StTableFields, StTableRow, SystemTable, ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, ST_CONSTRAINT_ID, - ST_EVENT_TABLE_ID, ST_INDEX_ACCESSOR_ID, ST_INDEX_ID, ST_ROW_LEVEL_SECURITY_ID, ST_SCHEDULED_ID, - ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, St2pcStateFields, St2pcStateRow, ST_2PC_STATE_ID, + with_sys_table_buf, St2pcStateFields, St2pcStateRow, StClientFields, StClientRow, StColumnAccessorFields, + StColumnAccessorRow, StColumnFields, StColumnRow, StConstraintFields, StConstraintRow, StEventTableRow, + StFields as _, StIndexAccessorFields, StIndexAccessorRow, StIndexFields, StIndexRow, StRowLevelSecurityFields, + StRowLevelSecurityRow, StScheduledFields, StScheduledRow, StSequenceFields, StSequenceRow, + StTableAccessorFields, StTableAccessorRow, StTableFields, StTableRow, SystemTable, ST_2PC_STATE_ID, + ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, ST_CONSTRAINT_ID, ST_EVENT_TABLE_ID, ST_INDEX_ACCESSOR_ID, + ST_INDEX_ID, ST_ROW_LEVEL_SECURITY_ID, ST_SCHEDULED_ID, ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, }, }; use crate::{execution_context::ExecutionContext, system_tables::StViewColumnRow}; @@ -2774,9 +2774,7 @@ impl MutTxId { // Collect deletes: row pointers live in the committed state; read them // without deleting. for (table_id, delete_table) in &self.tx_state.delete_tables { - if let Ok((table, blob_store, _)) = - self.committed_state_write_lock.get_table_and_blob_store(*table_id) - { + if let Ok((table, blob_store, _)) = self.committed_state_write_lock.get_table_and_blob_store(*table_id) { let rows: std::sync::Arc<[ProductValue]> = delete_table .iter() .map(|row_ptr| { diff --git a/crates/smoketests/tests/smoketests/cross_db_2pc.rs b/crates/smoketests/tests/smoketests/cross_db_2pc.rs index feb92deda0e..4279852dfb0 100644 --- a/crates/smoketests/tests/smoketests/cross_db_2pc.rs +++ b/crates/smoketests/tests/smoketests/cross_db_2pc.rs @@ -91,18 +91,12 @@ fn test_cross_db_2pc_happy_path() { // Publish bank B (the participant that will be debited). test.publish_module_named(&db_b_name, false) .expect("failed to publish bank B"); - let db_b_identity = test - .database_identity - .clone() - .expect("bank B identity not set"); + let db_b_identity = test.database_identity.clone().expect("bank B identity not set"); // Publish bank A (the coordinator that will be credited). test.publish_module_named(&db_a_name, false) .expect("failed to publish bank A"); - let _db_a_identity = test - .database_identity - .clone() - .expect("bank A identity not set"); + let _db_a_identity = test.database_identity.clone().expect("bank A identity not set"); // Transfer 50 from B's alice to A's alice. // The coordinator is bank A. It credits locally, then calls debit on B via 2PC. @@ -155,10 +149,7 @@ fn test_cross_db_2pc_abort_insufficient_funds() { // Publish bank B. test.publish_module_named(&db_b_name, false) .expect("failed to publish bank B"); - let db_b_identity = test - .database_identity - .clone() - .expect("bank B identity not set"); + let db_b_identity = test.database_identity.clone().expect("bank B identity not set"); // Publish bank A. test.publish_module_named(&db_a_name, false) @@ -167,7 +158,10 @@ fn test_cross_db_2pc_abort_insufficient_funds() { // Try to transfer 200 -- B only has 100, so the remote debit will fail. let result = test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "200"]); // The call should fail because the remote debit panicked. - assert!(result.is_err(), "Expected transfer_funds to fail due to insufficient funds"); + assert!( + result.is_err(), + "Expected transfer_funds to fail due to insufficient funds" + ); // Verify bank A: alice should still have 100 (the local credit was rolled back). let result_a = test diff --git a/tools/tpcc-runner/src/client.rs b/tools/tpcc-runner/src/client.rs index ef58bc51c6f..847f276eada 100644 --- a/tools/tpcc-runner/src/client.rs +++ b/tools/tpcc-runner/src/client.rs @@ -86,14 +86,10 @@ impl ModuleClient { increment_pending(pending); let pending_for_callback = Arc::clone(pending); let errors = Arc::clone(errors); - if let Err(err) = self - .conn - .reducers - .load_remote_warehouses_then(rows, move |_, res| { - handle_reducer_result("load_remote_warehouses", res, &errors); - decrement_pending(&pending_for_callback); - }) - { + if let Err(err) = self.conn.reducers.load_remote_warehouses_then(rows, move |_, res| { + handle_reducer_result("load_remote_warehouses", res, &errors); + decrement_pending(&pending_for_callback); + }) { decrement_pending(pending); return Err(anyhow!("load_remote_warehouses send error: {err}")); } diff --git a/tools/tpcc-runner/src/coordinator.rs b/tools/tpcc-runner/src/coordinator.rs index fdfd6473d95..7d17ae61a35 100644 --- a/tools/tpcc-runner/src/coordinator.rs +++ b/tools/tpcc-runner/src/coordinator.rs @@ -78,7 +78,9 @@ async fn register_driver( inner.registration_order.push(request.driver_id.clone()); inner.registrations.insert( request.driver_id.clone(), - DriverRegistration { assignment: assignment.clone() }, + DriverRegistration { + assignment: assignment.clone(), + }, ); assignment } diff --git a/tools/tpcc-runner/src/loader.rs b/tools/tpcc-runner/src/loader.rs index 19b3ab27175..96345ba8c81 100644 --- a/tools/tpcc-runner/src/loader.rs +++ b/tools/tpcc-runner/src/loader.rs @@ -430,11 +430,7 @@ fn load_customers_history_orders( ol_dist_info: alpha_string(rng, 24, 24), }); if order_line_batch.len() >= batch_size { - client.queue_load_order_lines( - std::mem::take(&mut order_line_batch), - &pending, - &errors, - )?; + client.queue_load_order_lines(std::mem::take(&mut order_line_batch), &pending, &errors)?; } } From 74bc3ebc46d01fdd70a30f78a3cd23e144a9f62c Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Sun, 29 Mar 2026 22:59:35 +0530 Subject: [PATCH 12/22] recovery --- crates/client-api/src/routes/database.rs | 31 ++- crates/core/src/db/relational_db.rs | 19 +- crates/core/src/host/host_controller.rs | 5 + crates/core/src/host/module_host.rs | 233 +++++++++++++++++- .../src/host/wasm_common/module_host_actor.rs | 192 ++++++++++++--- .../src/locking_tx_datastore/mut_tx.rs | 145 +++++------ crates/datastore/src/system_tables.rs | 95 ++++++- 7 files changed, 606 insertions(+), 114 deletions(-) diff --git a/crates/client-api/src/routes/database.rs b/crates/client-api/src/routes/database.rs index d007c7a4441..7f4dd08e378 100644 --- a/crates/client-api/src/routes/database.rs +++ b/crates/client-api/src/routes/database.rs @@ -349,6 +349,31 @@ pub async fn abort_2pc( Ok(StatusCode::OK) } +/// 2PC coordinator status endpoint. +/// +/// Returns `"commit"` if the coordinator has durably decided COMMIT for `prepare_id`, +/// or `"abort"` otherwise. Participant B polls this to recover from a timeout or crash. +/// +/// `GET /v1/database/:name_or_identity/2pc/status/:prepare_id` +pub async fn status_2pc( + State(worker_ctx): State, + Extension(_auth): Extension, + Path(TwoPcParams { + name_or_identity, + prepare_id, + }): Path, +) -> axum::response::Result { + let (module, _database) = find_module_and_database(&worker_ctx, name_or_identity).await?; + + let decision = if module.has_2pc_coordinator_commit(&prepare_id) { + "commit" + } else { + "abort" + }; + + Ok((StatusCode::OK, decision)) +} + fn reducer_outcome_response( module: &ModuleHost, owner_identity: &Identity, @@ -1361,6 +1386,8 @@ pub struct DatabaseRoutes { pub commit_2pc_post: MethodRouter, /// POST: /database/:name_or_identity/2pc/abort/:prepare_id pub abort_2pc_post: MethodRouter, + /// GET: /database/:name_or_identity/2pc/status/:prepare_id + pub status_2pc_get: MethodRouter, } impl Default for DatabaseRoutes @@ -1389,6 +1416,7 @@ where prepare_post: post(prepare::), commit_2pc_post: post(commit_2pc::), abort_2pc_post: post(abort_2pc::), + status_2pc_get: get(status_2pc::), } } } @@ -1416,7 +1444,8 @@ where .route("/reset", self.db_reset) .route("/prepare/:reducer", self.prepare_post) .route("/2pc/commit/:prepare_id", self.commit_2pc_post) - .route("/2pc/abort/:prepare_id", self.abort_2pc_post); + .route("/2pc/abort/:prepare_id", self.abort_2pc_post) + .route("/2pc/status/:prepare_id", self.status_2pc_get); axum::Router::new() .route("/", self.root_post) diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 6ce5f01f8e7..50e5a10b304 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -454,14 +454,25 @@ impl RelationalDB { } /// Read any 2PC participant transactions that were in PREPARE state when the database - /// last shut down (or crashed). Each returned string is a `prepare_id`. + /// last shut down (or crashed). /// - /// If non-empty, the caller must resume these transactions: retransmit PREPARED to - /// the coordinator and await a COMMIT or ABORT decision before allowing normal operation. - pub fn pending_2pc_prepares(&self) -> Result, DBError> { + /// Each returned row contains all the information needed to resume the transaction: + /// the prepare_id, coordinator identity, reducer name/args, and caller context. + /// B never aborts on its own — it polls the coordinator for a decision. + pub fn pending_2pc_prepares(&self) -> Result, DBError> { self.with_auto_commit(Workload::Internal, |tx| tx.scan_st_2pc_state().map_err(DBError::from)) } + /// Read any 2PC coordinator log entries that have not yet been acknowledged by their + /// participants. Used on coordinator crash-recovery to retransmit COMMIT decisions. + pub fn pending_2pc_coordinator_commits( + &self, + ) -> Result, DBError> { + self.with_auto_commit(Workload::Internal, |tx| { + tx.scan_st_2pc_coordinator_log().map_err(DBError::from) + }) + } + /// Read the set of clients currently connected to the database. pub fn connected_clients(&self) -> Result { self.with_read_only(Workload::Internal, |tx| { diff --git a/crates/core/src/host/host_controller.rs b/crates/core/src/host/host_controller.rs index dd774111b86..8ab53513314 100644 --- a/crates/core/src/host/host_controller.rs +++ b/crates/core/src/host/host_controller.rs @@ -1141,6 +1141,11 @@ impl Host { module_host.clear_all_clients().await?; scheduler_starter.start(&module_host)?; + + // Crash recovery: retransmit any pending 2PC decisions from before the restart. + module_host.recover_2pc_coordinator(); + module_host.recover_2pc_participant(); + let disk_metrics_recorder_task = tokio::spawn(metric_reporter(replica_ctx.clone())).abort_handle(); let view_cleanup_task = spawn_view_cleanup_loop(replica_ctx.relational_db().clone()); diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index bc2414d611b..2816a856999 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1808,17 +1808,18 @@ impl ModuleHost { let this = self.clone(); let reducer_name_owned = reducer_def.name.clone(); let prepare_id_clone = prepare_id.clone(); + let coordinator_identity = caller_identity; tokio::spawn(async move { let _ = this .call( &reducer_name_owned, - (params, prepare_id_clone, prepared_tx, decision_rx), - async |(p, pid, ptx, drx), inst| { - inst.call_reducer_prepare_and_hold(p, pid, ptx, drx); + (params, prepare_id_clone, coordinator_identity, prepared_tx, decision_rx), + async |(p, pid, cid, ptx, drx), inst| { + inst.call_reducer_prepare_and_hold(p, pid, cid, ptx, drx); Ok::<(), ReducerCallError>(()) }, // JS modules: no 2PC support yet. - async |(p, _pid, ptx, _drx), inst| { + async |(p, _pid, _cid, ptx, _drx), inst| { let (res, rv) = inst.call_reducer(p).await.map(|r| (r, None)).unwrap_or_else(|e| { log::error!("prepare_reducer JS fallback: {e}"); ( @@ -1874,6 +1875,230 @@ impl ModuleHost { Ok(()) } + /// Check whether `prepare_id` is present in the coordinator log of this database. + /// Used by participant B to ask coordinator A: "did you commit?" + pub fn has_2pc_coordinator_commit(&self, prepare_id: &str) -> bool { + let db = self.relational_db(); + db.pending_2pc_coordinator_commits() + .map(|rows| rows.iter().any(|r| r.participant_prepare_id == prepare_id)) + .unwrap_or(false) + } + + /// Crash recovery for the **coordinator** role. + /// + /// Scans `st_2pc_coordinator_log` for participants that have not yet acked + /// COMMIT and retransmits the HTTP commit call. Deletes the log entry on success. + pub fn recover_2pc_coordinator(&self) { + let db = self.relational_db().clone(); + let rows = match db.pending_2pc_coordinator_commits() { + Ok(r) => r, + Err(e) => { + log::error!("recover_2pc_coordinator: scan failed: {e}"); + return; + } + }; + if rows.is_empty() { + return; + } + let replica_ctx = self.replica_ctx().clone(); + let db2 = db.clone(); + tokio::spawn(async move { + let client = replica_ctx.call_reducer_client.clone(); + let router = replica_ctx.call_reducer_router.clone(); + let auth_token = replica_ctx.call_reducer_auth_token.clone(); + for row in rows { + let prepare_id = row.participant_prepare_id.clone(); + let participant_identity = match Identity::from_hex(&row.participant_identity_hex) { + Ok(id) => id, + Err(e) => { + log::error!("recover_2pc_coordinator: invalid participant identity hex {}: {e}", row.participant_identity_hex); + continue; + } + }; + let base_url = match router.resolve_base_url(participant_identity).await { + Ok(url) => url, + Err(e) => { + log::warn!("recover_2pc_coordinator: cannot resolve URL for {participant_identity}: {e}"); + continue; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/commit/{}", + base_url, + participant_identity.to_hex(), + prepare_id, + ); + let mut req = client.post(&url); + if let Some(ref token) = auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("recover_2pc_coordinator: re-committed {prepare_id} on {participant_identity}"); + if let Err(e) = db2.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |tx| { + Ok(tx.delete_st_2pc_coordinator_log(&prepare_id)?) + }) { + log::warn!("recover_2pc_coordinator: delete coordinator log failed for {prepare_id}: {e}"); + } + } + Ok(resp) => { + log::warn!("recover_2pc_coordinator: commit for {prepare_id} returned {}", resp.status()); + } + Err(e) => { + log::warn!("recover_2pc_coordinator: transport error for {prepare_id}: {e}"); + } + } + } + }); + } + + /// Crash recovery for the **participant** role. + /// + /// Scans `st_2pc_state` for any prepared-but-not-decided transactions, re-runs + /// each reducer to reacquire the write lock, then polls the coordinator for a decision. + /// + /// **B never aborts on its own** — only the coordinator's response yields ABORT. + pub fn recover_2pc_participant(&self) { + let db = self.relational_db().clone(); + let rows = match db.pending_2pc_prepares() { + Ok(r) => r, + Err(e) => { + log::error!("recover_2pc_participant: scan failed: {e}"); + return; + } + }; + if rows.is_empty() { + return; + } + let this = self.clone(); + tokio::spawn(async move { + for row in rows { + let original_prepare_id = row.prepare_id.clone(); + let coordinator_identity = match Identity::from_hex(&row.coordinator_identity_hex) { + Ok(id) => id, + Err(e) => { + log::error!("recover_2pc_participant: invalid coordinator identity hex for {original_prepare_id}: {e}"); + continue; + } + }; + let caller_identity = match Identity::from_hex(&row.caller_identity_hex) { + Ok(id) => id, + Err(e) => { + log::error!("recover_2pc_participant: invalid caller identity hex for {original_prepare_id}: {e}"); + continue; + } + }; + let caller_connection_id = u128::from_str_radix(&row.caller_connection_id_hex, 16) + .map(ConnectionId::from_u128) + .unwrap_or(ConnectionId::ZERO); + let args = FunctionArgs::Bsatn(row.args_bsatn.clone().into()); + + // Step 1: Re-run the reducer to reacquire the write lock. + let new_prepare_id = match this + .prepare_reducer(caller_identity, Some(caller_connection_id), &row.reducer_name, args) + .await + { + Ok((pid, result, _rv)) if !pid.is_empty() => { + log::info!( + "recover_2pc_participant: re-prepared {original_prepare_id} as {pid}: {:?}", + result.outcome + ); + pid + } + Ok(_) => { + // Reducer failed — treat as abort, clean up old marker. + log::warn!("recover_2pc_participant: reducer failed on re-run for {original_prepare_id}"); + let _ = db.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |tx| { + Ok(tx.delete_st_2pc_state(&original_prepare_id)?) + }); + continue; + } + Err(e) => { + log::error!("recover_2pc_participant: prepare_reducer error for {original_prepare_id}: {e:?}"); + continue; + } + }; + + // Step 2: Poll coordinator with the ORIGINAL prepare_id until we get a decision. + // We do this in a separate task so the loop can proceed to the next row. + let this2 = this.clone(); + let db2 = db.clone(); + let client = this.replica_ctx().call_reducer_client.clone(); + let router = this.replica_ctx().call_reducer_router.clone(); + let auth_token = this.replica_ctx().call_reducer_auth_token.clone(); + tokio::spawn(async move { + loop { + let decision = Self::query_coordinator_status_with_client( + &client, + &router, + auth_token.clone(), + coordinator_identity, + &original_prepare_id, + ).await; + match decision { + Some(commit) => { + if commit { + let _ = this2.commit_prepared(&new_prepare_id); + } else { + let _ = this2.abort_prepared(&new_prepare_id); + } + // Clean up the old st_2pc_state entry. + let _ = db2.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |tx| { + Ok(tx.delete_st_2pc_state(&original_prepare_id)?) + }); + break; + } + None => tokio::time::sleep(std::time::Duration::from_secs(5)).await, + } + } + }); + } + }); + } + + /// Query `GET /v1/database/{coordinator}/2pc/status/{prepare_id}`. + /// + /// Returns `Some(true)` = COMMIT, `Some(false)` = ABORT, `None` = transient error (retry). + async fn query_coordinator_status_with_client( + client: &reqwest::Client, + router: &std::sync::Arc, + auth_token: Option, + coordinator_identity: Identity, + prepare_id: &str, + ) -> Option { + let base_url = match router.resolve_base_url(coordinator_identity).await { + Ok(url) => url, + Err(e) => { + log::warn!("2PC recovery status poll: cannot resolve coordinator URL: {e}"); + return None; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/status/{}", + base_url, + coordinator_identity.to_hex(), + prepare_id, + ); + let mut req = client.get(&url); + if let Some(token) = &auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + let body = resp.text().await.unwrap_or_default(); + Some(body.trim() == "commit") + } + Ok(resp) => { + log::warn!("2PC recovery status poll: coordinator returned {}", resp.status()); + None + } + Err(e) => { + log::warn!("2PC recovery status poll: transport error: {e}"); + None + } + } + } + pub async fn call_view_add_single_subscription( &self, sender: Arc, diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index f67a397544f..f311f9be580 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -566,32 +566,45 @@ impl WasmModuleInstance { }) } - /// Run the reducer as a 2PC participant PREPARE. - /// /// Run the reducer as a 2PC participant PREPARE. /// /// Holds the write lock (MutTxId) open until a COMMIT or ABORT decision arrives. /// The flow: - /// 1. Run reducer (no commit); hold open MutTxId (write lock). - /// 2. If reducer failed: send failure via `prepared_tx`; rollback; return. - /// 3. If reducer succeeded: call `flush_2pc_prepare_marker` — inserts `st_2pc_state` - /// directly into committed state (bumps tx_offset), returns `TxData` for the marker. - /// Forward the `TxData` to the durability worker so the PREPARE is in the commitlog. - /// The write lock remains held throughout. - /// 4. Signal PREPARED via `prepared_tx`. - /// 5. Block on `decision_rx`: - /// - `true` (COMMIT): commit main tx (reducer changes get the next tx_offset), then - /// delete `st_2pc_state` in a new tx. - /// - `false` (ABORT) or channel closed: roll back main tx; delete `st_2pc_state` in - /// a new tx (the marker row is already in committed state from step 3). + /// 1. Extract recovery info from `params` (reducer name, args, caller context). + /// 2. Run reducer (no commit); hold open MutTxId (write lock). + /// 3. If reducer failed: send failure via `prepared_tx`; rollback; return. + /// 4. Flush `st_2pc_state` marker (with recovery fields) directly into committed state. + /// The marker's `TxData` is forwarded to durability so PREPARE is durable. + /// 5. Signal PREPARED via `prepared_tx`. + /// 6. Wait for decision: + /// - Fast path: `decision_rx.recv_timeout(60s)` delivers COMMIT or ABORT. + /// - Slow path: on timeout/disconnect, poll coordinator status endpoint every 5s. + /// **B never aborts on its own** — only A's response can yield ABORT. + /// - COMMIT: delete `st_2pc_state` in the same tx as reducer changes (atomic). + /// - ABORT: rollback, delete `st_2pc_state` in a new tx. pub fn call_reducer_prepare_and_hold( &mut self, params: CallReducerParams, prepare_id: String, + coordinator_identity: crate::identity::Identity, prepared_tx: tokio::sync::oneshot::Sender<(ReducerCallResult, Option)>, decision_rx: std::sync::mpsc::Receiver, ) { let stdb = self.instance.replica_ctx().relational_db().clone(); + let replica_ctx = self.instance.replica_ctx().clone(); + + // Extract recovery info before params are consumed. + let recovery_reducer_name = self + .common + .info + .module_def + .reducer_by_id(params.reducer_id) + .name + .to_string(); + let recovery_args_bsatn = params.args.get_bsatn().to_vec(); + let recovery_caller_identity_hex = params.caller_identity.to_hex().to_string(); + let recovery_caller_connection_id_hex = format!("{:x}", params.caller_connection_id.to_u128()); + let recovery_timestamp_micros = params.timestamp.to_micros_since_unix_epoch(); // Step 1: run the reducer and hold the write lock open. let (mut tx, event, client, trapped) = crate::callgrind_flag::invoke_allowing_callgrind(|| { @@ -615,9 +628,17 @@ impl WasmModuleInstance { return; } - // Step 3: flush the st_2pc_state marker directly into committed state, assign - // a tx_offset, and forward to durability — all while holding the write lock. - let marker_tx_data = match tx.flush_2pc_prepare_marker(&prepare_id) { + // Step 2: flush the st_2pc_state marker with recovery fields directly into committed + // state, assign a tx_offset, and forward to durability — while holding the write lock. + let marker_tx_data = match tx.flush_2pc_prepare_marker( + &prepare_id, + coordinator_identity.to_hex().to_string(), + recovery_reducer_name, + recovery_args_bsatn, + recovery_caller_identity_hex, + recovery_caller_connection_id_hex, + recovery_timestamp_micros, + ) { Ok(td) => std::sync::Arc::new(td), Err(e) => { log::error!("call_reducer_prepare_and_hold: flush_2pc_prepare_marker failed for {prepare_id}: {e}"); @@ -627,7 +648,7 @@ impl WasmModuleInstance { }; stdb.request_durability_for_tx_data(None, &marker_tx_data); - // Step 4: signal PREPARED. + // Step 3: signal PREPARED. let res = ReducerCallResult { outcome: ReducerOutcome::from(&event.status), energy_used: energy_quanta_used, @@ -636,20 +657,18 @@ impl WasmModuleInstance { let return_value = event.reducer_return_value.clone(); let _ = prepared_tx.send((res, return_value)); - // Step 5: block the executor thread until we receive a decision. - let commit = decision_rx.recv().unwrap_or(false); + // Step 4: wait for coordinator's decision (B never aborts on its own). + let commit = + Self::wait_for_2pc_decision(decision_rx, &prepare_id, coordinator_identity, &replica_ctx); if commit { - // Delete the marker in the same tx as the reducer changes so they are - // committed atomically. The row is in committed state (inserted by - // flush_2pc_prepare_marker), so delete_st_2pc_state finds it via iter. + // Delete the marker in the same tx as the reducer changes (atomic commit). if let Err(e) = tx.delete_st_2pc_state(&prepare_id) { log::error!("call_reducer_prepare_and_hold: failed to delete st_2pc_state for {prepare_id}: {e}"); } commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); } else { - // ABORT: roll back reducer changes (tx_state discarded). - // The marker row is already in committed state; clean it up in a new tx. + // ABORT: roll back reducer changes; clean up the already-committed marker. let _ = stdb.rollback_mut_tx(tx); if let Err(e) = stdb.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { Ok(del_tx.delete_st_2pc_state(&prepare_id)?) @@ -661,6 +680,98 @@ impl WasmModuleInstance { } } + /// Wait for a 2PC COMMIT or ABORT decision for `prepare_id`. + /// + /// First waits on `decision_rx` for up to 60 seconds. If no decision arrives, + /// switches to polling the coordinator's `GET /2pc/status/{prepare_id}` endpoint + /// every 5 seconds until a definitive answer is received. + /// + /// **B never aborts on its own** — ABORT is only returned when A explicitly says so. + fn wait_for_2pc_decision( + decision_rx: std::sync::mpsc::Receiver, + prepare_id: &str, + coordinator_identity: crate::identity::Identity, + replica_ctx: &std::sync::Arc, + ) -> bool { + match decision_rx.recv_timeout(Duration::from_secs(60)) { + Ok(commit) => return commit, + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { + log::warn!("2PC prepare_id={prepare_id}: no decision after 60s, polling coordinator"); + } + Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => { + log::warn!("2PC prepare_id={prepare_id}: decision channel closed, polling coordinator"); + } + } + + let handle = tokio::runtime::Handle::current(); + let client = replica_ctx.call_reducer_client.clone(); + let router = replica_ctx.call_reducer_router.clone(); + let auth_token = replica_ctx.call_reducer_auth_token.clone(); + let prepare_id_owned = prepare_id.to_owned(); + loop { + let decision = std::thread::scope(|s| { + s.spawn(|| { + handle.block_on(Self::query_coordinator_status( + &client, + &router, + auth_token.clone(), + coordinator_identity, + &prepare_id_owned, + )) + }) + .join() + .expect("coordinator poll thread panicked") + }); + match decision { + Some(commit) => return commit, + None => std::thread::sleep(Duration::from_secs(5)), + } + } + } + + /// Query `GET /v1/database/{coordinator}/2pc/status/{prepare_id}`. + /// + /// Returns `Some(true)` = COMMIT, `Some(false)` = ABORT, `None` = transient error (retry). + async fn query_coordinator_status( + client: &reqwest::Client, + router: &std::sync::Arc, + auth_token: Option, + coordinator_identity: crate::identity::Identity, + prepare_id: &str, + ) -> Option { + let base_url = match router.resolve_base_url(coordinator_identity).await { + Ok(url) => url, + Err(e) => { + log::warn!("2PC status poll: cannot resolve coordinator URL: {e}"); + return None; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/status/{}", + base_url, + coordinator_identity.to_hex(), + prepare_id, + ); + let mut req = client.get(&url); + if let Some(token) = &auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + let body = resp.text().await.unwrap_or_default(); + Some(body.trim() == "commit") + } + Ok(resp) => { + log::warn!("2PC status poll: coordinator returned {}", resp.status()); + None + } + Err(e) => { + log::warn!("2PC status poll: transport error: {e}"); + None + } + } + } + pub fn call_view(&mut self, cmd: ViewCommand) -> ViewCommandResult { let (res, trapped) = self.common.handle_cmd(cmd, &mut self.instance); self.trapped = trapped; @@ -937,18 +1048,32 @@ impl InstanceCommon { params: CallReducerParams, inst: &mut I, ) -> (ReducerCallResult, Option, bool) { - let (tx, event, client, trapped) = self.run_reducer_no_commit(tx, params, inst); + let (mut tx, event, client, trapped) = self.run_reducer_no_commit(tx, params, inst); let energy_quanta_used = event.energy_quanta_used; let total_duration = event.host_execution_duration; + // Take participants before commit so we can write the coordinator log atomically. + let prepared_participants = inst.take_prepared_participants(); + + // If this coordinator tx is committed and has participants, write coordinator log + // entries into the still-open tx. They are committed atomically with the tx, + // making A's COMMIT decision durable before any HTTP is sent to B (scenario 2 + // crash recovery). + if matches!(event.status, EventStatus::Committed(_)) && !prepared_participants.is_empty() { + for (db_identity, prepare_id) in &prepared_participants { + if let Err(e) = tx.insert_st_2pc_coordinator_log(prepare_id, &db_identity.to_hex().to_string()) { + log::error!("insert_st_2pc_coordinator_log failed for {prepare_id}: {e}"); + } + } + } + let event = commit_and_broadcast_event(&self.info.subscriptions, client, event, tx).event; - // 2PC post-commit coordination: commit or abort all prepared participants. - let prepared_participants = inst.take_prepared_participants(); + // 2PC post-commit coordination: send COMMIT or ABORT to each participant. if !prepared_participants.is_empty() { let committed = matches!(event.status, EventStatus::Committed(_)); - let stdb = self.info.subscriptions.relational_db(); + let stdb = self.info.subscriptions.relational_db().clone(); let replica_ctx = inst.replica_ctx().clone(); let handle = tokio::runtime::Handle::current(); @@ -986,6 +1111,15 @@ impl InstanceCommon { match req.send().await { Ok(resp) if resp.status().is_success() => { log::info!("2PC {action}: {prepare_id} on {db_identity}"); + // B acknowledged COMMIT — remove coordinator log entry + // (best-effort; recovery will clean up on restart if missed). + if committed { + if let Err(e) = stdb.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { + Ok(del_tx.delete_st_2pc_coordinator_log(prepare_id)?) + }) { + log::warn!("delete_st_2pc_coordinator_log failed for {prepare_id}: {e}"); + } + } } Ok(resp) => { log::error!( diff --git a/crates/datastore/src/locking_tx_datastore/mut_tx.rs b/crates/datastore/src/locking_tx_datastore/mut_tx.rs index 4f227ae0dd5..a2d2b306d92 100644 --- a/crates/datastore/src/locking_tx_datastore/mut_tx.rs +++ b/crates/datastore/src/locking_tx_datastore/mut_tx.rs @@ -20,13 +20,15 @@ use crate::{ use crate::{ error::{IndexError, SequenceError, TableError}, system_tables::{ - with_sys_table_buf, St2pcStateFields, St2pcStateRow, StClientFields, StClientRow, StColumnAccessorFields, - StColumnAccessorRow, StColumnFields, StColumnRow, StConstraintFields, StConstraintRow, StEventTableRow, + with_sys_table_buf, St2pcCoordinatorLogFields, St2pcCoordinatorLogRow, St2pcStateFields, St2pcStateRow, + StClientFields, StClientRow, StColumnAccessorFields, StColumnAccessorRow, StColumnFields, StColumnRow, + StConstraintFields, StConstraintRow, StEventTableRow, StFields as _, StIndexAccessorFields, StIndexAccessorRow, StIndexFields, StIndexRow, StRowLevelSecurityFields, StRowLevelSecurityRow, StScheduledFields, StScheduledRow, StSequenceFields, StSequenceRow, - StTableAccessorFields, StTableAccessorRow, StTableFields, StTableRow, SystemTable, ST_2PC_STATE_ID, - ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, ST_CONSTRAINT_ID, ST_EVENT_TABLE_ID, ST_INDEX_ACCESSOR_ID, - ST_INDEX_ID, ST_ROW_LEVEL_SECURITY_ID, ST_SCHEDULED_ID, ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, + StTableAccessorFields, StTableAccessorRow, StTableFields, StTableRow, SystemTable, + ST_2PC_COORDINATOR_LOG_ID, ST_2PC_STATE_ID, ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, + ST_CONSTRAINT_ID, ST_EVENT_TABLE_ID, ST_INDEX_ACCESSOR_ID, ST_INDEX_ID, ST_ROW_LEVEL_SECURITY_ID, + ST_SCHEDULED_ID, ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, }, }; use crate::{execution_context::ExecutionContext, system_tables::StViewColumnRow}; @@ -2690,34 +2692,30 @@ impl MutTxId { .map(|row| row.pointer()) } - /// Insert a row into `st_2pc_state` to record that this database is a 2PC participant - /// in the given prepared transaction. The row persists until `delete_st_2pc_state` is - /// called on COMMIT or ABORT. On crash-recovery, any rows here indicate transactions - /// that need to be resumed. - pub fn insert_st_2pc_state(&mut self, prepare_id: &str) -> Result<()> { - let row = &St2pcStateRow { - prepare_id: prepare_id.to_owned(), - }; - self.insert_via_serialize_bsatn(ST_2PC_STATE_ID, row) - .map(|_| ()) - .inspect_err(|e| { - log::error!("insert_st_2pc_state: failed to insert prepare_id ({prepare_id}), error: {e}"); - }) - } - /// Write the `st_2pc_state` PREPARE marker directly to the committed state and allocate a /// `tx_offset` for it, **without** releasing the write lock or committing the pending /// reducer changes in `tx_state`. /// - /// Returns the `TxData` containing just the `st_2pc_state` insert together with its - /// assigned `tx_offset`. The caller is responsible for forwarding this to the durability - /// worker so that the PREPARE record becomes durable in the commitlog. + /// Stores all fields needed for crash recovery (coordinator identity, reducer name/args, + /// caller context) so that, on restart, the participant can re-run the reducer and poll + /// the coordinator for a COMMIT or ABORT decision. + /// + /// Returns the `TxData` for the marker row (with its own `tx_offset`). The caller must + /// forward this to the durability worker so the PREPARE record becomes durable. /// - /// Because the write lock remains held, no other transaction can begin between this call - /// and the eventual `commit` / `rollback` of the enclosing `MutTxId`. On ABORT the - /// caller must delete the `st_2pc_state` row in a subsequent transaction (the row was - /// inserted directly into the committed state and is not part of `tx_state`). - pub fn flush_2pc_prepare_marker(&mut self, prepare_id: &str) -> Result { + /// The write lock remains held after this call. On ABORT the caller must delete the + /// `st_2pc_state` row in a subsequent transaction. + #[allow(clippy::too_many_arguments)] + pub fn flush_2pc_prepare_marker( + &mut self, + prepare_id: &str, + coordinator_identity_hex: String, + reducer_name: String, + args_bsatn: Vec, + caller_identity_hex: String, + caller_connection_id_hex: String, + timestamp_micros: i64, + ) -> Result { let schema = self .committed_state_write_lock .get_schema(ST_2PC_STATE_ID) @@ -2725,6 +2723,12 @@ impl MutTxId { .expect("st_2pc_state system table must exist in committed state"); let row = ProductValue::from(St2pcStateRow { prepare_id: prepare_id.to_owned(), + coordinator_identity_hex, + reducer_name, + args_bsatn, + caller_identity_hex, + caller_connection_id_hex, + timestamp_micros, }); self.committed_state_write_lock .insert_row_and_consume_offset(ST_2PC_STATE_ID, &schema, &row) @@ -2742,55 +2746,58 @@ impl MutTxId { Ok(()) } - /// Return all `prepare_id`s currently in `st_2pc_state`. - /// Used on recovery to find prepared transactions that need to be resumed. - pub fn scan_st_2pc_state(&self) -> Result> { + /// Return all rows in `st_2pc_state` (prepared but not yet committed/aborted). + /// Used on recovery: each row describes a transaction to resume. + pub fn scan_st_2pc_state(&self) -> Result> { self.iter(ST_2PC_STATE_ID)? - .map(|row| St2pcStateRow::try_from(row).map(|r| r.prepare_id)) + .map(|row| St2pcStateRow::try_from(row)) .collect() } - /// Return the [`TxData`] that would result from committing this transaction, - /// without actually committing it. + /// Insert a row into `st_2pc_coordinator_log` recording that the coordinator has + /// decided COMMIT for `participant_prepare_id` on `participant_identity_hex`. /// - /// The write lock on the committed state remains held and the [`TxState`] is - /// left intact. This is used during 2PC PREPARE so that a durability record - /// can be written while still holding the exclusive lock, preventing any - /// interleaved transactions between PREPARE and COMMIT/ABORT. - pub fn peek_tx_data(&self) -> TxData { - let mut tx_data = TxData::default(); - - // Collect inserts: scan tx_state.insert_tables without mutating anything. - for (table_id, tx_table) in &self.tx_state.insert_tables { - let rows: std::sync::Arc<[ProductValue]> = tx_table - .scan_rows(&self.tx_state.blob_store) - .map(|row| row.to_product_value()) - .collect(); - if !rows.is_empty() { - tx_data.set_inserts_for_table(*table_id, &tx_table.get_schema().table_name, rows); - } - } + /// Called inside the coordinator's open `MutTxId` so the entry is committed + /// atomically with the coordinator's own changes. + pub fn insert_st_2pc_coordinator_log( + &mut self, + participant_prepare_id: &str, + participant_identity_hex: &str, + ) -> Result<()> { + let row = &St2pcCoordinatorLogRow { + participant_prepare_id: participant_prepare_id.to_owned(), + participant_identity_hex: participant_identity_hex.to_owned(), + }; + self.insert_via_serialize_bsatn(ST_2PC_COORDINATOR_LOG_ID, row) + .map(|_| ()) + .inspect_err(|e| { + log::error!( + "insert_st_2pc_coordinator_log: failed for prepare_id ({participant_prepare_id}): {e}" + ); + }) + } - // Collect deletes: row pointers live in the committed state; read them - // without deleting. - for (table_id, delete_table) in &self.tx_state.delete_tables { - if let Ok((table, blob_store, _)) = self.committed_state_write_lock.get_table_and_blob_store(*table_id) { - let rows: std::sync::Arc<[ProductValue]> = delete_table - .iter() - .map(|row_ptr| { - table - .get_row_ref(blob_store, row_ptr) - .expect("delete_tables references non-existent row in committed state") - .to_product_value() - }) - .collect(); - if !rows.is_empty() { - tx_data.set_deletes_for_table(*table_id, &table.get_schema().table_name, rows); - } - } + /// Delete the coordinator log entry for `participant_prepare_id` once the participant + /// has acknowledged COMMIT. + pub fn delete_st_2pc_coordinator_log(&mut self, participant_prepare_id: &str) -> Result<()> { + if let Err(e) = self.delete_col_eq( + ST_2PC_COORDINATOR_LOG_ID, + St2pcCoordinatorLogFields::ParticipantPrepareId.col_id(), + &AlgebraicValue::String(participant_prepare_id.into()), + ) { + log::error!( + "delete_st_2pc_coordinator_log: no row for prepare_id ({participant_prepare_id}): {e}" + ); } + Ok(()) + } - tx_data + /// Return all entries in the coordinator log (COMMIT decisions not yet acknowledged). + /// Used on coordinator crash-recovery to retransmit COMMIT to participants. + pub fn scan_st_2pc_coordinator_log(&self) -> Result> { + self.iter(ST_2PC_COORDINATOR_LOG_ID)? + .map(|row| St2pcCoordinatorLogRow::try_from(row)) + .collect() } pub fn insert_via_serialize_bsatn<'a, T: Serialize>( diff --git a/crates/datastore/src/system_tables.rs b/crates/datastore/src/system_tables.rs index 171baa5ccc0..db1da0a8adf 100644 --- a/crates/datastore/src/system_tables.rs +++ b/crates/datastore/src/system_tables.rs @@ -91,6 +91,11 @@ pub const ST_COLUMN_ACCESSOR_ID: TableId = TableId(20); /// The static ID of the 2PC participant state table pub const ST_2PC_STATE_ID: TableId = TableId(21); pub(crate) const ST_2PC_STATE_NAME: &str = "st_2pc_state"; +/// The static ID of the 2PC coordinator log table. +/// A row is written atomically with the coordinator's commit, before sending COMMIT to participants. +/// Used on coordinator crash-recovery to retransmit COMMIT decisions. +pub const ST_2PC_COORDINATOR_LOG_ID: TableId = TableId(22); +pub(crate) const ST_2PC_COORDINATOR_LOG_NAME: &str = "st_2pc_coordinator_log"; pub(crate) const ST_CONNECTION_CREDENTIALS_NAME: &str = "st_connection_credentials"; pub const ST_TABLE_NAME: &str = "st_table"; @@ -208,7 +213,7 @@ pub enum SystemTable { st_table_accessor, } -pub fn system_tables() -> [TableSchema; 21] { +pub fn system_tables() -> [TableSchema; 22] { [ // The order should match the `id` of the system table, that start with [ST_TABLE_IDX]. st_table_schema(), @@ -232,6 +237,7 @@ pub fn system_tables() -> [TableSchema; 21] { st_index_accessor_schema(), st_column_accessor_schema(), st_2pc_state_schema(), + st_2pc_coordinator_log_schema(), ] } @@ -456,7 +462,19 @@ st_fields_enum!(enum StColumnAccessorFields { // WARNING: For a stable schema, don't change the field names and discriminants. st_fields_enum!(enum St2pcStateFields { - "prepare_id", PrepareId = 0, + "prepare_id", PrepareId = 0, + "coordinator_identity_hex",CoordinatorIdentityHex = 1, + "reducer_name", ReducerName = 2, + "args_bsatn", ArgsBsatn = 3, + "caller_identity_hex", CallerIdentityHex = 4, + "caller_connection_id_hex", CallerConnectionIdHex = 5, + "timestamp_micros", TimestampMicros = 6, +}); + +// WARNING: For a stable schema, don't change the field names and discriminants. +st_fields_enum!(enum St2pcCoordinatorLogFields { + "participant_prepare_id", ParticipantPrepareId = 0, + "participant_identity_hex", ParticipantIdentityHex = 1, }); /// Helper method to check that a system table has the correct fields. @@ -685,6 +703,17 @@ fn system_module_def() -> ModuleDef { .with_index_no_accessor_name(btree(St2pcStateFields::PrepareId)) .with_access(v9::TableAccess::Private); + let st_2pc_coordinator_log_type = builder.add_type::(); + builder + .build_table( + ST_2PC_COORDINATOR_LOG_NAME, + *st_2pc_coordinator_log_type.as_ref().expect("should be ref"), + ) + .with_type(TableType::System) + .with_unique_constraint(St2pcCoordinatorLogFields::ParticipantPrepareId) + .with_index_no_accessor_name(btree(St2pcCoordinatorLogFields::ParticipantPrepareId)) + .with_access(v9::TableAccess::Private); + let result = builder .finish() .try_into() @@ -711,6 +740,7 @@ fn system_module_def() -> ModuleDef { validate_system_table::(&result, ST_INDEX_ACCESSOR_NAME); validate_system_table::(&result, ST_COLUMN_ACCESSOR_NAME); validate_system_table::(&result, ST_2PC_STATE_NAME); + validate_system_table::(&result, ST_2PC_COORDINATOR_LOG_NAME); result } @@ -759,6 +789,8 @@ lazy_static::lazy_static! { m.insert("st_index_accessor_accessor_name_key", ConstraintId(23)); m.insert("st_column_accessor_table_name_col_name_key", ConstraintId(24)); m.insert("st_column_accessor_table_name_accessor_name_key", ConstraintId(25)); + m.insert("st_2pc_state_prepare_id_key", ConstraintId(26)); + m.insert("st_2pc_coordinator_log_participant_prepare_id_key", ConstraintId(27)); m }; } @@ -797,6 +829,8 @@ lazy_static::lazy_static! { m.insert("st_index_accessor_accessor_name_idx_btree", IndexId(27)); m.insert("st_column_accessor_table_name_col_name_idx_btree", IndexId(28)); m.insert("st_column_accessor_table_name_accessor_name_idx_btree", IndexId(29)); + m.insert("st_2pc_state_prepare_id_idx_btree", IndexId(30)); + m.insert("st_2pc_coordinator_log_participant_prepare_id_idx_btree", IndexId(31)); m }; } @@ -914,6 +948,10 @@ fn st_2pc_state_schema() -> TableSchema { st_schema(ST_2PC_STATE_NAME, ST_2PC_STATE_ID) } +fn st_2pc_coordinator_log_schema() -> TableSchema { + st_schema(ST_2PC_COORDINATOR_LOG_NAME, ST_2PC_COORDINATOR_LOG_ID) +} + fn st_connection_credential_schema() -> TableSchema { st_schema(ST_CONNECTION_CREDENTIALS_NAME, ST_CONNECTION_CREDENTIALS_ID) } @@ -991,6 +1029,7 @@ pub(crate) fn system_table_schema(table_id: TableId) -> Option { ST_INDEX_ACCESSOR_ID => Some(st_index_accessor_schema()), ST_COLUMN_ACCESSOR_ID => Some(st_column_accessor_schema()), ST_2PC_STATE_ID => Some(st_2pc_state_schema()), + ST_2PC_COORDINATOR_LOG_ID => Some(st_2pc_coordinator_log_schema()), _ => None, } } @@ -1887,15 +1926,26 @@ impl From for ProductValue { /// Tracks in-flight 2PC participant transactions. /// A row is inserted when B enters PREPARE state and deleted on COMMIT or ABORT. /// On recovery, any row here indicates a pending prepared transaction that must -/// be resumed (retransmit PREPARED to the coordinator and await the decision). -/// -/// | prepare_id | -/// |-------------| -/// | "prepare-1" | +/// be resumed: B re-runs the reducer with the stored args, then polls the coordinator +/// for a COMMIT or ABORT decision (B never aborts on its own). #[derive(Clone, Debug, Eq, PartialEq, SpacetimeType)] #[sats(crate = spacetimedb_lib)] pub struct St2pcStateRow { + /// The unique prepare ID for this transaction, generated by this participant. pub prepare_id: String, + /// Hex-encoded identity of the coordinator database (A). Used on recovery to query + /// `GET /v1/database/{coordinator}/2pc/status/{prepare_id}` for the decision. + pub coordinator_identity_hex: String, + /// Name of the reducer that was prepared. + pub reducer_name: String, + /// BSATN-encoded reducer arguments. Re-used when replaying the reducer on recovery. + pub args_bsatn: Vec, + /// Hex-encoded identity of the original caller. + pub caller_identity_hex: String, + /// Hex-encoded connection ID of the original caller ("0" if none). + pub caller_connection_id_hex: String, + /// Timestamp of the original call (microseconds since Unix epoch). + pub timestamp_micros: i64, } impl TryFrom> for St2pcStateRow { @@ -1911,6 +1961,37 @@ impl From for ProductValue { } } +/// System table [ST_2PC_COORDINATOR_LOG_NAME] +/// +/// Written atomically with the coordinator's COMMIT transaction (one row per participant). +/// Used on coordinator crash-recovery to retransmit COMMIT decisions to participants. +/// Also serves as the authoritative answer for the status endpoint: +/// - present → COMMIT (coordinator decided COMMIT for this participant prepare_id) +/// - absent → ABORT (coordinator never committed, or already cleaned up) +/// +/// A row is deleted after the participant acknowledges the COMMIT. +#[derive(Clone, Debug, Eq, PartialEq, SpacetimeType)] +#[sats(crate = spacetimedb_lib)] +pub struct St2pcCoordinatorLogRow { + /// The participant's prepare_id (B's prepare_id). + pub participant_prepare_id: String, + /// Hex-encoded identity of the participant database (B). + pub participant_identity_hex: String, +} + +impl TryFrom> for St2pcCoordinatorLogRow { + type Error = DatastoreError; + fn try_from(row: RowRef<'_>) -> Result { + read_via_bsatn(row) + } +} + +impl From for ProductValue { + fn from(x: St2pcCoordinatorLogRow) -> Self { + to_product_value(&x) + } +} + thread_local! { static READ_BUF: RefCell> = const { RefCell::new(Vec::new()) }; } From b1477a6a88b8f2b5982a4e7ff9dbf312e7ef929f Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Sun, 29 Mar 2026 23:32:24 +0530 Subject: [PATCH 13/22] ack --- crates/client-api/src/routes/database.rs | 30 ++++++++++++- crates/core/src/host/module_host.rs | 57 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/crates/client-api/src/routes/database.rs b/crates/client-api/src/routes/database.rs index 7f4dd08e378..c2adf71e111 100644 --- a/crates/client-api/src/routes/database.rs +++ b/crates/client-api/src/routes/database.rs @@ -374,6 +374,30 @@ pub async fn status_2pc( Ok((StatusCode::OK, decision)) } +/// 2PC commit-ack endpoint. +/// +/// Called by participant B after it commits via the status-poll recovery path, +/// so that the coordinator can delete its `st_2pc_coordinator_log` entry. +/// +/// `POST /v1/database/:name_or_identity/2pc/ack-commit/:prepare_id` +pub async fn ack_commit_2pc( + State(worker_ctx): State, + Extension(_auth): Extension, + Path(TwoPcParams { + name_or_identity, + prepare_id, + }): Path, +) -> axum::response::Result { + let (module, _database) = find_module_and_database(&worker_ctx, name_or_identity).await?; + + module.ack_2pc_coordinator_commit(&prepare_id).map_err(|e| { + log::error!("2PC ack-commit failed: {e}"); + (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response() + })?; + + Ok(StatusCode::OK) +} + fn reducer_outcome_response( module: &ModuleHost, owner_identity: &Identity, @@ -1388,6 +1412,8 @@ pub struct DatabaseRoutes { pub abort_2pc_post: MethodRouter, /// GET: /database/:name_or_identity/2pc/status/:prepare_id pub status_2pc_get: MethodRouter, + /// POST: /database/:name_or_identity/2pc/ack-commit/:prepare_id + pub ack_commit_2pc_post: MethodRouter, } impl Default for DatabaseRoutes @@ -1417,6 +1443,7 @@ where commit_2pc_post: post(commit_2pc::), abort_2pc_post: post(abort_2pc::), status_2pc_get: get(status_2pc::), + ack_commit_2pc_post: post(ack_commit_2pc::), } } } @@ -1445,7 +1472,8 @@ where .route("/prepare/:reducer", self.prepare_post) .route("/2pc/commit/:prepare_id", self.commit_2pc_post) .route("/2pc/abort/:prepare_id", self.abort_2pc_post) - .route("/2pc/status/:prepare_id", self.status_2pc_get); + .route("/2pc/status/:prepare_id", self.status_2pc_get) + .route("/2pc/ack-commit/:prepare_id", self.ack_commit_2pc_post); axum::Router::new() .route("/", self.root_post) diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 2816a856999..e78614d18e1 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1875,6 +1875,16 @@ impl ModuleHost { Ok(()) } + /// Delete a coordinator log entry for `prepare_id`. + /// Called when B has confirmed it committed, so A can stop retransmitting. + pub fn ack_2pc_coordinator_commit(&self, prepare_id: &str) -> Result<(), anyhow::Error> { + let db = self.relational_db().clone(); + db.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |tx| { + tx.delete_st_2pc_coordinator_log(prepare_id) + .map_err(anyhow::Error::from) + }) + } + /// Check whether `prepare_id` is present in the coordinator log of this database. /// Used by participant B to ask coordinator A: "did you commit?" pub fn has_2pc_coordinator_commit(&self, prepare_id: &str) -> bool { @@ -2039,6 +2049,14 @@ impl ModuleHost { Some(commit) => { if commit { let _ = this2.commit_prepared(&new_prepare_id); + // Tell A we committed so it can delete its coordinator log entry. + Self::send_ack_commit_to_coordinator( + &client, + &router, + auth_token.clone(), + coordinator_identity, + &original_prepare_id, + ).await; } else { let _ = this2.abort_prepared(&new_prepare_id); } @@ -2099,6 +2117,45 @@ impl ModuleHost { } } + /// POST `POST /v1/database/{coordinator}/2pc/ack-commit/{prepare_id}` to tell A that + /// B has committed, so A can delete its coordinator log entry. + async fn send_ack_commit_to_coordinator( + client: &reqwest::Client, + router: &std::sync::Arc, + auth_token: Option, + coordinator_identity: Identity, + prepare_id: &str, + ) { + let base_url = match router.resolve_base_url(coordinator_identity).await { + Ok(url) => url, + Err(e) => { + log::warn!("2PC ack-commit: cannot resolve coordinator URL: {e}"); + return; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/ack-commit/{}", + base_url, + coordinator_identity.to_hex(), + prepare_id, + ); + let mut req = client.post(&url); + if let Some(token) = &auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC ack-commit: notified coordinator for {prepare_id}"); + } + Ok(resp) => { + log::warn!("2PC ack-commit: coordinator returned {} for {prepare_id}", resp.status()); + } + Err(e) => { + log::warn!("2PC ack-commit: transport error for {prepare_id}: {e}"); + } + } + } + pub async fn call_view_add_single_subscription( &self, sender: Arc, From 91a2a7ec638b8b4a24ba18c1588cc0dcb2eb137b Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Sun, 29 Mar 2026 23:47:04 +0530 Subject: [PATCH 14/22] persistence --- crates/core/src/host/module_host.rs | 50 +--------- .../src/host/wasm_common/module_host_actor.rs | 92 ++++++++++++++++++- 2 files changed, 90 insertions(+), 52 deletions(-) diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index e78614d18e1..97e9135718f 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -2049,14 +2049,9 @@ impl ModuleHost { Some(commit) => { if commit { let _ = this2.commit_prepared(&new_prepare_id); - // Tell A we committed so it can delete its coordinator log entry. - Self::send_ack_commit_to_coordinator( - &client, - &router, - auth_token.clone(), - coordinator_identity, - &original_prepare_id, - ).await; + // The actor thread (call_reducer_prepare_and_hold) will wait + // for B's commit to be durable and then send the ack-commit + // to the coordinator. Nothing to do here. } else { let _ = this2.abort_prepared(&new_prepare_id); } @@ -2117,45 +2112,6 @@ impl ModuleHost { } } - /// POST `POST /v1/database/{coordinator}/2pc/ack-commit/{prepare_id}` to tell A that - /// B has committed, so A can delete its coordinator log entry. - async fn send_ack_commit_to_coordinator( - client: &reqwest::Client, - router: &std::sync::Arc, - auth_token: Option, - coordinator_identity: Identity, - prepare_id: &str, - ) { - let base_url = match router.resolve_base_url(coordinator_identity).await { - Ok(url) => url, - Err(e) => { - log::warn!("2PC ack-commit: cannot resolve coordinator URL: {e}"); - return; - } - }; - let url = format!( - "{}/v1/database/{}/2pc/ack-commit/{}", - base_url, - coordinator_identity.to_hex(), - prepare_id, - ); - let mut req = client.post(&url); - if let Some(token) = &auth_token { - req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); - } - match req.send().await { - Ok(resp) if resp.status().is_success() => { - log::info!("2PC ack-commit: notified coordinator for {prepare_id}"); - } - Ok(resp) => { - log::warn!("2PC ack-commit: coordinator returned {} for {prepare_id}", resp.status()); - } - Err(e) => { - log::warn!("2PC ack-commit: transport error for {prepare_id}: {e}"); - } - } - } - pub async fn call_view_add_single_subscription( &self, sender: Arc, diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index f311f9be580..b8d4e66471e 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -399,6 +399,47 @@ impl WasmModuleHostActor { } } +/// Notify coordinator A that B has committed, so A can delete its coordinator log entry. +/// +/// Called AFTER B's commit is durable. Fire-and-forget: failure is tolerated because +/// `recover_2pc_coordinator` on A will retransmit COMMIT on restart. +async fn send_ack_commit_to_coordinator( + client: reqwest::Client, + router: std::sync::Arc, + auth_token: Option, + coordinator_identity: crate::identity::Identity, + prepare_id: String, +) { + let base_url = match router.resolve_base_url(coordinator_identity).await { + Ok(url) => url, + Err(e) => { + log::warn!("2PC ack-commit: cannot resolve coordinator URL: {e}"); + return; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/ack-commit/{}", + base_url, + coordinator_identity.to_hex(), + prepare_id, + ); + let mut req = client.post(&url); + if let Some(token) = &auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC ack-commit: notified coordinator for {prepare_id}"); + } + Ok(resp) => { + log::warn!("2PC ack-commit: coordinator returned {} for {prepare_id}", resp.status()); + } + Err(e) => { + log::warn!("2PC ack-commit: transport error for {prepare_id}: {e}"); + } + } +} + impl WasmModuleHostActor { fn make_from_instance(&self, mut instance: T::Instance) -> WasmModuleInstance { let common = InstanceCommon::new(&self.common); @@ -648,7 +689,17 @@ impl WasmModuleInstance { }; stdb.request_durability_for_tx_data(None, &marker_tx_data); - // Step 3: signal PREPARED. + // Step 3: wait for the PREPARE marker to be durable before signalling PREPARED. + // B must not claim PREPARED until the marker is on disk — if B crashes after + // claiming PREPARED but before the marker is durable, recovery has nothing to recover. + if let Some(prepare_offset) = marker_tx_data.tx_offset() { + if let Some(mut durable) = stdb.durable_tx_offset() { + let handle = tokio::runtime::Handle::current(); + let _ = handle.block_on(durable.wait_for(prepare_offset)); + } + } + + // Step 4: signal PREPARED. let res = ReducerCallResult { outcome: ReducerOutcome::from(&event.status), energy_used: energy_quanta_used, @@ -666,7 +717,32 @@ impl WasmModuleInstance { if let Err(e) = tx.delete_st_2pc_state(&prepare_id) { log::error!("call_reducer_prepare_and_hold: failed to delete st_2pc_state for {prepare_id}: {e}"); } - commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); + let commit_result = commit_and_broadcast_event(&self.common.info.subscriptions, client, event, tx); + + // Wait for B's COMMIT to be durable before acking to coordinator. + // Without this, A could delete its coordinator log entry while B's commit + // is still in-memory — a B crash at that point would leave the tx uncommitted + // with no way to recover (A has already forgotten it committed). + let handle = tokio::runtime::Handle::current(); + if let Some(mut durable) = stdb.durable_tx_offset() { + if let Ok(offset) = handle.block_on(commit_result.tx_offset) { + let _ = handle.block_on(durable.wait_for(offset)); + } + } + + // Notify coordinator that B has committed so it can delete its coordinator log entry. + // Fire-and-forget: if this fails, coordinator's recover_2pc_coordinator will retry on + // restart, and B's commit_prepared will then return a harmless "not found" error. + let router = replica_ctx.call_reducer_router.clone(); + let client_http = replica_ctx.call_reducer_client.clone(); + let auth_token = replica_ctx.call_reducer_auth_token.clone(); + handle.spawn(send_ack_commit_to_coordinator( + client_http, + router, + auth_token, + coordinator_identity, + prepare_id, + )); } else { // ABORT: roll back reducer changes; clean up the already-committed marker. let _ = stdb.rollback_mut_tx(tx); @@ -1068,7 +1144,9 @@ impl InstanceCommon { } } - let event = commit_and_broadcast_event(&self.info.subscriptions, client, event, tx).event; + let commit_result = commit_and_broadcast_event(&self.info.subscriptions, client, event, tx); + let commit_tx_offset = commit_result.tx_offset; + let event = commit_result.event; // 2PC post-commit coordination: send COMMIT or ABORT to each participant. if !prepared_participants.is_empty() { @@ -1080,9 +1158,13 @@ impl InstanceCommon { std::thread::scope(|s| { s.spawn(|| { handle.block_on(async { + // Wait for A's coordinator log (committed atomically with the tx) to be + // durable before sending COMMIT to B. This guarantees that if A crashes + // after sending COMMIT, recovery can retransmit from the durable log. if committed && let Some(mut durable_offset) = stdb.durable_tx_offset() { - let current: u64 = durable_offset.last_seen().unwrap_or(0); - let _ = durable_offset.wait_for(current + 1).await; + if let Ok(offset) = commit_tx_offset.await { + let _ = durable_offset.wait_for(offset).await; + } } let client = replica_ctx.call_reducer_client.clone(); From e56e1fc084f6b24c8b6111bbd0bd96d3b651f2f1 Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 00:03:39 +0530 Subject: [PATCH 15/22] unique prepare --- crates/core/src/host/module_host.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 97e9135718f..93d8804e70c 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1788,7 +1788,14 @@ impl ModuleHost { args, }; - let prepare_id = format!("prepare-{}", PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed)); + // Include the coordinator identity so prepare_ids from different coordinators + // cannot collide on the participant's st_2pc_state table. + let coordinator_hex = caller_identity.to_hex(); + let prepare_id = format!( + "prepare-{}-{}", + &coordinator_hex.to_string()[..16], + PREPARE_COUNTER.fetch_add(1, Ordering::Relaxed), + ); // Channel for signalling PREPARED result back to this task. let (prepared_tx, prepared_rx) = tokio::sync::oneshot::channel::<(ReducerCallResult, Option)>(); From 8fb3e920a31b5a67081a714385e93dff56f900eb Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 00:30:56 +0530 Subject: [PATCH 16/22] smoketest --- .../tests/smoketests/cross_db_2pc_recovery.rs | 445 ++++++++++++++++++ crates/smoketests/tests/smoketests/mod.rs | 1 + 2 files changed, 446 insertions(+) create mode 100644 crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs diff --git a/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs b/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs new file mode 100644 index 00000000000..e7955ed28b6 --- /dev/null +++ b/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs @@ -0,0 +1,445 @@ +use spacetimedb_guard::ensure_binaries_built; +use spacetimedb_smoketests::{require_local_server, Smoketest}; +use std::time::Duration; + +/// Module code used for all recovery tests. +/// +/// Extends the basic banking module with: +/// - `debit_slow`: same as `debit` but spins for ~2-3s first, giving the test +/// a reliable window in which to crash the server mid-2PC. +/// - `balance`: convenience reducer that returns alice's balance in the logs +/// so tests can detect completion by polling server logs. +const MODULE_CODE: &str = r#" +use spacetimedb::{log, ReducerContext, Table, Identity}; + +#[spacetimedb::table(accessor = ledger, public)] +pub struct Ledger { + #[primary_key] + account: String, + balance: i64, +} + +#[spacetimedb::reducer(init)] +pub fn init(ctx: &ReducerContext) { + ctx.db.ledger().insert(Ledger { account: "alice".to_string(), balance: 100 }); +} + +#[spacetimedb::reducer] +pub fn debit(ctx: &ReducerContext, account: String, amount: i64) { + let row = ctx.db.ledger().account().find(&account) + .unwrap_or_else(|| panic!("account '{}' not found", account)); + let new_balance = row.balance - amount; + if new_balance < 0 { + panic!("insufficient funds: account '{}' has {} but tried to debit {}", account, row.balance, amount); + } + ctx.db.ledger().account().update(Ledger { account, balance: new_balance }); +} + +/// Same as `debit` but wastes ~2-3 seconds of CPU first. +/// This creates a reliable timing window for crash recovery tests: +/// the server can be killed while this reducer is executing or just after. +#[spacetimedb::reducer] +pub fn debit_slow(ctx: &ReducerContext, account: String, amount: i64) { + // Busy-wait loop. ~100M multiply-add iterations ≈ 2-3s in WASM. + // Using the timestamp as the seed prevents the loop from being + // eliminated by the WASM optimizer. + let mut x: u64 = ctx.timestamp.to_micros_since_unix_epoch() as u64; + for i in 0u64..100_000_000 { + x = x.wrapping_mul(6364136223846793005u64).wrapping_add(i | 1); + } + if x == 0 { panic!("impossible: loop result was zero"); } + debit(ctx, account, amount); +} + +#[spacetimedb::reducer] +pub fn credit(ctx: &ReducerContext, account: String, amount: i64) { + match ctx.db.ledger().account().find(&account) { + Some(row) => { + ctx.db.ledger().account().update(Ledger { account, balance: row.balance + amount }); + } + None => { + ctx.db.ledger().insert(Ledger { account, balance: amount }); + } + } +} + +#[spacetimedb::reducer] +pub fn transfer_funds(ctx: &ReducerContext, target_hex: String, from_account: String, to_account: String, amount: i64) { + credit(ctx, to_account.clone(), amount); + let target = Identity::from_hex(&target_hex).expect("invalid target identity hex"); + let args = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(target, "debit", &args) { + Ok(()) => log::info!("transfer_funds: remote debit succeeded"), + Err(e) => panic!("remote debit failed: {e}"), + } +} + +/// Same as transfer_funds but calls debit_slow on the remote side. +#[spacetimedb::reducer] +pub fn transfer_funds_slow(ctx: &ReducerContext, target_hex: String, from_account: String, to_account: String, amount: i64) { + credit(ctx, to_account.clone(), amount); + let target = Identity::from_hex(&target_hex).expect("invalid target identity hex"); + let args = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(target, "debit_slow", &args) { + Ok(()) => log::info!("transfer_funds_slow: remote debit_slow succeeded"), + Err(e) => panic!("remote debit_slow failed: {e}"), + } +} +"#; + +/// Spawn a background thread that fires `transfer_funds_slow` and ignores the result. +/// +/// This is used to start a long-running 2PC in the background so the main thread +/// can crash the server mid-flight. The call is expected to fail with a +/// connection error when the server is restarted. +fn spawn_transfer_funds_slow( + server_url: String, + config_path: std::path::PathBuf, + db_a_identity: String, + db_b_identity: String, + amount: i64, +) -> std::thread::JoinHandle<()> { + std::thread::spawn(move || { + let cli = ensure_binaries_built(); + let _ = std::process::Command::new(&cli) + .arg("--config-path") + .arg(&config_path) + .args([ + "call", + "--server", + &server_url, + "--", + &db_a_identity, + "transfer_funds_slow", + &db_b_identity, + "alice", + "alice", + &amount.to_string(), + ]) + .output(); + }) +} + +/// Query alice's balance on a specific database (by identity string). +fn alice_balance(test: &Smoketest, db_identity: &str) -> i64 { + let out = test + .spacetime(&[ + "sql", + "--server", + &test.server_url, + db_identity, + "SELECT balance FROM ledger WHERE account = 'alice'", + ]) + .unwrap_or_else(|e| panic!("sql query failed for {db_identity}: {e}")); + // Output looks like: " balance \n--------\n 100\n" + out.lines() + .filter_map(|l| l.trim().parse::().ok()) + .next() + .unwrap_or_else(|| panic!("could not parse balance from: {out}")) +} + +/// Set up two databases (A = coordinator, B = participant) on the same server +/// and return (db_a_identity, db_b_identity). `test.database_identity` points to A. +fn setup_two_banks(test: &mut Smoketest, pid: u32, suffix: &str) -> (String, String) { + let db_b_name = format!("2pc-rec-b-{pid}-{suffix}"); + let db_a_name = format!("2pc-rec-a-{pid}-{suffix}"); + + test.publish_module_named(&db_b_name, false) + .expect("failed to publish bank B"); + let db_b_identity = test.database_identity.clone().expect("bank B identity"); + + test.publish_module_named(&db_a_name, false) + .expect("failed to publish bank A"); + let db_a_identity = test.database_identity.clone().expect("bank A identity"); + + (db_a_identity, db_b_identity) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Test 1: committed data survives a full server restart. +// +// Rationale: verifies that every "persist" step in the 2PC protocol actually +// writes to durable storage. If any durability wait were missing, one side +// would lose its data on restart. +// ───────────────────────────────────────────────────────────────────────────── +#[test] +fn test_2pc_committed_data_survives_restart() { + require_local_server!(); + let pid = std::process::id(); + let mut test = Smoketest::builder() + .module_code(MODULE_CODE) + .autopublish(false) + .build(); + + let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "dur"); + + // Successful 2PC: transfer 50 from B's alice to A's alice. + test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "50"]) + .expect("transfer_funds failed"); + + // Verify pre-restart state. + assert_eq!(alice_balance(&test, &db_a_identity), 150, "A should have 150 before restart"); + assert_eq!(alice_balance(&test, &db_b_identity), 50, "B should have 50 before restart"); + + // Restart the server — exercises recovery path even though there's nothing to recover. + test.restart_server(); + + // After restart, data must still be present and correct. + assert_eq!( + alice_balance(&test, &db_a_identity), + 150, + "A's committed data should survive restart" + ); + assert_eq!( + alice_balance(&test, &db_b_identity), + 50, + "B's committed data should survive restart" + ); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Test 2: aborted 2PC rollback also survives a restart. +// +// Rationale: rollback (B's st_2pc_state deletion + reducer rollback) must also +// be durable. After restart, neither side should show the transfer. +// ───────────────────────────────────────────────────────────────────────────── +#[test] +fn test_2pc_aborted_state_survives_restart() { + require_local_server!(); + let pid = std::process::id(); + let mut test = Smoketest::builder() + .module_code(MODULE_CODE) + .autopublish(false) + .build(); + + let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "abort-dur"); + + // Try to transfer 200 — B only has 100, so the remote debit panics → abort. + let _ = test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "200"]); + + assert_eq!(alice_balance(&test, &db_a_identity), 100, "A should still be 100 after abort"); + assert_eq!(alice_balance(&test, &db_b_identity), 100, "B should still be 100 after abort"); + + test.restart_server(); + + assert_eq!( + alice_balance(&test, &db_a_identity), + 100, + "A's aborted rollback should survive restart" + ); + assert_eq!( + alice_balance(&test, &db_b_identity), + 100, + "B's aborted rollback should survive restart" + ); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Test 3: status endpoint returns "abort" for an unknown prepare_id. +// +// Rationale: tests that GET /v1/database/{db}/2pc/status/{id} is correctly wired +// and returns the right default when no coordinator log entry exists. +// ───────────────────────────────────────────────────────────────────────────── +#[test] +fn test_2pc_status_endpoint_unknown_returns_abort() { + let pid = std::process::id(); + let mut test = Smoketest::builder() + .module_code(MODULE_CODE) + .autopublish(false) + .build(); + + let (db_a_identity, _db_b_identity) = setup_two_banks(&mut test, pid, "status"); + + let resp = test + .api_call( + "GET", + &format!("/v1/database/{db_a_identity}/2pc/status/nonexistent-prepare-id"), + ) + .expect("api_call failed"); + + assert_eq!(resp.status_code, 200, "status endpoint should return 200"); + let body_text = resp.text().expect("response body is not UTF-8"); + assert_eq!( + body_text.trim(), + "abort", + "unknown prepare_id should return 'abort', got: {:?}", + body_text + ); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Test 4: 2PC atomicity is maintained when the server crashes mid-flight. +// +// Strategy: `transfer_funds_slow` calls `debit_slow` on B, which burns ~2-3s +// of CPU. We crash the server after 1s (when the 2PC is definitely in flight) +// and verify that both databases are in a CONSISTENT state after restart: +// either both committed (alice_a=150, alice_b=50) or both rolled back +// (alice_a=100, alice_b=100). +// +// Note: we intentionally do NOT assert which outcome occurred, because that +// depends on whether the crash hit before or after A wrote its coordinator log. +// What we assert is that the two sides agree — this is the 2PC guarantee. +// ───────────────────────────────────────────────────────────────────────────── +#[test] +fn test_2pc_atomicity_under_crash() { + require_local_server!(); + let pid = std::process::id(); + let mut test = Smoketest::builder() + .module_code(MODULE_CODE) + .autopublish(false) + .build(); + + let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "crash"); + + // Kick off the slow transfer in a background thread. It will block + // for ~2-3s inside debit_slow on B before completing. + let _call_thread = spawn_transfer_funds_slow( + test.server_url.clone(), + test.config_path.clone(), + db_a_identity.clone(), + db_b_identity.clone(), + 50, + ); + + // Give the 2PC time to get mid-flight (after B starts its slow reducer + // but before it finishes), then crash the server. + std::thread::sleep(Duration::from_millis(1000)); + test.restart_server(); + + // After restart, give recovery time to settle: coordinator recovery + // retransmits COMMIT if needed, participant recovery polls if needed. + std::thread::sleep(Duration::from_secs(5)); + + let bal_a = alice_balance(&test, &db_a_identity); + let bal_b = alice_balance(&test, &db_b_identity); + + // The 2PC guarantee: both sides must agree. + let both_committed = bal_a == 150 && bal_b == 50; + let both_rolled_back = bal_a == 100 && bal_b == 100; + assert!( + both_committed || both_rolled_back, + "2PC atomicity violated after crash: A={bal_a}, B={bal_b}. \ + Expected either (150, 50) or (100, 100)." + ); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Test 5: coordinator recovery — A crashes after writing its coordinator log, +// before B commits. +// +// Strategy: same crash-mid-flight approach, but we poll A's balance to detect +// the moment A has committed (alice_a=150), then immediately crash. At that +// point A's coordinator log is on disk, B has sent PREPARED, but B may not yet +// have received COMMIT. Recovery should bring B to the committed state. +// +// This test is inherently timing-sensitive (same-process loopback is fast). +// It uses `debit_slow` to widen the window: after A commits (detectable via +// alice_a=150), B is still inside `debit_slow` and has not yet received COMMIT. +// ───────────────────────────────────────────────────────────────────────────── +#[test] +fn test_2pc_coordinator_recovery() { + require_local_server!(); + let pid = std::process::id(); + let mut test = Smoketest::builder() + .module_code(MODULE_CODE) + .autopublish(false) + .build(); + + let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "coord-rec"); + + let _call_thread = spawn_transfer_funds_slow( + test.server_url.clone(), + test.config_path.clone(), + db_a_identity.clone(), + db_b_identity.clone(), + 50, + ); + + // Poll A's alice balance until it reaches 150 — that means A has committed + // its tx (including the coordinator log entry) and B has sent PREPARED. + // At this point B is still inside debit_slow, so COMMIT hasn't reached B yet. + let deadline = std::time::Instant::now() + Duration::from_secs(30); + loop { + std::thread::sleep(Duration::from_millis(100)); + if alice_balance(&test, &db_a_identity) == 150 { + break; + } + if std::time::Instant::now() > deadline { + panic!("timed out waiting for A to commit"); + } + } + + // Crash immediately: A has coordinator log, B has st_2pc_state, B hasn't committed. + test.restart_server(); + + // Allow recovery to complete: A's recover_2pc_coordinator retransmits COMMIT to B. + std::thread::sleep(Duration::from_secs(5)); + + let bal_a = alice_balance(&test, &db_a_identity); + let bal_b = alice_balance(&test, &db_b_identity); + + assert_eq!( + bal_a, 150, + "A should have committed (alice_a=150) before crash" + ); + assert_eq!( + bal_b, 50, + "B should have committed via coordinator recovery (alice_b=50), got {bal_b}" + ); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Test 6: participant recovery — B crashes after writing st_2pc_state (PREPARE +// durable) but before receiving COMMIT. +// +// Strategy: since A and B are on the same server, we cannot crash B without +// also crashing A. So we crash the server right after the PREPARE is durable +// on B (detectable: B's st_2pc_state is non-empty) and before A commits. +// On restart: +// - B finds st_2pc_state → re-runs reducer → polls A's status endpoint +// - A has no coordinator log (A hadn't committed) → status = "abort" +// - B aborts → both sides return to 100 +// +// A fully committed scenario (B polls and gets "commit") is exercised by +// test_2pc_coordinator_recovery which covers the symmetric window. +// ───────────────────────────────────────────────────────────────────────────── +#[test] +fn test_2pc_participant_recovery_polls_and_aborts() { + require_local_server!(); + let pid = std::process::id(); + let mut test = Smoketest::builder() + .module_code(MODULE_CODE) + .autopublish(false) + .build(); + + let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "part-rec"); + + let _call_thread = spawn_transfer_funds_slow( + test.server_url.clone(), + test.config_path.clone(), + db_a_identity.clone(), + db_b_identity.clone(), + 50, + ); + + // Crash early: after ~500ms, B's slow reducer should be mid-execution. + // A has not yet received PREPARED, so A has no coordinator log. + // B's st_2pc_state may or may not be written yet (it's written after the + // reducer finishes). Either way, the final state must be consistent. + std::thread::sleep(Duration::from_millis(500)); + test.restart_server(); + + // Wait for participant recovery to settle. B polls A's status endpoint + // every 5s; allow up to 15s for it to act. + std::thread::sleep(Duration::from_secs(15)); + + let bal_a = alice_balance(&test, &db_a_identity); + let bal_b = alice_balance(&test, &db_b_identity); + + let both_committed = bal_a == 150 && bal_b == 50; + let both_rolled_back = bal_a == 100 && bal_b == 100; + assert!( + both_committed || both_rolled_back, + "Inconsistent state after participant recovery: A={bal_a}, B={bal_b}" + ); +} diff --git a/crates/smoketests/tests/smoketests/mod.rs b/crates/smoketests/tests/smoketests/mod.rs index 52cf11c6107..f6acc606dd2 100644 --- a/crates/smoketests/tests/smoketests/mod.rs +++ b/crates/smoketests/tests/smoketests/mod.rs @@ -10,6 +10,7 @@ mod confirmed_reads; mod connect_disconnect_from_cli; mod create_project; mod cross_db_2pc; +mod cross_db_2pc_recovery; mod cross_db_reducer; mod csharp_module; mod default_module_clippy; From be4a56a23366ae08d8abe16abc0a56cfcc9de2f9 Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 02:14:44 +0530 Subject: [PATCH 17/22] dead code --- crates/core/src/host/instance_env.rs | 88 ------------------- crates/core/src/host/module_host.rs | 21 +++-- .../src/host/wasm_common/module_host_actor.rs | 16 ++-- .../src/locking_tx_datastore/mut_tx.rs | 17 ++-- .../tests/smoketests/cross_db_2pc_recovery.rs | 59 ++++++------- 5 files changed, 59 insertions(+), 142 deletions(-) diff --git a/crates/core/src/host/instance_env.rs b/crates/core/src/host/instance_env.rs index eb5c9e19919..a211b5603fc 100644 --- a/crates/core/src/host/instance_env.rs +++ b/crates/core/src/host/instance_env.rs @@ -1125,94 +1125,6 @@ impl InstanceEnv { result } } - - /// Commit all prepared participants (called after coordinator's reducer succeeds). - pub fn commit_all_prepared(&mut self) -> impl Future + use<> { - let participants = mem::take(&mut self.prepared_participants); - let client = self.replica_ctx.call_reducer_client.clone(); - let router = self.replica_ctx.call_reducer_router.clone(); - let auth_token = self.replica_ctx.call_reducer_auth_token.clone(); - - async move { - for (db_identity, prepare_id) in participants { - let base_url = match router.resolve_base_url(db_identity).await { - Ok(url) => url, - Err(e) => { - log::error!("2PC commit: failed to resolve base URL for {db_identity}: {e}"); - continue; - } - }; - let url = format!( - "{}/v1/database/{}/2pc/commit/{}", - base_url, - db_identity.to_hex(), - prepare_id, - ); - let mut req = client.post(&url); - if let Some(ref token) = auth_token { - req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); - } - match req.send().await { - Ok(resp) if resp.status().is_success() => { - log::info!("2PC commit: committed {prepare_id} on {db_identity}"); - } - Ok(resp) => { - log::error!( - "2PC commit: failed for {prepare_id} on {db_identity}: status {}", - resp.status() - ); - } - Err(e) => { - log::error!("2PC commit: transport error for {prepare_id} on {db_identity}: {e}"); - } - } - } - } - } - - /// Abort all prepared participants (called when coordinator's reducer fails). - pub fn abort_all_prepared(&mut self) -> impl Future + use<> { - let participants = mem::take(&mut self.prepared_participants); - let client = self.replica_ctx.call_reducer_client.clone(); - let router = self.replica_ctx.call_reducer_router.clone(); - let auth_token = self.replica_ctx.call_reducer_auth_token.clone(); - - async move { - for (db_identity, prepare_id) in participants { - let base_url = match router.resolve_base_url(db_identity).await { - Ok(url) => url, - Err(e) => { - log::error!("2PC abort: failed to resolve base URL for {db_identity}: {e}"); - continue; - } - }; - let url = format!( - "{}/v1/database/{}/2pc/abort/{}", - base_url, - db_identity.to_hex(), - prepare_id, - ); - let mut req = client.post(&url); - if let Some(ref token) = auth_token { - req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); - } - match req.send().await { - Ok(resp) if resp.status().is_success() => { - log::info!("2PC abort: aborted {prepare_id} on {db_identity}"); - } - Ok(resp) => { - log::error!( - "2PC abort: failed for {prepare_id} on {db_identity}: status {}", - resp.status() - ); - } - Err(e) => { - log::error!("2PC abort: transport error for {prepare_id} on {db_identity}: {e}"); - } - } - } - } - } } /// Default timeout for HTTP requests performed by [`InstanceEnv::http_request`]. diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 93d8804e70c..ab0115dc986 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1928,7 +1928,10 @@ impl ModuleHost { let participant_identity = match Identity::from_hex(&row.participant_identity_hex) { Ok(id) => id, Err(e) => { - log::error!("recover_2pc_coordinator: invalid participant identity hex {}: {e}", row.participant_identity_hex); + log::error!( + "recover_2pc_coordinator: invalid participant identity hex {}: {e}", + row.participant_identity_hex + ); continue; } }; @@ -1959,7 +1962,10 @@ impl ModuleHost { } } Ok(resp) => { - log::warn!("recover_2pc_coordinator: commit for {prepare_id} returned {}", resp.status()); + log::warn!( + "recover_2pc_coordinator: commit for {prepare_id} returned {}", + resp.status() + ); } Err(e) => { log::warn!("recover_2pc_coordinator: transport error for {prepare_id}: {e}"); @@ -1994,14 +2000,18 @@ impl ModuleHost { let coordinator_identity = match Identity::from_hex(&row.coordinator_identity_hex) { Ok(id) => id, Err(e) => { - log::error!("recover_2pc_participant: invalid coordinator identity hex for {original_prepare_id}: {e}"); + log::error!( + "recover_2pc_participant: invalid coordinator identity hex for {original_prepare_id}: {e}" + ); continue; } }; let caller_identity = match Identity::from_hex(&row.caller_identity_hex) { Ok(id) => id, Err(e) => { - log::error!("recover_2pc_participant: invalid caller identity hex for {original_prepare_id}: {e}"); + log::error!( + "recover_2pc_participant: invalid caller identity hex for {original_prepare_id}: {e}" + ); continue; } }; @@ -2051,7 +2061,8 @@ impl ModuleHost { auth_token.clone(), coordinator_identity, &original_prepare_id, - ).await; + ) + .await; match decision { Some(commit) => { if commit { diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index b8d4e66471e..aa704e43f3a 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -432,7 +432,10 @@ async fn send_ack_commit_to_coordinator( log::info!("2PC ack-commit: notified coordinator for {prepare_id}"); } Ok(resp) => { - log::warn!("2PC ack-commit: coordinator returned {} for {prepare_id}", resp.status()); + log::warn!( + "2PC ack-commit: coordinator returned {} for {prepare_id}", + resp.status() + ); } Err(e) => { log::warn!("2PC ack-commit: transport error for {prepare_id}: {e}"); @@ -709,8 +712,7 @@ impl WasmModuleInstance { let _ = prepared_tx.send((res, return_value)); // Step 4: wait for coordinator's decision (B never aborts on its own). - let commit = - Self::wait_for_2pc_decision(decision_rx, &prepare_id, coordinator_identity, &replica_ctx); + let commit = Self::wait_for_2pc_decision(decision_rx, &prepare_id, coordinator_identity, &replica_ctx); if commit { // Delete the marker in the same tx as the reducer changes (atomic commit). @@ -1196,9 +1198,11 @@ impl InstanceCommon { // B acknowledged COMMIT — remove coordinator log entry // (best-effort; recovery will clean up on restart if missed). if committed { - if let Err(e) = stdb.with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { - Ok(del_tx.delete_st_2pc_coordinator_log(prepare_id)?) - }) { + if let Err(e) = stdb + .with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { + Ok(del_tx.delete_st_2pc_coordinator_log(prepare_id)?) + }) + { log::warn!("delete_st_2pc_coordinator_log failed for {prepare_id}: {e}"); } } diff --git a/crates/datastore/src/locking_tx_datastore/mut_tx.rs b/crates/datastore/src/locking_tx_datastore/mut_tx.rs index a2d2b306d92..b688c201054 100644 --- a/crates/datastore/src/locking_tx_datastore/mut_tx.rs +++ b/crates/datastore/src/locking_tx_datastore/mut_tx.rs @@ -22,11 +22,10 @@ use crate::{ system_tables::{ with_sys_table_buf, St2pcCoordinatorLogFields, St2pcCoordinatorLogRow, St2pcStateFields, St2pcStateRow, StClientFields, StClientRow, StColumnAccessorFields, StColumnAccessorRow, StColumnFields, StColumnRow, - StConstraintFields, StConstraintRow, StEventTableRow, - StFields as _, StIndexAccessorFields, StIndexAccessorRow, StIndexFields, StIndexRow, StRowLevelSecurityFields, - StRowLevelSecurityRow, StScheduledFields, StScheduledRow, StSequenceFields, StSequenceRow, - StTableAccessorFields, StTableAccessorRow, StTableFields, StTableRow, SystemTable, - ST_2PC_COORDINATOR_LOG_ID, ST_2PC_STATE_ID, ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, + StConstraintFields, StConstraintRow, StEventTableRow, StFields as _, StIndexAccessorFields, StIndexAccessorRow, + StIndexFields, StIndexRow, StRowLevelSecurityFields, StRowLevelSecurityRow, StScheduledFields, StScheduledRow, + StSequenceFields, StSequenceRow, StTableAccessorFields, StTableAccessorRow, StTableFields, StTableRow, + SystemTable, ST_2PC_COORDINATOR_LOG_ID, ST_2PC_STATE_ID, ST_CLIENT_ID, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ID, ST_CONSTRAINT_ID, ST_EVENT_TABLE_ID, ST_INDEX_ACCESSOR_ID, ST_INDEX_ID, ST_ROW_LEVEL_SECURITY_ID, ST_SCHEDULED_ID, ST_SEQUENCE_ID, ST_TABLE_ACCESSOR_ID, ST_TABLE_ID, }, @@ -2771,9 +2770,7 @@ impl MutTxId { self.insert_via_serialize_bsatn(ST_2PC_COORDINATOR_LOG_ID, row) .map(|_| ()) .inspect_err(|e| { - log::error!( - "insert_st_2pc_coordinator_log: failed for prepare_id ({participant_prepare_id}): {e}" - ); + log::error!("insert_st_2pc_coordinator_log: failed for prepare_id ({participant_prepare_id}): {e}"); }) } @@ -2785,9 +2782,7 @@ impl MutTxId { St2pcCoordinatorLogFields::ParticipantPrepareId.col_id(), &AlgebraicValue::String(participant_prepare_id.into()), ) { - log::error!( - "delete_st_2pc_coordinator_log: no row for prepare_id ({participant_prepare_id}): {e}" - ); + log::error!("delete_st_2pc_coordinator_log: no row for prepare_id ({participant_prepare_id}): {e}"); } Ok(()) } diff --git a/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs b/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs index e7955ed28b6..dfd22aaf389 100644 --- a/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs +++ b/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs @@ -166,10 +166,7 @@ fn setup_two_banks(test: &mut Smoketest, pid: u32, suffix: &str) -> (String, Str fn test_2pc_committed_data_survives_restart() { require_local_server!(); let pid = std::process::id(); - let mut test = Smoketest::builder() - .module_code(MODULE_CODE) - .autopublish(false) - .build(); + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "dur"); @@ -178,8 +175,16 @@ fn test_2pc_committed_data_survives_restart() { .expect("transfer_funds failed"); // Verify pre-restart state. - assert_eq!(alice_balance(&test, &db_a_identity), 150, "A should have 150 before restart"); - assert_eq!(alice_balance(&test, &db_b_identity), 50, "B should have 50 before restart"); + assert_eq!( + alice_balance(&test, &db_a_identity), + 150, + "A should have 150 before restart" + ); + assert_eq!( + alice_balance(&test, &db_b_identity), + 50, + "B should have 50 before restart" + ); // Restart the server — exercises recovery path even though there's nothing to recover. test.restart_server(); @@ -207,18 +212,23 @@ fn test_2pc_committed_data_survives_restart() { fn test_2pc_aborted_state_survives_restart() { require_local_server!(); let pid = std::process::id(); - let mut test = Smoketest::builder() - .module_code(MODULE_CODE) - .autopublish(false) - .build(); + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "abort-dur"); // Try to transfer 200 — B only has 100, so the remote debit panics → abort. let _ = test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "200"]); - assert_eq!(alice_balance(&test, &db_a_identity), 100, "A should still be 100 after abort"); - assert_eq!(alice_balance(&test, &db_b_identity), 100, "B should still be 100 after abort"); + assert_eq!( + alice_balance(&test, &db_a_identity), + 100, + "A should still be 100 after abort" + ); + assert_eq!( + alice_balance(&test, &db_b_identity), + 100, + "B should still be 100 after abort" + ); test.restart_server(); @@ -243,10 +253,7 @@ fn test_2pc_aborted_state_survives_restart() { #[test] fn test_2pc_status_endpoint_unknown_returns_abort() { let pid = std::process::id(); - let mut test = Smoketest::builder() - .module_code(MODULE_CODE) - .autopublish(false) - .build(); + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); let (db_a_identity, _db_b_identity) = setup_two_banks(&mut test, pid, "status"); @@ -284,10 +291,7 @@ fn test_2pc_status_endpoint_unknown_returns_abort() { fn test_2pc_atomicity_under_crash() { require_local_server!(); let pid = std::process::id(); - let mut test = Smoketest::builder() - .module_code(MODULE_CODE) - .autopublish(false) - .build(); + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "crash"); @@ -340,10 +344,7 @@ fn test_2pc_atomicity_under_crash() { fn test_2pc_coordinator_recovery() { require_local_server!(); let pid = std::process::id(); - let mut test = Smoketest::builder() - .module_code(MODULE_CODE) - .autopublish(false) - .build(); + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "coord-rec"); @@ -378,10 +379,7 @@ fn test_2pc_coordinator_recovery() { let bal_a = alice_balance(&test, &db_a_identity); let bal_b = alice_balance(&test, &db_b_identity); - assert_eq!( - bal_a, 150, - "A should have committed (alice_a=150) before crash" - ); + assert_eq!(bal_a, 150, "A should have committed (alice_a=150) before crash"); assert_eq!( bal_b, 50, "B should have committed via coordinator recovery (alice_b=50), got {bal_b}" @@ -407,10 +405,7 @@ fn test_2pc_coordinator_recovery() { fn test_2pc_participant_recovery_polls_and_aborts() { require_local_server!(); let pid = std::process::id(); - let mut test = Smoketest::builder() - .module_code(MODULE_CODE) - .autopublish(false) - .build(); + let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "part-rec"); From 056f511710494b3e3bed106bcef0f37b95bf7f1c Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 02:15:43 +0530 Subject: [PATCH 18/22] crate --- crates/core/2PC-IMPLEMENTATION-PLAN.md | 182 ------------------------- 1 file changed, 182 deletions(-) delete mode 100644 crates/core/2PC-IMPLEMENTATION-PLAN.md diff --git a/crates/core/2PC-IMPLEMENTATION-PLAN.md b/crates/core/2PC-IMPLEMENTATION-PLAN.md deleted file mode 100644 index 55d2ddbaa61..00000000000 --- a/crates/core/2PC-IMPLEMENTATION-PLAN.md +++ /dev/null @@ -1,182 +0,0 @@ -# 2PC Implementation Plan (Pipelined) - -## Context - -The TPC-C benchmark on branch `origin/phoebe/tpcc/reducer-return-value` (public submodule) uses non-atomic HTTP calls for cross-database operations. We need 2PC so distributed transactions either commit on both databases or neither. Pipelined 2PC is chosen because it avoids blocking on persistence during lock-holding, and the codebase already separates in-memory commit from durability. - -## Protocol - -### Participant happy path: - -1. Receive CALL from coordinator (reducer name + args) -2. Execute reducer (write lock held) -3. Return result to coordinator (write lock still held, transaction still open) -4. Possibly receive more CALLs from coordinator (same transaction, same write lock) -5. Receive END_CALLS from coordinator ("no more reducer calls in this transaction") -6. Commit in-memory (release write lock) -7. Send PREPARE to durability worker -8. **Barrier up** -- no more durability requests go through -9. In background: wait for PREPARE to be durable -10. Once durable: send PREPARED to coordinator -11. Wait for COMMIT or ABORT from coordinator -12. Receive COMMIT -13. Send COMMIT to durability worker -14. **Barrier down** -- flush buffered requests - -### Coordinator happy path: - -1. Execute reducer, calling participant reducers along the way (participants hold write locks, return results, but don't commit) -2. Reducer succeeds -3. Send END_CALLS to all participants (they can now commit in-memory) -4. Commit coordinator in-memory (release write lock) -5. Send PREPARE to durability worker -6. **Barrier up** -- no more durability requests go through -7. Wait for coordinator's own PREPARE to be durable -8. Wait for all participants to report PREPARED -9. Send COMMIT to all participants -10. Send COMMIT to durability worker -11. **Barrier down** -- flush buffered requests - -## Key correctness properties - -- **Serializable isolation**: Participant holds write lock from CALL through END_CALLS. Multiple CALLs from the same coordinator transaction execute within the same MutTxId on the participant. The second call sees the first call's writes. -- **Persistence barrier**: After PREPARE is sent to durability (step 7/8 on participant, step 5/6 on coordinator), no speculative transactions can reach the durability worker until COMMIT or ABORT. Anything sent to the durability worker can eventually become persistent, so the barrier is required. -- **Two responses from participant**: The immediate result (step 3) and the later PREPARED notification (step 10). The coordinator collects both: results during reducer execution, PREPARED notifications before deciding COMMIT. -- **Pipelining benefit**: Locks are held only during reducer execution (steps 1-6), not during persistence (steps 7-14). The persistence and 2PC handshake happen after locks are released on both sides. - -## Holding MutTxId: reuse existing blocking pattern - -`MutTxId` is `!Send` (holds `SharedWriteGuard`). The participant must hold it across multiple CALL requests from the coordinator for serializable isolation. - -The codebase already has a blocking pattern: on the coordinator side, `call_reducer_on_db` uses `std::thread::scope` + `Handle::block_on` to block the WASM thread while making an async HTTP call. The same pattern works for the participant: instead of returning from the reducer execution, the participant's thread blocks on a channel (`blocking_recv`) waiting for the next command. The `MutTxId` stays alive on that same thread. No new threading model is needed. - -``` -Coordinator thread Participant thread -(WASM reducer running, (holds MutTxId, holds WASM instance) - holds coordinator MutTxId) - -call_reducer_on_db_2pc() - | - |-- HTTP POST /2pc/begin/debit -> spawn thread, create MutTxId - | execute reducer - | send result via channel - | <-- HTTP response (result block on channel (blocking_recv) - | + session_id) | - | | [MutTxId held, write lock held] - | | -call_reducer_on_db_2pc() (2nd call) | - | | - |-- HTTP POST /2pc/{sid}/call/x -> send command via channel - | wake up, execute reducer - | send result via channel - | <-- HTTP response block on channel - | | -reducer finishes | - | | -[post-commit coordination] | - | | - |-- HTTP POST /2pc/{sid}/end ---> wake up, commit in-memory - | release write lock - | send PREPARE to durability - | barrier up - | wait for PREPARE durable... - | <-- HTTP response (PREPARED) block on channel - | | - |-- HTTP POST /2pc/{sid}/commit -> wake up - | send COMMIT to durability - | barrier down, flush - | <-- HTTP response thread exits -``` - -### Implementation - -On first CALL for a new 2PC transaction: -1. The async HTTP handler spawns a blocking thread (via `std::thread::scope` or `tokio::task::spawn_blocking`) -2. The blocking thread takes a WASM instance from the module's instance pool -3. The blocking thread creates `MutTxId` (acquires write lock) -4. The blocking thread executes the first reducer -5. The blocking thread sends the result back via a `oneshot` channel -6. The async HTTP handler receives the result and returns the HTTP response with a `session_id` -7. The blocking thread blocks on a `mpsc::Receiver` waiting for the next command -8. The async HTTP handler stores the `mpsc::Sender` in a session map keyed by `session_id` - -Subsequent CALLs and END_CALLS look up the `session_id`, send commands on the channel. The blocking thread processes them sequentially on the same `MutTxId`. - -When the thread exits (after COMMIT or ABORT), it returns the WASM instance to the pool. - -```rust -enum TxCommand { - Call { reducer: String, args: Bytes, reply: oneshot::Sender }, - EndCalls { reply: oneshot::Sender }, - Commit { reply: oneshot::Sender<()> }, - Abort { reply: oneshot::Sender<()> }, -} -``` - -## Abort paths - -**Coordinator's reducer fails (step 2):** -- Send ABORT to all participants (they still hold write locks) -- Participants rollback their MutTxId (release write lock, no changes) -- No PREPARE was sent, no barrier needed - -**Participant's reducer fails (step 2):** -- Participant returns error to coordinator -- Coordinator's reducer fails (propagates error) -- Coordinator sends ABORT to all other participants that succeeded -- Those participants rollback their MutTxId - -**Coordinator's PREPARE persists but a participant's PREPARE fails to persist:** -- Participant cannot send PREPARED -- Coordinator times out waiting for PREPARED -- Coordinator sends ABORT to all participants -- Coordinator inverts its own in-memory state, discards buffered durability requests - -**Crash during protocol:** -- See proposal in `proposals/00XX-inter-database-communication.md` section 8 for recovery rules - -## Commitlog format - -- PREPARE record: includes all row changes (inserts/deletes) -- COMMIT record: follows PREPARE, marks transaction as committed -- ABORT record: follows PREPARE, marks transaction as aborted -- No other records can appear between PREPARE and COMMIT/ABORT in the durable log (persistence barrier enforces this) - -## Replay semantics - -On replay, when encountering a PREPARE: -- Do not apply it to the datastore -- Read the next record: - - COMMIT: apply the PREPARE's changes - - ABORT: skip the PREPARE - - No next record (crash): transaction is still in progress, wait for coordinator or timeout and abort - -## Persistence barrier - -The barrier in `relational_db.rs` has two states: `Inactive` and `Active`. - -- **Inactive**: normal operation, durability requests go through. -- **Active**: all durability requests are buffered. - -No race is possible because the barrier is activated on the same thread that holds the write lock. The sequence on both coordinator and participant is: - -1. Commit in-memory (releases write lock) -2. Send PREPARE to durability worker (direct call, bypasses barrier) -3. Activate barrier - -Steps 1-3 happen sequentially on one thread. No other transaction can commit between 1 and 3 because steps 2 and 3 are immediate (no async, no lock release between them). By the time another transaction acquires the write lock and commits, the barrier is already active and its durability request is buffered. - -- On COMMIT: deactivate, flush buffered requests -- On ABORT: deactivate, discard buffered requests - -## Key files - -- `crates/core/src/db/relational_db.rs` -- PersistenceBarrier (Inactive/Armed/Active), send_or_buffer_durability, finalize_prepare_commit/abort -- `crates/core/src/host/prepared_tx.rs` -- TxCommand, TxSession, PreparedTransactions registry, session map -- `crates/core/src/host/module_host.rs` -- begin_2pc_session, commit_prepared, abort_prepared -- `crates/core/src/host/wasm_common/module_host_actor.rs` -- coordinator post-commit coordination (END_CALLS, wait PREPARED, COMMIT) -- `crates/core/src/host/instance_env.rs` -- call_reducer_on_db_2pc, prepared_participants tracking -- `crates/core/src/host/wasmtime/wasm_instance_env.rs` -- WASM host function -- `crates/client-api/src/routes/database.rs` -- HTTP endpoints: /2pc/begin/:reducer, /2pc/:sid/call/:reducer, /2pc/:sid/end, /2pc/:sid/commit, /2pc/:sid/abort -- `crates/bindings-sys/src/lib.rs` -- FFI -- `crates/bindings/src/remote_reducer.rs` -- safe wrapper From 2c04a393f86a41028196492aa74d8e8c9073ca1e Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 03:43:53 +0530 Subject: [PATCH 19/22] fix deadlock --- crates/client-api/src/routes/database.rs | 17 ++------ crates/core/src/host/mod.rs | 26 +++++++++++ .../src/host/wasm_common/module_host_actor.rs | 43 ++++++++----------- .../src/host/wasmtime/wasm_instance_env.rs | 16 +------ .../locking_tx_datastore/committed_state.rs | 4 ++ crates/datastore/src/system_tables.rs | 2 + 6 files changed, 55 insertions(+), 53 deletions(-) diff --git a/crates/client-api/src/routes/database.rs b/crates/client-api/src/routes/database.rs index c2adf71e111..5077d6756cc 100644 --- a/crates/client-api/src/routes/database.rs +++ b/crates/client-api/src/routes/database.rs @@ -263,22 +263,13 @@ pub async fn prepare( let (module, Database { owner_identity, .. }) = find_module_and_database(&worker_ctx, name_or_identity).await?; - let connection_id = generate_random_connection_id(); - - module - .call_identity_connected(auth.into(), connection_id) - .await - .map_err(client_connected_error_to_response)?; - + // 2PC prepare is a server-to-server call; no client lifecycle management needed. + // call_identity_connected/disconnected submit jobs to the module's executor, which + // will be blocked holding the 2PC write lock after prepare_reducer returns — deadlock. let result = module - .prepare_reducer(caller_identity, Some(connection_id), &reducer, args) + .prepare_reducer(caller_identity, None, &reducer, args) .await; - module - .call_identity_disconnected(caller_identity, connection_id) - .await - .map_err(client_disconnected_error_to_response)?; - match result { Ok((prepare_id, rcr, return_value)) => { let (status, body) = diff --git a/crates/core/src/host/mod.rs b/crates/core/src/host/mod.rs index df6ec4d42f0..06e55de6444 100644 --- a/crates/core/src/host/mod.rs +++ b/crates/core/src/host/mod.rs @@ -1,6 +1,7 @@ use anyhow::Context; use bytes::Bytes; use bytestring::ByteString; +use core::future::Future; use derive_more::Display; use enum_map::Enum; use once_cell::sync::OnceCell; @@ -10,6 +11,31 @@ use spacetimedb_lib::ProductValue; use spacetimedb_schema::def::deserialize::{ArgsSeed, FunctionDef}; use spacetimedb_schema::def::ModuleDef; +/// Block on `fut` from a synchronous context that may be inside a Tokio runtime. +/// +/// `Handle::block_on` and `block_in_place` both panic when the calling thread is +/// a custom (`std::thread::spawn`) thread that has entered the runtime via +/// `Handle::enter()` — which is exactly the pattern used by `SingleCoreExecutor`. +/// +/// The fix (same as the non-2PC `call_reducer_on_db` path): spawn a **scoped** +/// OS thread. The scoped thread starts with no Tokio context, so `Handle::block_on` +/// works normally and drives the future using the **original** runtime's I/O reactor +/// and connection pools. +/// +/// Use this for every place in the 2PC / cross-DB call paths that needs to +/// synchronously drive a future from blocking (WASM executor) context. +pub(crate) fn block_on_scoped(handle: &tokio::runtime::Handle, fut: F) -> F::Output +where + F: Future + Send, + F::Output: Send, +{ + std::thread::scope(|s| { + s.spawn(|| handle.block_on(fut)) + .join() + .expect("block_on_scoped: thread panicked") + }) +} + mod disk_storage; mod host_controller; mod module_common; diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index aa704e43f3a..1301eaf447a 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -13,6 +13,7 @@ use crate::host::module_host::{ ViewCallResult, ViewCommand, ViewCommandResult, ViewOutcome, }; use crate::host::scheduler::{CallScheduledFunctionResult, ScheduledFunctionParams}; +use crate::host::block_on_scoped; use crate::host::{ ArgsTuple, ModuleHost, ProcedureCallError, ProcedureCallResult, ReducerCallError, ReducerCallResult, ReducerId, ReducerOutcome, Scheduler, UpdateDatabaseResult, @@ -698,7 +699,7 @@ impl WasmModuleInstance { if let Some(prepare_offset) = marker_tx_data.tx_offset() { if let Some(mut durable) = stdb.durable_tx_offset() { let handle = tokio::runtime::Handle::current(); - let _ = handle.block_on(durable.wait_for(prepare_offset)); + let _ = block_on_scoped(&handle, durable.wait_for(prepare_offset)); } } @@ -725,11 +726,13 @@ impl WasmModuleInstance { // Without this, A could delete its coordinator log entry while B's commit // is still in-memory — a B crash at that point would leave the tx uncommitted // with no way to recover (A has already forgotten it committed). - let handle = tokio::runtime::Handle::current(); if let Some(mut durable) = stdb.durable_tx_offset() { - if let Ok(offset) = handle.block_on(commit_result.tx_offset) { - let _ = handle.block_on(durable.wait_for(offset)); - } + let handle = tokio::runtime::Handle::current(); + block_on_scoped(&handle, async move { + if let Ok(offset) = commit_result.tx_offset.await { + let _ = durable.wait_for(offset).await; + } + }); } // Notify coordinator that B has committed so it can delete its coordinator log entry. @@ -738,7 +741,7 @@ impl WasmModuleInstance { let router = replica_ctx.call_reducer_router.clone(); let client_http = replica_ctx.call_reducer_client.clone(); let auth_token = replica_ctx.call_reducer_auth_token.clone(); - handle.spawn(send_ack_commit_to_coordinator( + tokio::runtime::Handle::current().spawn(send_ack_commit_to_coordinator( client_http, router, auth_token, @@ -787,19 +790,13 @@ impl WasmModuleInstance { let auth_token = replica_ctx.call_reducer_auth_token.clone(); let prepare_id_owned = prepare_id.to_owned(); loop { - let decision = std::thread::scope(|s| { - s.spawn(|| { - handle.block_on(Self::query_coordinator_status( - &client, - &router, - auth_token.clone(), - coordinator_identity, - &prepare_id_owned, - )) - }) - .join() - .expect("coordinator poll thread panicked") - }); + let decision = block_on_scoped(&handle, Self::query_coordinator_status( + &client, + &router, + auth_token.clone(), + coordinator_identity, + &prepare_id_owned, + )); match decision { Some(commit) => return commit, None => std::thread::sleep(Duration::from_secs(5)), @@ -1157,9 +1154,7 @@ impl InstanceCommon { let replica_ctx = inst.replica_ctx().clone(); let handle = tokio::runtime::Handle::current(); - std::thread::scope(|s| { - s.spawn(|| { - handle.block_on(async { + block_on_scoped(&handle, async { // Wait for A's coordinator log (committed atomically with the tx) to be // durable before sending COMMIT to B. This guarantees that if A crashes // after sending COMMIT, recovery can retransmit from the durable log. @@ -1218,10 +1213,6 @@ impl InstanceCommon { } } } - }); - }) - .join() - .expect("2PC coordination thread panicked"); }); } diff --git a/crates/core/src/host/wasmtime/wasm_instance_env.rs b/crates/core/src/host/wasmtime/wasm_instance_env.rs index a40ac3cfd14..e23e3da01bb 100644 --- a/crates/core/src/host/wasmtime/wasm_instance_env.rs +++ b/crates/core/src/host/wasmtime/wasm_instance_env.rs @@ -1996,19 +1996,11 @@ impl WasmInstanceEnv { let args_buf = mem.deref_slice(args_ptr, args_len)?; let args = bytes::Bytes::copy_from_slice(args_buf); - // Reducers run inside a tokio LocalSet (single-threaded), so block_in_place - // is unavailable and futures::executor::block_on can't drive tokio I/O. - // Spawn a new OS thread and call Handle::block_on from there, which is - // designed to be called from synchronous (non-async) contexts. let handle = tokio::runtime::Handle::current(); let fut = env .instance_env .call_reducer_on_db(database_identity, &reducer_name, args); - let result = std::thread::scope(|s| { - s.spawn(|| handle.block_on(fut)) - .join() - .expect("call_reducer_on_db: worker thread panicked") - }); + let result = super::super::block_on_scoped(&handle, fut); match result { Ok((status, body)) => { @@ -2067,11 +2059,7 @@ impl WasmInstanceEnv { let fut = env .instance_env .call_reducer_on_db_2pc(database_identity, &reducer_name, args); - let result = std::thread::scope(|s| { - s.spawn(|| handle.block_on(fut)) - .join() - .expect("call_reducer_on_db_2pc: worker thread panicked") - }); + let result = super::super::block_on_scoped(&handle, fut); match result { Ok((status, body, prepare_id)) => { diff --git a/crates/datastore/src/locking_tx_datastore/committed_state.rs b/crates/datastore/src/locking_tx_datastore/committed_state.rs index 5a27bd15ef4..925139ba6cd 100644 --- a/crates/datastore/src/locking_tx_datastore/committed_state.rs +++ b/crates/datastore/src/locking_tx_datastore/committed_state.rs @@ -29,6 +29,7 @@ use crate::{ use crate::{ locking_tx_datastore::ViewCallInfo, system_tables::{ + ST_2PC_COORDINATOR_LOG_ID, ST_2PC_COORDINATOR_LOG_IDX, ST_2PC_STATE_ID, ST_2PC_STATE_IDX, ST_COLUMN_ACCESSOR_ID, ST_COLUMN_ACCESSOR_IDX, ST_CONNECTION_CREDENTIALS_ID, ST_CONNECTION_CREDENTIALS_IDX, ST_EVENT_TABLE_ID, ST_EVENT_TABLE_IDX, ST_INDEX_ACCESSOR_ID, ST_INDEX_ACCESSOR_IDX, ST_TABLE_ACCESSOR_ID, ST_TABLE_ACCESSOR_IDX, ST_VIEW_COLUMN_ID, ST_VIEW_COLUMN_IDX, ST_VIEW_ID, ST_VIEW_IDX, ST_VIEW_PARAM_ID, @@ -478,6 +479,9 @@ impl CommittedState { self.create_table(ST_INDEX_ACCESSOR_ID, schemas[ST_INDEX_ACCESSOR_IDX].clone()); self.create_table(ST_COLUMN_ACCESSOR_ID, schemas[ST_COLUMN_ACCESSOR_IDX].clone()); + self.create_table(ST_2PC_STATE_ID, schemas[ST_2PC_STATE_IDX].clone()); + self.create_table(ST_2PC_COORDINATOR_LOG_ID, schemas[ST_2PC_COORDINATOR_LOG_IDX].clone()); + // Insert the sequences into `st_sequences` let (st_sequences, blob_store, pool) = self.get_table_and_blob_store_or_create(ST_SEQUENCE_ID, &schemas[ST_SEQUENCE_IDX]); diff --git a/crates/datastore/src/system_tables.rs b/crates/datastore/src/system_tables.rs index db1da0a8adf..e19cc47fd74 100644 --- a/crates/datastore/src/system_tables.rs +++ b/crates/datastore/src/system_tables.rs @@ -286,6 +286,8 @@ pub(crate) const ST_EVENT_TABLE_IDX: usize = 16; pub(crate) const ST_TABLE_ACCESSOR_IDX: usize = 17; pub(crate) const ST_INDEX_ACCESSOR_IDX: usize = 18; pub(crate) const ST_COLUMN_ACCESSOR_IDX: usize = 19; +pub(crate) const ST_2PC_STATE_IDX: usize = 20; +pub(crate) const ST_2PC_COORDINATOR_LOG_IDX: usize = 21; macro_rules! st_fields_enum { ($(#[$attr:meta])* enum $ty_name:ident { $($name:expr, $var:ident = $discr:expr,)* }) => { From c2b4f97bbc8f108260b8cf6cb03a217fd52ae9b0 Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 14:09:26 +0530 Subject: [PATCH 20/22] http2 --- Cargo.toml | 2 +- crates/core/src/replica_context.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7228dba44c9..d93fc13348b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -158,7 +158,7 @@ anymap = "0.12" arrayvec = "0.7.2" async-stream = "0.3.6" async-trait = "0.1.68" -axum = { version = "0.7", features = ["tracing"] } +axum = { version = "0.7", features = ["tracing", "http2"] } axum-extra = { version = "0.9", features = ["typed-header"] } backtrace = "0.3.66" base64 = "0.21.2" diff --git a/crates/core/src/replica_context.rs b/crates/core/src/replica_context.rs index 8c9f8804f24..307dcf9f70c 100644 --- a/crates/core/src/replica_context.rs +++ b/crates/core/src/replica_context.rs @@ -68,12 +68,16 @@ pub struct ReplicaContext { impl ReplicaContext { /// Build a warmed `reqwest::Client` from `config`. + /// + /// Uses HTTP/2 prior knowledge (h2c) for all connections. + /// The server must be configured to accept h2c (HTTP/2 cleartext) connections. pub fn new_call_reducer_client(config: &CallReducerOnDbConfig) -> reqwest::Client { reqwest::Client::builder() .tcp_keepalive(config.tcp_keepalive) .pool_idle_timeout(config.pool_idle_timeout) .pool_max_idle_per_host(config.pool_max_idle_per_host) .timeout(config.request_timeout) + .http2_prior_knowledge() .build() .expect("failed to build call_reducer_on_db HTTP client") } From 95d112ab1d98c2903c296cde6b9dfc50962e3670 Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 15:47:00 +0530 Subject: [PATCH 21/22] avoid blocking co-ordinator --- crates/core/src/host/module_host.rs | 16 +- .../src/host/wasm_common/module_host_actor.rs | 142 ++++++++++-------- 2 files changed, 79 insertions(+), 79 deletions(-) diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index ab0115dc986..5358749903d 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1826,21 +1826,7 @@ impl ModuleHost { Ok::<(), ReducerCallError>(()) }, // JS modules: no 2PC support yet. - async |(p, _pid, _cid, ptx, _drx), inst| { - let (res, rv) = inst.call_reducer(p).await.map(|r| (r, None)).unwrap_or_else(|e| { - log::error!("prepare_reducer JS fallback: {e}"); - ( - ReducerCallResult { - outcome: ReducerOutcome::Failed(Box::new(Box::from("reducer error"))), - energy_used: EnergyQuanta::ZERO, - execution_duration: Default::default(), - }, - None, - ) - }); - let _ = ptx.send((res, rv)); - Ok(()) - }, + async |(p, _pid, _cid, ptx, _drx), inst| Err(ReducerCallError::NoSuchReducer), ) .await; }); diff --git a/crates/core/src/host/wasm_common/module_host_actor.rs b/crates/core/src/host/wasm_common/module_host_actor.rs index 1301eaf447a..1b0dd352641 100644 --- a/crates/core/src/host/wasm_common/module_host_actor.rs +++ b/crates/core/src/host/wasm_common/module_host_actor.rs @@ -4,6 +4,7 @@ use crate::client::ClientActorId; use crate::database_logger; use crate::energy::{EnergyMonitor, FunctionBudget, FunctionFingerprint}; use crate::error::DBError; +use crate::host::block_on_scoped; use crate::host::host_controller::CallProcedureReturn; use crate::host::instance_env::{InstanceEnv, TxSlot}; use crate::host::module_common::{build_common_module_from_raw, ModuleCommon}; @@ -13,7 +14,6 @@ use crate::host::module_host::{ ViewCallResult, ViewCommand, ViewCommandResult, ViewOutcome, }; use crate::host::scheduler::{CallScheduledFunctionResult, ScheduledFunctionParams}; -use crate::host::block_on_scoped; use crate::host::{ ArgsTuple, ModuleHost, ProcedureCallError, ProcedureCallResult, ReducerCallError, ReducerCallResult, ReducerId, ReducerOutcome, Scheduler, UpdateDatabaseResult, @@ -790,13 +790,16 @@ impl WasmModuleInstance { let auth_token = replica_ctx.call_reducer_auth_token.clone(); let prepare_id_owned = prepare_id.to_owned(); loop { - let decision = block_on_scoped(&handle, Self::query_coordinator_status( - &client, - &router, - auth_token.clone(), - coordinator_identity, - &prepare_id_owned, - )); + let decision = block_on_scoped( + &handle, + Self::query_coordinator_status( + &client, + &router, + auth_token.clone(), + coordinator_identity, + &prepare_id_owned, + ), + ); match decision { Some(commit) => return commit, None => std::thread::sleep(Duration::from_secs(5)), @@ -1151,68 +1154,79 @@ impl InstanceCommon { if !prepared_participants.is_empty() { let committed = matches!(event.status, EventStatus::Committed(_)); let stdb = self.info.subscriptions.relational_db().clone(); - - let replica_ctx = inst.replica_ctx().clone(); let handle = tokio::runtime::Handle::current(); - block_on_scoped(&handle, async { - // Wait for A's coordinator log (committed atomically with the tx) to be - // durable before sending COMMIT to B. This guarantees that if A crashes - // after sending COMMIT, recovery can retransmit from the durable log. - if committed && let Some(mut durable_offset) = stdb.durable_tx_offset() { - if let Ok(offset) = commit_tx_offset.await { - let _ = durable_offset.wait_for(offset).await; - } + + // Wait for A's coordinator log (committed atomically with the tx) to be + // durable before sending COMMIT to B. This guarantees that if A crashes + // after sending COMMIT, recovery can retransmit from the durable log. + // Only needed for COMMIT — ABORT carries no durability requirement. + if committed { + if let Some(mut durable_offset) = stdb.durable_tx_offset() { + block_on_scoped(&handle, async move { + if let Ok(offset) = commit_tx_offset.await { + let _ = durable_offset.wait_for(offset).await; } + }); + } + } - let client = replica_ctx.call_reducer_client.clone(); - let router = replica_ctx.call_reducer_router.clone(); - let auth_token = replica_ctx.call_reducer_auth_token.clone(); - for (db_identity, prepare_id) in &prepared_participants { - let action = if committed { "commit" } else { "abort" }; - let base_url = match router.resolve_base_url(*db_identity).await { - Ok(url) => url, - Err(e) => { - log::error!("2PC {action}: failed to resolve base URL for {db_identity}: {e}"); - continue; - } - }; - let url = format!( - "{}/v1/database/{}/2pc/{}/{}", - base_url, - db_identity.to_hex(), - action, - prepare_id, - ); - let mut req = client.post(&url); - if let Some(ref token) = auth_token { - req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); - } - match req.send().await { - Ok(resp) if resp.status().is_success() => { - log::info!("2PC {action}: {prepare_id} on {db_identity}"); - // B acknowledged COMMIT — remove coordinator log entry - // (best-effort; recovery will clean up on restart if missed). - if committed { - if let Err(e) = stdb - .with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { - Ok(del_tx.delete_st_2pc_coordinator_log(prepare_id)?) - }) - { - log::warn!("delete_st_2pc_coordinator_log failed for {prepare_id}: {e}"); - } - } - } - Ok(resp) => { - log::error!( - "2PC {action}: failed for {prepare_id} on {db_identity}: status {}", - resp.status() - ); - } - Err(e) => { - log::error!("2PC {action}: transport error for {prepare_id} on {db_identity}: {e}"); + // Fire-and-forget: send COMMIT/ABORT to each participant. + // The coordinator log (written atomically with A's tx above) is the + // durability guarantee — if a send fails, recover_2pc_coordinator retransmits + // on restart and recover_2pc_participant polls the status endpoint. + // Blocking the executor here would stall A's next reducer for up to + // 30 s × number of participants with no correctness benefit. + let replica_ctx = inst.replica_ctx().clone(); + handle.spawn(async move { + let client = replica_ctx.call_reducer_client.clone(); + let router = replica_ctx.call_reducer_router.clone(); + let auth_token = replica_ctx.call_reducer_auth_token.clone(); + for (db_identity, prepare_id) in &prepared_participants { + let action = if committed { "commit" } else { "abort" }; + let base_url = match router.resolve_base_url(*db_identity).await { + Ok(url) => url, + Err(e) => { + log::error!("2PC {action}: failed to resolve base URL for {db_identity}: {e}"); + continue; + } + }; + let url = format!( + "{}/v1/database/{}/2pc/{}/{}", + base_url, + db_identity.to_hex(), + action, + prepare_id, + ); + let mut req = client.post(&url); + if let Some(ref token) = auth_token { + req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); + } + match req.send().await { + Ok(resp) if resp.status().is_success() => { + log::info!("2PC {action}: {prepare_id} on {db_identity}"); + // B acknowledged COMMIT — remove coordinator log entry + // (best-effort; recovery will clean up on restart if missed). + if committed { + if let Err(e) = stdb + .with_auto_commit::<_, _, anyhow::Error>(Workload::Internal, |del_tx| { + Ok(del_tx.delete_st_2pc_coordinator_log(prepare_id)?) + }) + { + log::warn!("delete_st_2pc_coordinator_log failed for {prepare_id}: {e}"); } } } + Ok(resp) => { + log::error!( + "2PC {action}: failed for {prepare_id} on {db_identity}: status {}", + resp.status() + ); + } + Err(e) => { + log::error!("2PC {action}: transport error for {prepare_id} on {db_identity}: {e}"); + } + } + } }); } From 405a7a9f1a0a4c7d703c1bd20e6e2a048f76acb3 Mon Sep 17 00:00:00 2001 From: Shubham Mishra Date: Mon, 30 Mar 2026 16:28:18 +0530 Subject: [PATCH 22/22] update smoketests --- crates/client-api/src/routes/database.rs | 11 +- crates/core/src/host/instance_env.rs | 3 +- crates/core/src/host/module_host.rs | 29 +- .../tests/smoketests/cross_db_2pc.rs | 232 +++++++------ .../tests/smoketests/cross_db_2pc_recovery.rs | 309 ++++++++---------- 5 files changed, 286 insertions(+), 298 deletions(-) diff --git a/crates/client-api/src/routes/database.rs b/crates/client-api/src/routes/database.rs index 5077d6756cc..a4e1cdf1b7b 100644 --- a/crates/client-api/src/routes/database.rs +++ b/crates/client-api/src/routes/database.rs @@ -256,18 +256,27 @@ pub async fn prepare( reducer, }): Path, TypedHeader(content_type): TypedHeader, + headers: axum::http::HeaderMap, body: Bytes, ) -> axum::response::Result { let args = parse_call_args(content_type, body)?; let caller_identity = auth.claims.identity; + // The coordinator sends its actual database identity in `X-Coordinator-Identity`. + // Without this, `anon_auth_middleware` gives the HTTP caller an ephemeral random + // identity, which gets stored in `st_2pc_state` and breaks recovery polling. + let coordinator_identity = headers + .get("X-Coordinator-Identity") + .and_then(|v| v.to_str().ok()) + .and_then(|s| spacetimedb_lib::Identity::from_hex(s).ok()); + let (module, Database { owner_identity, .. }) = find_module_and_database(&worker_ctx, name_or_identity).await?; // 2PC prepare is a server-to-server call; no client lifecycle management needed. // call_identity_connected/disconnected submit jobs to the module's executor, which // will be blocked holding the 2PC write lock after prepare_reducer returns — deadlock. let result = module - .prepare_reducer(caller_identity, None, &reducer, args) + .prepare_reducer(caller_identity, None, &reducer, args, coordinator_identity) .await; match result { diff --git a/crates/core/src/host/instance_env.rs b/crates/core/src/host/instance_env.rs index a211b5603fc..a7b867baa2f 100644 --- a/crates/core/src/host/instance_env.rs +++ b/crates/core/src/host/instance_env.rs @@ -1038,7 +1038,7 @@ impl InstanceEnv { .bytes() .await .map_err(|e| NodesError::HttpError(e.to_string()))?; - Ok((status, body)) + Ok::<_, NodesError>((status, body)) } .await; @@ -1093,6 +1093,7 @@ impl InstanceEnv { let mut req = client .post(&url) .header(http::header::CONTENT_TYPE, "application/octet-stream") + .header("X-Coordinator-Identity", caller_identity.to_hex().to_string()) .body(args); if let Some(token) = auth_token { req = req.header(http::header::AUTHORIZATION, format!("Bearer {token}")); diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index 5358749903d..b3ecf0c28c7 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -1757,9 +1757,25 @@ impl ModuleHost { caller_connection_id: Option, reducer_name: &str, args: FunctionArgs, + // The actual coordinator database identity (from `X-Coordinator-Identity` header). + // When `Some`, used for `prepare_id` namespacing and stored in `st_2pc_state` for + // recovery. Falls back to `caller_identity` when `None` (e.g., internal calls). + coordinator_identity_override: Option, ) -> Result<(String, ReducerCallResult, Option), ReducerCallError> { use std::sync::atomic::{AtomicU64, Ordering}; - static PREPARE_COUNTER: AtomicU64 = AtomicU64::new(1); + use std::sync::OnceLock; + // Counter seeded from current time on first use so that restarts begin from a + // different value than any existing st_2pc_state entries (which hold IDs from + // previous sessions starting at much smaller counter values). + static PREPARE_COUNTER: AtomicU64 = AtomicU64::new(0); + static PREPARE_COUNTER_INIT: OnceLock<()> = OnceLock::new(); + PREPARE_COUNTER_INIT.get_or_init(|| { + let seed = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_micros() as u64; + PREPARE_COUNTER.store(seed, Ordering::Relaxed); + }); let (reducer_id, reducer_def) = self .info @@ -1788,9 +1804,13 @@ impl ModuleHost { args, }; + // Resolve the effective coordinator identity before generating the prepare_id so + // the prefix is namespaced correctly even when called from the HTTP prepare handler. + let coordinator_identity = coordinator_identity_override.unwrap_or(caller_identity); + // Include the coordinator identity so prepare_ids from different coordinators // cannot collide on the participant's st_2pc_state table. - let coordinator_hex = caller_identity.to_hex(); + let coordinator_hex = coordinator_identity.to_hex(); let prepare_id = format!( "prepare-{}-{}", &coordinator_hex.to_string()[..16], @@ -1815,7 +1835,6 @@ impl ModuleHost { let this = self.clone(); let reducer_name_owned = reducer_def.name.clone(); let prepare_id_clone = prepare_id.clone(); - let coordinator_identity = caller_identity; tokio::spawn(async move { let _ = this .call( @@ -1826,7 +1845,7 @@ impl ModuleHost { Ok::<(), ReducerCallError>(()) }, // JS modules: no 2PC support yet. - async |(p, _pid, _cid, ptx, _drx), inst| Err(ReducerCallError::NoSuchReducer), + async |(_p, _pid, _cid, _ptx, _drx), _inst| Err(ReducerCallError::NoSuchReducer), ) .await; }); @@ -2008,7 +2027,7 @@ impl ModuleHost { // Step 1: Re-run the reducer to reacquire the write lock. let new_prepare_id = match this - .prepare_reducer(caller_identity, Some(caller_connection_id), &row.reducer_name, args) + .prepare_reducer(caller_identity, Some(caller_connection_id), &row.reducer_name, args, Some(coordinator_identity)) .await { Ok((pid, result, _rv)) if !pid.is_empty() => { diff --git a/crates/smoketests/tests/smoketests/cross_db_2pc.rs b/crates/smoketests/tests/smoketests/cross_db_2pc.rs index 4279852dfb0..3359fb115bd 100644 --- a/crates/smoketests/tests/smoketests/cross_db_2pc.rs +++ b/crates/smoketests/tests/smoketests/cross_db_2pc.rs @@ -2,19 +2,21 @@ use spacetimedb_smoketests::Smoketest; /// Module code for the 2PC test. /// -/// Both the "bank A" and "bank B" databases use the same module. +/// All three databases (A = coordinator, B and C = participants) use the same module. /// /// Tables: /// - `Ledger(account: String PK, balance: i64)` -- stores account balances. /// /// Reducers: /// - `init`: seeds "alice" with balance 100. +/// - `balance(account) -> i64`: returns the current balance for an account. /// - `debit(account, amount)`: decrements balance, panics if insufficient funds. /// - `credit(account, amount)`: increments balance (or inserts if absent). -/// - `transfer_funds(target_hex, from_account, to_account, amount)`: -/// Credits `to_account` locally, then calls `debit` on the remote database -/// using `call_reducer_on_db_2pc`. If the remote debit fails (panic/insufficient funds), -/// the local credit is also rolled back by the 2PC protocol. +/// - `transfer_funds(b_hex, c_hex, from_account, to_account, amount) -> TransferResult`: +/// Credits `amount * 2` to `to_account` locally (collecting `amount` from each of B and C), +/// then calls `debit(from_account, amount)` on both B and C via `call_reducer_on_db_2pc`. +/// If either remote debit fails, all three databases are rolled back atomically. +/// On success, returns the new local balance so the caller can verify without a second query. const MODULE_CODE: &str = r#" use spacetimedb::{log, ReducerContext, Table, Identity}; @@ -30,6 +32,14 @@ pub fn init(ctx: &ReducerContext) { ctx.db.ledger().insert(Ledger { account: "alice".to_string(), balance: 100 }); } +/// Returns the current balance for `account`. +#[spacetimedb::reducer] +pub fn balance(ctx: &ReducerContext, account: String) -> Result { + ctx.db.ledger().account().find(&account) + .map(|r| r.balance) + .ok_or_else(|| format!("account '{}' not found", account)) +} + #[spacetimedb::reducer] pub fn debit(ctx: &ReducerContext, account: String, amount: i64) { let row = ctx.db.ledger().account().find(&account) @@ -53,143 +63,131 @@ pub fn credit(ctx: &ReducerContext, account: String, amount: i64) { } } -/// Transfer `amount` from `from_account` on the remote database to `to_account` locally. +/// Transfer `amount` from `from_account` on both B and C to `to_account` on A (locally). +/// +/// Returns the new local balance of `to_account` so the caller can verify correctness +/// without issuing a separate query. /// -/// Uses 2PC: credits locally first, then calls debit on the remote database via -/// `call_reducer_on_db_2pc`. If the remote debit fails, the coordinator's reducer also -/// fails, triggering abort of all participants. +/// If either remote debit fails (insufficient funds), returns Err and the 2PC protocol +/// rolls back all three databases atomically. #[spacetimedb::reducer] -pub fn transfer_funds(ctx: &ReducerContext, target_hex: String, from_account: String, to_account: String, amount: i64) { - // Credit locally first. - credit(ctx, to_account.clone(), amount); - - // Now call debit on the remote database using 2PC. - let target = Identity::from_hex(&target_hex).expect("invalid target identity hex"); - let args = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); - match spacetimedb::remote_reducer::call_reducer_on_db_2pc(target, "debit", &args) { - Ok(()) => { - log::info!("transfer_funds: remote debit succeeded"); - } - Err(e) => { - log::error!("transfer_funds: remote debit failed: {}", e); - panic!("remote debit failed: {e}"); - } - } +pub fn transfer_funds(ctx: &ReducerContext, b_hex: String, c_hex: String, from_account: String, to_account: String, amount: i64) -> Result { + credit(ctx, to_account.clone(), amount * 2); + + let b = Identity::from_hex(&b_hex).map_err(|e| format!("invalid B identity: {e}"))?; + let args_b = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account.clone(), amount)).map_err(|e| format!("failed to encode args: {e}"))?; + spacetimedb::remote_reducer::call_reducer_on_db_2pc(b, "debit", &args_b) + .map_err(|e| format!("debit on B failed: {e}"))?; + log::info!("transfer_funds: debit on B succeeded"); + + let c = Identity::from_hex(&c_hex).map_err(|e| format!("invalid C identity: {e}"))?; + let args_c = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).map_err(|e| format!("failed to encode args: {e}"))?; + spacetimedb::remote_reducer::call_reducer_on_db_2pc(c, "debit", &args_c) + .map_err(|e| format!("debit on C failed: {e}"))?; + log::info!("transfer_funds: debit on C succeeded"); + + // Return new local balance so the caller can assert correctness immediately. + ctx.db.ledger().account().find(&to_account) + .map(|r| r.balance) + .ok_or_else(|| format!("account '{}' not found after credit", to_account)) } "#; -/// Happy path: transfer 50 from B's alice to A's alice. -/// After: A alice = 150, B alice = 50. +/// Call `balance(account)` on `db_identity` via the HTTP API and return the i64 result. +fn call_balance(test: &Smoketest, db_identity: &str, account: &str) -> i64 { + let resp = test + .api_call_json( + "POST", + &format!("/v1/database/{db_identity}/call/balance"), + &format!("[\"{account}\"]"), + ) + .unwrap_or_else(|e| panic!("balance call failed for {db_identity}: {e}")); + assert!(resp.is_success(), "balance reducer returned {}", resp.status_code); + resp.json() + .unwrap_or_else(|e| panic!("failed to parse balance JSON: {e}")) + .as_i64() + .unwrap_or_else(|| panic!("balance JSON was not an integer")) +} + +/// Happy path: transfer 30 from both B's alice and C's alice to A's alice. +/// +/// The coordinator reducer returns the new local balance (160), which is used directly +/// to assert A's result. B and C balances are verified via `balance` reducer calls. +/// +/// Expected: A=160, B=70, C=70. #[test] fn test_cross_db_2pc_happy_path() { let pid = std::process::id(); let db_a_name = format!("2pc-bank-a-{pid}"); let db_b_name = format!("2pc-bank-b-{pid}"); + let db_c_name = format!("2pc-bank-c-{pid}"); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - // Publish bank B (the participant that will be debited). - test.publish_module_named(&db_b_name, false) - .expect("failed to publish bank B"); + // Publish participants first, then coordinator. + test.publish_module_named(&db_b_name, false).expect("failed to publish bank B"); let db_b_identity = test.database_identity.clone().expect("bank B identity not set"); - // Publish bank A (the coordinator that will be credited). - test.publish_module_named(&db_a_name, false) - .expect("failed to publish bank A"); - let _db_a_identity = test.database_identity.clone().expect("bank A identity not set"); - - // Transfer 50 from B's alice to A's alice. - // The coordinator is bank A. It credits locally, then calls debit on B via 2PC. - test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "50"]) - .expect("transfer_funds failed"); - - // Verify bank A: alice should have 150. - let result_a = test - .spacetime(&[ - "sql", - "--server", - &test.server_url, - test.database_identity.as_ref().unwrap(), - "SELECT balance FROM ledger WHERE account = 'alice'", - ]) - .expect("sql query on bank A failed"); - assert!( - result_a.contains("150"), - "Expected bank A alice balance = 150, got:\n{result_a}" - ); - - // Verify bank B: alice should have 50. - let result_b = test - .spacetime(&[ - "sql", - "--server", - &test.server_url, - &db_b_identity, - "SELECT balance FROM ledger WHERE account = 'alice'", - ]) - .expect("sql query on bank B failed"); - assert!( - result_b.contains("50"), - "Expected bank B alice balance = 50, got:\n{result_b}" - ); + test.publish_module_named(&db_c_name, false).expect("failed to publish bank C"); + let db_c_identity = test.database_identity.clone().expect("bank C identity not set"); + + test.publish_module_named(&db_a_name, false).expect("failed to publish bank A"); + let db_a_identity = test.database_identity.clone().expect("bank A identity not set"); + + // Call transfer_funds; the return value is A's new alice balance. + let resp = test + .api_call_json( + "POST", + &format!("/v1/database/{db_a_identity}/call/transfer_funds"), + &format!("[\"{db_b_identity}\", \"{db_c_identity}\", \"alice\", \"alice\", 30]"), + ) + .expect("transfer_funds call failed"); + assert!(resp.is_success(), "transfer_funds failed: {}", resp.status_code); + let new_a_balance = resp.json().expect("invalid JSON").as_i64().expect("not i64"); + assert_eq!(new_a_balance, 160, "transfer_funds return value: expected A alice=160"); + + // Verify B and C via balance reducer. + assert_eq!(call_balance(&test, &db_b_identity, "alice"), 70, "B alice should be 70"); + assert_eq!(call_balance(&test, &db_c_identity, "alice"), 70, "C alice should be 70"); } -/// Abort path: try to transfer 200, but B only has 100. -/// The remote debit should fail, causing the coordinator reducer to panic, -/// which should roll back the local credit. -/// After: both A and B should still have alice = 100. +/// Abort path: try to transfer 110 from B and C, but both only have 100. +/// +/// B's debit fails (insufficient funds), so the coordinator reducer panics and the +/// 2PC protocol rolls back all three databases. We verify via `balance` reducer calls +/// that every account is still at 100. +/// +/// Expected: A=100, B=100, C=100. #[test] fn test_cross_db_2pc_abort_insufficient_funds() { let pid = std::process::id(); let db_a_name = format!("2pc-abort-a-{pid}"); let db_b_name = format!("2pc-abort-b-{pid}"); + let db_c_name = format!("2pc-abort-c-{pid}"); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - // Publish bank B. - test.publish_module_named(&db_b_name, false) - .expect("failed to publish bank B"); + test.publish_module_named(&db_b_name, false).expect("failed to publish bank B"); let db_b_identity = test.database_identity.clone().expect("bank B identity not set"); - // Publish bank A. - test.publish_module_named(&db_a_name, false) - .expect("failed to publish bank A"); - - // Try to transfer 200 -- B only has 100, so the remote debit will fail. - let result = test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "200"]); - // The call should fail because the remote debit panicked. - assert!( - result.is_err(), - "Expected transfer_funds to fail due to insufficient funds" - ); - - // Verify bank A: alice should still have 100 (the local credit was rolled back). - let result_a = test - .spacetime(&[ - "sql", - "--server", - &test.server_url, - test.database_identity.as_ref().unwrap(), - "SELECT balance FROM ledger WHERE account = 'alice'", - ]) - .expect("sql query on bank A failed"); - assert!( - result_a.contains("100"), - "Expected bank A alice balance = 100 after failed transfer, got:\n{result_a}" - ); - - // Verify bank B: alice should still have 100. - let result_b = test - .spacetime(&[ - "sql", - "--server", - &test.server_url, - &db_b_identity, - "SELECT balance FROM ledger WHERE account = 'alice'", - ]) - .expect("sql query on bank B failed"); - assert!( - result_b.contains("100"), - "Expected bank B alice balance = 100 after failed transfer, got:\n{result_b}" - ); + test.publish_module_named(&db_c_name, false).expect("failed to publish bank C"); + let db_c_identity = test.database_identity.clone().expect("bank C identity not set"); + + test.publish_module_named(&db_a_name, false).expect("failed to publish bank A"); + let db_a_identity = test.database_identity.clone().expect("bank A identity not set"); + + // Transfer 110 from each — both only have 100, so B's debit panics → 2PC aborts all. + let resp = test + .api_call_json( + "POST", + &format!("/v1/database/{db_a_identity}/call/transfer_funds"), + &format!("[\"{db_b_identity}\", \"{db_c_identity}\", \"alice\", \"alice\", 110]"), + ) + .expect("api_call failed"); + assert!(!resp.is_success(), "Expected transfer_funds to fail due to insufficient funds"); + + // All three accounts must still be at 100. + assert_eq!(call_balance(&test, &db_a_identity, "alice"), 100, "A alice should still be 100"); + assert_eq!(call_balance(&test, &db_b_identity, "alice"), 100, "B alice should still be 100"); + assert_eq!(call_balance(&test, &db_c_identity, "alice"), 100, "C alice should still be 100"); } diff --git a/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs b/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs index dfd22aaf389..de1c06f79c1 100644 --- a/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs +++ b/crates/smoketests/tests/smoketests/cross_db_2pc_recovery.rs @@ -4,11 +4,10 @@ use std::time::Duration; /// Module code used for all recovery tests. /// -/// Extends the basic banking module with: -/// - `debit_slow`: same as `debit` but spins for ~2-3s first, giving the test -/// a reliable window in which to crash the server mid-2PC. -/// - `balance`: convenience reducer that returns alice's balance in the logs -/// so tests can detect completion by polling server logs. +/// All three databases (A = coordinator, B and C = participants) use the same module. +/// +/// `transfer_funds_slow` calls `debit_slow` on B and regular `debit` on C, creating +/// a reliable ~2-3s window while B's slow reducer is executing — useful for crash tests. const MODULE_CODE: &str = r#" use spacetimedb::{log, ReducerContext, Table, Identity}; @@ -24,6 +23,14 @@ pub fn init(ctx: &ReducerContext) { ctx.db.ledger().insert(Ledger { account: "alice".to_string(), balance: 100 }); } +/// Returns the current balance for `account`. +#[spacetimedb::reducer] +pub fn balance(ctx: &ReducerContext, account: String) -> Result { + ctx.db.ledger().account().find(&account) + .map(|r| r.balance) + .ok_or_else(|| format!("account '{}' not found", account)) +} + #[spacetimedb::reducer] pub fn debit(ctx: &ReducerContext, account: String, amount: i64) { let row = ctx.db.ledger().account().find(&account) @@ -41,8 +48,6 @@ pub fn debit(ctx: &ReducerContext, account: String, amount: i64) { #[spacetimedb::reducer] pub fn debit_slow(ctx: &ReducerContext, account: String, amount: i64) { // Busy-wait loop. ~100M multiply-add iterations ≈ 2-3s in WASM. - // Using the timestamp as the seed prevents the loop from being - // eliminated by the WASM optimizer. let mut x: u64 = ctx.timestamp.to_micros_since_unix_epoch() as u64; for i in 0u64..100_000_000 { x = x.wrapping_mul(6364136223846793005u64).wrapping_add(i | 1); @@ -63,40 +68,61 @@ pub fn credit(ctx: &ReducerContext, account: String, amount: i64) { } } +/// Transfer `amount` from `from_account` on both B and C to `to_account` on A. +/// A credits `amount * 2` locally, then calls `debit(from_account, amount)` on each +/// of B and C via 2PC. If either fails, all three roll back atomically. #[spacetimedb::reducer] -pub fn transfer_funds(ctx: &ReducerContext, target_hex: String, from_account: String, to_account: String, amount: i64) { - credit(ctx, to_account.clone(), amount); - let target = Identity::from_hex(&target_hex).expect("invalid target identity hex"); - let args = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); - match spacetimedb::remote_reducer::call_reducer_on_db_2pc(target, "debit", &args) { - Ok(()) => log::info!("transfer_funds: remote debit succeeded"), - Err(e) => panic!("remote debit failed: {e}"), +pub fn transfer_funds(ctx: &ReducerContext, b_hex: String, c_hex: String, from_account: String, to_account: String, amount: i64) { + credit(ctx, to_account.clone(), amount * 2); + + let b = Identity::from_hex(&b_hex).expect("invalid B identity"); + let args_b = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account.clone(), amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(b, "debit", &args_b) { + Ok(()) => log::info!("transfer_funds: debit on B succeeded"), + Err(e) => panic!("debit on B failed: {e}"), + } + + let c = Identity::from_hex(&c_hex).expect("invalid C identity"); + let args_c = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(c, "debit", &args_c) { + Ok(()) => log::info!("transfer_funds: debit on C succeeded"), + Err(e) => panic!("debit on C failed: {e}"), } } -/// Same as transfer_funds but calls debit_slow on the remote side. +/// Same as `transfer_funds` but calls `debit_slow` on B and regular `debit` on C. +/// The slow call on B creates a ~2-3s window for crash recovery tests. #[spacetimedb::reducer] -pub fn transfer_funds_slow(ctx: &ReducerContext, target_hex: String, from_account: String, to_account: String, amount: i64) { - credit(ctx, to_account.clone(), amount); - let target = Identity::from_hex(&target_hex).expect("invalid target identity hex"); - let args = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); - match spacetimedb::remote_reducer::call_reducer_on_db_2pc(target, "debit_slow", &args) { - Ok(()) => log::info!("transfer_funds_slow: remote debit_slow succeeded"), - Err(e) => panic!("remote debit_slow failed: {e}"), +pub fn transfer_funds_slow(ctx: &ReducerContext, b_hex: String, c_hex: String, from_account: String, to_account: String, amount: i64) { + credit(ctx, to_account.clone(), amount * 2); + + let b = Identity::from_hex(&b_hex).expect("invalid B identity"); + let args_b = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account.clone(), amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(b, "debit_slow", &args_b) { + Ok(()) => log::info!("transfer_funds_slow: debit_slow on B succeeded"), + Err(e) => panic!("debit_slow on B failed: {e}"), + } + + let c = Identity::from_hex(&c_hex).expect("invalid C identity"); + let args_c = spacetimedb::spacetimedb_lib::bsatn::to_vec(&(from_account, amount)).expect("failed to encode args"); + match spacetimedb::remote_reducer::call_reducer_on_db_2pc(c, "debit", &args_c) { + Ok(()) => log::info!("transfer_funds_slow: debit on C succeeded"), + Err(e) => panic!("debit on C failed: {e}"), } } "#; /// Spawn a background thread that fires `transfer_funds_slow` and ignores the result. /// -/// This is used to start a long-running 2PC in the background so the main thread -/// can crash the server mid-flight. The call is expected to fail with a -/// connection error when the server is restarted. +/// Used to start a long-running 2PC in the background so the main thread can crash +/// the server mid-flight. The call is expected to fail with a connection error when +/// the server is restarted. fn spawn_transfer_funds_slow( server_url: String, config_path: std::path::PathBuf, db_a_identity: String, db_b_identity: String, + db_c_identity: String, amount: i64, ) -> std::thread::JoinHandle<()> { std::thread::spawn(move || { @@ -112,6 +138,7 @@ fn spawn_transfer_funds_slow( &db_a_identity, "transfer_funds_slow", &db_b_identity, + &db_c_identity, "alice", "alice", &amount.to_string(), @@ -120,47 +147,46 @@ fn spawn_transfer_funds_slow( }) } -/// Query alice's balance on a specific database (by identity string). +/// Call the `balance(account)` reducer on `db_identity` and return the i64 result. fn alice_balance(test: &Smoketest, db_identity: &str) -> i64 { - let out = test - .spacetime(&[ - "sql", - "--server", - &test.server_url, - db_identity, - "SELECT balance FROM ledger WHERE account = 'alice'", - ]) - .unwrap_or_else(|e| panic!("sql query failed for {db_identity}: {e}")); - // Output looks like: " balance \n--------\n 100\n" - out.lines() - .filter_map(|l| l.trim().parse::().ok()) - .next() - .unwrap_or_else(|| panic!("could not parse balance from: {out}")) + let resp = test + .api_call_json( + "POST", + &format!("/v1/database/{db_identity}/call/balance"), + "[\"alice\"]", + ) + .unwrap_or_else(|e| panic!("balance call failed for {db_identity}: {e}")); + assert!(resp.is_success(), "balance reducer returned {}", resp.status_code); + resp.json() + .unwrap_or_else(|e| panic!("failed to parse balance JSON: {e}")) + .as_i64() + .unwrap_or_else(|| panic!("balance JSON was not an integer")) } -/// Set up two databases (A = coordinator, B = participant) on the same server -/// and return (db_a_identity, db_b_identity). `test.database_identity` points to A. -fn setup_two_banks(test: &mut Smoketest, pid: u32, suffix: &str) -> (String, String) { +/// Set up three databases (A = coordinator, B and C = participants) on the same server. +/// Returns `(db_a_identity, db_b_identity, db_c_identity)`. `test.database_identity` points to A. +fn setup_three_banks(test: &mut Smoketest, pid: u32, suffix: &str) -> (String, String, String) { let db_b_name = format!("2pc-rec-b-{pid}-{suffix}"); + let db_c_name = format!("2pc-rec-c-{pid}-{suffix}"); let db_a_name = format!("2pc-rec-a-{pid}-{suffix}"); test.publish_module_named(&db_b_name, false) .expect("failed to publish bank B"); let db_b_identity = test.database_identity.clone().expect("bank B identity"); + test.publish_module_named(&db_c_name, false) + .expect("failed to publish bank C"); + let db_c_identity = test.database_identity.clone().expect("bank C identity"); + test.publish_module_named(&db_a_name, false) .expect("failed to publish bank A"); let db_a_identity = test.database_identity.clone().expect("bank A identity"); - (db_a_identity, db_b_identity) + (db_a_identity, db_b_identity, db_c_identity) } // ───────────────────────────────────────────────────────────────────────────── // Test 1: committed data survives a full server restart. -// -// Rationale: verifies that every "persist" step in the 2PC protocol actually -// writes to durable storage. If any durability wait were missing, one side -// would lose its data on restart. // ───────────────────────────────────────────────────────────────────────────── #[test] fn test_2pc_committed_data_survives_restart() { @@ -168,45 +194,25 @@ fn test_2pc_committed_data_survives_restart() { let pid = std::process::id(); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "dur"); + let (db_a_identity, db_b_identity, db_c_identity) = setup_three_banks(&mut test, pid, "dur"); - // Successful 2PC: transfer 50 from B's alice to A's alice. - test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "50"]) + // Successful 2PC: transfer 30 from both B and C to A. + test.call("transfer_funds", &[&db_b_identity, &db_c_identity, "alice", "alice", "30"]) .expect("transfer_funds failed"); - // Verify pre-restart state. - assert_eq!( - alice_balance(&test, &db_a_identity), - 150, - "A should have 150 before restart" - ); - assert_eq!( - alice_balance(&test, &db_b_identity), - 50, - "B should have 50 before restart" - ); + assert_eq!(alice_balance(&test, &db_a_identity), 160, "A should have 160 before restart"); + assert_eq!(alice_balance(&test, &db_b_identity), 70, "B should have 70 before restart"); + assert_eq!(alice_balance(&test, &db_c_identity), 70, "C should have 70 before restart"); - // Restart the server — exercises recovery path even though there's nothing to recover. test.restart_server(); - // After restart, data must still be present and correct. - assert_eq!( - alice_balance(&test, &db_a_identity), - 150, - "A's committed data should survive restart" - ); - assert_eq!( - alice_balance(&test, &db_b_identity), - 50, - "B's committed data should survive restart" - ); + assert_eq!(alice_balance(&test, &db_a_identity), 160, "A's committed data should survive restart"); + assert_eq!(alice_balance(&test, &db_b_identity), 70, "B's committed data should survive restart"); + assert_eq!(alice_balance(&test, &db_c_identity), 70, "C's committed data should survive restart"); } // ───────────────────────────────────────────────────────────────────────────── // Test 2: aborted 2PC rollback also survives a restart. -// -// Rationale: rollback (B's st_2pc_state deletion + reducer rollback) must also -// be durable. After restart, neither side should show the transfer. // ───────────────────────────────────────────────────────────────────────────── #[test] fn test_2pc_aborted_state_survives_restart() { @@ -214,48 +220,31 @@ fn test_2pc_aborted_state_survives_restart() { let pid = std::process::id(); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "abort-dur"); + let (db_a_identity, db_b_identity, db_c_identity) = setup_three_banks(&mut test, pid, "abort-dur"); - // Try to transfer 200 — B only has 100, so the remote debit panics → abort. - let _ = test.call("transfer_funds", &[&db_b_identity, "alice", "alice", "200"]); + // Transfer 110 from each — both only have 100, so B's debit panics → abort. + let _ = test.call("transfer_funds", &[&db_b_identity, &db_c_identity, "alice", "alice", "110"]); - assert_eq!( - alice_balance(&test, &db_a_identity), - 100, - "A should still be 100 after abort" - ); - assert_eq!( - alice_balance(&test, &db_b_identity), - 100, - "B should still be 100 after abort" - ); + assert_eq!(alice_balance(&test, &db_a_identity), 100, "A should still be 100 after abort"); + assert_eq!(alice_balance(&test, &db_b_identity), 100, "B should still be 100 after abort"); + assert_eq!(alice_balance(&test, &db_c_identity), 100, "C should still be 100 after abort"); test.restart_server(); - assert_eq!( - alice_balance(&test, &db_a_identity), - 100, - "A's aborted rollback should survive restart" - ); - assert_eq!( - alice_balance(&test, &db_b_identity), - 100, - "B's aborted rollback should survive restart" - ); + assert_eq!(alice_balance(&test, &db_a_identity), 100, "A's aborted rollback should survive restart"); + assert_eq!(alice_balance(&test, &db_b_identity), 100, "B's aborted rollback should survive restart"); + assert_eq!(alice_balance(&test, &db_c_identity), 100, "C's aborted rollback should survive restart"); } // ───────────────────────────────────────────────────────────────────────────── // Test 3: status endpoint returns "abort" for an unknown prepare_id. -// -// Rationale: tests that GET /v1/database/{db}/2pc/status/{id} is correctly wired -// and returns the right default when no coordinator log entry exists. // ───────────────────────────────────────────────────────────────────────────── #[test] fn test_2pc_status_endpoint_unknown_returns_abort() { let pid = std::process::id(); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - let (db_a_identity, _db_b_identity) = setup_two_banks(&mut test, pid, "status"); + let (db_a_identity, _db_b_identity, _db_c_identity) = setup_three_banks(&mut test, pid, "status"); let resp = test .api_call( @@ -277,15 +266,10 @@ fn test_2pc_status_endpoint_unknown_returns_abort() { // ───────────────────────────────────────────────────────────────────────────── // Test 4: 2PC atomicity is maintained when the server crashes mid-flight. // -// Strategy: `transfer_funds_slow` calls `debit_slow` on B, which burns ~2-3s -// of CPU. We crash the server after 1s (when the 2PC is definitely in flight) -// and verify that both databases are in a CONSISTENT state after restart: -// either both committed (alice_a=150, alice_b=50) or both rolled back -// (alice_a=100, alice_b=100). -// -// Note: we intentionally do NOT assert which outcome occurred, because that -// depends on whether the crash hit before or after A wrote its coordinator log. -// What we assert is that the two sides agree — this is the 2PC guarantee. +// `transfer_funds_slow` calls `debit_slow` on B (~2-3s) then `debit` on C. +// We crash after 1s (B is definitely mid-execution). After restart, all three +// databases must agree: either all committed (A=160, B=70, C=70) or all rolled +// back (A=100, B=100, C=100). // ───────────────────────────────────────────────────────────────────────────── #[test] fn test_2pc_atomicity_under_crash() { @@ -293,52 +277,43 @@ fn test_2pc_atomicity_under_crash() { let pid = std::process::id(); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "crash"); + let (db_a_identity, db_b_identity, db_c_identity) = setup_three_banks(&mut test, pid, "crash"); - // Kick off the slow transfer in a background thread. It will block - // for ~2-3s inside debit_slow on B before completing. let _call_thread = spawn_transfer_funds_slow( test.server_url.clone(), test.config_path.clone(), db_a_identity.clone(), db_b_identity.clone(), - 50, + db_c_identity.clone(), + 30, ); - // Give the 2PC time to get mid-flight (after B starts its slow reducer - // but before it finishes), then crash the server. std::thread::sleep(Duration::from_millis(1000)); test.restart_server(); - // After restart, give recovery time to settle: coordinator recovery - // retransmits COMMIT if needed, participant recovery polls if needed. std::thread::sleep(Duration::from_secs(5)); let bal_a = alice_balance(&test, &db_a_identity); let bal_b = alice_balance(&test, &db_b_identity); + let bal_c = alice_balance(&test, &db_c_identity); - // The 2PC guarantee: both sides must agree. - let both_committed = bal_a == 150 && bal_b == 50; - let both_rolled_back = bal_a == 100 && bal_b == 100; + let both_committed = bal_a == 160 && bal_b == 70 && bal_c == 70; + let both_rolled_back = bal_a == 100 && bal_b == 100 && bal_c == 100; assert!( both_committed || both_rolled_back, - "2PC atomicity violated after crash: A={bal_a}, B={bal_b}. \ - Expected either (150, 50) or (100, 100)." + "2PC atomicity violated after crash: A={bal_a}, B={bal_b}, C={bal_c}. \ + Expected either (160, 70, 70) or (100, 100, 100)." ); } // ───────────────────────────────────────────────────────────────────────────── // Test 5: coordinator recovery — A crashes after writing its coordinator log, -// before B commits. -// -// Strategy: same crash-mid-flight approach, but we poll A's balance to detect -// the moment A has committed (alice_a=150), then immediately crash. At that -// point A's coordinator log is on disk, B has sent PREPARED, but B may not yet -// have received COMMIT. Recovery should bring B to the committed state. +// before B and C commit. // -// This test is inherently timing-sensitive (same-process loopback is fast). -// It uses `debit_slow` to widen the window: after A commits (detectable via -// alice_a=150), B is still inside `debit_slow` and has not yet received COMMIT. +// `transfer_funds_slow` calls `debit_slow` on B (~2-3s) then `debit` on C. +// We poll until A=160 (A committed, coordinator log written for both B and C), +// then crash. At this point B is still inside `debit_slow` awaiting COMMIT. +// Recovery must bring all three to the committed state: A=160, B=70, C=70. // ───────────────────────────────────────────────────────────────────────────── #[test] fn test_2pc_coordinator_recovery() { @@ -346,23 +321,23 @@ fn test_2pc_coordinator_recovery() { let pid = std::process::id(); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "coord-rec"); + let (db_a_identity, db_b_identity, db_c_identity) = setup_three_banks(&mut test, pid, "coord-rec"); let _call_thread = spawn_transfer_funds_slow( test.server_url.clone(), test.config_path.clone(), db_a_identity.clone(), db_b_identity.clone(), - 50, + db_c_identity.clone(), + 30, ); - // Poll A's alice balance until it reaches 150 — that means A has committed - // its tx (including the coordinator log entry) and B has sent PREPARED. - // At this point B is still inside debit_slow, so COMMIT hasn't reached B yet. + // Wait until A has committed (alice_a=160), meaning both B and C have sent PREPARED + // and A's coordinator log entries for both are on disk. let deadline = std::time::Instant::now() + Duration::from_secs(30); loop { std::thread::sleep(Duration::from_millis(100)); - if alice_balance(&test, &db_a_identity) == 150 { + if alice_balance(&test, &db_a_identity) == 160 { break; } if std::time::Instant::now() > deadline { @@ -370,36 +345,24 @@ fn test_2pc_coordinator_recovery() { } } - // Crash immediately: A has coordinator log, B has st_2pc_state, B hasn't committed. + // Crash: A has coordinator log for both B and C; B is waiting in decision_rx for COMMIT. test.restart_server(); - // Allow recovery to complete: A's recover_2pc_coordinator retransmits COMMIT to B. + // Allow recovery to complete. std::thread::sleep(Duration::from_secs(5)); - let bal_a = alice_balance(&test, &db_a_identity); - let bal_b = alice_balance(&test, &db_b_identity); - - assert_eq!(bal_a, 150, "A should have committed (alice_a=150) before crash"); - assert_eq!( - bal_b, 50, - "B should have committed via coordinator recovery (alice_b=50), got {bal_b}" - ); + assert_eq!(alice_balance(&test, &db_a_identity), 160, "A should remain committed"); + assert_eq!(alice_balance(&test, &db_b_identity), 70, "B should have committed via coordinator recovery"); + assert_eq!(alice_balance(&test, &db_c_identity), 70, "C should have committed via coordinator recovery"); } // ───────────────────────────────────────────────────────────────────────────── -// Test 6: participant recovery — B crashes after writing st_2pc_state (PREPARE -// durable) but before receiving COMMIT. -// -// Strategy: since A and B are on the same server, we cannot crash B without -// also crashing A. So we crash the server right after the PREPARE is durable -// on B (detectable: B's st_2pc_state is non-empty) and before A commits. -// On restart: -// - B finds st_2pc_state → re-runs reducer → polls A's status endpoint -// - A has no coordinator log (A hadn't committed) → status = "abort" -// - B aborts → both sides return to 100 +// Test 6: participant recovery — crash before A commits. // -// A fully committed scenario (B polls and gets "commit") is exercised by -// test_2pc_coordinator_recovery which covers the symmetric window. +// We crash early (~500ms into the slow debit on B). A has not yet received +// PREPARED from B, so A has no coordinator log. After restart B (and possibly C) +// recover by polling A's status endpoint, which returns "abort". Both sides +// must end up consistent. // ───────────────────────────────────────────────────────────────────────────── #[test] fn test_2pc_participant_recovery_polls_and_aborts() { @@ -407,34 +370,32 @@ fn test_2pc_participant_recovery_polls_and_aborts() { let pid = std::process::id(); let mut test = Smoketest::builder().module_code(MODULE_CODE).autopublish(false).build(); - let (db_a_identity, db_b_identity) = setup_two_banks(&mut test, pid, "part-rec"); + let (db_a_identity, db_b_identity, db_c_identity) = setup_three_banks(&mut test, pid, "part-rec"); let _call_thread = spawn_transfer_funds_slow( test.server_url.clone(), test.config_path.clone(), db_a_identity.clone(), db_b_identity.clone(), - 50, + db_c_identity.clone(), + 30, ); - // Crash early: after ~500ms, B's slow reducer should be mid-execution. - // A has not yet received PREPARED, so A has no coordinator log. - // B's st_2pc_state may or may not be written yet (it's written after the - // reducer finishes). Either way, the final state must be consistent. + // Crash early: B's slow reducer is mid-execution, A has no coordinator log yet. std::thread::sleep(Duration::from_millis(500)); test.restart_server(); - // Wait for participant recovery to settle. B polls A's status endpoint - // every 5s; allow up to 15s for it to act. + // Allow participant recovery to settle (polls status every 5s). std::thread::sleep(Duration::from_secs(15)); let bal_a = alice_balance(&test, &db_a_identity); let bal_b = alice_balance(&test, &db_b_identity); + let bal_c = alice_balance(&test, &db_c_identity); - let both_committed = bal_a == 150 && bal_b == 50; - let both_rolled_back = bal_a == 100 && bal_b == 100; + let both_committed = bal_a == 160 && bal_b == 70 && bal_c == 70; + let both_rolled_back = bal_a == 100 && bal_b == 100 && bal_c == 100; assert!( both_committed || both_rolled_back, - "Inconsistent state after participant recovery: A={bal_a}, B={bal_b}" + "Inconsistent state after participant recovery: A={bal_a}, B={bal_b}, C={bal_c}" ); }